Digging through it, this is what I see: worker/caasunitprovisioner/worker.go: :121 bare channel send that could easily block forever w, ok := appWorkers[appId] if ok { // Before stopping the application worker, inform it that // the app is gone so it has a chance to clean up. // The worker will act on the removed prior to processing the // Stop() request. p.appRemoved <- struct{}{} if err := worker.Stop(w); err != nil { logger.Errorf("stopping application worker for %v: %v", appId, err) } delete(appWorkers, appId) } We are also sending on a channel, that is being listened to by *all* application workers. There is no guarantee that the worker responsible for appId will be the one that actually handles it in 'application_worker.go:134' case <-aw.appRemoved: So if you ever have 2 applications running, stopping one of them will randomly kill one of them, since receiving that signal means that all units are forcibly destroyed. What *should* happen if we are trying to tear down at the same time? It's entirely plausible that a goroutine would be hitting for _, appId := range apps at exactly the same time that application_worker notices that aw.catacomb.Dying(). And if the applicationWorker.loop() notices catacomb.Dying before the caasunitprovisioner worker decides it needs to delete that app, then caasunitprovisioner worker will block indefinitely. *Something like* delete p.appRemoved, it doesn't belong on the provisioner, and just look directly at the application worker details. Either that, or get rid of appRemoved and *only* ever trigger on Worker.Stop() --- a/worker/caasunitprovisioner/application_worker.go +++ b/worker/caasunitprovisioner/application_worker.go @@ -48,7 +48,7 @@ func newApplicationWorker( applicationUpdater ApplicationUpdater, unitGetter UnitGetter, unitUpdater UnitUpdater, -) (worker.Worker, error) { +) (*applicationWorker, error) { w := &applicationWorker{ application: application, jujuManagedUnits: jujuManagedUnits, diff --git a/worker/caasunitprovisioner/worker.go b/worker/caasunitprovisioner/worker.go index 15f076dbf3..b661245259 100644 --- a/worker/caasunitprovisioner/worker.go +++ b/worker/caasunitprovisioner/worker.go @@ -73,8 +73,6 @@ func NewWorker(config Config) (worker.Worker, error) { type provisioner struct { catacomb catacomb.Catacomb config Config - - appRemoved chan struct{} } // Kill is part of the worker.Worker interface. @@ -98,8 +96,7 @@ func (p *provisioner) loop() error { // The channel is unbuffered to that we block until // requests are processed. - p.appRemoved = make(chan struct{}) - appWorkers := make(map[string]worker.Worker) + appWorkers := make(map[string]*applicationWorker) for { select { case <-p.catacomb.Dying(): @@ -124,7 +121,16 @@ func (p *provisioner) loop() error { // the app is gone so it has a chance to clean up. // The worker will act on the removed prior to processing the // Stop() request. - p.appRemoved <- struct{}{} + // We have to use a channel send here, rather than just closing the select, otherwise we + // effectively send the Stop() at the same time as the appRemoved signal. + // By sending a message, we block until it at least starts that routine + select { + case w.appRemoved <- struct{}{}: + case <-w.catacomb.Dying(): + // If the catacomb is already dying, there is no guarantee that w.appRemoved will ever be + // seen. But we can still at least close the channel + close(w.appRemoved) + } if err := worker.Stop(w); err != nil { logger.Errorf("stopping application worker for %v: %v", appId, err) } @@ -147,7 +153,7 @@ func (p *provisioner) loop() error { jujuManagedUnits := cfg.GetBool(caas.JujuManagedUnits, false) w, err := newApplicationWorker( appId, - p.appRemoved, + make(chan struct{}), jujuManagedUnits, p.config.ServiceBroker, p.config.ContainerBroker,