worker/caasunitprovisioner/worker.go:
:121 bare channel send that could easily block forever
w, ok := appWorkers[appId]
if ok {
// Before stopping the application worker, inform it that
// the app is gone so it has a chance to clean up.
// The worker will act on the removed prior to processing the
// Stop() request.
p.appRemoved <- struct{}{}
if err := worker.Stop(w); err != nil { logger.Errorf("stopping application worker for %v: %v", appId, err)
}
delete(appWorkers, appId)
}
We are also sending on a channel, that is being listened to by *all* application workers. There is no guarantee that the worker responsible for appId will be the one that actually handles it in 'application_worker.go:134'
case <-aw.appRemoved:
So if you ever have 2 applications running, stopping one of them will randomly kill one of them, since receiving that signal means that all units are forcibly destroyed.
What *should* happen if we are trying to tear down at the same time? It's entirely plausible that a goroutine would be hitting for _, appId := range apps at exactly the same time that application_worker notices that aw.catacomb.Dying().
And if the applicationWorker.loop() notices catacomb.Dying before the caasunitprovisioner worker decides it needs to delete that app, then caasunitprovisioner worker will block indefinitely.
*Something like*
delete p.appRemoved, it doesn't belong on the provisioner, and just look directly at the application worker details.
Either that, or get rid of appRemoved and *only* ever trigger on Worker.Stop()
--- a/worker/caasunitprovisioner/application_worker.go
+++ b/worker/caasunitprovisioner/application_worker.go
@@ -48,7 +48,7 @@ func newApplicationWorker( applicationUpdater ApplicationUpdater,
unitGetter UnitGetter,
unitUpdater UnitUpdater,
-) (worker.Worker, error) {
+) (*applicationWorker, error) {
w := &applicationWorker{ application: application, jujuManagedUnits: jujuManagedUnits,
diff --git a/worker/caasunitprovisioner/worker.go b/worker/caasunitprovisioner/worker.go
index 15f076dbf3..b661245259 100644
--- a/worker/caasunitprovisioner/worker.go
+++ b/worker/caasunitprovisioner/worker.go
@@ -73,8 +73,6 @@ func NewWorker(config Config) (worker.Worker, error) {
type provisioner struct {
catacomb catacomb.Catacomb
config Config
-
- appRemoved chan struct{}
}
// Kill is part of the worker.Worker interface.
@@ -98,8 +96,7 @@ func (p *provisioner) loop() error {
// The channel is unbuffered to that we block until
// requests are processed.
- p.appRemoved = make(chan struct{})
- appWorkers := make(map[string]worker.Worker)
+ appWorkers := make(map[string]*applicationWorker)
for { select { case <-p.catacomb.Dying():
@@ -124,7 +121,16 @@ func (p *provisioner) loop() error { // the app is gone so it has a chance to clean up. // The worker will act on the removed prior to processing the // Stop() request.
- p.appRemoved <- struct{}{}
+ // We have to use a channel send here, rather than just closing the select, otherwise we
+ // effectively send the Stop() at the same time as the appRemoved signal.
+ // By sending a message, we block until it at least starts that routine
+ select {
+ case w.appRemoved <- struct{}{}:
+ case <-w.catacomb.Dying():
+ // If the catacomb is already dying, there is no guarantee that w.appRemoved will ever be
+ // seen. But we can still at least close the channel
+ close(w.appRemoved)
+ } if err := worker.Stop(w); err != nil { logger.Errorf("stopping application worker for %v: %v", appId, err) }
@@ -147,7 +153,7 @@ func (p *provisioner) loop() error { jujuManagedUnits := cfg.GetBool(caas.JujuManagedUnits, false) w, err := newApplicationWorker( appId,
- p.appRemoved,
+ make(chan struct{}), jujuManagedUnits, p.config.ServiceBroker, p.config.ContainerBroker,
Digging through it, this is what I see:
worker/ caasunitprovisi oner/worker. go:
logger. Errorf( "stopping application worker for %v: %v", appId, err) appWorkers, appId)
:121 bare channel send that could easily block forever
w, ok := appWorkers[appId]
if ok {
// Before stopping the application worker, inform it that
// the app is gone so it has a chance to clean up.
// The worker will act on the removed prior to processing the
// Stop() request.
p.appRemoved <- struct{}{}
if err := worker.Stop(w); err != nil {
}
delete(
}
We are also sending on a channel, that is being listened to by *all* application workers. There is no guarantee that the worker responsible for appId will be the one that actually handles it in 'application_ worker. go:134'
case <-aw.appRemoved:
So if you ever have 2 applications running, stopping one of them will randomly kill one of them, since receiving that signal means that all units are forcibly destroyed.
What *should* happen if we are trying to tear down at the same time? It's entirely plausible that a goroutine would be hitting for _, appId := range apps at exactly the same time that application_worker notices that aw.catacomb. Dying() . er.loop( ) notices catacomb.Dying before the caasunitprovisioner worker decides it needs to delete that app, then caasunitprovisioner worker will block indefinitely.
And if the applicationWork
*Something like*
delete p.appRemoved, it doesn't belong on the provisioner, and just look directly at the application worker details. caasunitprovisi oner/applicatio n_worker. go caasunitprovisi oner/applicatio n_worker. go orker(
applicationUpd ater ApplicationUpdater, rker, error) {
application: application,
jujuManagedUn its: jujuManagedUnits, caasunitprovisi oner/worker. go b/worker/ caasunitprovisi oner/worker. go .b661245259 100644 caasunitprovisi oner/worker. go caasunitprovisi oner/worker. go
Either that, or get rid of appRemoved and *only* ever trigger on Worker.Stop()
--- a/worker/
+++ b/worker/
@@ -48,7 +48,7 @@ func newApplicationW
unitGetter UnitGetter,
unitUpdater UnitUpdater,
-) (worker.Worker, error) {
+) (*applicationWo
w := &applicationWorker{
diff --git a/worker/
index 15f076dbf3.
--- a/worker/
+++ b/worker/
@@ -73,8 +73,6 @@ func NewWorker(config Config) (worker.Worker, error) {
type provisioner struct {
catacomb catacomb.Catacomb
config Config
-
- appRemoved chan struct{}
}
// Kill is part of the worker.Worker interface.
@@ -98,8 +96,7 @@ func (p *provisioner) loop() error {
// The channel is unbuffered to that we block until string] worker. Worker) string] *applicationWor ker)
select {
case <-p.catacomb. Dying() :
/ / the app is gone so it has a chance to clean up.
/ / The worker will act on the removed prior to processing the
/ / Stop() request. Dying() :
if err := worker.Stop(w); err != nil {
logger. Errorf( "stopping application worker for %v: %v", appId, err)
}
jujuManaged Units := cfg.GetBool( caas.JujuManage dUnits, false)
w, err := newApplicationW orker(
appId,
jujuManage dUnits,
p. config. ServiceBroker,
p. config. ContainerBroker ,
// requests are processed.
- p.appRemoved = make(chan struct{})
- appWorkers := make(map[
+ appWorkers := make(map[
for {
@@ -124,7 +121,16 @@ func (p *provisioner) loop() error {
- p.appRemoved <- struct{}{}
+ // We have to use a channel send here, rather than just closing the select, otherwise we
+ // effectively send the Stop() at the same time as the appRemoved signal.
+ // By sending a message, we block until it at least starts that routine
+ select {
+ case w.appRemoved <- struct{}{}:
+ case <-w.catacomb.
+ // If the catacomb is already dying, there is no guarantee that w.appRemoved will ever be
+ // seen. But we can still at least close the channel
+ close(w.appRemoved)
+ }
@@ -147,7 +153,7 @@ func (p *provisioner) loop() error {
- p.appRemoved,
+ make(chan struct{}),