We seem to be missing log output in the failed node, in particular the error message from pg_basebackup. The code (taken from the live, failed unit - the correct code is being deployed) looks like this:
generate_pgpass()
if 'following' not in local_state: log("Fresh unit. I will clone {} and become a hot standby".format( master))
# Before we start destroying anything, ensure that the
# master is contactable. master_ip = hookenv.relation_get('private-address', master) wait_for_db(db='postgres', user='juju_replication', host=master_ip)
clone_database(master, master_ip)
[...]
try:
output = subprocess.check_output(cmd)
log(output, DEBUG)
# Debian by default expects SSL certificates in the datadir.
os.symlink( '/etc/ssl/certs/ssl-cert-snakeoil.pem', os.path.join(postgresql_cluster_dir, 'server.crt'))
os.symlink( '/etc/ssl/private/ssl-cert-snakeoil.key', os.path.join(postgresql_cluster_dir, 'server.key')) create_recovery_conf(master_host, local_state['replication_password'])
except subprocess.CalledProcessError, x:
# We failed, and this cluster is broken. Rebuild a
# working cluster so start/stop etc. works and we
# can retry hooks again. Even assuming the charm is
# functioning correctly, the clone may still fail
# due to eg. lack of disk space.
log("Clone failed, db cluster destroyed", ERROR) log(x.output, ERROR)
if os.path.exists(postgresql_cluster_dir): shutil.rmtree(postgresql_cluster_dir)
if os.path.exists(postgresql_config_dir): shutil.rmtree(postgresql_config_dir) run('pg_createcluster {} main'.format(version)) config_changed()
raise
finally: postgresql_start() wait_for_db()
The first part in the following quoted log is from the generate_pgpass() call. The rest of the log messages are missing up until the traceback is reported. The same log helper charmhelpers.core.hookenv.log() is being used everywhere:
2013-08-05 15:19:16 INFO juju server.go:105 worker/uniter/jujuc: running hook tool "juju-log" ["Writing file /var/lib/juju/agents/unit-postgresql-1/charm/pgpass postgres:postgres 400"]
2013-08-05 15:19:16 DEBUG juju server.go:106 worker/uniter/jujuc: hook context id "postgresql/1:replication-relation-changed:1008308750711802284"; dir "/var/lib/juju/agents/unit-postgresql-1/charm"
2013-08-05 15:19:16 INFO juju juju-log.go:64 postgresql/1 replication:0: Writing file /var/lib/juju/agents/unit-postgresql-1/charm/pgpass postgres:postgres 400
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK Traceback (most recent call last):
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/juju/agents/unit-postgresql-1/charm/hooks/replication-relation-changed", line 1860, in <module>
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK hooks.execute(sys.argv)
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/juju/agents/unit-postgresql-1/charm/hooks/charmhelpers/core/hookenv.py", line 317, in execute
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK self._hooks[hook_name]()
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/juju/agents/unit-postgresql-1/charm/hooks/replication-relation-changed", line 1503, in replication_relation_joined_changed
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK clone_database(master, master_ip)
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/juju/agents/unit-postgresql-1/charm/hooks/replication-relation-changed", line 1675, in clone_database
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK output = subprocess.check_output(cmd)
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/usr/lib/python2.7/subprocess.py", line 544, in check_output
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK raise CalledProcessError(retcode, cmd, output=output)
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK subprocess.CalledProcessError: Command '['sudo', '-E', '-u', 'postgres', 'pg_basebackup', '-D', u'/var/lib/postgresql/9.1/main', '--xlog', '--checkpoint=fast', '--no-password', '-h', u'10.55.32.126', '-p', '5432', '--username=juju_replication']' returned non-zero exit status 1
2013-08-05 15:19:18 ERROR juju uniter.go:352 worker/uniter: hook failed: exit status 1
2013-08-05 15:19:18 DEBUG juju modes.go:407 worker/uniter: ModeAbide exiting
2013-08-05 15:19:18 INFO juju modes.go:405 worker/uniter: ModeHookError starting
2013-08-05 15:19:18 DEBUG juju filter.go:384 worker/uniter/filter: want resolved event
2013-08-05 15:19:18 DEBUG juju filter.go:378 worker/uniter/filter: want forced upgrade true
2013-08-05 15:19:18 DEBUG juju filter.go:482 worker/uniter/filter: no new charm event
We seem to be missing log output in the failed node, in particular the error message from pg_basebackup. The code (taken from the live, failed unit - the correct code is being deployed) looks like this:
if 'following' not in local_state:
log( "Fresh unit. I will clone {} and become a hot standby".format(
master) )
# Before we start destroying anything, ensure that the
master_ ip = hookenv. relation_ get('private- address' , master)
wait_ for_db( db='postgres' , user='juju_ replication' , host=master_ip)
# master is contactable.
[...]
try: check_output( cmd)
'/ etc/ssl/ certs/ssl- cert-snakeoil. pem',
os. path.join( postgresql_ cluster_ dir, 'server.crt'))
'/ etc/ssl/ private/ ssl-cert- snakeoil. key',
os. path.join( postgresql_ cluster_ dir, 'server.key'))
create_ recovery_ conf(master_ host, local_state[ 'replication_ password' ]) CalledProcessEr ror, x:
log(x. output, ERROR) exists( postgresql_ cluster_ dir):
shutil. rmtree( postgresql_ cluster_ dir) exists( postgresql_ config_ dir):
shutil. rmtree( postgresql_ config_ dir)
run('pg_ createcluster {} main'.format( version) )
config_ changed( )
postgresql_ start()
wait_for_ db()
output = subprocess.
log(output, DEBUG)
# Debian by default expects SSL certificates in the datadir.
os.symlink(
os.symlink(
except subprocess.
# We failed, and this cluster is broken. Rebuild a
# working cluster so start/stop etc. works and we
# can retry hooks again. Even assuming the charm is
# functioning correctly, the clone may still fail
# due to eg. lack of disk space.
log("Clone failed, db cluster destroyed", ERROR)
if os.path.
if os.path.
raise
finally:
The first part in the following quoted log is from the generate_pgpass() call. The rest of the log messages are missing up until the traceback is reported. The same log helper charmhelpers. core.hookenv. log() is being used everywhere:
2013-08-05 15:19:16 INFO juju server.go:105 worker/ uniter/ jujuc: running hook tool "juju-log" ["Writing file /var/lib/ juju/agents/ unit-postgresql -1/charm/ pgpass postgres:postgres 400"] uniter/ jujuc: hook context id "postgresql/ 1:replication- relation- changed: 100830875071180 2284"; dir "/var/lib/ juju/agents/ unit-postgresql -1/charm" juju/agents/ unit-postgresql -1/charm/ pgpass postgres:postgres 400 juju/agents/ unit-postgresql -1/charm/ hooks/replicati on-relation- changed" , line 1860, in <module> sys.argv) juju/agents/ unit-postgresql -1/charm/ hooks/charmhelp ers/core/ hookenv. py", line 317, in execute hook_name] () juju/agents/ unit-postgresql -1/charm/ hooks/replicati on-relation- changed" , line 1503, in replication_ relation_ joined_ changed master, master_ip) juju/agents/ unit-postgresql -1/charm/ hooks/replicati on-relation- changed" , line 1675, in clone_database check_output( cmd) python2. 7/subprocess. py", line 544, in check_output ror(retcode, cmd, output=output) CalledProcessEr ror: Command '['sudo', '-E', '-u', 'postgres', 'pg_basebackup', '-D', u'/var/ lib/postgresql/ 9.1/main' , '--xlog', '--checkpoint= fast', '--no-password', '-h', u'10.55.32.126', '-p', '5432', '--username= juju_replicatio n']' returned non-zero exit status 1 uniter/ filter: want resolved event uniter/ filter: want forced upgrade true uniter/ filter: no new charm event
2013-08-05 15:19:16 DEBUG juju server.go:106 worker/
2013-08-05 15:19:16 INFO juju juju-log.go:64 postgresql/1 replication:0: Writing file /var/lib/
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK Traceback (most recent call last):
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK hooks.execute(
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK self._hooks[
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK clone_database(
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/var/lib/
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK output = subprocess.
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK File "/usr/lib/
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK raise CalledProcessEr
2013-08-05 15:19:18 INFO juju context.go:235 worker/uniter: HOOK subprocess.
2013-08-05 15:19:18 ERROR juju uniter.go:352 worker/uniter: hook failed: exit status 1
2013-08-05 15:19:18 DEBUG juju modes.go:407 worker/uniter: ModeAbide exiting
2013-08-05 15:19:18 INFO juju modes.go:405 worker/uniter: ModeHookError starting
2013-08-05 15:19:18 DEBUG juju filter.go:384 worker/
2013-08-05 15:19:18 DEBUG juju filter.go:378 worker/
2013-08-05 15:19:18 DEBUG juju filter.go:482 worker/