Hi Chris, this is pretty simple to replicate. I'm pretty sure this gets called to restart all OSDs on each host: https://github.com/openstack/charm-ceph-osd/blob/55720fa087f3ddaddbd761d24c2ceb1ef72d70d3/lib/charms_ceph/utils.py#L2699
# bundle.yaml relations: - - ceph-osd:mon - ceph-mon:osd series: focal applications: ceph-mon: charm: cs:ceph-mon-73 num_units: 3 constraints: tags=ceph-mon bindings: "": site2-oam public: site2-ceph-public options: monitor-count: 3 source: cloud:focal-xena ceph-public-network: "172.17.104.0/23" ceph-osd: charm: cs:ceph-osd-513 num_units: 3 constraints: tags=ceph-osd options: osd-devices: "/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf" #osd-devices: "/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl /dev/sdm" source: cloud:focal-xena ceph-public-network: "172.17.104.0/23" ceph-cluster-network: "172.17.106.0/23" bluestore-db: "/dev/nvme0n1" bluestore-block-db-size: "266000000000" bindings: "": site2-oam public: site2-ceph-public cluster: site2-ceph-cluster
# Steps juju deploy ./bundle.yaml wait for deploy to finish and ceph health_ok juju config ceph-osd osd-devices="/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg"
# Ceph monitor logs showing all OSDs reported down per host after changing osd-devices config flag root@hcc-admin24:/var/log/ceph# tail -f ceph.log | grep down 2022-04-04T13:05:37.800286+0000 mon.hcc-admin19 (mon.0) 1334 : cluster [WRN] Health check failed: 6 osds down (OSD_DOWN) 2022-04-04T13:05:37.800340+0000 mon.hcc-admin19 (mon.0) 1335 : cluster [WRN] Health check failed: 1 host (6 osds) down (OSD_HOST_DOWN) 2022-04-04T13:05:41.834425+0000 mon.hcc-admin19 (mon.0) 1359 : cluster [INF] Health check cleared: OSD_DOWN (was: 6 osds down) 2022-04-04T13:05:41.834483+0000 mon.hcc-admin19 (mon.0) 1360 : cluster [INF] Health check cleared: OSD_HOST_DOWN (was: 1 host (6 osds) down) 2022-04-04T13:06:04.373403+0000 mon.hcc-admin19 (mon.0) 1622 : cluster [WRN] Health check failed: 5 osds down (OSD_DOWN) 2022-04-04T13:06:04.373456+0000 mon.hcc-admin19 (mon.0) 1623 : cluster [WRN] Health check failed: 1 host (5 osds) down (OSD_HOST_DOWN) 2022-04-04T13:06:08.416227+0000 mon.hcc-admin19 (mon.0) 1648 : cluster [INF] Health check cleared: OSD_HOST_DOWN (was: 1 host (5 osds) down) 2022-04-04T13:08:55.647011+0000 mon.hcc-admin19 (mon.0) 2013 : cluster [WRN] Health check failed: 5 osds down (OSD_DOWN) 2022-04-04T13:08:55.647064+0000 mon.hcc-admin19 (mon.0) 2014 : cluster [WRN] Health check failed: 1 host (5 osds) down (OSD_HOST_DOWN) 2022-04-04T13:08:59.694032+0000 mon.hcc-admin19 (mon.0) 2038 : cluster [INF] Health check cleared: OSD_DOWN (was: 5 osds down) 2022-04-04T13:08:59.694076+0000 mon.hcc-admin19 (mon.0) 2039 : cluster [INF] Health check cleared: OSD_HOST_DOWN (was: 1 host (5 osds) down)
# Example of a few ceph OSD logs showing they all got restarted at same time root@hcc-store36:~# journalctl -u ceph-osd@3 -- Logs begin at Mon 2022-04-04 12:56:29 UTC, end at Mon 2022-04-04 13:11:27 UTC. -- Apr 04 13:01:36 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.3... Apr 04 13:01:36 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.3. Apr 04 13:01:39 hcc-store36 ceph-osd[19395]: 2022-04-04T13:01:39.788+0000 7f169a26c2c0 -1 osd.3 0 log_to_monitors {default=true} Apr 04 13:01:41 hcc-store36 ceph-osd[19395]: 2022-04-04T13:01:41.416+0000 7f16877fe700 -1 osd.3 0 waiting for initial osdmap Apr 04 13:01:41 hcc-store36 ceph-osd[19395]: 2022-04-04T13:01:41.452+0000 7f16857fa700 -1 osd.3 24 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory Apr 04 13:06:03 hcc-store36 ceph-osd[19395]: 2022-04-04T13:06:03.400+0000 7f1695ea9700 -1 received signal: Terminated from /sbin/init (PID: 1) UID: 0 Apr 04 13:06:03 hcc-store36 ceph-osd[19395]: 2022-04-04T13:06:03.400+0000 7f1695ea9700 -1 osd.3 118 *** Got signal Terminated *** Apr 04 13:06:03 hcc-store36 ceph-osd[19395]: 2022-04-04T13:06:03.400+0000 7f1695ea9700 -1 osd.3 118 *** Immediate shutdown (osd_fast_shutdown=true) *** Apr 04 13:06:03 hcc-store36 systemd[1]: Stopping Ceph object storage daemon osd.3... Apr 04 13:06:03 hcc-store36 systemd[1]: ceph-osd@3.service: Succeeded. Apr 04 13:06:03 hcc-store36 systemd[1]: Stopped Ceph object storage daemon osd.3. Apr 04 13:06:03 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.3... Apr 04 13:06:03 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.3. Apr 04 13:06:07 hcc-store36 ceph-osd[32764]: 2022-04-04T13:06:07.156+0000 7fda19d8a2c0 -1 osd.3 118 log_to_monitors {default=true} Apr 04 13:06:07 hcc-store36 ceph-osd[32764]: 2022-04-04T13:06:07.428+0000 7fda04ff9700 -1 osd.3 118 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory
root@hcc-store36:~# journalctl -u ceph-osd@6 -- Logs begin at Mon 2022-04-04 12:56:29 UTC, end at Mon 2022-04-04 13:11:27 UTC. -- Apr 04 13:02:00 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.6... Apr 04 13:02:00 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.6. Apr 04 13:02:03 hcc-store36 ceph-osd[21448]: 2022-04-04T13:02:03.768+0000 7f05624722c0 -1 osd.6 0 log_to_monitors {default=true} Apr 04 13:02:05 hcc-store36 ceph-osd[21448]: 2022-04-04T13:02:05.616+0000 7f054ffff700 -1 osd.6 0 waiting for initial osdmap Apr 04 13:02:05 hcc-store36 ceph-osd[21448]: 2022-04-04T13:02:05.656+0000 7f054d7fa700 -1 osd.6 40 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory Apr 04 13:06:03 hcc-store36 ceph-osd[21448]: 2022-04-04T13:06:03.400+0000 7f055e0af700 -1 received signal: Terminated from /sbin/init (PID: 1) UID: 0 Apr 04 13:06:03 hcc-store36 ceph-osd[21448]: 2022-04-04T13:06:03.400+0000 7f055e0af700 -1 osd.6 118 *** Got signal Terminated *** Apr 04 13:06:03 hcc-store36 ceph-osd[21448]: 2022-04-04T13:06:03.400+0000 7f055e0af700 -1 osd.6 118 *** Immediate shutdown (osd_fast_shutdown=true) *** Apr 04 13:06:03 hcc-store36 systemd[1]: Stopping Ceph object storage daemon osd.6... Apr 04 13:06:03 hcc-store36 systemd[1]: ceph-osd@6.service: Succeeded. Apr 04 13:06:03 hcc-store36 systemd[1]: Stopped Ceph object storage daemon osd.6. Apr 04 13:06:03 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.6... Apr 04 13:06:03 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.6. Apr 04 13:06:07 hcc-store36 ceph-osd[32741]: 2022-04-04T13:06:07.084+0000 7fef467f32c0 -1 osd.6 118 log_to_monitors {default=true} Apr 04 13:06:07 hcc-store36 ceph-osd[32741]: 2022-04-04T13:06:07.428+0000 7fef397fa700 -1 osd.6 118 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory
root@hcc-store36:~# journalctl -u ceph-osd@9 -- Logs begin at Mon 2022-04-04 12:56:29 UTC, end at Mon 2022-04-04 13:11:27 UTC. -- Apr 04 13:02:24 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.9... Apr 04 13:02:24 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.9. Apr 04 13:02:27 hcc-store36 ceph-osd[23482]: 2022-04-04T13:02:27.928+0000 7fa11436e2c0 -1 osd.9 0 log_to_monitors {default=true} Apr 04 13:02:29 hcc-store36 ceph-osd[23482]: 2022-04-04T13:02:29.824+0000 7fa1097fa700 -1 osd.9 0 waiting for initial osdmap Apr 04 13:02:29 hcc-store36 ceph-osd[23482]: 2022-04-04T13:02:29.868+0000 7fa0faffd700 -1 osd.9 59 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory Apr 04 13:06:03 hcc-store36 ceph-osd[23482]: 2022-04-04T13:06:03.400+0000 7fa10b7fe700 -1 received signal: Terminated from /sbin/init (PID: 1) UID: 0 Apr 04 13:06:03 hcc-store36 ceph-osd[23482]: 2022-04-04T13:06:03.400+0000 7fa10b7fe700 -1 osd.9 118 *** Got signal Terminated *** Apr 04 13:06:03 hcc-store36 ceph-osd[23482]: 2022-04-04T13:06:03.400+0000 7fa10b7fe700 -1 osd.9 118 *** Immediate shutdown (osd_fast_shutdown=true) *** Apr 04 13:06:03 hcc-store36 systemd[1]: Stopping Ceph object storage daemon osd.9... Apr 04 13:06:03 hcc-store36 systemd[1]: ceph-osd@9.service: Succeeded. Apr 04 13:06:03 hcc-store36 systemd[1]: Stopped Ceph object storage daemon osd.9. Apr 04 13:06:03 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.9... Apr 04 13:06:03 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.9. Apr 04 13:06:07 hcc-store36 ceph-osd[32732]: 2022-04-04T13:06:07.064+0000 7fbb82a792c0 -1 osd.9 118 log_to_monitors {default=true} Apr 04 13:06:07 hcc-store36 ceph-osd[32732]: 2022-04-04T13:06:07.424+0000 7fbb757fa700 -1 osd.9 118 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory
Hi Chris, this is pretty simple to replicate. I'm pretty sure this gets called to restart all OSDs on each host: https:/ /github. com/openstack/ charm-ceph- osd/blob/ 55720fa087f3dda ddbd761d24c2ceb 1ef72d70d3/ lib/charms_ ceph/utils. py#L2699
# bundle.yaml count: 3 public- network: "172.17.104.0/23" public- network: "172.17.104.0/23" cluster- network: "172.17.106.0/23" -block- db-size: "266000000000"
relations:
- - ceph-osd:mon
- ceph-mon:osd
series: focal
applications:
ceph-mon:
charm: cs:ceph-mon-73
num_units: 3
constraints: tags=ceph-mon
bindings:
"": site2-oam
public: site2-ceph-public
options:
monitor-
source: cloud:focal-xena
ceph-
ceph-osd:
charm: cs:ceph-osd-513
num_units: 3
constraints: tags=ceph-osd
options:
osd-devices: "/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf"
#osd-devices: "/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl /dev/sdm"
source: cloud:focal-xena
ceph-
ceph-
bluestore-db: "/dev/nvme0n1"
bluestore
bindings:
"": site2-oam
public: site2-ceph-public
cluster: site2-ceph-cluster
# Steps "/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg"
juju deploy ./bundle.yaml
wait for deploy to finish and ceph health_ok
juju config ceph-osd osd-devices=
# Ceph monitor logs showing all OSDs reported down per host after changing osd-devices config flag admin24: /var/log/ ceph# tail -f ceph.log | grep down 04T13:05: 37.800286+ 0000 mon.hcc-admin19 (mon.0) 1334 : cluster [WRN] Health check failed: 6 osds down (OSD_DOWN) 04T13:05: 37.800340+ 0000 mon.hcc-admin19 (mon.0) 1335 : cluster [WRN] Health check failed: 1 host (6 osds) down (OSD_HOST_DOWN) 04T13:05: 41.834425+ 0000 mon.hcc-admin19 (mon.0) 1359 : cluster [INF] Health check cleared: OSD_DOWN (was: 6 osds down) 04T13:05: 41.834483+ 0000 mon.hcc-admin19 (mon.0) 1360 : cluster [INF] Health check cleared: OSD_HOST_DOWN (was: 1 host (6 osds) down) 04T13:06: 04.373403+ 0000 mon.hcc-admin19 (mon.0) 1622 : cluster [WRN] Health check failed: 5 osds down (OSD_DOWN) 04T13:06: 04.373456+ 0000 mon.hcc-admin19 (mon.0) 1623 : cluster [WRN] Health check failed: 1 host (5 osds) down (OSD_HOST_DOWN) 04T13:06: 08.416227+ 0000 mon.hcc-admin19 (mon.0) 1648 : cluster [INF] Health check cleared: OSD_HOST_DOWN (was: 1 host (5 osds) down) 04T13:08: 55.647011+ 0000 mon.hcc-admin19 (mon.0) 2013 : cluster [WRN] Health check failed: 5 osds down (OSD_DOWN) 04T13:08: 55.647064+ 0000 mon.hcc-admin19 (mon.0) 2014 : cluster [WRN] Health check failed: 1 host (5 osds) down (OSD_HOST_DOWN) 04T13:08: 59.694032+ 0000 mon.hcc-admin19 (mon.0) 2038 : cluster [INF] Health check cleared: OSD_DOWN (was: 5 osds down) 04T13:08: 59.694076+ 0000 mon.hcc-admin19 (mon.0) 2039 : cluster [INF] Health check cleared: OSD_HOST_DOWN (was: 1 host (5 osds) down)
root@hcc-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
2022-04-
# Example of a few ceph OSD logs showing they all got restarted at same time 04T13:01: 39.788+ 0000 7f169a26c2c0 -1 osd.3 0 log_to_monitors {default=true} 04T13:01: 41.416+ 0000 7f16877fe700 -1 osd.3 0 waiting for initial osdmap 04T13:01: 41.452+ 0000 7f16857fa700 -1 osd.3 24 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory 04T13:06: 03.400+ 0000 7f1695ea9700 -1 received signal: Terminated from /sbin/init (PID: 1) UID: 0 04T13:06: 03.400+ 0000 7f1695ea9700 -1 osd.3 118 *** Got signal Terminated *** 04T13:06: 03.400+ 0000 7f1695ea9700 -1 osd.3 118 *** Immediate shutdown (osd_fast_ shutdown= true) *** 04T13:06: 07.156+ 0000 7fda19d8a2c0 -1 osd.3 118 log_to_monitors {default=true} 04T13:06: 07.428+ 0000 7fda04ff9700 -1 osd.3 118 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory
root@hcc-store36:~# journalctl -u ceph-osd@3
-- Logs begin at Mon 2022-04-04 12:56:29 UTC, end at Mon 2022-04-04 13:11:27 UTC. --
Apr 04 13:01:36 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.3...
Apr 04 13:01:36 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.3.
Apr 04 13:01:39 hcc-store36 ceph-osd[19395]: 2022-04-
Apr 04 13:01:41 hcc-store36 ceph-osd[19395]: 2022-04-
Apr 04 13:01:41 hcc-store36 ceph-osd[19395]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[19395]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[19395]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[19395]: 2022-04-
Apr 04 13:06:03 hcc-store36 systemd[1]: Stopping Ceph object storage daemon osd.3...
Apr 04 13:06:03 hcc-store36 systemd[1]: ceph-osd@3.service: Succeeded.
Apr 04 13:06:03 hcc-store36 systemd[1]: Stopped Ceph object storage daemon osd.3.
Apr 04 13:06:03 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.3...
Apr 04 13:06:03 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.3.
Apr 04 13:06:07 hcc-store36 ceph-osd[32764]: 2022-04-
Apr 04 13:06:07 hcc-store36 ceph-osd[32764]: 2022-04-
root@hcc-store36:~# journalctl -u ceph-osd@6 04T13:02: 03.768+ 0000 7f05624722c0 -1 osd.6 0 log_to_monitors {default=true} 04T13:02: 05.616+ 0000 7f054ffff700 -1 osd.6 0 waiting for initial osdmap 04T13:02: 05.656+ 0000 7f054d7fa700 -1 osd.6 40 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory 04T13:06: 03.400+ 0000 7f055e0af700 -1 received signal: Terminated from /sbin/init (PID: 1) UID: 0 04T13:06: 03.400+ 0000 7f055e0af700 -1 osd.6 118 *** Got signal Terminated *** 04T13:06: 03.400+ 0000 7f055e0af700 -1 osd.6 118 *** Immediate shutdown (osd_fast_ shutdown= true) *** 04T13:06: 07.084+ 0000 7fef467f32c0 -1 osd.6 118 log_to_monitors {default=true} 04T13:06: 07.428+ 0000 7fef397fa700 -1 osd.6 118 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory
-- Logs begin at Mon 2022-04-04 12:56:29 UTC, end at Mon 2022-04-04 13:11:27 UTC. --
Apr 04 13:02:00 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.6...
Apr 04 13:02:00 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.6.
Apr 04 13:02:03 hcc-store36 ceph-osd[21448]: 2022-04-
Apr 04 13:02:05 hcc-store36 ceph-osd[21448]: 2022-04-
Apr 04 13:02:05 hcc-store36 ceph-osd[21448]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[21448]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[21448]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[21448]: 2022-04-
Apr 04 13:06:03 hcc-store36 systemd[1]: Stopping Ceph object storage daemon osd.6...
Apr 04 13:06:03 hcc-store36 systemd[1]: ceph-osd@6.service: Succeeded.
Apr 04 13:06:03 hcc-store36 systemd[1]: Stopped Ceph object storage daemon osd.6.
Apr 04 13:06:03 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.6...
Apr 04 13:06:03 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.6.
Apr 04 13:06:07 hcc-store36 ceph-osd[32741]: 2022-04-
Apr 04 13:06:07 hcc-store36 ceph-osd[32741]: 2022-04-
root@hcc-store36:~# journalctl -u ceph-osd@9 04T13:02: 27.928+ 0000 7fa11436e2c0 -1 osd.9 0 log_to_monitors {default=true} 04T13:02: 29.824+ 0000 7fa1097fa700 -1 osd.9 0 waiting for initial osdmap 04T13:02: 29.868+ 0000 7fa0faffd700 -1 osd.9 59 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory 04T13:06: 03.400+ 0000 7fa10b7fe700 -1 received signal: Terminated from /sbin/init (PID: 1) UID: 0 04T13:06: 03.400+ 0000 7fa10b7fe700 -1 osd.9 118 *** Got signal Terminated *** 04T13:06: 03.400+ 0000 7fa10b7fe700 -1 osd.9 118 *** Immediate shutdown (osd_fast_ shutdown= true) *** 04T13:06: 07.064+ 0000 7fbb82a792c0 -1 osd.9 118 log_to_monitors {default=true} 04T13:06: 07.424+ 0000 7fbb757fa700 -1 osd.9 118 set_numa_affinity unable to identify public interface '' numa node: (2) No such file or directory
-- Logs begin at Mon 2022-04-04 12:56:29 UTC, end at Mon 2022-04-04 13:11:27 UTC. --
Apr 04 13:02:24 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.9...
Apr 04 13:02:24 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.9.
Apr 04 13:02:27 hcc-store36 ceph-osd[23482]: 2022-04-
Apr 04 13:02:29 hcc-store36 ceph-osd[23482]: 2022-04-
Apr 04 13:02:29 hcc-store36 ceph-osd[23482]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[23482]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[23482]: 2022-04-
Apr 04 13:06:03 hcc-store36 ceph-osd[23482]: 2022-04-
Apr 04 13:06:03 hcc-store36 systemd[1]: Stopping Ceph object storage daemon osd.9...
Apr 04 13:06:03 hcc-store36 systemd[1]: ceph-osd@9.service: Succeeded.
Apr 04 13:06:03 hcc-store36 systemd[1]: Stopped Ceph object storage daemon osd.9.
Apr 04 13:06:03 hcc-store36 systemd[1]: Starting Ceph object storage daemon osd.9...
Apr 04 13:06:03 hcc-store36 systemd[1]: Started Ceph object storage daemon osd.9.
Apr 04 13:06:07 hcc-store36 ceph-osd[32732]: 2022-04-
Apr 04 13:06:07 hcc-store36 ceph-osd[32732]: 2022-04-