Comment 8 for bug 1866099

Revision history for this message
Tee Ngo (teewrs) wrote :

The owners of the bash shells are drbd demote which were triggered by sm e.g.

Finding process name of pid 2665471 (a bash shell)
controller-0:~$ ps -efww|grep 2665471
root 2665471 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4096011 2665471 0 19:39 ? 00:00:00 sleep 1
sysadmin 4096119 4080989 0 19:39 pts/13 00:00:00 grep --color=auto 2665471

Finding process name of pid 2665613 (another bash)
controller-0:~$ ps -efww|grep 2665613
root 2665613 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4123722 2665613 0 19:40 ? 00:00:00 sleep 1
sysadmin 4123802 4080989 0 19:40 pts/13 00:00:00 grep --color=auto 2665613

Finding the parent of these drbd demote. By this time it issued a reboot
controller-0:~$ ps -efww|grep 103911
root 103911 1 1 15:38 ? 00:04:20 /usr/bin/sm
root 103918 103911 0 15:38 ? 00:00:01 /usr/bin/sm
root 2665471 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2665613 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2665615 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2665617 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2665873 103911 0 18:52 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2670126 103911 0 18:53 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2670255 103911 0 18:53 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 2670563 103911 0 18:53 ? 00:00:18 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114797 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114798 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114799 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114802 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114807 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114811 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114813 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4114820 103911 0 19:39 ? 00:00:00 /bin/bash /usr/lib/ocf/resource.d/linbit/drbd demote
root 4127812 103911 0 19:40 ? 00:00:00 reboot
root 4132507 103911 0 19:40 ? 00:00:00 /bin/bash /etc/init.d/ceph-init-wrapper stop
root 4132518 103911 4 19:40 ? 00:00:00 /usr/bin/python /etc/init.d/mgr-restful-plugin stop
root 4132537 103911 0 19:40 ? 00:00:00 /bin/sh /etc/init.d/openldap stop
root 4133148 103911 0 19:40 ? 00:00:00 [IPaddr2] <defunct>

Below is the stack trace of one the drbd demote processes. Perhaps the content of /proc/drbd should be monitored for hints after issuing the host-swact command.

[pid 4133733] stat("/proc/drbd", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 4133733] brk(NULL) = 0xa86000
[pid 4133733] brk(0xaa9000) = 0xaa9000
[pid 4133733] brk(NULL) = 0xaa9000
[pid 4133733] socket(AF_NETLINK, SOCK_DGRAM, NETLINK_GENERIC) = 8
[pid 4133733] setsockopt(8, SOL_SOCKET, SO_SNDBUF, [2048], 4) = 0
[pid 4133733] setsockopt(8, SOL_SOCKET, SO_RCVBUF, [2048], 4) = 0
[pid 4133733] bind(8, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
[pid 4133733] getsockname(8, {sa_family=AF_NETLINK, pid=4133733, groups=00000000}, [12]) = 0
[pid 4133733] write(8, " \0\0\0\20\0\1\0\224\37`^e\23?\0\3\2\0\0\t\0\2\0drbd\0\0\0\0", 32) = 32
[pid 4133733] poll([{fd=8, events=POLLIN}], 1, 3000) = 1 ([{fd=8, revents=POLLIN}])
[pid 4133733] recvmsg(8, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"\314\2\0\0\20\0\0\0\224\37`^e\23?\0\1\2\0\0\t\0\2\0drbd\0\0\0\0"..., 8192}], msg_controllen=0, msg_flags=0}, MSG_PEEK) = 716
[pid 4133733] poll([{fd=8, events=POLLIN}], 1, 3000) = 1 ([{fd=8, revents=POLLIN}])
[pid 4133733] recvmsg(8, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"\314\2\0\0\20\0\0\0\224\37`^e\23?\0\1\2\0\0\t\0\2\0drbd\0\0\0\0"..., 8192}], msg_controllen=0, msg_flags=0}, 0) = 716
[pid 4133733] open("/var/lock/drbd-147-6", O_RDWR|O_CREAT, 0600) = 18
[pid 4133733] rt_sigaction(SIGALRM, {0x409a70, [], SA_RESTORER, 0x7fcdc18a9280}, {SIG_DFL, [], 0}, 8) = 0
[pid 4133733] alarm(1) = 0
[pid 4133733] fcntl(18, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
[pid 4133733] alarm(0) = 1
[pid 4133733] rt_sigaction(SIGALRM, {SIG_DFL, [], SA_RESTORER, 0x7fcdc18a9280}, NULL, 8) = 0
[pid 4133733] write(8, "\34\0\0\0\34\0\1\0\225\37`^e\23?\0\17\1\0\0\6\0\0\0\0\0\0\0", 28) = 28
[pid 4133733] poll([{fd=8, events=POLLIN}], 1, 120000) = 1 ([{fd=8, revents=POLLIN}])
[pid 4133733] recvmsg(8, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"\34\0\0\0\34\0\0\0\225\37`^e\23?\0\17\1\0\0\6\0\0\0\364\377\377\377", 8192}], msg_controllen=0, msg_flags=0}, MSG_PEEK) = 28
[pid 4133733] poll([{fd=8, events=POLLIN}], 1, 120000) = 1 ([{fd=8, revents=POLLIN}])
[pid 4133733] recvmsg(8, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"\34\0\0\0\34\0\0\0\225\37`^e\23?\0\17\1\0\0\6\0\0\0\364\377\377\377", 8192}], msg_controllen=0, msg_flags=0}, 0) = 28
[pid 4133733] write(2, "6: State change failed: (-12) De"..., 61) = 61
[pid 4133733] close(18) = 0
[pid 4133733] exit_group(11) = ?
[pid 4133733] +++ exited with 11 +++