MySQL WSREP server fails to stop

Bug #1099742 reported by Vladimir Kuklin
8
This bug affects 1 person
Affects Status Importance Assigned to Milestone
MySQL patches by Codership
New
Undecided
Unassigned

Bug Description

Hi, guys

We are experiencing rather strange issue with mysql wsrep server. Sometimes when we run /etc/init.d/mysql stop it tries to stop but hangs due to unknown reason. The last message in log is:

130111 23:36:36 [Note] /usr/sbin/mysqld: Normal shutdown

130111 23:36:36 [Note] WSREP: rollbacker thread exiting
130111 23:36:36 [Note] Event Scheduler: Purging the queue. 0 events

strace -ttt -f -p `pidof mysqld` shows the following:

Process 4905 attached with 22 threads - interrupt to quit
[pid 4959] 1358242220.703215 futex(0x10550e4, FUTEX_WAIT_PRIVATE, 8, NULL <unfinished ...>
[pid 4928] 1358242220.703252 futex(0x21a358c, FUTEX_WAIT_PRIVATE, 6, NULL <unfinished ...>
[pid 4927] 1358242220.703265 futex(0x21a358c, FUTEX_WAIT_PRIVATE, 8, NULL <unfinished ...>
[pid 4925] 1358242220.703277 futex(0x3aa9ae4, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 4924] 1358242220.703290 gettimeofday( <unfinished ...>
[pid 4923] 1358242220.703302 futex(0x215404c, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 4922] 1358242220.703313 rt_sigtimedwait([HUP QUIT ALRM TERM TSTP], NULL, NULL, 8 <unfinished ...>
[pid 4921] 1358242220.703348 futex(0x18a2b14, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid 4920] 1358242220.703359 restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 4919] 1358242220.703368 restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 4918] 1358242220.703378 restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 4916] 1358242220.703388 io_getevents(140319102091264, 1, 256, <unfinished ...>
[pid 4915] 1358242220.703404 io_getevents(140319102103552, 1, 256, <unfinished ...>
[pid 4914] 1358242220.703416 io_getevents(140319102115840, 1, 256, <unfinished ...>
[pid 4913] 1358242220.703427 io_getevents(140319102128128, 1, 256, <unfinished ...>
[pid 4912] 1358242220.703438 io_getevents(140319102332928, 1, 256, <unfinished ...>
[pid 4911] 1358242220.703449 io_getevents(140319102345216, 1, 256, <unfinished ...>
[pid 4910] 1358242220.703460 io_getevents(140319102357504, 1, 256, <unfinished ...>
[pid 4909] 1358242220.703472 io_getevents(140319102369792, 1, 256, <unfinished ...>
[pid 4908] 1358242220.703482 io_getevents(140319102574592, 1, 256, <unfinished ...>
[pid 4907] 1358242220.703494 io_getevents(140319102586880, 1, 256, <unfinished ...>
[pid 4905] 1358242220.703505 futex(0x10550e4, FUTEX_WAIT_PRIVATE, 7, NULL <unfinished ...>
[pid 4928] 1358242220.703574 <... futex resumed> ) = -1 EAGAIN (Resource temporarily unavailable)
[pid 4924] 1358242220.703600 <... gettimeofday resumed> {1358242220, 703548}, NULL) = 0
[pid 4905] 1358242220.703616 <... futex resumed> ) = -1 EAGAIN (Resource temporarily unavailable)
[pid 4928] 1358242220.703885 futex(0x21a358c, FUTEX_WAIT_PRIVATE, 8, NULL <unfinished ...>
[pid 4924] 1358242220.703932 gettimeofday( <unfinished ...>
[pid 4905] 1358242220.703943 futex(0x10550e4, FUTEX_WAIT_PRIVATE, 8, NULL <unfinished ...>
[pid 4924] 1358242220.703972 <... gettimeofday resumed> {1358242220, 703955}, NULL) = 0
[pid 4924] 1358242220.704003 epoll_wait(36, {}, 128, 21) = 0
[pid 4924] 1358242220.725173 gettimeofday({1358242220, 725183}, NULL) = 0
[pid 4924] 1358242220.725217 gettimeofday({1358242220, 725225}, NULL) = 0
[pid 4924] 1358242220.725256 epoll_wait(36, {}, 128, 1) = 0
[pid 4924] 1358242220.726417 gettimeofday({1358242220, 726426}, NULL) = 0
[pid 4924] 1358242220.726471 clock_gettime(CLOCK_MONOTONIC, {268040, 685856629}) = 0
[pid 4924] 1358242220.726509 clock_gettime(CLOCK_MONOTONIC, {268040, 685893060}) = 0
[pid 4924] 1358242220.726546 clock_gettime(CLOCK_MONOTONIC, {268040, 685930159}) = 0
[pid 4924] 1358242220.726582 clock_gettime(CLOCK_MONOTONIC, {268040, 685965969}) = 0
[pid 4924] 1358242220.726618 clock_gettime(CLOCK_MONOTONIC, {268040, 686000480}) = 0
[pid 4924] 1358242220.726770 clock_gettime(CLOCK_MONOTONIC, {268040, 686154121}) = 0
[pid 4924] 1358242220.726806 gettimeofday({1358242220, 726812}, NULL) = 0
[pid 4924] 1358242220.726846 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242220.726921 gettimeofday({1358242220, 726927}, NULL) = 0
[pid 4924] 1358242220.726957 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 227) = 1
[pid 4924] 1358242220.726992 gettimeofday({1358242220, 726998}, NULL) = 0
[pid 4924] 1358242220.727025 gettimeofday({1358242220, 727031}, NULL) = 0
[pid 4924] 1358242220.727059 epoll_wait(36, {}, 128, 227) = 0
[pid 4924] 1358242220.954503 gettimeofday({1358242220, 954521}, NULL) = 0
[pid 4924] 1358242220.954595 gettimeofday({1358242220, 954609}, NULL) = 0
[pid 4924] 1358242220.954890 epoll_wait(36, {}, 128, 0) = 0
[pid 4924] 1358242220.955002 gettimeofday({1358242220, 955016}, NULL) = 0
[pid 4924] 1358242220.955081 clock_gettime(CLOCK_MONOTONIC, {268040, 914472743}) = 0
[pid 4924] 1358242220.955152 clock_gettime(CLOCK_MONOTONIC, {268040, 914543150}) = 0
[pid 4924] 1358242220.955227 clock_gettime(CLOCK_MONOTONIC, {268040, 914618052}) = 0
[pid 4924] 1358242220.955299 clock_gettime(CLOCK_MONOTONIC, {268040, 914689755}) = 0
[pid 4924] 1358242220.955373 clock_gettime(CLOCK_MONOTONIC, {268040, 914763809}) = 0
[pid 4924] 1358242220.955441 clock_gettime(CLOCK_MONOTONIC, {268040, 914832202}) = 0
[pid 4924] 1358242220.955510 clock_gettime(CLOCK_MONOTONIC, {268040, 914900886}) = 0
[pid 4924] 1358242220.955583 clock_gettime(CLOCK_MONOTONIC, {268040, 914973807}) = 0
[pid 4924] 1358242220.955824 clock_gettime(CLOCK_MONOTONIC, {268040, 915218523}) = 0
[pid 4924] 1358242220.955939 gettimeofday({1358242220, 955952}, NULL) = 0
[pid 4924] 1358242220.956016 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242220.956093 gettimeofday({1358242220, 956105}, NULL) = 0
[pid 4924] 1358242220.956159 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 271) = 1
[pid 4924] 1358242220.956236 gettimeofday({1358242220, 956251}, NULL) = 0
[pid 4924] 1358242220.956310 gettimeofday({1358242220, 956325}, NULL) = 0
[pid 4924] 1358242220.956386 epoll_wait(36, <unfinished ...>
[pid 4920] 1358242220.983830 <... restart_syscall resumed> ) = -1 ETIMEDOUT (Connection timed out)
[pid 4920] 1358242220.983902 futex(0x1b61df0, FUTEX_WAKE_PRIVATE, 1) = 0
[pid 4920] 1358242220.983977 gettimeofday({1358242220, 983991}, NULL) = 0
[pid 4920] 1358242220.984058 gettimeofday({1358242220, 984072}, NULL) = 0
[pid 4920] 1358242220.984135 gettimeofday({1358242220, 984147}, NULL) = 0
[pid 4920] 1358242220.984206 futex(0x1b61e34, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 106421, {1358242225, 984147000}, ffffffff <unfinished ...>
[pid 4916] 1358242221.203892 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4915] 1358242221.203954 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4914] 1358242221.203994 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4913] 1358242221.204032 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4912] 1358242221.204070 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4911] 1358242221.204107 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4910] 1358242221.204144 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4909] 1358242221.204185 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4908] 1358242221.204226 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4907] 1358242221.204265 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4916] 1358242221.204338 io_getevents(140319102091264, 1, 256, <unfinished ...>
[pid 4915] 1358242221.204377 io_getevents(140319102103552, 1, 256, <unfinished ...>
[pid 4914] 1358242221.204410 io_getevents(140319102115840, 1, 256, <unfinished ...>
[pid 4913] 1358242221.204442 io_getevents(140319102128128, 1, 256, <unfinished ...>
[pid 4912] 1358242221.204475 io_getevents(140319102332928, 1, 256, <unfinished ...>
[pid 4911] 1358242221.204508 io_getevents(140319102345216, 1, 256, <unfinished ...>
[pid 4910] 1358242221.204544 io_getevents(140319102357504, 1, 256, <unfinished ...>
[pid 4909] 1358242221.204578 io_getevents(140319102369792, 1, 256, <unfinished ...>
[pid 4908] 1358242221.204614 io_getevents(140319102574592, 1, 256, <unfinished ...>
[pid 4907] 1358242221.204937 io_getevents(140319102586880, 1, 256, <unfinished ...>
[pid 4924] 1358242221.227790 <... epoll_wait resumed> {}, 128, 271) = 0
[pid 4924] 1358242221.227826 gettimeofday({1358242221, 227834}, NULL) = 0
[pid 4924] 1358242221.227879 clock_gettime(CLOCK_MONOTONIC, {268041, 187263232}) = 0
[pid 4924] 1358242221.227913 clock_gettime(CLOCK_MONOTONIC, {268041, 187296544}) = 0
[pid 4924] 1358242221.227949 clock_gettime(CLOCK_MONOTONIC, {268041, 187331699}) = 0
[pid 4924] 1358242221.227981 clock_gettime(CLOCK_MONOTONIC, {268041, 187363740}) = 0
[pid 4924] 1358242221.228014 clock_gettime(CLOCK_MONOTONIC, {268041, 187397105}) = 0
[pid 4924] 1358242221.228046 clock_gettime(CLOCK_MONOTONIC, {268041, 187429046}) = 0
[pid 4924] 1358242221.228080 gettimeofday({1358242221, 228086}, NULL) = 0
[pid 4924] 1358242221.228119 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242221.228160 gettimeofday({1358242221, 228166}, NULL) = 0
[pid 4924] 1358242221.228194 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 227) = 1
[pid 4924] 1358242221.228229 gettimeofday({1358242221, 228235}, NULL) = 0
[pid 4924] 1358242221.228262 gettimeofday({1358242221, 228268}, NULL) = 0
[pid 4924] 1358242221.228296 epoll_wait(36, <unfinished ...>
[pid 4918] 1358242221.437732 <... restart_syscall resumed> ) = -1 ETIMEDOUT (Connection timed out)
[pid 4919] 1358242221.437795 <... restart_syscall resumed> ) = -1 ETIMEDOUT (Connection timed out)
[pid 4918] 1358242221.437815 futex(0x1b61d60, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 4919] 1358242221.437840 futex(0x1b61cd0, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 4918] 1358242221.437855 <... futex resumed> ) = 0
[pid 4919] 1358242221.437927 <... futex resumed> ) = 0
[pid 4918] 1358242221.437947 gettimeofday( <unfinished ...>
[pid 4919] 1358242221.437966 gettimeofday( <unfinished ...>
[pid 4918] 1358242221.437979 <... gettimeofday resumed> {1358242221, 437953}, NULL) = 0
[pid 4919] 1358242221.438005 <... gettimeofday resumed> {1358242221, 437971}, NULL) = 0
[pid 4918] 1358242221.438071 futex(0x1b61da4, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 532007, {1358242222, 437953000}, ffffffff <unfinished ...>
[pid 4919] 1358242221.438097 gettimeofday({1358242221, 438102}, NULL) = 0
[pid 4919] 1358242221.438130 futex(0x1b61d14, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 532003, {1358242222, 438102000}, ffffffff <unfinished ...>
[pid 4924] 1358242221.455653 <... epoll_wait resumed> {}, 128, 227) = 0
[pid 4924] 1358242221.455687 gettimeofday({1358242221, 455695}, NULL) = 0
[pid 4924] 1358242221.455732 gettimeofday({1358242221, 455739}, NULL) = 0
[pid 4924] 1358242221.455768 epoll_wait(36, {}, 128, 1) = 0
[pid 4924] 1358242221.456937 gettimeofday({1358242221, 456945}, NULL) = 0
[pid 4924] 1358242221.456982 clock_gettime(CLOCK_MONOTONIC, {268041, 416366137}) = 0
[pid 4924] 1358242221.457016 clock_gettime(CLOCK_MONOTONIC, {268041, 416399087}) = 0
[pid 4924] 1358242221.457051 clock_gettime(CLOCK_MONOTONIC, {268041, 416433674}) = 0
[pid 4924] 1358242221.457083 clock_gettime(CLOCK_MONOTONIC, {268041, 416465582}) = 0
[pid 4924] 1358242221.457117 clock_gettime(CLOCK_MONOTONIC, {268041, 416499772}) = 0
[pid 4924] 1358242221.457148 clock_gettime(CLOCK_MONOTONIC, {268041, 416531239}) = 0
[pid 4924] 1358242221.457181 clock_gettime(CLOCK_MONOTONIC, {268041, 416563747}) = 0
[pid 4924] 1358242221.457214 clock_gettime(CLOCK_MONOTONIC, {268041, 416597515}) = 0
[pid 4924] 1358242221.457247 clock_gettime(CLOCK_MONOTONIC, {268041, 416629795}) = 0
[pid 4924] 1358242221.457282 gettimeofday({1358242221, 457288}, NULL) = 0
[pid 4924] 1358242221.457320 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242221.457391 gettimeofday({1358242221, 457405}, NULL) = 0
[pid 4924] 1358242221.457438 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 199) = 1
[pid 4924] 1358242221.457474 gettimeofday({1358242221, 457480}, NULL) = 0
[pid 4924] 1358242221.457507 gettimeofday({1358242221, 457514}, NULL) = 0
[pid 4924] 1358242221.457541 epoll_wait(36, {}, 128, 199) = 0
[pid 4924] 1358242221.656911 gettimeofday({1358242221, 656919}, NULL) = 0
[pid 4924] 1358242221.656961 gettimeofday({1358242221, 656967}, NULL) = 0
[pid 4924] 1358242221.656996 epoll_wait(36, {}, 128, 0) = 0
[pid 4924] 1358242221.657027 gettimeofday({1358242221, 657034}, NULL) = 0
[pid 4924] 1358242221.657067 clock_gettime(CLOCK_MONOTONIC, {268041, 616451036}) = 0
[pid 4924] 1358242221.657101 clock_gettime(CLOCK_MONOTONIC, {268041, 616484207}) = 0
[pid 4924] 1358242221.657137 clock_gettime(CLOCK_MONOTONIC, {268041, 616519726}) = 0
[pid 4924] 1358242221.657169 clock_gettime(CLOCK_MONOTONIC, {268041, 616551862}) = 0
[pid 4924] 1358242221.657213 clock_gettime(CLOCK_MONOTONIC, {268041, 616596640}) = 0
[pid 4924] 1358242221.657274 clock_gettime(CLOCK_MONOTONIC, {268041, 616657376}) = 0
[pid 4924] 1358242221.657310 clock_gettime(CLOCK_MONOTONIC, {268041, 616693310}) = 0
[pid 4924] 1358242221.657351 clock_gettime(CLOCK_MONOTONIC, {268041, 616752249}) = 0
[pid 4924] 1358242221.657405 clock_gettime(CLOCK_MONOTONIC, {268041, 616787690}) = 0
[pid 4924] 1358242221.657438 clock_gettime(CLOCK_MONOTONIC, {268041, 616821061}) = 0
[pid 4924] 1358242221.657470 clock_gettime(CLOCK_MONOTONIC, {268041, 616852873}) = 0
[pid 4924] 1358242221.657503 gettimeofday({1358242221, 657509}, NULL) = 0
[pid 4924] 1358242221.657541 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242221.657582 gettimeofday({1358242221, 657588}, NULL) = 0
[pid 4924] 1358242221.657616 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 70) = 1
[pid 4924] 1358242221.657917 gettimeofday({1358242221, 657929}, NULL) = 0
[pid 4924] 1358242221.658041 gettimeofday({1358242221, 658051}, NULL) = 0
[pid 4924] 1358242221.658083 epoll_wait(36, <unfinished ...>
[pid 4916] 1358242221.704824 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4915] 1358242221.704862 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4914] 1358242221.704881 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4913] 1358242221.704900 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4912] 1358242221.704918 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4911] 1358242221.704936 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4910] 1358242221.704954 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4909] 1358242221.704972 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4916] 1358242221.705006 io_getevents(140319102091264, 1, 256, <unfinished ...>
[pid 4915] 1358242221.705026 io_getevents(140319102103552, 1, 256, <unfinished ...>
[pid 4914] 1358242221.705042 io_getevents(140319102115840, 1, 256, <unfinished ...>
[pid 4913] 1358242221.705058 io_getevents(140319102128128, 1, 256, <unfinished ...>
[pid 4912] 1358242221.705074 io_getevents(140319102332928, 1, 256, <unfinished ...>
[pid 4911] 1358242221.705089 io_getevents(140319102345216, 1, 256, <unfinished ...>
[pid 4910] 1358242221.705105 io_getevents(140319102357504, 1, 256, <unfinished ...>
[pid 4909] 1358242221.705120 io_getevents(140319102369792, 1, 256, <unfinished ...>
[pid 4908] 1358242221.705885 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4907] 1358242221.705922 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4908] 1358242221.706019 io_getevents(140319102574592, 1, 256, <unfinished ...>
[pid 4907] 1358242221.706043 io_getevents(140319102586880, 1, 256, <unfinished ...>
[pid 4924] 1358242221.728276 <... epoll_wait resumed> {}, 128, 70) = 0
[pid 4924] 1358242221.728311 gettimeofday({1358242221, 728318}, NULL) = 0
[pid 4924] 1358242221.728361 clock_gettime(CLOCK_MONOTONIC, {268041, 687746069}) = 0
[pid 4924] 1358242221.728396 clock_gettime(CLOCK_MONOTONIC, {268041, 687779289}) = 0
[pid 4924] 1358242221.728431 clock_gettime(CLOCK_MONOTONIC, {268041, 687813601}) = 0
[pid 4924] 1358242221.728463 clock_gettime(CLOCK_MONOTONIC, {268041, 687845292}) = 0
[pid 4924] 1358242221.728495 clock_gettime(CLOCK_MONOTONIC, {268041, 687878248}) = 0
[pid 4924] 1358242221.728527 clock_gettime(CLOCK_MONOTONIC, {268041, 687910492}) = 0
[pid 4924] 1358242221.728561 gettimeofday({1358242221, 728567}, NULL) = 0
[pid 4924] 1358242221.728599 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242221.728843 gettimeofday({1358242221, 728856}, NULL) = 0
[pid 4924] 1358242221.728956 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 228) = 1
[pid 4924] 1358242221.728998 gettimeofday({1358242221, 729004}, NULL) = 0
[pid 4924] 1358242221.729033 gettimeofday({1358242221, 729039}, NULL) = 0
[pid 4924] 1358242221.729067 epoll_wait(36, {}, 128, 228) = 0
[pid 4924] 1358242221.957501 gettimeofday({1358242221, 957509}, NULL) = 0
[pid 4924] 1358242221.957553 clock_gettime(CLOCK_MONOTONIC, {268041, 916937462}) = 0
[pid 4924] 1358242221.957587 clock_gettime(CLOCK_MONOTONIC, {268041, 916970183}) = 0
[pid 4924] 1358242221.957622 clock_gettime(CLOCK_MONOTONIC, {268041, 917177390}) = 0
[pid 4924] 1358242221.957837 clock_gettime(CLOCK_MONOTONIC, {268041, 917220473}) = 0
[pid 4924] 1358242221.957873 clock_gettime(CLOCK_MONOTONIC, {268041, 917288983}) = 0
[pid 4924] 1358242221.957942 clock_gettime(CLOCK_MONOTONIC, {268041, 917324892}) = 0
[pid 4924] 1358242221.957975 clock_gettime(CLOCK_MONOTONIC, {268041, 917357461}) = 0
[pid 4924] 1358242221.958009 clock_gettime(CLOCK_MONOTONIC, {268041, 917391614}) = 0
[pid 4924] 1358242221.958042 clock_gettime(CLOCK_MONOTONIC, {268041, 917425043}) = 0
[pid 4924] 1358242221.958079 gettimeofday({1358242221, 958085}, NULL) = 0
[pid 4924] 1358242221.958118 epoll_ctl(36, EPOLL_CTL_MOD, 37, {EPOLLIN|EPOLLERR|EPOLLET, {u32=61515008, u64=61515008}}) = 0
[pid 4924] 1358242221.958159 gettimeofday({1358242221, 958165}, NULL) = 0
[pid 4924] 1358242221.958203 epoll_wait(36, {{EPOLLIN, {u32=61515008, u64=61515008}}}, 128, 270) = 1
[pid 4924] 1358242221.958275 gettimeofday({1358242221, 958287}, NULL) = 0
[pid 4924] 1358242221.958319 gettimeofday({1358242221, 958325}, NULL) = 0
[pid 4924] 1358242221.958353 epoll_wait(36, <unfinished ...>
[pid 4916] 1358242222.205852 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4915] 1358242222.205907 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4914] 1358242222.205940 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4913] 1358242222.205977 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4912] 1358242222.206009 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4911] 1358242222.206041 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4910] 1358242222.206076 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4909] 1358242222.206113 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4916] 1358242222.206176 io_getevents(140319102091264, 1, 256, <unfinished ...>
[pid 4915] 1358242222.206211 io_getevents(140319102103552, 1, 256, <unfinished ...>
[pid 4914] 1358242222.206241 io_getevents(140319102115840, 1, 256, <unfinished ...>
[pid 4913] 1358242222.206269 io_getevents(140319102128128, 1, 256, <unfinished ...>
[pid 4912] 1358242222.206298 io_getevents(140319102332928, 1, 256, <unfinished ...>
[pid 4911] 1358242222.206330 io_getevents(140319102345216, 1, 256, <unfinished ...>
[pid 4910] 1358242222.206358 io_getevents(140319102357504, 1, 256, <unfinished ...>
[pid 4909] 1358242222.206385 io_getevents(140319102369792, 1, 256, <unfinished ...>
[pid 4908] 1358242222.206938 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4907] 1358242222.206978 <... io_getevents resumed> {}{0, 500000000}) = 0
[pid 4908] 1358242222.207065 io_getevents(140319102574592, 1, 256, <unfinished ...>
[pid 4907] 1358242222.207103 io_getevents(140319102586880, 1, 256, ^C <unfinished ...>

The third argument of futex call is permanently incrementing with the step of 2 except some TIDs.

Looks like there are some problems with race conditions.

 I searched for a while and found some bugs caused by `leap second` insertion. But setting the time and restarting ntpd did not help. Also I tried to change innodb_thread_concurrency to 32, but it did not help.

We experience this problem on both Centos 6.3 and Ubuntu 12.04.1 no matter if it is KVM VM (1GB of RAM) or real machine (96GB of RAM) . It is easily triggered when running deployment of our services using puppet which does some sequential restarts of mysql server.

The only thing that helps us is kill -9 <mysqld_pid>. But I do not think this is affordable solution.

I attached strace, mysqld and dmesg. Let me know if you need other logs.

Looking forward for your help.

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :
Revision history for this message
Teemu Ollakka (teemu-ollakka) wrote :

Hi,

According to logs wsrep provider fails to even start shutting down for some reason. When this happens again, could you run

$ gdb --batch -ex "thr apply all bt" -p $(pidof mysqld) /path/to/mysqld > backtrace.out

and attach that output here.

Also, what is the version of mysql-server-wsrep?

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

Hi,

I attached backtrace and corresponding strace in order for you to easily distinguish which threads hold which futexes.

Our wsrep and galera versions are 5.5.28-23.7 and 23.2.2 We are using your vanilla galera and mysql-server-wsrep for debs and self-built (only spec changed, no source-modification) rpms.

We also tried to turn replication off gracefully by use of "set wsrep_on=off" in init script before stop of mysql. In this case this bug is 100% reproducible

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Could you also please post your my.cnf and other config files. That might be a configuration issue.

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :
Download full text (3.6 KiB)

Hi

Config follows:

# This file contains wsrep-related mysqld options. It should be included
# in the main MySQL configuration file.
#
# Options that need to be customized:
# - wsrep_provider
# - wsrep_cluster_address
# - wsrep_sst_auth
# The rest of defaults should work out of the box.

##
## mysqld options _MANDATORY_ for correct opration of the cluster
##
[mysqld]

# (This must be substituted by wsrep_format)
binlog_format=ROW

# Currently only InnoDB storage engine is supported
default-storage-engine=innodb

# to avoid issues with 'bulk mode inserts' using autoinc
innodb_autoinc_lock_mode=2

# This is a must for paralell applying
innodb_locks_unsafe_for_binlog=1

#Try to fix futex loop
innodb_thread_concurrency = 32

# Query Cache is not supported with wsrep
query_cache_size=0
query_cache_type=0

# Override bind-address
# In some systems bind-address defaults to 127.0.0.1, and with mysqldump SST
# it will have (most likely) disastrous consequences on donor node
bind-address=10.0.176.3
port=3307

##
## WSREP options
##

# Full path to wsrep provider library or 'none'
wsrep_provider=/usr/lib/galera/libgalera_smm.so

# Provider specific configuration options

# See http://www.codership.com/wiki/doku.php?id=faq

wsrep_provider_options="pc.ignore_sb = no;ist.recv_addr=10.0.176.3;gmcast.listen_addr=tcp://10.0.176.3:4567"

# Logical cluster name. Should be the same for all nodes.
wsrep_cluster_name="openstack"
    wsrep_cluster_address="gcomm://"

# Human-readable node name (non-unique). Hostname by default.
#wsrep_node_name=

# Base replication <address|hostname>[:port] of the node.
# The values supplied will be used as defaults for state transfer receiving,
# listening ports and so on. Default: address of the first network interface.
wsrep_node_address=10.0.176.3

# Address for incoming client connections. Autodetect by default.
#wsrep_node_incoming_address=

# How many threads will process writesets from other nodes
wsrep_slave_threads=2

# DBUG options for wsrep provider
#wsrep_dbug_option

# Generate fake primary keys for non-PK tables (required for multi-master
# and parallel applying operation)
wsrep_certify_nonPK=1

# Maximum number of rows in write set
wsrep_max_ws_rows=131072

# Maximum size of write set
wsrep_max_ws_size=1073741824

# to enable debug level logging, set this to 1
wsrep_debug=0

# convert locking sessions into transactions
wsrep_convert_LOCK_to_trx=0

# how many times to retry deadlocked autocommits
wsrep_retry_autocommit=1

# change auto_increment_increment and auto_increment_offset automatically
wsrep_auto_increment_control=1

# retry autoinc insert, which failed for duplicate key error
wsrep_drupal_282555_workaround=0

# enable "strictly synchronous" semantics for read operations
wsrep_causal_reads=0

# Command to call when node status or cluster membership changes.
# Will be passed all or some of the following options:
# --status - new status of this node
# --uuid - UUID of the cluster
# --primary - whether the component is primary or not ("yes"/"no")
# --members - comma-separated list of members
# --index - index of this node in the list
wsrep_notify_cmd=

##
## WSREP State Transfer options
##

# S...

Read more...

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

No-no ;) we need main my.cnf. There is nothing in wsrep options that can cause this.

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

#[mysqld]
#datadir=/var/lib/mysql
#socket=/var/lib/mysql/mysql.sock
#user=mysql
# Disabling symbolic-links is recommended to prevent assorted security risks
#symbolic-links=0

[mysqld_safe]
log-error=/var/log/mysqld.log
#pid-file=/var/run/mysqld.pid

!includedir /etc/mysql/conf.d/

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Are these all of your settings? You don't set innodb_buffer_pool_size and innodb_log_file_size?

What does

mysql> SHOW GLOBAL VARIABLES\G

say?

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :
Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Hi Vladimir,

Unfortunately the gdb backtrace didn't provide much information since mysqld binary was stripped. I tried to reproduce this with the settings you provided, but with no success. Perhaps it somehow also depends on your load, your OS configuration and the way you initialize shutdown. I still believe this is triggered by some mysqld option or a combination of those. I have rounded a list of suspects here (mostly because they are different from usual settings and so far you're the only one who experiences this bug):

innodb_log_file_size=5242880
innodb_support_xa=ON
innodb_thread_concurrency=32
myisam_recover_options=BACKUP
max_binlog_size=104857600

Could you try to reproduce the bug with the following settings instead:

innodb_log_file_size=128M
innodb_support_xa=OFF
innodb_thread_concurrency=0
myisam_recover_options=OFF
max_binlog_size=1G

Note that changing innodb_log_file_size requires graceful shutdown and removal of old innodb log files (ib_logfile*). Also it seems like you may have a number of configuration files in /etc/mysql/conf.d which may overwrite the settings from /etc/my.cnf, so please be careful to place these settings into the last file to be included (most likely wsrep.cnf).

Also, could you try to reproduce the bug with wsrep_provider='none' ?

Revision history for this message
Vladimir Kuklin (vkuklin) wrote : Re: [Bug 1099742] Re: MySQL WSREP server fails to stop
Download full text (24.0 KiB)

Could you please provide package with debug symbols?
18.01.2013 15:51 пользователь "Alex Yurchenko" <email address hidden>
написал:

> Hi Vladimir,
>
> Unfortunately the gdb backtrace didn't provide much information since
> mysqld binary was stripped. I tried to reproduce this with the settings
> you provided, but with no success. Perhaps it somehow also depends on
> your load, your OS configuration and the way you initialize shutdown. I
> still believe this is triggered by some mysqld option or a combination
> of those. I have rounded a list of suspects here (mostly because they
> are different from usual settings and so far you're the only one who
> experiences this bug):
>
> innodb_log_file_size=5242880
> innodb_support_xa=ON
> innodb_thread_concurrency=32
> myisam_recover_options=BACKUP
> max_binlog_size=104857600
>
> Could you try to reproduce the bug with the following settings instead:
>
> innodb_log_file_size=128M
> innodb_support_xa=OFF
> innodb_thread_concurrency=0
> myisam_recover_options=OFF
> max_binlog_size=1G
>
> Note that changing innodb_log_file_size requires graceful shutdown and
> removal of old innodb log files (ib_logfile*). Also it seems like you
> may have a number of configuration files in /etc/mysql/conf.d which may
> overwrite the settings from /etc/my.cnf, so please be careful to place
> these settings into the last file to be included (most likely
> wsrep.cnf).
>
> Also, could you try to reproduce the bug with wsrep_provider='none' ?
>
> --
> You received this bug notification because you are subscribed to the bug
> report.
> https://bugs.launchpad.net/bugs/1099742
>
> Title:
> MySQL WSREP server fails to stop
>
> Status in MySQL patches by Codership:
> New
>
> Bug description:
> Hi, guys
>
> We are experiencing rather strange issue with mysql wsrep server.
> Sometimes when we run /etc/init.d/mysql stop it tries to stop but
> hangs due to unknown reason. The last message in log is:
>
> 130111 23:36:36 [Note] /usr/sbin/mysqld: Normal shutdown
>
> 130111 23:36:36 [Note] WSREP: rollbacker thread exiting
> 130111 23:36:36 [Note] Event Scheduler: Purging the queue. 0 events
>
> strace -ttt -f -p `pidof mysqld` shows the following:
>
> Process 4905 attached with 22 threads - interrupt to quit
> [pid 4959] 1358242220.703215 futex(0x10550e4, FUTEX_WAIT_PRIVATE, 8,
> NULL <unfinished ...>
> [pid 4928] 1358242220.703252 futex(0x21a358c, FUTEX_WAIT_PRIVATE, 6,
> NULL <unfinished ...>
> [pid 4927] 1358242220.703265 futex(0x21a358c, FUTEX_WAIT_PRIVATE, 8,
> NULL <unfinished ...>
> [pid 4925] 1358242220.703277 futex(0x3aa9ae4, FUTEX_WAIT_PRIVATE, 1,
> NULL <unfinished ...>
> [pid 4924] 1358242220.703290 gettimeofday( <unfinished ...>
> [pid 4923] 1358242220.703302 futex(0x215404c, FUTEX_WAIT_PRIVATE, 1,
> NULL <unfinished ...>
> [pid 4922] 1358242220.703313 rt_sigtimedwait([HUP QUIT ALRM TERM TSTP],
> NULL, NULL, 8 <unfinished ...>
> [pid 4921] 1358242220.703348 futex(0x18a2b14, FUTEX_WAIT_PRIVATE, 1,
> NULL <unfinished ...>
> [pid 4920] 1358242220.703359 restart_syscall(<... resuming interrupted
> call ...> <unfinished ...>
> [pid 4919] 1358242220.703368 restart_syscal...

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

It looks like somehow my message was abandoned. I think the easiest way would be to use binaries with debug symbols. I could not get them from launchpad. May be you can provide it, I will install it and upload corresponding backtrace.

Revision history for this message
Seppo Jaakola (seppo-jaakola) wrote :

maybe mysqld cannot create "kill server thread". You have open files limit at 1024 and max_connections 151, are there many client connections, when trying to shutdown?

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

Nope, there is no connections at all: at this time server is only configured by puppet.

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Hi Vladimir,

I'm afraid we can't provide you with DEBs with unstripped mysqld at the moment. But RPMs have mysqld unstripped.

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

Guys, I ran backtrace on centos. Corresponding strace and gdb bt attached.

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Thanks, Vladimir!

Thread 2 (Thread 0x7f403c179700 (LWP 4979)):
#0 0x00007f403e99343c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x000000000051ee84 in inline_mysql_cond_wait (sig_ptr=0x0) at /builddir/build/BUILD/mysql-5.5.28/mysql-5.5.28/include/mysql/psi/mysql_thread.h:980
#2 close_connections (sig_ptr=0x0) at /builddir/build/BUILD/mysql-5.5.28/mysql-5.5.28/sql/mysqld.cc:1241
#3 kill_server (sig_ptr=0x0) at /builddir/build/BUILD/mysql-5.5.28/mysql-5.5.28/sql/mysqld.cc:1382
#4 0x000000000051f75e in kill_server_thread (arg=<value optimized out>) at /builddir/build/BUILD/mysql-5.5.28/mysql-5.5.28/sql/mysqld.cc:1413
#5 0x00007f403e98f851 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f403dc1567d in clone () from /lib64/libc.so.6

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

And here's where it sits:

 /* All threads has now been aborted */
  DBUG_PRINT("quit",("Waiting for threads to die (count=%u)",thread_count));
  mysql_mutex_lock(&LOCK_thread_count);
  while (thread_count)
  {
    mysql_cond_wait(&COND_thread_count, &LOCK_thread_count); <-------- here
    DBUG_PRINT("quit",("One thread died (count=%u)",thread_count));
  }
  mysql_mutex_unlock(&LOCK_thread_count);

  close_active_mi();
  DBUG_PRINT("quit",("close_connections thread"));
  DBUG_VOID_RETURN;
}
waiting for slave threads to exit.

And here's where it misses to close wsrep backend:

#ifdef WITH_WSREP
  if (WSREP_ON) wsrep_stop_replication(NULL);
#endif

  close_connections();

where WSREP_ON expands to (global_system_variables.wsrep_on) which is in no way indicative of initialized wsrep backend.

Something else must be used here. E.g. checking for non-NULL wsrep pointer.

Workaround: avoid setting wsrep_on to 0.

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

This sometimes does not help. When we do restart of server frequently, e.g. by means of automatic deployment software (that is what puppet does when changes configuration files and restarts), we also hit this bug. Looks like there is some race condition which is triggered by fast start/stop/start/stop/start commands issued.

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Yes, this could also be the case. The check for WSREP_ON is simply not applicable there.

Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

So what is the workaround and will the corresponding patch be included into future release ?

Revision history for this message
Alex Yurchenko (ayurchen) wrote :

Have you tried to put some short sleep between start and stop commands? We don't have any fix for that yet and it is of low priority so far.

Revision history for this message
Elena Stepanova (elenst) wrote :
Revision history for this message
Aleksey Sanin (aleksey-l) wrote :

I have a 100% repro test case (see mariadb bug from #23) on centos 5.8. Here is the output with wsrep_debug=1:

130212 3:42:48 [Note] WSREP: Shifting OPEN -> CLOSED (TO: 1745947)
130212 3:42:48 [Note] WSREP: New cluster view: global state: 91e773e6-725d-11e2-0800-1d679f8c5252:1745947, view# -1: non-Primary, number of nodes: 1, my index: 0, protocol version 2
130212 3:42:48 [Note] WSREP: Setting wsrep_ready to 0
130212 3:42:48 [Note] WSREP: RECV thread exiting 0: Success
130212 3:42:48 [Note] WSREP: wsrep_notify_cmd is not defined, skipping notification.
130212 3:42:48 [Note] WSREP: recv_thread() joined.
130212 3:42:48 [Note] WSREP: Closing slave action queue.
130212 3:42:48 [Note] WSREP: New cluster view: global state: 91e773e6-725d-11e2-0800-1d679f8c5252:1745947, view# -1: non-Primary, number of nodes: 0, my index: -1, protocol version 2
130212 3:42:48 [Note] WSREP: Setting wsrep_ready to 0
130212 3:42:48 [Note] WSREP: wsrep_notify_cmd is not defined, skipping notification.
130212 3:42:48 [Note] WSREP: applier thread exiting (code:0)
130212 3:42:48 [Note] WSREP: closing applier 5
130212 3:42:48 [Note] WSREP: applier thread exiting (code:5)
130212 3:42:48 [Note] WSREP: closing applier 2
130212 3:42:48 [Note] WSREP: applier thread exiting (code:5)
130212 3:42:48 [Note] WSREP: closing applier 4
130212 3:42:48 [Note] WSREP: applier thread exiting (code:5)
130212 3:42:48 [Note] WSREP: closing applier 3
130212 3:42:50 [Note] WSREP: waiting for client connections to close: 5
130212 3:42:50 [Note] WSREP: Informing thread 5 that it's time to die
130212 3:42:50 [Note] WSREP: Informing thread 4 that it's time to die
130212 3:42:50 [Note] WSREP: Informing thread 3 that it's time to die
130212 3:42:50 [Note] WSREP: Informing thread 2 that it's time to die
130212 3:42:50 [Note] WSREP: Informing thread 1 that it's time to die
130212 3:42:51 [Note] WSREP: Before Lock_thread_count

I will be happy to try any patches or code change suggestions.

Revision history for this message
Aleksey Sanin (aleksey-l) wrote :

Never mind, my problem was MariaDB specific.

To post a comment you must log in.
This report contains Public information  
Everyone can see this information.

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.