Hi,
We have the same issue on our k8s cluster.
Description: ubuntu16.04.1 LTSx86_64 Release: 16.04.1
Kernel: 4.4.0-104-generic
the dump file can be downloaded via following way: wget http://129.226.115.161/dump.202006231820.tar.gz
I did some analysis, however i still didnot find the root cause:
Load the vmcore in crash (please refer to the hyperlink above). Crash should present details similar to the following: crash> bt PID: 11388 TASK: ffff880eb1f79e00 CPU: 29 COMMAND: "heartbeat" #0 [ffff8809131a7b08] machine_kexec at ffffffff8105c22b #1 [ffff8809131a7b68] crash_kexec at ffffffff8110e852 #2 [ffff8809131a7c38] oops_end at ffffffff81031c49 #3 [ffff8809131a7c60] die at ffffffff810320fb #4 [ffff8809131a7c90] do_trap at ffffffff8102f121 #5 [ffff8809131a7ce0] do_error_trap at ffffffff8102f4a9 #6 [ffff8809131a7da0] do_invalid_op at ffffffff8102fa10 #7 [ffff8809131a7db0] invalid_op at ffffffff8184638e [exception RIP: __fput+541] RIP: ffffffff812126ad RSP: ffff8809131a7e68 RFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880ef6915700 RCX: 0000000365fb1705 RDX: 0000000000000001 RSI: ffff880fff55a020 RDI: 0000000000000000 RBP: ffff8809131a7ea0 R8: 000000000001a020 R9: ffffffff811b591d R10: ffffea002b69b300 R11: ffff880ef6915710 R12: 0000000000000010 R13: ffff880ed152aef8 R14: ffff8800bba18aa0 R15: ffff880ed1513a40 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffff8809131a7e60] __fput at ffffffff812125ac #9 [ffff8809131a7ea8] ____fput at ffffffff812126ee #10 [ffff8809131a7eb8] task_work_run at ffffffff8109f101 #11 [ffff8809131a7ef8] exit_to_usermode_loop at ffffffff81003242 #12 [ffff8809131a7f30] syscall_return_slowpath at ffffffff81003c6e #13 [ffff8809131a7f50] int_ret_from_sys_call at ffffffff818449d0 RIP: 000000000047f704 RSP: 000000c423b77c98 RFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000047f704 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000000ca RBP: 000000c423b77ce0 R8: 0000000000000000 R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 000000c423b78ee0 R15: 0000000000000008 ORIG_RAX: 0000000000000003 CS: 0033 SS: 002b crash>
crash> log [19156101.592212] ------------[ cut here ]------------ [19156101.593103] kernel BUG at /build/linux-SwhOyu/linux-4.4.0/include/linux/fs.h:2582! [19156101.594385] invalid opcode: 0000 [#1] SMP [19156101.595083] Modules linked in: binfmt_misc af_packet_diag netlink_diag dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag veth br_netfilter ipt_MASQUERADE nf_nat_masquerade_ipv4 xt_set xt_mark ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_bitmap_port ip_set_hash_ipport ip_set dummy xt_comment xt_addrtype iptable_nat nf_nat_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 ip_vs_sh ip_vs_wrr ip_vs_rr ip_vs xt_tcpudp bridge stp llc nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack aufs isofs ppdev crct10dif_pclmul parport_pc crc32_pclmul input_leds joydev ghash_clmulni_intel parport serio_raw ib_iser rdma_cm iw_cm ib_cm ib_sa ib_mad ib_core ib_addr sunrpc iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 btrfs raid10 raid456 async_raid6_recov [19156101.606434] async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd psmouse floppy [19156101.609129] CPU: 29 PID: 11388 Comm: heartbeat Not tainted 4.4.0-104-generic #127-Ubuntu [19156101.610384] Hardware name: Smdbmds KVM, BIOS seabios-1.9.1-qemu-project.org 04/01/2014 [19156101.611637] task: ffff880eb1f79e00 ti: ffff8809131a4000 task.ti: ffff8809131a4000 [19156101.612905] RIP: 0010:[<ffffffff812126ad>] [<ffffffff812126ad>] __fput+0x21d/0x220 [19156101.614188] RSP: 0018:ffff8809131a7e68 EFLAGS: 00010246 [19156101.614989] RAX: 0000000000000000 RBX: ffff880ef6915700 RCX: 0000000365fb1705 [19156101.616143] RDX: 0000000000000001 RSI: ffff880fff55a020 RDI: 0000000000000000 [19156101.617285] RBP: ffff8809131a7ea0 R08: 000000000001a020 R09: ffffffff811b591d [19156101.618422] R10: ffffea002b69b300 R11: ffff880ef6915710 R12: 0000000000000010 [19156101.619574] R13: ffff880ed152aef8 R14: ffff8800bba18aa0 R15: ffff880ed1513a40 [19156101.620785] FS: 000000c42085bc90(0000) GS:ffff880fff540000(0000) knlGS:0000000000000000 [19156101.622074] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [19156101.622921] CR2: 00007f508b166b04 CR3: 0000000e0981b000 CR4: 00000000003406e0 [19156101.624062] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [19156101.625210] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [19156101.626349] Stack: [19156101.626765] ffff880ed152aef8 ffff880ef6915710 ffff880eb1f79e00 ffffffff8210ad50 [19156101.628018] ffff880ef6915700 0000000000000000 ffff880eb1f7a4a0 ffff8809131a7eb0 [19156101.629305] ffffffff812126ee ffff8809131a7ef0 ffffffff8109f101 ffff880eb1f7a4d4 [19156101.630568] Call Trace: [19156101.631036] [<ffffffff812126ee>] ____fput+0xe/0x10 [19156101.631804] [<ffffffff8109f101>] task_work_run+0x81/0xa0 [19156101.632612] [<ffffffff81003242>] exit_to_usermode_loop+0xc2/0xd0 [19156101.633497] [<ffffffff81003c6e>] syscall_return_slowpath+0x4e/0x60 [19156101.634402] [<ffffffff818449d0>] int_ret_from_sys_call+0x25/0x8f [19156101.635285] Code: 0f 84 cf fe ff ff 48 8b 43 28 48 8b 80 80 00 00 00 48 85 c0 0f 84 bb fe ff ff 31 d2 48 89 de bf ff ff ff ff ff d0 e9 aa fe ff ff <0f> 0b 90 0f 1f 44 00 00 31 ff 48 87 3d 8a 6e fc 00 48 85 ff 74 [19156101.639244] RIP [<ffffffff812126ad>] __fput+0x21d/0x220 [19156101.640049] RSP <ffff8809131a7e68>
Referencing the line above in the source code, fs.h:2582!, we see the panic is due to a BUG_ON:
static void __fput(struct file *file) { .... if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode);// .... }
static inline void i_readcount_dec(struct inode *inode) { BUG_ON(!atomic_read(&inode->i_readcount)); atomic_dec(&inode->i_readcount); }
And the corresponding dissasembly for the panic location:
crash> dis -r __fput+541 ... 0xffffffff812126a1 <__fput+529>: mov $0xffffffff,%edi 0xffffffff812126a6 <__fput+534>: callq *%rax 0xffffffff812126a8 <__fput+536>: jmpq 0xffffffff81212557 <__fput+199> 0xffffffff812126ad <__fput+541>: ud2 crash>
Jumped to the ud2 that caused the panic. Where did we jump from?
crash> dis __fput | grep __fput+541 0xffffffff81212638 <__fput+424>: je 0xffffffff812126ad <__fput+541> 0xffffffff812126ad <__fput+541>: ud2 crash>
And the assembly before the je:
crash> dis __fput | grep __fput+541 -B3 0xffffffff8121262e <__fput+414>: retq 0xffffffff8121262f <__fput+415>: mov 0x154(%r13),%eax 0xffffffff81212636 <__fput+422>: test %eax,%eax 0xffffffff81212638 <__fput+424>: je 0xffffffff812126ad <__fput+541>
Above r13 is likely the inode, so the 0x154(%r13) is inode.i_readcount:
crash> struct inode.i_readcount -xo struct inode { [0x154] atomic_t i_readcount; } crash>
The r13 is ffff880ed152aef8, so get the value of inode.i_readcount is 111:
crash> bt | grep R13 R13: ffff880ed152aef8 R14: ffff8800bba18aa0 R15: ffff880ed1513a40
crash> inode.i_readcount.counter ffff880ed152aef8 i_readcount.counter = 111 crash>
the inode.i_readcount.counter is not equal 0, why call BUG_ON?
Hi,
We have the same issue on our k8s cluster.
Description: ubuntu16.04.1 LTSx86_64
Release: 16.04.1
Kernel: 4.4.0-104-generic
the dump file can be downloaded via following way: 129.226. 115.161/ dump.2020062318 20.tar. gz
wget http://
I did some analysis, however i still didnot find the root cause:
Load the vmcore in crash (please refer to the hyperlink above). Crash should present details similar to the following: usermode_ loop at ffffffff81003242 return_ slowpath at ffffffff81003c6e from_sys_ call at ffffffff818449d0
crash> bt
PID: 11388 TASK: ffff880eb1f79e00 CPU: 29 COMMAND: "heartbeat"
#0 [ffff8809131a7b08] machine_kexec at ffffffff8105c22b
#1 [ffff8809131a7b68] crash_kexec at ffffffff8110e852
#2 [ffff8809131a7c38] oops_end at ffffffff81031c49
#3 [ffff8809131a7c60] die at ffffffff810320fb
#4 [ffff8809131a7c90] do_trap at ffffffff8102f121
#5 [ffff8809131a7ce0] do_error_trap at ffffffff8102f4a9
#6 [ffff8809131a7da0] do_invalid_op at ffffffff8102fa10
#7 [ffff8809131a7db0] invalid_op at ffffffff8184638e
[exception RIP: __fput+541]
RIP: ffffffff812126ad RSP: ffff8809131a7e68 RFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff880ef6915700 RCX: 0000000365fb1705
RDX: 0000000000000001 RSI: ffff880fff55a020 RDI: 0000000000000000
RBP: ffff8809131a7ea0 R8: 000000000001a020 R9: ffffffff811b591d
R10: ffffea002b69b300 R11: ffff880ef6915710 R12: 0000000000000010
R13: ffff880ed152aef8 R14: ffff8800bba18aa0 R15: ffff880ed1513a40
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#8 [ffff8809131a7e60] __fput at ffffffff812125ac
#9 [ffff8809131a7ea8] ____fput at ffffffff812126ee
#10 [ffff8809131a7eb8] task_work_run at ffffffff8109f101
#11 [ffff8809131a7ef8] exit_to_
#12 [ffff8809131a7f30] syscall_
#13 [ffff8809131a7f50] int_ret_
RIP: 000000000047f704 RSP: 000000c423b77c98 RFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000047f704
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000000ca
RBP: 000000c423b77ce0 R8: 0000000000000000 R9: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 000000c423b78ee0 R15: 0000000000000008
ORIG_RAX: 0000000000000003 CS: 0033 SS: 002b
crash>
crash> log linux-SwhOyu/ linux-4. 4.0/include/ linux/fs. h:2582! masquerade_ ipv4 xt_set xt_mark ip_set_ hash_ipportnet ip_set_ hash_ipportip ip_set_bitmap_port ip_set_hash_ipport ip_set dummy xt_comment xt_addrtype iptable_nat nf_nat_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 ip_vs_sh ip_vs_wrr ip_vs_rr ip_vs xt_tcpudp bridge stp llc nf_conntrack_ netlink nfnetlink xfrm_user xfrm_algo iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack aufs isofs ppdev crct10dif_pclmul parport_pc crc32_pclmul input_leds joydev ghash_clmulni_intel parport serio_raw ib_iser rdma_cm iw_cm ib_cm ib_sa ib_mad ib_core ib_addr sunrpc iscsi_tcp libiscsi_tcp libiscsi scsi_transport_ iscsi autofs4 btrfs raid10 raid456 async_raid6_recov 1.9.1-qemu- project. org 04/01/2014 ffffffff812126a d>] [<ffffffff81212 6ad>] __fput+0x21d/0x220 1a7e68 EFLAGS: 00010246 0(0000) GS:ffff880fff54 0000(0000) knlGS:000000000 0000000 6ee>] ____fput+0xe/0x10 101>] task_work_ run+0x81/ 0xa0 242>] exit_to_ usermode_ loop+0xc2/ 0xd0 c6e>] syscall_ return_ slowpath+ 0x4e/0x60 9d0>] int_ret_ from_sys_ call+0x25/ 0x8f 6ad>] __fput+0x21d/0x220
[19156101.592212] ------------[ cut here ]------------
[19156101.593103] kernel BUG at /build/
[19156101.594385] invalid opcode: 0000 [#1] SMP
[19156101.595083] Modules linked in: binfmt_misc af_packet_diag netlink_diag dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag veth br_netfilter ipt_MASQUERADE nf_nat_
[19156101.606434] async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd psmouse floppy
[19156101.609129] CPU: 29 PID: 11388 Comm: heartbeat Not tainted 4.4.0-104-generic #127-Ubuntu
[19156101.610384] Hardware name: Smdbmds KVM, BIOS seabios-
[19156101.611637] task: ffff880eb1f79e00 ti: ffff8809131a4000 task.ti: ffff8809131a4000
[19156101.612905] RIP: 0010:[<
[19156101.614188] RSP: 0018:ffff880913
[19156101.614989] RAX: 0000000000000000 RBX: ffff880ef6915700 RCX: 0000000365fb1705
[19156101.616143] RDX: 0000000000000001 RSI: ffff880fff55a020 RDI: 0000000000000000
[19156101.617285] RBP: ffff8809131a7ea0 R08: 000000000001a020 R09: ffffffff811b591d
[19156101.618422] R10: ffffea002b69b300 R11: ffff880ef6915710 R12: 0000000000000010
[19156101.619574] R13: ffff880ed152aef8 R14: ffff8800bba18aa0 R15: ffff880ed1513a40
[19156101.620785] FS: 000000c42085bc9
[19156101.622074] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[19156101.622921] CR2: 00007f508b166b04 CR3: 0000000e0981b000 CR4: 00000000003406e0
[19156101.624062] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[19156101.625210] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[19156101.626349] Stack:
[19156101.626765] ffff880ed152aef8 ffff880ef6915710 ffff880eb1f79e00 ffffffff8210ad50
[19156101.628018] ffff880ef6915700 0000000000000000 ffff880eb1f7a4a0 ffff8809131a7eb0
[19156101.629305] ffffffff812126ee ffff8809131a7ef0 ffffffff8109f101 ffff880eb1f7a4d4
[19156101.630568] Call Trace:
[19156101.631036] [<ffffffff81212
[19156101.631804] [<ffffffff8109f
[19156101.632612] [<ffffffff81003
[19156101.633497] [<ffffffff81003
[19156101.634402] [<ffffffff81844
[19156101.635285] Code: 0f 84 cf fe ff ff 48 8b 43 28 48 8b 80 80 00 00 00 48 85 c0 0f 84 bb fe ff ff 31 d2 48 89 de bf ff ff ff ff ff d0 e9 aa fe ff ff <0f> 0b 90 0f 1f 44 00 00 31 ff 48 87 3d 8a 6e fc 00 48 85 ff 74
[19156101.639244] RIP [<ffffffff81212
[19156101.640049] RSP <ffff8809131a7e68>
Referencing the line above in the source code, fs.h:2582!, we see the panic is due to a BUG_ON:
static void __fput(struct file *file)
i_readcount_ dec(inode) ;//
{
....
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
....
}
static inline void i_readcount_ dec(struct inode *inode)
BUG_ON( !atomic_ read(&inode- >i_readcount) );
atomic_ dec(&inode- >i_readcount) ;
{
}
And the corresponding dissasembly for the panic location:
crash> dis -r __fput+541
...
0xffffffff812126a1 <__fput+529>: mov $0xffffffff,%edi
0xffffffff812126a6 <__fput+534>: callq *%rax
0xffffffff812126a8 <__fput+536>: jmpq 0xffffffff81212557 <__fput+199>
0xffffffff812126ad <__fput+541>: ud2
crash>
Jumped to the ud2 that caused the panic. Where did we jump from?
crash> dis __fput | grep __fput+541
0xffffffff81212638 <__fput+424>: je 0xffffffff812126ad <__fput+541>
0xffffffff812126ad <__fput+541>: ud2
crash>
And the assembly before the je:
crash> dis __fput | grep __fput+541 -B3
0xffffffff8121262e <__fput+414>: retq
0xffffffff8121262f <__fput+415>: mov 0x154(%r13),%eax
0xffffffff81212636 <__fput+422>: test %eax,%eax
0xffffffff81212638 <__fput+424>: je 0xffffffff812126ad <__fput+541>
Above r13 is likely the inode, so the 0x154(%r13) is inode.i_readcount:
crash> struct inode.i_readcount -xo
struct inode {
[0x154] atomic_t i_readcount;
}
crash>
The r13 is ffff880ed152aef8, so get the value of inode.i_readcount is 111:
crash> bt | grep R13
R13: ffff880ed152aef8 R14: ffff8800bba18aa0 R15: ffff880ed1513a40
crash> inode.i_ readcount. counter ffff880ed152aef8 counter = 111
i_readcount.
crash>
the inode.i_ readcount. counter is not equal 0, why call BUG_ON?