BUG() inside megaraid (megaraid_sas_fusion) driver
Affects | Status | Importance | Assigned to | Milestone | |
---|---|---|---|---|---|
linux (Ubuntu) |
Opinion
|
Medium
|
Rafael David Tinoco |
Bug Description
The following kernel trace (together with a dump) was brought to me:
...
[ 8650.749804] SLUB: Unable to allocate memory on node -1 (gfp=0x2080020)
[ 8650.749809] cache: kmalloc-
[ 8650.749812] node 0: slabs: 83, objs: 5312, free: 0
[ 8650.749814] node 1: slabs: 59, objs: 3776, free: 0
[ 8650.749817] SLUB: Unable to allocate memory on node -1 (gfp=0x2080020)
[ 8650.749819] cache: kmalloc-
[ 8650.749821] node 0: slabs: 83, objs: 5312, free: 0
[ 8650.749823] node 1: slabs: 59, objs: 3776, free: 0
[ 8650.749825] DMAR: Allocating 2-page iova for 0000:02:00.0 failed
[ 8650.756414] ------------[ cut here ]------------
[ 8650.761768] kernel BUG at /build/
[ 8650.772638] invalid opcode: 0000 [#1] SMP
[ 8650.777226] Modules linked in: vport_gre ip_gre ip6_tables xt_set xt_multiport iptable_mangle iptable_raw ip_set_hash_ip ip_set_hash_net ip_set ipip tunnel4 ip_tunnel veth xt_statistic xt_nat xt_recent ipt_REJECT nf_reject_ipv4 xt_tcpudp gre openvswitch nf_defrag_ipv6 xt_comment xt_mark ipt_MASQUERADE nf_nat_
[ 8650.857496] async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear uas hid_generic ahci crct10dif_pclmul usb_storage crc32_pclmul ghash_clmulni_intel ixgbe igb aesni_intel usbhid aes_x86_64 dca mdio lrw vxlan gf128mul ip6_udp_tunnel glue_helper udp_tunnel ablk_helper ptp cryptd i2c_algo_bit libahci megaraid_sas pps_core hid scsi_dh_emc scsi_dh_rdac scsi_dh_alua dm_multipath
[ 8650.897195] CPU: 5 PID: 9720 Comm: in_tail.rb:276 Not tainted 4.4.0-112-generic #135-Ubuntu
[ 8650.906514] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.1.7 06/16/2016
[ 8650.915059] task: ffff881d91f88000 ti: ffff881d91f90000 task.ti: ffff881d91f90000
[ 8650.923409] RIP: 0010:[<
[ 8650.935073] RSP: 0000:ffff881d91
[ 8650.941000] RAX: 00000000fffffff4 RBX: ffff881fec2c0580 RCX: 0000000000000000
[ 8650.948960] RDX: 00000000fffffff4 RSI: 0000000000000246 RDI: 0000000000000246
[ 8650.956922] RBP: ffff881d91f93aa8 R08: 0000000000000005 R09: 0000000000000891
[ 8650.964883] R10: 0000000000000000 R11: 0000000000000891 R12: ffff883d72a79080
[ 8650.972845] R13: ffff881fec2c0500 R14: 00000000fffffff4 R15: ffff881fea100000
[ 8650.980806] FS: 00007f0d2d8f9ab
[ 8650.989835] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 8650.996246] CR2: 00007f0d335a6972 CR3: 0000003fec1fa000 CR4: 0000000000360670
[ 8651.004207] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 8651.012168] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 8651.020129] Stack:
[ 8651.022372] ffff881d91f93a88 ffffffff815bf054 ffff881fea91f8c0 ffff883d72a79080
[ 8651.030663] ffff881feb8607d8 ffff881ff0029000 ffff881feb8607d8 ffff881fea91f8c0
[ 8651.038954] ffff883d72a79080 0000000000001055 0000000000000005 ffff881d91f93ae0
[ 8651.047247] Call Trace:
[ 8651.049982] [<ffffffff815bf
[ 8651.057173] [<ffffffffc00c8
[ 8651.066787] [<ffffffffc00b8
[ 8651.075139] [<ffffffff815c6
[ 8651.081647] [<ffffffff815c9
[ 8651.088064] [<ffffffff813c7
[ 8651.094283] [<ffffffff813c7
[ 8651.100502] [<ffffffff813cd
[ 8651.107300] [<ffffffff813ce
[ 8651.113522] [<ffffffff8119f
[ 8651.120903] [<ffffffff81191
[ 8651.127704] [<ffffffff81193
[ 8651.133923] [<ffffffff811ce
[ 8651.140629] [<ffffffff812a5
[ 8651.147128] [<ffffffff811bf
[ 8651.152860] [<ffffffff811c3
[ 8651.159370] [<ffffffff81210
[ 8651.165688] [<ffffffff8106b
[ 8651.172089] [<ffffffff8106b
[ 8651.178116] [<ffffffff81849
[ 8651.183848] Code: 34 c1 e9 31 fe ff ff 41 0f b7 87 68 09 00 00 4c 89 e7 48 c1 e0 04 c6 44 03 ff 00 e8 a7 29 50 c1 85 c0 41 89 c6 0f 89 93 fd ff ff <0f> 0b 48 8b 45 b8 48 8b 7d c8 4c 8b 30 49 8b 04 24 48 8b 8f f8
[ 8651.205479] RIP [<ffffffffc00c8
...
crash-study.txt
/** make_sgl_ fusion - Prepares 32-bit SGL make_sgl_ fusion( struct megasas_instance *instance, SGE_CHAIN64 *sgl_ptr,
* megasas_
* @instance: Adapter soft state
* @scp: SCSI command from the mid-layer
* @sgl_ptr: SGL to be filled in
* @cmd: cmd we are working on
*
* If successful, this function returns the number of SG elements.
*/
static int
megasas_
struct scsi_cmnd *scp,
struct MPI25_IEEE_
struct megasas_cmd_fusion *cmd)
{
...
sge_count = scsi_dma_map(scp);
BUG_ON(sge_count < 0); ----> FAILS HERE
if (sge_count > instance- >max_num_ sge || !sge_count)
return sge_count;
----
/**
* scsi_dma_map - perform DMA mapping against command's sg lists
* @cmd: scsi command
*
* Returns the number of sg lists actually used, zero if the sg lists
* is NULL, or -ENOMEM if the mapping failed.
*/
int scsi_dma_map(struct scsi_cmnd *cmd)
{
int nseg = 0;
if (scsi_sg_ count(cmd) ) { >host-> dma_dev;
struct device *dev = cmd->device-
nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), >sc_data_ direction) ;
cmd-
if (unlikely(!nseg))
return -ENOMEM;
}
return nseg;
}
----
The only possible way for the BUG_ON in megasas_ make_sg_ fusion to be triggered
is if nseg is 0 and -ENOMEM (-12) is returned. This means that dma_map_sg could
NOT mapp the scatter gather buffers, from scsi_cmnd, into the firmware ?
----
#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
----
/* sg_attrs( struct device *dev, struct scatterlist *sg,
* dma_maps_sg_attrs returns 0 on error and > 0 on success.
* It should never return a value < 0.
*/
static inline int dma_map_
int nents, enum dma_data_direction dir,
struct dma_attrs *attrs)
{
...
ents = ops->map_sg(dev, sg, nents, dir, attrs);
BUG_ON(ents < 0);
...
return ents;
}
----
Specially because dma_map_sg_attrs would have BUGed_ON if ents where negative. make_sgl_ fusion( ) instead.
So the only possible thing that could have happened is for ents to be zeroed, so
it BUGed_ON at megasas_
----
ops->map_sg:
{init __mic_dma_ops}() : dma_map_ops dma_ops} () : dma_map_ops dma_ops} () : dma_map_ops
{init amd_iommu_
{init calgary_dma_ops}() : dma_map_ops
{init gart_dma_ops}() : dma_map_ops
{init intel_dma_ops}() : dma_map_ops
{init nommu_dma_ops}() : dma_map_ops
{init sta2x11_dma_ops}() : dma_map_ops
{init swiotlb_dma_ops}() : dma_map_ops
{init xen_swiotlb_
crash> dev -d
MAJOR GENDISK NAME REQUEST_QUEUE TOTAL ASYNC SYNC DRV
8 ffff881ff0142800 sdc ffff881fe92a9f50 0 0 0 1
11 ffff881ff06dd000 sr0 ffff881fe8ecb968 0 0 0 0
8 ffff881ff06de000 sdd ffff881fe8ecb430 0 0 0 0
8 ffff881ff0141800 sdb ffff881fe9c78a70 0 0 0 0
8 ffff881ff0140800 sda ffff881fe9c78538 0 0 0 0
8 ffff881ff06b2000 sde ffff881fe8dfbea0 0 0 0 0
crash> struct device.archdata ffff881fe8ecb968
archdata = {
dma_ops = 0xffff881ff06dc168,
iommu = 0x0
}
crash> struct device.archdata ffff881fe8ecb430
archdata = {
dma_ops = 0xffff881ff06dc968,
iommu = 0x0
}...