Comment 13 for bug 1788035

Revision history for this message
Brian Moyles (bmoyles) wrote :

We encountered an instance that had a nvme failure very early on in boot today. I've updated our internal Canonical case as well as our Amazon case on this, but posting relevant details here as well for consistency:

# uname -a
Linux XXX 4.4.0-1069-aws #79-Ubuntu SMP Mon Sep 24 15:01:41 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

# cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION="Ubuntu 16.04.5 LTS"

# echo type $EC2_INSTANCE_TYPE
type m5.xlarge

# lsblk
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
nvme0n1 259:0 0 10G 0 disk /

# ls -al /dev/nvme* /dev/xvd* /dev/sd*
ls: cannot access '/dev/xvd*': No such file or directory
crw------- 1 root root 248, 0 Oct 31 15:02 /dev/nvme0
brw-rw---- 1 root disk 259, 0 Oct 31 15:02 /dev/nvme0n1
lrwxrwxrwx 1 root root 7 Oct 31 15:02 /dev/sda1 -> nvme0n1

# dmesg | grep '63\.'
[ 63.401466] nvme 0000:00:1f.0: I/O 0 QID 0 timeout, disable controller
[ 63.505790] nvme 0000:00:1f.0: Cancelling I/O 0 QID 0
[ 63.505812] nvme 0000:00:1f.0: Identify Controller failed (-4)
[ 63.507536] nvme 0000:00:1f.0: Removing after probe failure
[ 63.507604] iounmap: bad address ffffc90001b40000
[ 63.508941] CPU: 1 PID: 351 Comm: kworker/1:3 Tainted: P O 4.4.0-1069-aws #79-Ubuntu
[ 63.508943] Hardware name: Amazon EC2 m5.xlarge/, BIOS 1.0 10/16/2017
[ 63.508948] Workqueue: events nvme_remove_dead_ctrl_work [nvme]
[ 63.508950] 0000000000000286 3501e2639044a4d2 ffff8800372bfce0 ffffffff923ffe03
[ 63.508952] ffff88040dd878f0 ffffc90001b40000 ffff8800372bfd00 ffffffff9206d3af
[ 63.508954] ffff88040dd878f0 ffff88040dd87a58 ffff8800372bfd10 ffffffff9206d3ec
[ 63.508956] Call Trace:
[ 63.508961] [<ffffffff923ffe03>] dump_stack+0x63/0x90
[ 63.508965] [<ffffffff9206d3af>] iounmap.part.1+0x7f/0x90
[ 63.508967] [<ffffffff9206d3ec>] iounmap+0x2c/0x30
[ 63.508969] [<ffffffffc039abfa>] nvme_dev_unmap.isra.35+0x1a/0x30 [nvme]
[ 63.508972] [<ffffffffc039bd1e>] nvme_remove+0xce/0xe0 [nvme]
[ 63.508976] [<ffffffff92441e0e>] pci_device_remove+0x3e/0xc0
[ 63.508980] [<ffffffff9254f654>] __device_release_driver+0xa4/0x150
[ 63.508982] [<ffffffff9254f723>] device_release_driver+0x23/0x30
[ 63.508986] [<ffffffff9243abda>] pci_stop_bus_device+0x7a/0xa0
[ 63.508988] [<ffffffff9243ad3a>] pci_stop_and_remove_bus_device_locked+0x1a/0x30
[ 63.508990] [<ffffffffc039a62c>] nvme_remove_dead_ctrl_work+0x3c/0x50 [nvme]
[ 63.508994] [<ffffffff9209d86b>] process_one_work+0x16b/0x490
[ 63.508996] [<ffffffff9209dbdb>] worker_thread+0x4b/0x4d0
[ 63.508998] [<ffffffff9209db90>] ? process_one_work+0x490/0x490
[ 63.509001] [<ffffffff920a3e47>] kthread+0xe7/0x100
[ 63.509005] [<ffffffff92823301>] ? __schedule+0x301/0x7f0
[ 63.509007] [<ffffffff920a3d60>] ? kthread_create_on_node+0x1e0/0x1e0
[ 63.509009] [<ffffffff92827e35>] ret_from_fork+0x55/0x80
[ 63.509011] [<ffffffff920a3d60>] ? kthread_create_on_node+0x1e0/0x1e0
[ 63.509013] Trying to free nonexistent resource <00000000febf8000-00000000febfbfff>

# modinfo nvme
filename: /lib/modules/4.4.0-1069-aws/kernel/drivers/nvme/host/nvme.ko
version: 1.0
license: GPL
author: Matthew Wilcox <email address hidden>
srcversion: 5CF522443B009A8675C497B
alias: pci:v0000106Bd00002001sv*sd*bc*sc*i*
alias: pci:v*d*sv*sd*bc01sc08i02*
alias: pci:v0000144Dd0000A822sv*sd*bc*sc*i*
alias: pci:v0000144Dd0000A821sv*sd*bc*sc*i*
alias: pci:v00001C58d00000003sv*sd*bc*sc*i*
alias: pci:v00008086d00005845sv*sd*bc*sc*i*
alias: pci:v00008086d0000F1A5sv*sd*bc*sc*i*
alias: pci:v00008086d00000953sv*sd*bc*sc*i*
depends:
retpoline: Y
intree: Y
vermagic: 4.4.0-1069-aws SMP mod_unload modversions retpoline
parm: admin_timeout:timeout in seconds for admin commands (uint)
parm: io_timeout:timeout in seconds for I/O (uint)
parm: shutdown_timeout:timeout in seconds for controller shutdown (byte)
parm: use_threaded_interrupts:int
parm: use_cmb_sqes:use controller's memory buffer for I/O SQes (bool)
parm: nvme_major:int
parm: nvme_char_major:int
parm: default_ps_max_latency_us:max power saving latency for new devices; use PM QOS to change per device (ulong)

# systool -m nvme -va
Module = "nvme"

  Attributes:
    coresize = "65536"
    initsize = "0"
    initstate = "live"
    refcnt = "1"
    srcversion = "5CF522443B009A8675C497B"
    taint = ""
    uevent = <store method only>
    version = "1.0"

  Parameters:
    admin_timeout = "60"
    default_ps_max_latency_us= "100000"
    io_timeout = "4294967295"
    shutdown_timeout = "5"
    use_cmb_sqes = "Y"

  Sections:
    .bss = "0xffffffffc03a3780"
    .data = "0xffffffffc03a3000"
    .data.unlikely = "0xffffffffc03a33d8"
    .exit.text = "0xffffffffc03a0cea"
    .gnu.linkonce.this_module= "0xffffffffc03a3400"
    .init.text = "0xffffffffc03a8000"
    .note.gnu.build-id = "0xffffffffc03a1000"
    .parainstructions = "0xffffffffc03a1b88"
    .rodata = "0xffffffffc03a1060"
    .rodata.str1.1 = "0xffffffffc03a2349"
    .rodata.str1.8 = "0xffffffffc03a1d78"
    .smp_locks = "0xffffffffc03a1b28"
    .strtab = "0xffffffffc03abb08"
    .symtab = "0xffffffffc03a9000"
    .text = "0xffffffffc0397000"
    __bug_table = "0xffffffffc03a2be0"
    __kcrctab_gpl = "0xffffffffc03a1040"
    __ksymtab_gpl = "0xffffffffc03a1030"
    __ksymtab_strings = "0xffffffffc03a25d3"
    __mcount_loc = "0xffffffffc03a2730"
    __param = "0xffffffffc03a25f0"