Comment 13 for bug 1817713

Revision history for this message
In , bugproxy (bugproxy-redhat-bugs) wrote :

------- Comment From <email address hidden> 2017-10-31 09:49 EDT-------
Going deeper in the rabbit hole from IBM Comment 27 / RH Comment 12:

============================
grub-core/kern/disk.c +187
============================
187 grub_disk_t
188 grub_disk_open (const char *name)
189 {
...
224 for (dev = grub_disk_dev_list; dev; dev = dev->next)
225 {
226 if ((dev->open) (raw, disk) == GRUB_ERR_NONE)
227 break;
228 else if (grub_errno == GRUB_ERR_UNKNOWN_DEVICE)
229 grub_errno = GRUB_ERR_NONE;
230 else
231 goto fail;
232 }
233
234 if (! dev)
235 {
236 grub_error (GRUB_ERR_UNKNOWN_DEVICE, N_("disk `%s' not found"),
237 name);
238 goto fail;
239 }
============================

Using --metadata=1.0, `dev` comes zeroed out after the for loop on line 224, whereas on 0.90 it is defined. Moreover, line 236 grub_error() message is the one being printed by grub2-probe.

=====
FAILURE grub2-probe - RAID using --metadata=1.0
=====
Breakpoint 1, grub_disk_open (name=0x10185290 "mduuid/0ef5c3920edae097657894d84aef753d") at grub-core/kern/disk.c:234
234 if (! dev)
(gdb) print dev
$1 = (grub_disk_dev_t) 0x0
(gdb) print *dev
Cannot access memory at address 0x0
(gdb) s
236 grub_error (GRUB_ERR_UNKNOWN_DEVICE, N_("disk `%s' not found"),
=====

=====
SUCCESS grub2-probe - RAID using --metadata=0.90
=====
Breakpoint 1, grub_disk_open (name=0x10185830 "mduuid/ebae38d5105eed037b777c24c48ad94b") at grub-core/kern/disk.c:234
234 if (! dev)
(gdb) print dev
$1 = (grub_disk_dev_t) 0x10165e80 <grub_diskfilter_dev>
(gdb) print *dev
$2 = {
name = 0x10146d50 "diskfilter",
id = GRUB_DISK_DEVICE_DISKFILTER_ID,
iterate = 0x101107cc <grub_diskfilter_iterate>,
open = 0x10110fd8 <grub_diskfilter_open>,
close = 0x10111120 <grub_diskfilter_close>,
read = 0x1011220c <grub_diskfilter_read>,
write = 0x1011227c <grub_diskfilter_write>,
memberlist = 0x10110950 <grub_diskfilter_memberlist>,
raidname = 0x10110df4 <grub_diskfilter_getname>,
next = 0x10165fb8 <grub_procfs_dev>
}
(gdb) s
240 if (disk->log_sector_size > GRUB_DISK_CACHE_BITS + GRUB_DISK_SECTOR_BITS
=====

Since `dev` is used for a couple of devices on grub and this is a C template struct, each dev had its own functions. In our case, we are dealing with grub_disk_dev_t, and through gdb we can see that dev->open() on line 226 actually is grub_diskfilter_open() on:

============================
grub-core/disk/diskfilter.c +419
============================
419 static grub_err_t
420 grub_diskfilter_open (const char *name, grub_disk_t disk)
421 {
422 struct grub_diskfilter_lv *lv;
423
424 if (!is_valid_diskfilter_name (name))
425 return grub_error (GRUB_ERR_UNKNOWN_DEVICE, "unknown DISKFILTER device %s",
426 name);
427
428 lv = find_lv (name);
429
430 if (! lv)
431 {
432 scan_devices (name);
433 if (grub_errno)
434 {
435 grub_print_error ();
436 grub_errno = GRUB_ERR_NONE;
437 }
438 lv = find_lv (name);
439 }
440
441 if (!lv)
442 return grub_error (GRUB_ERR_UNKNOWN_DEVICE, "unknown DISKFILTER device %s",
443 name);
444
445 disk->id = lv->number;
446 disk->data = lv;
447
448 disk->total_sectors = lv->size;
449 disk->max_agglomerate = GRUB_DISK_MAX_MAX_AGGLOMERATE;
450 return 0;
============================

The is_valid_diskfilter_name() check on line 426 passes for both metadatas 0.90 and 1.0.

However, if we break line 441, a strange thing to note here - using --metadata=1.0 all my disk devices passes through the breakpoint, whereas using 0.90 only the raid device passed through the breakpoint on line 441.

=====
FAILURE grub2-probe - RAID using --metadata=1.0
=====
Breakpoint 1, grub_diskfilter_open (name=0x10185b90 "lvm/rhel-root", disk=0x101827d0) at grub-core/disk/diskfilter.c:441
441 if (!lv)
(gdb) c
Continuing.

Breakpoint 1, grub_diskfilter_open (name=0x101859f0 "lvm/rhel-home", disk=0x101827d0) at grub-core/disk/diskfilter.c:441
441 if (!lv)
(gdb) c
Continuing.

Breakpoint 1, grub_diskfilter_open (name=0x101857e0 "lvm/rhel-swap", disk=0x101827d0) at grub-core/disk/diskfilter.c:441
441 if (!lv)
(gdb) c
Continuing.

...

Breakpoint 2, grub_diskfilter_open (name=0x10185290 "mduuid/0ef5c3920edae097657894d84aef753d", disk=0x10182780) at grub-core/disk/diskfilter.c:441
441 if (!lv)
(gdb) print lv
$2 = (struct grub_diskfilter_lv *) 0x0
(gdb) print *lv
Cannot access memory at address 0x0
(gdb) s
442 return grub_error (GRUB_ERR_UNKNOWN_DEVICE, "unknown DISKFILTER device %s",
=====

=====
SUCCESS grub2-probe - RAID using --metadata=0.90
=====
Breakpoint 1, grub_diskfilter_open (name=0x10185830 "mduuid/c5e0adca3d6a76ef7b777c24c48ad94b", disk=0x101834a0) at grub-core/disk/diskfilter.c:441
441 if (!lv)
(gdb) print lv
$1 = (struct grub_diskfilter_lv *) 0x10183690
(gdb) print *lv
$2 = {
fullname = 0x10183580 "md/md1",
idname = 0x10183700 "mduuid/c5e0adca3d6a76ef7b777c24c48ad94b",
name = 0x10183580 "md/md1",
number = 0,
segment_count = 1,
segment_alloc = 0,
size = 20969344,
became_readable_at = 1,
scanned = 0,
visible = 1,
segments = 0x10183730,
vg = 0x10183530,
next = 0x0,
internal_id = 0x0
}
(gdb) s
445 disk->id = lv->number;
=====

So, `lv` on 441 is coming out zeroed out. Let's back up a bit and break line 430:

=====
FAILURE grub2-probe - RAID using --metadata=1.0
=====
Breakpoint 2, grub_diskfilter_open (name=0x10185290 "mduuid/2872dd311d2585e4690defc1d9ba07a7", disk=0x10182780) at grub-core/disk/diskfilter.c:430
430 if (! lv)
(gdb) print lv
$2 = (struct grub_diskfilter_lv *) 0x0
(gdb) print *lv
Cannot access memory at address 0x0
(gdb) c
Continuing.

...

Breakpoint 2, grub_diskfilter_open (name=0x10185b90 "lvm/rhel-root", disk=0x101827d0) at grub-core/disk/diskfilter.c:430
430 if (! lv)
(gdb) c
Continuing.

Breakpoint 2, grub_diskfilter_open (name=0x101859f0 "lvm/rhel-home", disk=0x101827d0) at grub-core/disk/diskfilter.c:430
430 if (! lv)
(gdb) c
Continuing.

Breakpoint 2, grub_diskfilter_open (name=0x101857e0 "lvm/rhel-swap", disk=0x101827d0) at grub-core/disk/diskfilter.c:430
430 if (! lv)
(gdb) c
Continuing.
/usr/sbin/grub2-probe: error: disk ?mduuid/2872dd311d2585e4690defc1d9ba07a7? not found.
[Inferior 1 (process 19832) exited with code 01]
=====

=====
SUCCESS grub2-probe - RAID using --metadata=0.90
=====
Breakpoint 1, grub_diskfilter_open (name=0x10185830 "mduuid/a2f06ca0ad6cedbc7b777c24c48ad94b", disk=0x101834a0) at grub-core/disk/diskfilter.c:430
430 if (! lv)
(gdb) print lv
$1 = (struct grub_diskfilter_lv *) 0x10183690
(gdb) print *lv
$2 = {
fullname = 0x10183580 "md/md1",
idname = 0x10183700 "mduuid/a2f06ca0ad6cedbc7b777c24c48ad94b",
name = 0x10183580 "md/md1",
number = 0,
segment_count = 1,
segment_alloc = 0,
size = 20969344,
became_readable_at = 1,
scanned = 0,
visible = 1,
segments = 0x10183730,
vg = 0x10183530,
next = 0x0,
internal_id = 0x0
}
=====

Thus, apparently the culprit now might be hiding on find_lv ().

============================
grub-core/disk/diskfilter.c +401
============================
401 static struct grub_diskfilter_lv *
402 find_lv (const char *name)
403 {
404 struct grub_diskfilter_vg *vg;
405 struct grub_diskfilter_lv *lv = NULL;
406
407 for (vg = array_list; vg; vg = vg->next)
408 {
409 if (vg->lvs)
410 for (lv = vg->lvs; lv; lv = lv->next)
411 if (((lv->fullname && grub_strcmp (lv->fullname, name) == 0)
412 || (lv->idname && grub_strcmp (lv->idname, name) == 0))
413 && is_lv_readable (lv, 0))
414 return lv;
415 }
416 return NULL;
417 }
============================

=====
FAILURE grub2-probe - RAID using --metadata=1.0
=====
Breakpoint 1, find_lv (name=0x10185290 "mduuid/e5cf979ce818f58cc57b618bd78b4b86") at grub-core/disk/diskfilter.c:407
407 for (vg = array_list; vg; vg = vg->next)
(gdb) print array_list
$1 = (struct grub_diskfilter_vg *) 0x0
(gdb) print *array_list
Cannot access memory at address 0x0
(gdb) s
416 return NULL;
=====

=====
SUCCESS grub2-probe - RAID using --metadata=0.90
=====
Breakpoint 5, find_lv (name=0x10185830 "mduuid/a2f06ca0ad6cedbc7b777c24c48ad94b") at grub-core/disk/diskfilter.c:407
407 for (vg = array_list; vg; vg = vg->next)
(gdb) print array_list
$5 = (struct grub_diskfilter_vg *) 0x10183530
(gdb) print *array_list
$6 = {
uuid = 0x10183300 "\242\360l\240\255l\355\274{w|$?\331Ke/en_US.!",
uuid_len = 16,
name = 0x10183580 "md/md1",
extent_size = 1,
pvs = 0x10186090,
lvs = 0x10183690,
next = 0x0,
driver = 0x10160228 <grub_mdraid_dev>
}
(gdb) s
409 if (vg->lvs)
=====

Therefore, at this point we can infer that RAID 1 with --metadata=1.0 on 4k blocksize disks is leading to the creation of an empty array_list, which is leading to everything else.

Riddle me this: why?

More gdb to come ... apparently array_list is being populated on grub_diskfilter_vg_register() at grub-core/disk/diskfilter.c +838.

Will look into that next, and eventually on mdadm.