diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.0 mdadm-3.1.4/ANNOUNCE-2.0 --- mdadm-2.6.7.1/ANNOUNCE-2.0 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.0 1970-01-01 02:00:00.000000000 +0200 @@ -1,41 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.0 - A tool for managing Soft RAID under Linux - -I am (at last) please to announce the availability of - mdadm version 2.0 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.0 contains a substantial rewrite of various pieces of -functionality, particularly the --create option. This enables support -for a new style of superblock - the version-1 superblock. -Version-1 can support many more than 28 devices and can be easily -moved between hosts with different endian-ness. -Release 2.0 also contains support for the recent bitmap-intent-logging -which will appear in 2.6.13, and the RAID1 write-behind that will be -available in 2.6.14 (it is currently only in -mm). - -This release comes with a test-suite which has been used to verify that -mdadm-2.0 actually works in a number of common scenarios. Some of the -tests require a bleeding-edge kernel, so don't be surprised if some fail -on kernels prior to 2.6.14. - -Being a '.0' release, 2.0 should be treated with some caution. -However I believe it is quite stable and can safely be used on -production systems. - -Note that this release is "2.0", not "2.0.0". I never found a use for -the third number with 1.X.Y, so I dropped it. - - -Development of mdadm has moved from CSE@UNSW and is now sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 26th August 2005 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.1 mdadm-3.1.4/ANNOUNCE-2.1 --- mdadm-2.6.7.1/ANNOUNCE-2.1 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.1 1970-01-01 02:00:00.000000000 +0200 @@ -1,37 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.1 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.1 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.1 fixes a few problems found after the release of 2.0. -If you are using raid10 or version-1 superblocks and upgrade is -recommended. - -Specifically: - - Fix assembling of raid10 array when devices are missing. - mdadm now correctly detects if a array is workable or not - depending on which devices are present, and so will correctly - handle "--assemble --force" if multiple devices have failed. - - Report raid10 layout in --examine output. - - Fix assembling of arrays that use the version-1 superblock and - have spares. Previously the spares would be ignored. - - Fix bug so that multiple drives can be re-added at once. - - Fix problem with hot-adding a bitmap to version-1-superblock - arrays. - - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 12th September 2005 - - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.2 mdadm-3.1.4/ANNOUNCE-2.2 --- mdadm-2.6.7.1/ANNOUNCE-2.2 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.2 1970-01-01 02:00:00.000000000 +0200 @@ -1,50 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.2 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.2 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.2 fixes a few small bugs and add as few small elements of -functionality. Possibly the most interesting is the addition of -'README.initramfs' and 'mkinitramfs'. Feedback on these would be -most welcome. - -Changelog Entries: - - Assorted bug fixes - - Support write-intent-bitmaps on raid10 - - Support little-endian (Rather than hostendian) bitmaps. - - Return correct error code from 'mdadm -S' - - Remove extra blank line from 'mdadm -Eb' output. - - Improve option parsing so that -a and -b do not have - optional arguements: the arg is either required or not - depending on context. - - Allow scanning of devices listed in /proc/partitions even - if they don't appear in /dev. - - Support --assume-clean in --create mode as well as --build - - Add support for --monitor to report to syslog: -y or --syslog. - Thanks to Ross Vandegrift - - --monitor now reports which device failed in a 'Fail' message - This broke with 2.6 - - Improve chance of array starting properly after a crash. - mdadm was insisting the event numbers were identical, but this - isn't needed, and is a problem if the crash was while the metadata - was being updated. - - Support --update==uuid - - Added README.initramfs and mkinitramfs to help people use an - initram for starting md arrays at boot. - - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 5th December 2005 - - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.3 mdadm-3.1.4/ANNOUNCE-2.3 --- mdadm-2.6.7.1/ANNOUNCE-2.3 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.3 1970-01-01 02:00:00.000000000 +0200 @@ -1,49 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.3 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.3 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.3 fixes an assortment of bugs including the "--assemble --scan" -crash. It also makes mdadm better at handling very large arrays. -Finally, it adds some functionality to support some pending kernel -features such as raid5 reshaping. - -Changelog Entries: - - Try /etc/mdadm/mdadm.conf if /etc/mdadm.conf doesn't exist. - This provided compatability for Debian. - - Fixed for version-1 superblock: - report chunksize for raid6 and raid10 - make sure device size used is a multiple of chunksize - - Fix "--assemble --scan" crash. - - Fix completely failure to create array on ppc64 - - Fix memcmp in place of memcpy - - A few minor improvements to online help - - Clean up usage of 'long long' for used-size of devices, so - that it is possible to create a raid1 of 7TB devices! - - Make internal bitmaps work on 7TB raid1 arrays. - - Provide error message if --examine doesn't find any superblock. - - Report 'reshape' status in --examine - this depends on kernel - patches that are not yet finalised. - - Report bitmap status in --detail and --examine - - Default to v1 superblocks instead of v0.90 if the array - is too big for 0.90 to handle. - - Sort the output of "mdadm --detail --scan" so that it is - in a suitable order for assembling arrays. i.e. components come - before an array that they are part of. - - Print size of large reiserfs array properly went warning of - possible confilcts. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 2nd February 2006 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.3.1 mdadm-3.1.4/ANNOUNCE-2.3.1 --- mdadm-2.6.7.1/ANNOUNCE-2.3.1 2006-02-06 06:11:49.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-2.3.1 1970-01-01 02:00:00.000000000 +0200 @@ -1,31 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.3.1 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.3.1 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.3.1 primarily fixes a few compile problems with 2.3. Though -there are some functionality changes, they are extremely minor and you -probably won't notice them. - -Changelog Entries: - - Fixed -O2 compile so I could make and RPM. - - Type cast number to be printed %llu so it compiles on 64bit - machines. (Thanks Luca). - - Stop using asm/byteorder.h - to make Redhat happy :-( - - Require bitmap files to have a '/' in their name. - - Error-check a few syscalls - code from SuSE package. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 6th February 2006 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.4 mdadm-3.1.4/ANNOUNCE-2.4 --- mdadm-2.6.7.1/ANNOUNCE-2.4 2006-03-30 07:24:09.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.4 1970-01-01 02:00:00.000000000 +0200 @@ -1,45 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.4 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.4 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.4 primarily adds support for increasing the number of -devices in a RAID5 array, which requires 2.6.17 (or some -rc or -mm -prerelease). -It also includes a number of minor functionality enhancements and -documentation updates. - -Changelog Entries: - - Rewrite 'reshape' support including performing a backup - of the critical region for a raid5 growth, and restoring that - backup after a crash. - - Put a 'canary' at each end of the backup so a corruption - can be more easily detected. - - Remove useless 'ident' arguement from ->getinfo_super method. - - Support --backup-file for backing-up critical section during - growth. - - Erase old superblocks (of different versions) when creating new - array. - - Allow --monitor to work with arrays with >28 devices - - Report reshape information in --detail - - Handle symlinks in /dev better - - Fix mess in --detail output which a device is missing. - - Manpage tidyup - - Support 'bitmap=' in mdadm.conf for auto-assembling arrays with - write-intent bitmaps in separate files. - - Updates to md.4 man page including section on RESTRIPING and SYSFS - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 30th March 2006 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.4.1 mdadm-3.1.4/ANNOUNCE-2.4.1 --- mdadm-2.6.7.1/ANNOUNCE-2.4.1 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.4.1 1970-01-01 02:00:00.000000000 +0200 @@ -1,31 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.4.1 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.4.1 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -2.4.1 primarily fixes an alignment problem in the version-1 -superblock. This is an incompatible change relating to -raid5-reshape. Read the change log below. - -Changelog Entries: - - Honour --write-mostly when adding to an array without persistent - superblocks. - - Fix alignment problem in version-1 superblocks. - NOTE: This is an incompatable change affecting raid5 reshape. - If you want to reshape a raid5 using version-1 superblocks, - use 2.6.17-rc2 or later, and mdadm-2.4.1 or later. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 7th April 2006 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.4-pre1 mdadm-3.1.4/ANNOUNCE-2.4-pre1 --- mdadm-2.6.7.1/ANNOUNCE-2.4-pre1 2006-03-20 05:34:49.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-2.4-pre1 1970-01-01 02:00:00.000000000 +0200 @@ -1,24 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.4-pre1 - development release for testing only - -I am somewhat cautious in announcing the availability of - mdadm version 2.4-pre1 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -pre-release 2.4-pre1 contains support for growing a raid5 array i.e. -adding extra drives and restriping the whole arrays. - -This requires a kernel more recent than 2.6.16-rc6-mm2. - -During a restripe, there is a crtical section at the start where a -system crash can leave the contents of the raid5 unrecoverable. This -version of mdadm will backup that section, and can restore it when -reassembling the array after a crash. - -Have fun, but use it carefully and report any problems. - -NeilBrown 20th March 2006 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5 mdadm-3.1.4/ANNOUNCE-2.5 --- mdadm-2.6.7.1/ANNOUNCE-2.5 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.5 1970-01-01 02:00:00.000000000 +0200 @@ -1,67 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5 adds a host of minor updates and one major update. - -The major update involves an "Auto Assemble" function which will, -with certainly limits, scan all available devices for anything that -looks like an md array, and will try to assemble it. -This code should be treated with some caution as it is very new, -and could be revised in future, though hopefully not very much. - -The main problem I have always had will auto-assembly is that it -is too easy for it to assemble thing that you don't want assembled, -e.g. if you have moved some devices from a different computer. -To remove this problem, arrays can now be tagged for the computer -that is their home (homehost) and only arrays with the correct -homehost will be auto-assembled. - -Feedback on the effectiveness and usefulness of this feature and it's -documentation is encouraged. - -Changelog Entries: - - Support 'mailfrom' line in mdadm.conf so the From: line in alert - emails can be explicitly set. - - Arrange that SparesMissing (which is similar in import to - DegradedArray) generates an Email. - - Assume "DEVICE partitions" if no DEVICE line is given. - - Support new 'offset' layout for raid10. - - When creating a bitmap file, choose a chunksize to limit number - of bitmap chunks to 2 million. More than this can cause kmalloc - failure. - - New 'CREATE' line in mdadm.conf for defaults such as owner, group, - mode and auto-flag - - --detail checks if array has been started or not and includes that - in report. - - When using --update=uuid on an array with a bitmap, update the - bitmap's uuid too. - - Add a copy of /proc/mdstat to the mail message sent by mdadm - --monitor. - - New flag --no-degraded to avoid starting arrays if there are - fewer devices available than last time the array was started. - This is only needed with --scan, as with --scan, that behaviour - is the default. - - Support for 'homehost' concept. This is a fairly major update. - It includes a configfile option and a command line option for - specifying a homehost, records that host in the superblock, - and reports the homehost where possible. - - Support for Auto Assembly. "mdadm -As" will, if provided with - the name of a homehost, try to assemble all arrays it can find - that were created for that homehost. See man pages for more details. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 26th May 2006 - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5.1 mdadm-3.1.4/ANNOUNCE-2.5.1 --- mdadm-2.6.7.1/ANNOUNCE-2.5.1 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.5.1 1970-01-01 02:00:00.000000000 +0200 @@ -1,40 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5.1 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5.1 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - http://www.{countrycode}.kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/git/mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5.1 adds multiple minor updates including a couple of bugfixes. - -Changelog Entries: - - Various fixes for gcc warnings - - uclibc warnings - - Makefile improvements for static linking/intalling - - Makefile uninstall target - - Really fix return status of --examine - - Typos - - Byteorder stuff (again) - - Don't try to create devices with --manage or --grow - - allow default metadata (superblock) type to be specified - in mdadm.conf - - Get --stop to list devices stopped but honour --quiet - - remove libssl dependency - - Avoid some misdetection of overlapping partitions - - Fix memory leak in --monitor mode - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 16th June 2006 - - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5.2 mdadm-3.1.4/ANNOUNCE-2.5.2 --- mdadm-2.6.7.1/ANNOUNCE-2.5.2 2006-06-27 13:54:54.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.5.2 1970-01-01 02:00:00.000000000 +0200 @@ -1,45 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5.2 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5.2 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5.2 is primarily a bugfix release over 2.5.1. -It also contains a work-around for a kernel bug which affects -hot-adding to arrays with a version-1 superblock. - -Changelog Entries: - - Fix problem with compiling with gcc-2 compilers - - Fix compile problem of post-incrmenting a variable in a macro arg. - - Stop map_dev from returning [0:0], as that breaks things. - - Add 'Array Slot' line to --examine for version-1 superblocks - to make it a bit easier to see what is happening. - - Work around bug in --add handling for version-1 superblocks - in 2.6.17 (and prior). - - Make -assemble a bit more resilient to finding strange - information in superblocks. - - Don't claim newly added spares are InSync!! (don't know why that - code was ever in there) - - Work better when no 'ftw' is available, and check to see - if current uclibc provides ftw. - - Never use /etc/mdadm.conf if --config file is given (previously - some code used one, some used the other). - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 27th June 2006 - - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5.3 mdadm-3.1.4/ANNOUNCE-2.5.3 --- mdadm-2.6.7.1/ANNOUNCE-2.5.3 2006-08-07 04:37:56.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.5.3 1970-01-01 02:00:00.000000000 +0200 @@ -1,33 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5.3 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5.3 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5.3 is a bugfix release over 2.5.2. - -Changelog Entries: - - Document v0.91 superblocks in md.4 - - Make GPL explicit in man pages. - - Fix recent breakage of starting degraded arrays. - - Tidyup automatic name choice for v-1 arrays: - /dev/md_d0 now becomes '0', not '_d0'. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 7th August 2006 - - diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5.4 mdadm-3.1.4/ANNOUNCE-2.5.4 --- mdadm-2.6.7.1/ANNOUNCE-2.5.4 2006-10-13 02:58:03.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.5.4 1970-01-01 02:00:00.000000000 +0200 @@ -1,38 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5.4 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5.4 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5.4 is a bugfix release over 2.5.3 - -Changelog Entries: - - When creating devices in /dev/md/ create matching symlinks - from /dev. e.g. /dev/md0 -> /dev/md/0. - Allow this to be disabled in mdadm.conf or on command line. - - Fix some endian-ness issues with version-1 superblocks (affects - bigendian only). - - Fix endian problem with 'bitmap' metadata - - Allow a number (of partitions) after the 'yes' option to --auto= - This is particularly useful in the 'create' line in mdadm.conf. - - Remove partitions from any whole device that is made part of - an md array. This is a work-around for annoying messages - when the first block on some drive accidentally looks like a - partition table. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 13th October 2006 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5.5 mdadm-3.1.4/ANNOUNCE-2.5.5 --- mdadm-2.6.7.1/ANNOUNCE-2.5.5 2006-10-23 08:53:27.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.5.5 1970-01-01 02:00:00.000000000 +0200 @@ -1,43 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5.5 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5.5 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5.5 is a bugfix release over 2.5.4. -Hopefully the last before 2.6. - -Changelog Entries: - - Don't #include linux/blkpg.h as that isn't safe. Just - include the content literally. - - Reduce maximum bitmap usage when working with bitmap files, - so that a only single-page allocations are made, even on - 64bit hosts with 4K pages. - - Close stray fd in mdassemble so that it can assemble stacked - devices - - If mdassemble finds an array already assembled, it marks it - read-write. - - Remove error in md_open if array is already active. This isn't - needed and gets in the ways if an array was created e.g. in - initramfs, but device doesn't yet exist in /dev. - - When --assemble --scan is run, if all arrays that could be found - have already been started, don't report an error. - - Fix a couple of bugs related to raid10 and the new 'offset' layout. - - Improve error message when a wrong '--update' option is given. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 23rd October 2006 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.5.6 mdadm-3.1.4/ANNOUNCE-2.5.6 --- mdadm-2.6.7.1/ANNOUNCE-2.5.6 2006-11-09 00:58:59.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-2.5.6 1970-01-01 02:00:00.000000000 +0200 @@ -1,32 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.5.6 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.5.6 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.5.6 is a bugfix release over 2.5.5, which it turns out -wasn't the last before 2.6. Maybe this one? - -Changelog Entries: - - Fix bug which meant "bitmap=xxx" in mdadm.conf was not handled - properly. - - Documentation updates. - - Fix bug that caused infinite loop when doing auto-assembly, - in certain cases where arrays couldn't be assembled. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 6th November 2006 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6 mdadm-3.1.4/ANNOUNCE-2.6 --- mdadm-2.6.7.1/ANNOUNCE-2.6 2006-12-21 08:11:48.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-2.6 1970-01-01 02:00:00.000000000 +0200 @@ -1,76 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6 adds assorted fixes and improvements and a new major mode. -"Incremental Assembly" via -I or --incremental can be used to -assemble an array one device at a time. The idea is that you get -udev to run "mdadm -Iq devicename" on each new block device that it -finds. Anything that is part of an array gets included in an array as -appropriate. -Two special notes: - 1/ This is very new code and is probably buggy. It passes a few basic - tests, and helped me find some kernel bugs, but it is still fresh - and should not be considered 'stable'. Please test and provide - feedback. - 2/ There is a bug in the linux kernel that makes incremental assembly - not possible in general (you cannot safely remove a drive from an array - that has not yet been started. This is needed if an old device was - detected first). If mdadm detects a kernel which might have the - bug, it rejects --incremental requests. - The bug will hopefully be fixed in 2.6.20 and this mdadm release - contains patches for 2.6.18, 2.6.18.6 and 2.6.19. Apply the - appropriate patch to test --incremental. - -Changelog Entries: - - Fixed UUID printing in "--detail --brief" for version1 metadata. - - --update=resync did exactly the wrong thing for version1 metadata. - It caused a resync to not happen, rather than to happen. - - Allow --assemble --force to mark a raid6 clean when it has two - missing devices (which is needed else if won't assemble. - Without this fix it would only assemble if one or zero - missing devices. - - Support --update=devicesize for cases where the underlying device - can change size. - - Default to --auto=yes so the array devices with 'standard' names - get created automatically, as this is almost always what is wanted. - - Give useful message if raid4/5/6 cannot be started because it is - not clean and is also degraded. - - Increase raid456 stripe cache size if needed to --grow the array. - The setting used unfortunately requires intimate knowledge of the - kernel, and it not reset when the reshape finishes. - - Change 'Device Size' to 'Used Dev Size' because it only shows how - much of each device is actually used, not how big they are. - - --wait or -W will wait for resync activity to finish on the given - devices. - - Fix some problems with --update=uuid and add a test. - - If two drives in a raid5 disappear at the same time, then "-Af" - will add them both in rather than just one and forcing the array - to 'clean'. This is slightly safer in some cases. - - Check device is large enough before hot-add: this improves quality - of error message. - - Don't hold md device open for so long in --monitor mode - map_dev - can be slow and interferes with trying to stop the array. - - Support --uuid= with --create to choose your own UUID. - - New major more "--incremental" for incremental assemble of arrays, - intended for use with udev. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 21st December 2006 -Blessed Christmas to all. diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.1 mdadm-3.1.4/ANNOUNCE-2.6.1 --- mdadm-2.6.7.1/ANNOUNCE-2.6.1 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.1 1970-01-01 02:00:00.000000000 +0200 @@ -1,33 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.1 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.1 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.1 adds a few minor bug fixes to 2.6 including improved -support for growing a RAID6 array. - -Changelog Entries: - - --monitor was producing some meaningless warnings due to a bug. - - Fix some compiler warnings. - - Fully support --grow for raid6. If a reshape crashed during the - critical period, mdadm wouldn't restore the Q information - properly. - - Update documentation for --grow. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 22nd February 2007 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.2 mdadm-3.1.4/ANNOUNCE-2.6.2 --- mdadm-2.6.7.1/ANNOUNCE-2.6.2 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.2 1970-01-01 02:00:00.000000000 +0200 @@ -1,37 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.2 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.2 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.2 adds a few minor bug fixes to 2.6.1 - -Changelog Entries: - - --fail detached and --remove faulty can be used to fail and - remove devices that are no longer physically present. - - --export option for --detail or present information in a format - that can be processed by udev. - - fix internal bitmap allocation problems with v1.1, v1.2 metadata. - - --help now goes to stdout so you can direct it to a pager. - - Various manpage updates. - - Make "--grow --add" for linear arrays really work. - - --auto-detect to trigger in-kernel autodetect. - - Make return code for "--detail --test" more reliable. Missing - devices as well as failed devices cause an error. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 21st May 2007 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.3 mdadm-3.1.4/ANNOUNCE-2.6.3 --- mdadm-2.6.7.1/ANNOUNCE-2.6.3 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.3 1970-01-01 02:00:00.000000000 +0200 @@ -1,44 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.3 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.3 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.3 adds a few minor bug fixes to 2.6.2 - -Changelog Entries: - - allow --write-behind to be set for --grow. - - When adding new disk to an array, don't reserve so much bitmap - space that the disk cannot store the required data. (Needed when - 1.x array was created with older mdadm). - - When adding a drive that was a little too small, we did not get - the correct error message. - - Make sure that if --assemble find an array in the critical region - of a reshape, and cannot find the critical data to restart the - reshape, it gives an error message. - - Fix segfault with '--detail --export' and non-persistent - superblocks. - - Various manpage updates. - - Improved 'raid4' support (--assemble, --monitor) - - Option parsing fixes w.r.t -a - - Interpret "--assemble --metadata=1" to allow any version 1.x - metadata, and be more specific in the "metadata=" message printed - with --examine --brief - - Fix spare migration in --monitor. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 20th August 2007 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.4 mdadm-3.1.4/ANNOUNCE-2.6.4 --- mdadm-2.6.7.1/ANNOUNCE-2.6.4 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.4 1970-01-01 02:00:00.000000000 +0200 @@ -1,31 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.4 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.4 - -It is available at the usual places: - http://www.cse.unsw.edu.au/~neilb/source/mdadm/ -and - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.4 adds a few minor bug fixes to 2.6.3 - -Changelog Entries: - - Make "--create --auto=mdp" work for non-standard device names. - - Fix restarting of a 'reshape' if it was stopped in the middle. - - Fix a segfault when using v1 superblock. - - Make --write-mostly effective when re-adding a device to an array. - - Various minor fixes - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 19th October 2007 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.5 mdadm-3.1.4/ANNOUNCE-2.6.5 --- mdadm-2.6.7.1/ANNOUNCE-2.6.5 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.5 1970-01-01 02:00:00.000000000 +0200 @@ -1,40 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.5 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.5 - -It is available at the usual places: - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.5 adds a few minor bug fixes to 2.6.4 - -Changelog Entries: - - Avoid segfault when parsing /proc/mdstat with auto-read-only - arrays. - - Fix problem with failing to add devices to v.large (>4TB) arrays, - cause by problems with device-size overflow. - - For v0.90 superblocks, print the 'Events' count as a real count, - not 2 numbers separated by a dot. - - Updates some URLs in the man page. - - Allow creation of a RAID6 with exactly one missing device. - - Use LOG_PID for syslog, so you get the pid of mdadm in the log - files. - - --export now works with --examine too (not just --detail) - - Improve auto-creation of device special file when using - --incremental - - Simple locking for --incremental so mdadm doesn't get confused - when run concurrently with itself. - - Make --incremental cope better with arrays that are being reshaped. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 15th May 2007 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.6 mdadm-3.1.4/ANNOUNCE-2.6.6 --- mdadm-2.6.7.1/ANNOUNCE-2.6.6 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.6 1970-01-01 02:00:00.000000000 +0200 @@ -1,27 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.6 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.6 - -It is available at the usual places: - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.6 fixes a compile problem in 2.6.5 and adds a small manpage update. - -Changelog Entries: - - "make everything" now make mdassemble.auto - - fix compile problem with mdassemble.auto - - Update FAQ URLs in man page again. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 19th May 2007 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.7 mdadm-3.1.4/ANNOUNCE-2.6.7 --- mdadm-2.6.7.1/ANNOUNCE-2.6.7 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.7 1970-01-01 02:00:00.000000000 +0200 @@ -1,27 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.7 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.7 - -It is available at the usual places: - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.7 fixes a few bugs in 2.6.5 which 2.6.6 didn't fix. - -Changelog Entries: - - Avoid NULL reference calling free_super and elsewhere. - - Remove stray semicolon (Causes compile error with gcc-2.95) - - Fix autoassemble for stack arrays. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 6th June 2008 diff -Nru mdadm-2.6.7.1/ANNOUNCE-2.6.7.1 mdadm-3.1.4/ANNOUNCE-2.6.7.1 --- mdadm-2.6.7.1/ANNOUNCE-2.6.7.1 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/ANNOUNCE-2.6.7.1 1970-01-01 02:00:00.000000000 +0200 @@ -1,27 +0,0 @@ -Subject: ANNOUNCE: mdadm 2.6.7.1 - A tool for managing Soft RAID under Linux - -I am pleased to announce the availability of - mdadm version 2.6.7.1 - -It is available at the usual places: - countrycode=xx. - http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ -and via git at - git://neil.brown.name/mdadm - http://neil.brown.name/git?p=mdadm - -mdadm is a tool for creating, managing and monitoring -device arrays using the "md" driver in Linux, also -known as Software RAID arrays. - -Release 2.6.7.1 fixes two bugs in 2.6.7 and is a special release -for Debian Lenny (which is too frozen to take 2.6.8). - -Changelog Entries: - - Manage: allow adding device that is just large enough to v1.x array. - - Fix bug in forced assemble. - -Development of mdadm is sponsored by - SUSE Labs, Novell Inc. - -NeilBrown 15th October 2008 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.0 mdadm-3.1.4/ANNOUNCE-3.0 --- mdadm-2.6.7.1/ANNOUNCE-3.0 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.0 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,98 @@ +Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux + +I am pleased to (finally) announce the availability of + mdadm version 3.0 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This is a major new version and as such should be treated with some +caution. However it has seen substantial testing and is considerred +to be ready for wide use. + + +The significant change which justifies the new major version number is +that mdadm can now handle metadata updates entirely in userspace. +This allows mdadm to support metadata formats that the kernel knows +nothing about. + +Currently two such metadata formats are supported: + - DDF - The SNIA standard format + - Intel Matrix - The metadata used by recent Intel ICH controlers. + +Also the approach to device names has changed significantly. + +If udev is installed on the system, mdadm will not create any devices +in /dev. Rather it allows udev to manage those devices. For this to work +as expected, the included udev rules file should be installed. + +If udev is not installed, mdadm will still create devices and symlinks +as required, and will also remove them when the array is stopped. + +mdadm now requires all devices which do not have a standard name (mdX +or md_dX) to live in the directory /dev/md/. Names in this directory +will always be created as symlinks back to the standard name in /dev. + +The man pages contain some information about the new externally managed +metadata. However see below for a more condensed overview. + +Externally managed metadata introduces the concept of a 'container'. +A container is a collection of (normally) physical devices which have +a common set of metadata. A container is assembled as an md array, but +is left 'inactive'. + +A container can contain one or more data arrays. These are composed from +slices (partitions?) of various devices in the container. + +For example, a 5 devices DDF set can container a RAID1 using the first +half of two devices, a RAID0 using the first half of the remain 3 devices, +and a RAID5 over thte second half of all 5 devices. + +A container can be created with + + mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde] + +or "-e imsm" to use the Intel Matrix Storage Manager. + +An array can be created within a container either by giving the +container name and the only member: + + mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0 + +or by listing the component devices + + mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde] + +To assemble a container, it is easiest just to pass each device in turn to +mdadm -I + + for i in /dev/sd[abcde] + do mdadm -I $i + done + +This will assemble the container and the components. + +Alternately the container can be assembled explicitly + + mdadm -A /dev/md0 /dev/sd[abcde] + +Then the components can all be assembled with + + mdadm -I /dev/md0 + +For each container, mdadm will start a program called "mdmon" which will +monitor the array and effect any metadata updates needed. The array is +initially assembled readonly. It is up to "mdmon" to mark the metadata +as 'dirty' and which the array to 'read-write'. + +The version 0.90 and 1.x metadata formats supported by previous +versions for mdadm are still supported and the kernel still performs +the same updates it use to. The new 'mdmon' approach is only used for +newly introduced metadata types. + +NeilBrown 2nd June 2009 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.0.1 mdadm-3.1.4/ANNOUNCE-3.0.1 --- mdadm-2.6.7.1/ANNOUNCE-3.0.1 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.0.1 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains only minor bug fixes over 3.0. If you are using +3.0, you could consider upgrading. + +The brief change log is: + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +NeilBrown 25th September 2009 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.0.2 mdadm-3.1.4/ANNOUNCE-3.0.2 --- mdadm-2.6.7.1/ANNOUNCE-3.0.2 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.0.2 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,21 @@ +Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This just contains one bugfix over 3.0.1 - I was obviously a bit hasty +in releasing that one. + +The brief change log is: + - Fix crash when hosthost is not set, as often happens in + early boot. + +NeilBrown 25th September 2009 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.0.3 mdadm-3.1.4/ANNOUNCE-3.0.3 --- mdadm-2.6.7.1/ANNOUNCE-3.0.3 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.0.3 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,29 @@ +Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains a collection of bug fixes and minor enhancements over +3.0.1. + +The brief change log is: + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +NeilBrown 22nd October 2009 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.1 mdadm-3.1.4/ANNOUNCE-3.1 --- mdadm-2.6.7.1/ANNOUNCE-3.1 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.1 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,33 @@ +Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux + +Hot on the heals of 3.0.3 I am pleased to announce the availability of + mdadm version 3.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +It contains significant feature enhancements over 3.0.x + +The brief change log is: + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Note that a 2.6.31 or later is needed to have access to these. +Reducing devices in a RAID4/5/6 requires 2.6.32. +Changing RAID5 to RAID1 requires 2.6.33. + +You should only upgrade if you need to use, or which to test, these +features. + +NeilBrown 22nd October 2009 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.1.1 mdadm-3.1.4/ANNOUNCE-3.1.1 --- mdadm-2.6.7.1/ANNOUNCE-3.1.1 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.1.1 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,39 @@ +Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix release over 3.1, which was withdrawn due to serious +bugs. So it might be best to ignore 3.1 and say that this is a significant +feature release over 3.0.x + +Significant changes are: + - RAID level conversion between RAID1, RAID5, and RAID6 are + possible were the kernel supports it (2.6.32 at least) + - online chunksize and layout changing for RAID5 and RAID6 + where the kernel supports it. + - reduce the number of devices in a RAID4/5/6 array. + + - The default metadata is not v1.1. This metadata is stored at the + start of the device so is safer in many ways but could interfere with + boot loaded. The old default (0.90) is still available and fully + supported. + + - The default chunksize is now 512K rather than 64K. This seems more + appropriate for modern devices. + + - The default bitmap chunksize for internal bitmaps is now at least + 64Meg as fine grained bitmaps tend to impact performance more for + little extra gain. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.1. + +NeilBrown 19th November 2009 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.1.2 mdadm-3.1.4/ANNOUNCE-3.1.2 --- mdadm-2.6.7.1/ANNOUNCE-3.1.2 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.1.2 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.1. + +Significant changes are: + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + + +This release is believed to be stable and you should feel free to +upgrade to 3.1.2 + +NeilBrown 10th March 2010 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.1.3 mdadm-3.1.4/ANNOUNCE-3.1.3 --- mdadm-2.6.7.1/ANNOUNCE-3.1.3 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.1.3 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.2 + +Significant changes are: + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +This release is believed to be stable and you should feel free to +upgrade to 3.1.3 + +It is expected that the next release will be 3.2 with a number of new +features. 3.1.4 will only happen if important bugs show up before 3.2 +is stable. + +NeilBrown 6th August 2010 diff -Nru mdadm-2.6.7.1/ANNOUNCE-3.1.4 mdadm-3.1.4/ANNOUNCE-3.1.4 --- mdadm-2.6.7.1/ANNOUNCE-3.1.4 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/ANNOUNCE-3.1.4 2010-08-31 10:21:29.000000000 +0300 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.4 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.3. +3.1.3 had a couple of embarrasing regressions and a couple of other +issues surfaces which had easy fixes so I decided to make a 3.1.4 +release after all. + +Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev +And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + - Fix spare migration + +This release is believed to be stable and you should feel free to +upgrade to 3.1.4 + +It is expected that the next release will be 3.2 with a number of new +features. + +NeilBrown 31st August 2010 diff -Nru mdadm-2.6.7.1/Assemble.c mdadm-3.1.4/Assemble.c --- mdadm-2.6.7.1/Assemble.c 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/Assemble.c 2010-08-31 10:18:39.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,17 +19,11 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" #include -#include static int name_matches(char *found, char *required, char *homehost) { @@ -51,11 +45,36 @@ return 0; } -int Assemble(struct supertype *st, char *mddev, int mdfd, +static int is_member_busy(char *metadata_version) +{ + /* check if the given member array is active */ + struct mdstat_ent *mdstat = mdstat_read(1, 0); + struct mdstat_ent *ent; + int busy = 0; + + for (ent = mdstat; ent; ent = ent->next) { + if (ent->metadata_version == NULL) + continue; + if (strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + if (!is_subarray(&ent->metadata_version[9])) + continue; + /* Skip first char - it can be '/' or '-' */ + if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) { + busy = 1; + break; + } + } + free_mdstat(mdstat); + + return busy; +} + +int Assemble(struct supertype *st, char *mddev, mddev_ident_t ident, mddev_dev_t devlist, char *backup_file, int readonly, int runstop, - char *update, char *homehost, + char *update, char *homehost, int require_homehost, int verbose, int force) { /* @@ -112,10 +131,13 @@ * START_ARRAY * */ - int clean = 0; - int must_close = 0; + int mdfd; + int clean; + int auto_assem = (mddev == NULL && !ident->uuid_set && + ident->super_minor == UnSet && ident->name[0] == 0 + && (ident->container == NULL || ident->member == NULL)); int old_linux = 0; - int vers = 0; /* Keep gcc quite - it really is initialised */ + int vers = vers; /* Keep gcc quite - it really is initialised */ struct { char *devname; int uptodate; /* set once we decide that this device is as @@ -124,45 +146,32 @@ struct mdinfo i; } *devices; int *best = NULL; /* indexed by raid_disk */ - unsigned int bestcnt = 0; + int bestcnt = 0; int devcnt = 0; - unsigned int okcnt, sparecnt; + unsigned int okcnt, sparecnt, rebuilding_cnt; unsigned int req_cnt; - unsigned int i; + int i; int most_recent = 0; int chosen_drive; int change = 0; int inargv = 0; + int report_missmatch; int bitmap_done; - int start_partial_ok = (runstop >= 0) && (force || devlist==NULL || mdfd < 0); + int start_partial_ok = (runstop >= 0) && + (force || devlist==NULL || auto_assem); unsigned int num_devs; mddev_dev_t tmpdev; struct mdinfo info; + struct mdinfo *content = NULL; char *avail; int nextspare = 0; + char *name = NULL; + int trustworthy; + char chosen_name[1024]; if (get_linux_version() < 2004000) old_linux = 1; - if (mdfd >= 0) { - vers = md_get_version(mdfd); - if (vers <= 0) { - fprintf(stderr, Name ": %s appears not to be an md device.\n", mddev); - return 1; - } - if (vers < 9000) { - fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n" - " Upgrade your kernel or try --build\n"); - return 1; - } - - if (ioctl(mdfd, GET_ARRAY_INFO, &info.array)>=0) { - fprintf(stderr, Name ": device %s already active - cannot assemble it\n", - mddev); - return 1; - } - ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */ - } /* * If any subdevs are listed, then any that don't * match ident are discarded. Remainder must all match and @@ -173,18 +182,26 @@ if (!devlist && ident->uuid_set == 0 && - ident->super_minor < 0 && + (ident->super_minor < 0 || ident->super_minor == UnSet) && + ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL) && ident->devices == NULL) { fprintf(stderr, Name ": No identity information available for %s - cannot assemble.\n", mddev ? mddev : "further assembly"); return 1; } + if (devlist == NULL) devlist = conf_get_devs(); - else if (mdfd >= 0) + else if (mddev) inargv = 1; + report_missmatch = ((inargv && verbose >= 0) || verbose > 0); try_again: + /* We come back here when doing auto-assembly and attempting some + * set of devices failed. Those are now marked as ->used==2 and + * we ignore them and try again + */ tmpdev = devlist; num_devs = 0; while (tmpdev) { @@ -204,7 +221,7 @@ /* first walk the list of devices to find a consistent set * that match the criterea, if that is possible. - * We flag the one we like with 'used'. + * We flag the ones we like with 'used'. */ for (tmpdev = devlist; tmpdev; @@ -218,14 +235,14 @@ if (ident->devices && !match_oneof(ident->devices, devname)) { - if ((inargv && verbose>=0) || verbose > 0) + if (report_missmatch) fprintf(stderr, Name ": %s is not one of %s\n", devname, ident->devices); continue; } dfd = dev_open(devname, O_RDONLY|O_EXCL); if (dfd < 0) { - if ((inargv && verbose >= 0) || verbose > 0) + if (report_missmatch) fprintf(stderr, Name ": cannot open device %s: %s\n", devname, strerror(errno)); tmpdev->used = 2; @@ -239,72 +256,134 @@ devname); tmpdev->used = 2; } else if (!tst && (tst = guess_super(dfd)) == NULL) { - if ((inargv && verbose >= 0) || verbose > 0) + if (report_missmatch) fprintf(stderr, Name ": no recogniseable superblock on %s\n", devname); tmpdev->used = 2; } else if (tst->ss->load_super(tst,dfd, NULL)) { - if ((inargv && verbose >= 0) || verbose > 0) + if (report_missmatch) fprintf( stderr, Name ": no RAID superblock on %s\n", devname); + } else if (auto_assem && st == NULL && + !conf_test_metadata(tst->ss->name, + tst->ss->match_home(tst, homehost) == 1)) { + if (report_missmatch) + fprintf(stderr, Name ": %s has metadata type %s for which " + "auto-assembly is disabled\n", + devname, tst->ss->name); + tst->ss->free_super(tst); + tmpdev->used = 2; } else { - tst->ss->getinfo_super(tst, &info); + content = &info; + memset(content, 0, sizeof(*content)); + tst->ss->getinfo_super(tst, content); } if (dfd >= 0) close(dfd); + if (tst && tst->sb && tst->ss->container_content + && tst->loaded_container) { + /* tmpdev is a container. We need to be either + * looking for a member, or auto-assembling + */ + if (st) { + /* already found some components, this cannot + * be another one. + */ + if (report_missmatch) + fprintf(stderr, Name ": %s is a container, but we are looking for components\n", + devname); + goto loop; + } + + if (ident->container) { + if (ident->container[0] == '/' && + !same_dev(ident->container, devname)) { + if (report_missmatch) + fprintf(stderr, Name ": %s is not the container required (%s)\n", + devname, ident->container); + goto loop; + } + if (ident->container[0] != '/') { + /* we have a uuid */ + int uuid[4]; + if (!parse_uuid(ident->container, uuid) || + !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { + if (report_missmatch) + fprintf(stderr, Name ": %s has wrong UUID to be required container\n", + devname); + goto loop; + } + } + } + /* It is worth looking inside this container. + */ + if (verbose > 0) + fprintf(stderr, Name ": looking in container %s\n", + devname); + next_member: + if (tmpdev->content) + content = tmpdev->content; + else + content = tst->ss->container_content(tst); + if (!content) + goto loop; /* empty container */ + + tmpdev->content = content->next; + if (tmpdev->content == NULL) + tmpdev->used = 2; + + } else if (ident->container || ident->member) { + /* No chance of this matching if we don't have + * a container */ + if (report_missmatch) + fprintf(stderr, Name "%s is not a container, and one is required.\n", + devname); + goto loop; + } + if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) && (!tst || !tst->sb || - same_uuid(info.uuid, ident->uuid, tst->ss->swapuuid)==0)) { - if ((inargv && verbose >= 0) || verbose > 0) + same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0)) { + if (report_missmatch) fprintf(stderr, Name ": %s has wrong uuid.\n", devname); goto loop; } if (ident->name[0] && (!update || strcmp(update, "name")!= 0) && (!tst || !tst->sb || - name_matches(info.name, ident->name, homehost)==0)) { - if ((inargv && verbose >= 0) || verbose > 0) + name_matches(content->name, ident->name, homehost)==0)) { + if (report_missmatch) fprintf(stderr, Name ": %s has wrong name.\n", devname); goto loop; } if (ident->super_minor != UnSet && (!tst || !tst->sb || - ident->super_minor != info.array.md_minor)) { - if ((inargv && verbose >= 0) || verbose > 0) + ident->super_minor != content->array.md_minor)) { + if (report_missmatch) fprintf(stderr, Name ": %s has wrong super-minor.\n", devname); goto loop; } if (ident->level != UnSet && (!tst || !tst->sb || - ident->level != info.array.level)) { - if ((inargv && verbose >= 0) || verbose > 0) + ident->level != content->array.level)) { + if (report_missmatch) fprintf(stderr, Name ": %s has wrong raid level.\n", devname); goto loop; } if (ident->raid_disks != UnSet && (!tst || !tst->sb || - ident->raid_disks!= info.array.raid_disks)) { - if ((inargv && verbose >= 0) || verbose > 0) + ident->raid_disks!= content->array.raid_disks)) { + if (report_missmatch) fprintf(stderr, Name ": %s requires wrong number of drives.\n", devname); goto loop; } - if (mdfd < 0) { + if (auto_assem) { if (tst == NULL || tst->sb == NULL) continue; - if (update == NULL && - tst->ss->match_home(tst, homehost)==0) { - if ((inargv && verbose >= 0) || verbose > 0) - fprintf(stderr, Name ": %s is not built for host %s.\n", - devname, homehost); - /* Auto-assemble, and this is not a usable host */ - /* if update != NULL, we are updating the host - * name... */ - goto loop; - } } /* If we are this far, then we are nearly commited to this device. * If the super_block doesn't exist, or doesn't match others, @@ -321,6 +400,54 @@ return 1; } + if (tst && tst->sb && tst->ss->container_content + && tst->loaded_container) { + /* we have the one container we need, don't keep + * looking. If the chosen member is active, skip. + */ + if (is_member_busy(content->text_version)) { + if (report_missmatch) + fprintf(stderr, Name ": member %s in %s is already assembled\n", + content->text_version, + devname); + skip: + if (tmpdev->content) + goto next_member; + tst->ss->free_super(tst); + tst = NULL; + content = NULL; + if (auto_assem) + goto loop; + return 1; + } + if (ident->member && ident->member[0]) { + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + fprintf(stderr, Name ": badly formatted version: %s\n", + content->text_version); + goto skip; + } + if (strcmp(ident->member, s+1) != 0) { + if (report_missmatch) + fprintf(stderr, + Name ": skipping wrong member %s\n", + content->text_version); + goto skip; + } + } + st = tst; tst = NULL; + if (!auto_assem && inargv && tmpdev->next != NULL) { + fprintf(stderr, Name ": %s is a container, but is not " + "only device given: confused and aborting\n", + devname); + st->ss->free_super(st); + return 1; + } + if (verbose > 0) + fprintf(stderr, Name ": found match on member %s in %s\n", + content->text_version, devname); + break; + } if (st == NULL) st = dup_super(tst); if (st->minor_version == -1) @@ -333,21 +460,22 @@ * Or, if we are auto assembling, we just ignore the second * for now. */ - if (mdfd < 0) + if (auto_assem) goto loop; if (homehost) { int first = st->ss->match_home(st, homehost); int last = tst->ss->match_home(tst, homehost); - if (first+last == 1) { + if (first != last && + (first == 1 || last == 1)) { /* We can do something */ if (first) {/* just ignore this one */ - if ((inargv && verbose >= 0) || verbose > 0) + if (report_missmatch) fprintf(stderr, Name ": %s misses out due to wrong homehost\n", devname); goto loop; } else { /* reject all those sofar */ mddev_dev_t td; - if ((inargv && verbose >= 0) || verbose > 0) + if (report_missmatch) fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n", devname); for (td=devlist; td != tmpdev; td=td->next) @@ -368,59 +496,102 @@ tmpdev->used = 1; loop: + if (tmpdev->content) + goto next_member; if (tst) tst->ss->free_super(tst); } - if (mdfd < 0) { - /* So... it is up to me to open the device. - * We create a name '/dev/md/XXX' based on the info in the - * superblock, and call open_mddev on that + if (!st || !st->sb || !content) + return 2; + + /* Now need to open the array device. Use create_mddev */ + if (content == &info) + st->ss->getinfo_super(st, content); + + trustworthy = FOREIGN; + name = content->name; + switch (st->ss->match_home(st, homehost) + ?: st->ss->match_home(st, "any")) { + case 1: + trustworthy = LOCAL; + name = strchr(content->name, ':'); + if (name) + name++; + else + name = content->name; + break; + } + if (!auto_assem) + /* If the array is listed in mdadm.conf or on + * command line, then we trust the name + * even if the array doesn't look local */ - mdu_array_info_t inf; - char *c; - if (!st || !st->sb) { - return 2; - } - st->ss->getinfo_super(st, &info); - c = strchr(info.name, ':'); - if (c) c++; else c= info.name; - if (isdigit(*c) && ((ident->autof & 7)==4 || (ident->autof&7)==6)) { - /* /dev/md/d0 style for partitionable */ - int ret = asprintf(&mddev, "/dev/md/d%s", c); - assert(ret >= 0); - } - else { - int ret = asprintf(&mddev, "/dev/md/%s", c); - assert(ret >= 0); - } - mdfd = open_mddev(mddev, ident->autof); - if (mdfd < 0) { - st->ss->free_super(st); - free(devices); + trustworthy = LOCAL; + + if (name[0] == 0 && + content->array.level == LEVEL_CONTAINER) { + name = content->text_version; + trustworthy = METADATA; + } + + if (name[0] && trustworthy != LOCAL && + ! require_homehost && + conf_name_is_free(name)) + trustworthy = LOCAL; + + if (trustworthy == LOCAL && + strchr(name, ':')) + /* Ignore 'host:' prefix of name */ + name = strchr(name, ':')+1; + + mdfd = create_mddev(mddev, name, ident->autof, trustworthy, + chosen_name); + if (mdfd < 0) { + st->ss->free_super(st); + free(devices); + if (auto_assem) goto try_again; - } - vers = md_get_version(mdfd); - if (ioctl(mdfd, GET_ARRAY_INFO, &inf)==0) { - for (tmpdev = devlist ; - tmpdev && tmpdev->used != 1; - tmpdev = tmpdev->next) - ; - fprintf(stderr, Name ": %s already active, cannot restart it!\n", mddev); - if (tmpdev) - fprintf(stderr, Name ": %s needed for %s...\n", - mddev, tmpdev->devname); - close(mdfd); - mdfd = -1; - st->ss->free_super(st); - free(devices); + return 1; + } + mddev = chosen_name; + vers = md_get_version(mdfd); + if (vers < 9000) { + fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n" + " Upgrade your kernel or try --build\n"); + close(mdfd); + return 1; + } + if (mddev_busy(fd2devnum(mdfd))) { + fprintf(stderr, Name ": %s already active, cannot restart it!\n", + mddev); + for (tmpdev = devlist ; + tmpdev && tmpdev->used != 1; + tmpdev = tmpdev->next) + ; + if (tmpdev && auto_assem) + fprintf(stderr, Name ": %s needed for %s...\n", + mddev, tmpdev->devname); + close(mdfd); + mdfd = -3; + st->ss->free_super(st); + free(devices); + if (auto_assem) goto try_again; - } - must_close = 1; + return 1; } + ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */ +#ifndef MDASSEMBLE + if (content != &info) { + /* This is a member of a container. Try starting the array. */ + return assemble_container_content(st, mdfd, content, runstop, + chosen_name, verbose); + } +#endif /* Ok, no bad inconsistancy, we can try updating etc */ bitmap_done = 0; + content->update_private = NULL; for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) { char *devname = tmpdev->devname; struct stat stb; @@ -450,20 +621,27 @@ remove_partitions(dfd); tst = dup_super(st); - tst->ss->load_super(tst, dfd, NULL); - tst->ss->getinfo_super(tst, &info); + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + fprintf(stderr, Name ": cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + return 1; + } + tst->ss->getinfo_super(tst, content); - memcpy(info.uuid, ident->uuid, 16); - strcpy(info.name, ident->name); - info.array.md_minor = minor(stb2.st_rdev); + memcpy(content->uuid, ident->uuid, 16); + strcpy(content->name, ident->name); + content->array.md_minor = minor(stb2.st_rdev); - tst->ss->update_super(tst, &info, update, + tst->ss->update_super(tst, content, update, devname, verbose, ident->uuid_set, homehost); if (strcmp(update, "uuid")==0 && !ident->uuid_set) { ident->uuid_set = 1; - memcpy(ident->uuid, info.uuid, 16); + memcpy(ident->uuid, content->uuid, 16); } if (dfd < 0) fprintf(stderr, Name ": Cannot open %s for superblock update\n", @@ -477,7 +655,7 @@ if (strcmp(update, "uuid")==0 && ident->bitmap_fd >= 0 && !bitmap_done) { if (bitmap_update_uuid(ident->bitmap_fd, - info.uuid, + content->uuid, tst->ss->swapuuid) != 0) fprintf(stderr, Name ": Could not update uuid on external bitmap.\n"); else @@ -493,8 +671,15 @@ remove_partitions(dfd); - tst->ss->load_super(tst, dfd, NULL); - tst->ss->getinfo_super(tst, &info); + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + fprintf(stderr, Name ": cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + return 1; + } + tst->ss->getinfo_super(tst, content); tst->ss->free_super(tst); close(dfd); } @@ -503,10 +688,10 @@ if (verbose > 0) fprintf(stderr, Name ": %s is identified as a member of %s, slot %d.\n", - devname, mddev, info.disk.raid_disk); + devname, mddev, content->disk.raid_disk); devices[devcnt].devname = devname; devices[devcnt].uptodate = 0; - devices[devcnt].i = info; + devices[devcnt].i = *content; devices[devcnt].i.disk.major = major(stb.st_rdev); devices[devcnt].i.disk.minor = minor(stb.st_rdev); if (most_recent < devcnt) { @@ -514,25 +699,25 @@ > devices[most_recent].i.events) most_recent = devcnt; } - if (info.array.level == -4) + if (content->array.level == LEVEL_MULTIPATH) /* with multipath, the raid_disk from the superblock is meaningless */ i = devcnt; else i = devices[devcnt].i.disk.raid_disk; if (i+1 == 0) { - if (nextspare < info.array.raid_disks) - nextspare = info.array.raid_disks; + if (nextspare < content->array.raid_disks) + nextspare = content->array.raid_disks; i = nextspare++; } else { - if (i >= info.array.raid_disks && + if (i >= content->array.raid_disks && i >= nextspare) nextspare = i+1; } if (i < 10000) { if (i >= bestcnt) { - unsigned int newbestcnt = i+10; + int newbestcnt = i+10; int *newbest = malloc(sizeof(int)*newbestcnt); - unsigned int c; + int c; for (c=0; c < newbestcnt; c++) if (c < bestcnt) newbest[c] = best[c]; @@ -547,8 +732,8 @@ == devices[devcnt].i.events && (devices[best[i]].i.disk.minor != devices[devcnt].i.disk.minor) - && st->ss->major == 0 - && info.array.level != -4) { + && st->ss == &super0 + && content->array.level != LEVEL_MULTIPATH) { /* two different devices with identical superblock. * Could be a mis-detection caused by overlapping * partitions. fail-safe. @@ -563,7 +748,7 @@ inargv ? "the list" : "the\n DEVICE list in mdadm.conf" ); - if (must_close) close(mdfd); + close(mdfd); return 1; } if (best[i] == -1 @@ -573,30 +758,33 @@ } devcnt++; } + free(content->update_private); + content->update_private = NULL; if (devcnt == 0) { fprintf(stderr, Name ": no devices found for %s\n", mddev); if (st) st->ss->free_super(st); - if (must_close) close(mdfd); + close(mdfd); return 1; } if (update && strcmp(update, "byteorder")==0) st->minor_version = 90; - st->ss->getinfo_super(st, &info); - clean = info.array.state & 1; + st->ss->getinfo_super(st, content); + clean = content->array.state & 1; /* now we have some devices that might be suitable. * I wonder how many */ - avail = malloc(info.array.raid_disks); - memset(avail, 0, info.array.raid_disks); + avail = malloc(content->array.raid_disks); + memset(avail, 0, content->array.raid_disks); okcnt = 0; sparecnt=0; - for (i=0; i< bestcnt ;i++) { + rebuilding_cnt=0; + for (i=0; i< bestcnt; i++) { int j = best[i]; int event_margin = 1; /* always allow a difference of '1' * like the kernel does @@ -605,8 +793,8 @@ /* note: we ignore error flags in multipath arrays * as they don't make sense */ - if (info.array.level != -4) - if (!(devices[j].i.disk.state & (1<array.level != LEVEL_MULTIPATH) + if (!(devices[j].i.disk.state & (1<= devices[most_recent].i.events) { devices[j].uptodate = 1; - if (i < info.array.raid_disks) { - okcnt++; - avail[i]=1; + if (i < content->array.raid_disks) { + if (devices[j].i.recovery_start == MaxSector) { + okcnt++; + avail[i]=1; + } else + rebuilding_cnt++; } else sparecnt++; } } - while (force && !enough(info.array.level, info.array.raid_disks, - info.array.layout, 1, + while (force && !enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail, okcnt)) { /* Choose the newest best drive which is * not up-to-date, update the superblock @@ -631,13 +822,13 @@ */ int fd; struct supertype *tst; - long long current_events; + unsigned long long current_events; chosen_drive = -1; - for (i=0; iarray.raid_disks && i < bestcnt; i++) { int j = best[i]; if (j>=0 && !devices[j].uptodate && - devices[j].i.events > 0 && + devices[j].i.recovery_start == MaxSector && (chosen_drive < 0 || devices[j].i.events > devices[chosen_drive].i.events)) @@ -668,8 +859,8 @@ devices[chosen_drive].i.events = 0; continue; } - info.events = devices[most_recent].i.events; - tst->ss->update_super(tst, &info, "force-one", + content->events = devices[most_recent].i.events; + tst->ss->update_super(tst, content, "force-one", devices[chosen_drive].devname, verbose, 0, NULL); @@ -691,11 +882,10 @@ /* If there are any other drives of the same vintage, * add them in as well. We can't lose and we might gain */ - for (i=0; iarray.raid_disks && i < bestcnt ; i++) { int j = best[i]; if (j >= 0 && !devices[j].uptodate && - devices[j].i.events > 0 && devices[j].i.events == current_events) { chosen_drive = j; goto add_another; @@ -723,29 +913,32 @@ if ((fd=dev_open(devices[j].devname, O_RDONLY|O_EXCL))< 0) { fprintf(stderr, Name ": Cannot open %s: %s\n", devices[j].devname, strerror(errno)); - if (must_close) close(mdfd); + close(mdfd); return 1; } if (st->ss->load_super(st,fd, NULL)) { close(fd); fprintf(stderr, Name ": RAID superblock has disappeared from %s\n", devices[j].devname); - if (must_close) close(mdfd); + close(mdfd); return 1; } close(fd); } if (st->sb == NULL) { fprintf(stderr, Name ": No suitable drives found for %s\n", mddev); - if (must_close) close(mdfd); + close(mdfd); return 1; } - st->ss->getinfo_super(st, &info); + st->ss->getinfo_super(st, content); +#ifndef MDASSEMBLE + sysfs_init(content, mdfd, 0); +#endif for (i=0; iarray.raid_disks) desired_state = (1<ss->update_super(st, &devices[j].i, "assemble", NULL, verbose, 0, NULL)) { @@ -780,10 +975,10 @@ #endif } if (force && !clean && - !enough(info.array.level, info.array.raid_disks, - info.array.layout, clean, + !enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail, okcnt)) { - change += st->ss->update_super(st, &info, "force-array", + change += st->ss->update_super(st, content, "force-array", devices[chosen_drive].devname, verbose, 0, NULL); clean = 1; @@ -795,14 +990,14 @@ if (fd < 0) { fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n", devices[chosen_drive].devname); - if (must_close) close(mdfd); + close(mdfd); return 1; } if (st->ss->store_super(st, fd)) { close(fd); fprintf(stderr, Name ": Could not re-write superblock on %s\n", devices[chosen_drive].devname); - if (must_close) close(mdfd); + close(mdfd); return 1; } close(fd); @@ -813,9 +1008,13 @@ * The code of doing this lives in Grow.c */ #ifndef MDASSEMBLE - if (info.reshape_active) { + if (content->reshape_active) { int err = 0; int *fdlist = malloc(sizeof(int)* bestcnt); + if (verbose > 0) + fprintf(stderr, Name ":%s has an active reshape - checking " + "if critical section needs to be restored\n", + chosen_name); for (i=0; i= 0) { @@ -830,14 +1029,16 @@ fdlist[i] = -1; } if (!err) - err = Grow_restart(st, &info, fdlist, bestcnt, backup_file); + err = Grow_restart(st, content, fdlist, bestcnt, backup_file, verbose > 0); while (i>0) { i--; if (fdlist[i]>=0) close(fdlist[i]); } if (err) { fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n"); - if (must_close) close(mdfd); + if (backup_file == NULL) + fprintf(stderr," Possibly you needed to specify the --backup-file\n"); + close(mdfd); return err; } } @@ -845,30 +1046,31 @@ /* count number of in-sync devices according to the superblock. * We must have this number to start the array without -s or -R */ - req_cnt = info.array.working_disks; + req_cnt = content->array.working_disks; /* Almost ready to actually *do* something */ if (!old_linux) { int rv; - if ((vers % 100) >= 1) { /* can use different versions */ - mdu_array_info_t inf; - memset(&inf, 0, sizeof(inf)); - inf.major_version = st->ss->major; - inf.minor_version = st->minor_version; - rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); - } else - rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); + /* First, fill in the map, so that udev can find our name + * as soon as we become active. + */ + map_update(NULL, fd2devnum(mdfd), content->text_version, + content->uuid, chosen_name); + + rv = set_array_info(mdfd, st, content); if (rv) { - fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n", + fprintf(stderr, Name ": failed to set array info for %s: %s\n", mddev, strerror(errno)); - if (must_close) close(mdfd); + ioctl(mdfd, STOP_ARRAY, NULL); + close(mdfd); return 1; } if (ident->bitmap_fd >= 0) { if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { fprintf(stderr, Name ": SET_BITMAP_FILE failed.\n"); - if (must_close) close(mdfd); + ioctl(mdfd, STOP_ARRAY, NULL); + close(mdfd); return 1; } } else if (ident->bitmap_file) { @@ -877,13 +1079,15 @@ if (bmfd < 0) { fprintf(stderr, Name ": Could not open bitmap file %s\n", ident->bitmap_file); - if (must_close) close(mdfd); + ioctl(mdfd, STOP_ARRAY, NULL); + close(mdfd); return 1; } if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { fprintf(stderr, Name ": Failed to set bitmapfile for %s\n", mddev); close(bmfd); - if (must_close) close(mdfd); + ioctl(mdfd, STOP_ARRAY, NULL); + close(mdfd); return 1; } close(bmfd); @@ -900,14 +1104,15 @@ j = chosen_drive; if (j >= 0 /* && devices[j].uptodate */) { - if (ioctl(mdfd, ADD_NEW_DISK, - &devices[j].i.disk)!=0) { + rv = add_disk(mdfd, st, content, &devices[j].i); + + if (rv) { fprintf(stderr, Name ": failed to add " "%s to %s: %s\n", devices[j].devname, mddev, strerror(errno)); - if (i < info.array.raid_disks + if (i < content->array.raid_disks || i == bestcnt) okcnt--; else @@ -917,35 +1122,82 @@ "to %s as %d\n", devices[j].devname, mddev, devices[j].i.disk.raid_disk); - } else if (verbose > 0 && i < info.array.raid_disks) + } else if (verbose > 0 && i < content->array.raid_disks) fprintf(stderr, Name ": no uptodate device for " "slot %d of %s\n", i, mddev); } + if (content->array.level == LEVEL_CONTAINER) { + if (verbose >= 0) { + fprintf(stderr, Name ": Container %s has been " + "assembled with %d drive%s", + mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + fprintf(stderr, "\n"); + } + sysfs_uevent(content, "change"); + wait_for(chosen_name, mdfd); + close(mdfd); + return 0; + } + if (runstop == 1 || (runstop <= 0 && - ( enough(info.array.level, info.array.raid_disks, - info.array.layout, clean, avail, okcnt) && - (okcnt >= req_cnt || start_partial_ok) + ( enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail, okcnt) && + (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok) ))) { - if (ioctl(mdfd, RUN_ARRAY, NULL)==0) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; +#ifndef MDASSEMBLE + if (content->reshape_active && + content->delta_disks <= 0) + rv = Grow_continue(mdfd, st, content, backup_file); + else +#endif + rv = ioctl(mdfd, RUN_ARRAY, NULL); + if (rv == 0) { if (verbose >= 0) { fprintf(stderr, Name ": %s has been started with %d drive%s", mddev, okcnt, okcnt==1?"":"s"); - if (okcnt < info.array.raid_disks) - fprintf(stderr, " (out of %d)", info.array.raid_disks); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); if (sparecnt) fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); fprintf(stderr, ".\n"); } - if (must_close) { + if (content->reshape_active && + content->array.level >= 4 && + content->array.level <= 6) { + /* might need to increase the size + * of the stripe cache - default is 256 + */ + if (256 < 4 * (content->array.chunk_size/4096)) { + struct mdinfo *sra = sysfs_read(mdfd, 0, 0); + if (sra) + sysfs_set_num(sra, NULL, + "stripe_cache_size", + (4 * content->array.chunk_size / 4096) + 1); + } + } + wait_for(mddev, mdfd); + close(mdfd); + if (auto_assem) { int usecs = 1; - close(mdfd); /* There is a nasty race with 'mdadm --monitor'. * If it opens this device before we close it, * it gets an incomplete open on which IO - * doesn't work and the capacity if wrong. + * doesn't work and the capacity is + * wrong. * If we reopen (to check for layered devices) * before --monitor closes, we loose. * @@ -970,59 +1222,59 @@ fprintf(stderr, Name ": failed to RUN_ARRAY %s: %s\n", mddev, strerror(errno)); - if (!enough(info.array.level, info.array.raid_disks, - info.array.layout, 1, avail, okcnt)) + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail, okcnt)) fprintf(stderr, Name ": Not enough devices to " "start the array.\n"); - else if (!enough(info.array.level, - info.array.raid_disks, - info.array.layout, clean, + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, avail, okcnt)) fprintf(stderr, Name ": Not enough devices to " "start the array while not clean " "- consider --force.\n"); - if (must_close) { + if (auto_assem) ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - } + close(mdfd); return 1; } if (runstop == -1) { fprintf(stderr, Name ": %s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s"); - if (okcnt != info.array.raid_disks) - fprintf(stderr, " (out of %d)", info.array.raid_disks); + if (okcnt != (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); fprintf(stderr, ", but not started.\n"); - if (must_close) close(mdfd); + close(mdfd); return 0; } if (verbose >= -1) { fprintf(stderr, Name ": %s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s"); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?", ":" and ", rebuilding_cnt); if (sparecnt) fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); - if (!enough(info.array.level, info.array.raid_disks, - info.array.layout, 1, avail, okcnt)) + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail, okcnt)) fprintf(stderr, " - not enough to start the array.\n"); - else if (!enough(info.array.level, - info.array.raid_disks, - info.array.layout, clean, + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, avail, okcnt)) fprintf(stderr, " - not enough to start the " "array while not clean - consider " "--force.\n"); else { - if (req_cnt == info.array.raid_disks) + if (req_cnt == (unsigned)content->array.raid_disks) fprintf(stderr, " - need all %d to start it", req_cnt); else - fprintf(stderr, " - need %d of %d to start", req_cnt, info.array.raid_disks); + fprintf(stderr, " - need %d of %d to start", req_cnt, content->array.raid_disks); fprintf(stderr, " (use --run to insist).\n"); } } - if (must_close) { + if (auto_assem) ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - } + close(mdfd); return 1; } else { /* The "chosen_drive" is a good choice, and if necessary, the superblock has @@ -1038,6 +1290,95 @@ } } - if (must_close) close(mdfd); + close(mdfd); return 0; } + +#ifndef MDASSEMBLE +int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, int runstop, + char *chosen_name, int verbose) +{ + struct mdinfo *dev, *sra; + int working = 0, preexist = 0; + struct map_ent *map = NULL; + + sysfs_init(content, mdfd, 0); + + sra = sysfs_read(mdfd, 0, GET_VERSION); + if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) + if (sysfs_set_array(content, md_get_version(mdfd)) != 0) { + close(mdfd); + return 1; + } + if (sra) + sysfs_free(sra); + + for (dev = content->devs; dev; dev = dev->next) + if (sysfs_add_disk(content, dev, 1) == 0) + working++; + else if (errno == EEXIST) + preexist++; + if (working == 0) { + close(mdfd); + return 1;/* Nothing new, don't try to start */ + } + + map_update(&map, fd2devnum(mdfd), + content->text_version, + content->uuid, chosen_name); + + if (runstop > 0 || + (working + preexist) >= content->array.working_disks) { + int err; + + switch(content->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(content, NULL, "array_state", + "active"); + break; + default: + err = sysfs_set_str(content, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!err) { + if (!mdmon_running(st->container_dev)) + start_mdmon(st->container_dev); + ping_monitor(devnum2devname(st->container_dev)); + } + break; + } + if (!err) + sysfs_set_safemode(content, content->safe_mode_delay); + if (verbose >= 0) { + if (err) + fprintf(stderr, Name + ": array %s now has %d devices", + chosen_name, working + preexist); + else + fprintf(stderr, Name + ": Started %s with %d devices", + chosen_name, working + preexist); + if (preexist) + fprintf(stderr, " (%d new)", working); + fprintf(stderr, "\n"); + } + if (!err) + wait_for(chosen_name, mdfd); + close(mdfd); + return 0; + /* FIXME should have an O_EXCL and wait for read-auto */ + } else { + if (verbose >= 0) + fprintf(stderr, Name + ": %s assembled with %d devices but " + "not started\n", + chosen_name, working); + close(mdfd); + return 1; + } +} +#endif + diff -Nru mdadm-2.6.7.1/bitmap.c mdadm-3.1.4/bitmap.c --- mdadm-2.6.7.1/bitmap.c 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/bitmap.c 2010-08-26 05:24:15.000000000 +0300 @@ -20,8 +20,6 @@ #include "mdadm.h" -#define min(a,b) (((a) < (b)) ? (a) : (b)) - inline void sb_le_to_cpu(bitmap_super_t *sb) { sb->magic = __le32_to_cpu(sb->magic); @@ -131,11 +129,13 @@ */ unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0; bitmap_info_t *info; - char *buf, *unaligned; - int n, skip; + void *buf; + unsigned int n, skip; - unaligned = malloc(8192*2); - buf = (char*) ((unsigned long)unaligned | 8191)+1; + if (posix_memalign(&buf, 512, 8192) != 0) { + fprintf(stderr, Name ": failed to allocate 8192 bytes\n"); + return NULL; + } n = read(fd, buf, 8192); info = malloc(sizeof(*info)); @@ -154,7 +154,6 @@ fprintf(stderr, Name ": failed to read superblock of bitmap " "file: %s\n", strerror(errno)); free(info); - free(unaligned); return NULL; } memcpy(&info->sb, buf, sizeof(info->sb)); @@ -162,7 +161,7 @@ sb_le_to_cpu(&info->sb); /* convert superblock to CPU byte ordering */ - if (brief || info->sb.sync_size == 0) + if (brief || info->sb.sync_size == 0 || info->sb.chunksize == 0) goto out; /* read the rest of the file counting total bits and dirty bits -- @@ -228,9 +227,13 @@ if (!st) { /* just look at device... */ lseek(fd, 0, 0); - } else { + } else if (!st->ss->locate_bitmap) { + fprintf(stderr, Name ": No bitmap possible with %s metadata\n", + st->ss->name); + return NULL; + } else st->ss->locate_bitmap(st, fd); - } + ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ *stp = st; } else { @@ -270,6 +273,7 @@ int rv = 1; char buf[64]; int swap; + __u32 uuid32[4]; info = bitmap_file_read(filename, brief, &st); if (!info) @@ -297,19 +301,20 @@ #else swap = 1; #endif - if (swap) { - printf(" UUID : %08x:%08x:%08x:%08x\n", - swapl(*(__u32 *)(sb->uuid+0)), - swapl(*(__u32 *)(sb->uuid+4)), - swapl(*(__u32 *)(sb->uuid+8)), - swapl(*(__u32 *)(sb->uuid+12))); - } else { - printf(" UUID : %08x:%08x:%08x:%08x\n", - *(__u32 *)(sb->uuid+0), - *(__u32 *)(sb->uuid+4), - *(__u32 *)(sb->uuid+8), - *(__u32 *)(sb->uuid+12)); - } + memcpy(uuid32, sb->uuid, 16); + if (swap) + printf(" UUID : %08x:%08x:%08x:%08x\n", + swapl(uuid32[0]), + swapl(uuid32[1]), + swapl(uuid32[2]), + swapl(uuid32[3])); + else + printf(" UUID : %08x:%08x:%08x:%08x\n", + uuid32[0], + uuid32[1], + uuid32[2], + uuid32[3]); + printf(" Events : %llu\n", (unsigned long long)sb->events); printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); printf(" State : %s\n", bitmap_state(sb->state)); @@ -368,7 +373,7 @@ */ chunksize = DEFAULT_BITMAP_CHUNK; /* <<20 for 2^20 chunks, >>9 to convert bytes to sectors */ - while (array_size > (chunksize << (20-9))) + while (array_size > ((unsigned long long)chunksize << (20-9))) chunksize <<= 1; } diff -Nru mdadm-2.6.7.1/Build.c mdadm-3.1.4/Build.c --- mdadm-2.6.7.1/Build.c 2008-10-15 08:04:09.000000000 +0300 +++ mdadm-3.1.4/Build.c 2010-08-05 09:51:58.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -33,10 +28,10 @@ #define START_MD _IO (MD_MAJOR, 2) #define STOP_MD _IO (MD_MAJOR, 3) -int Build(char *mddev, int mdfd, int chunk, int level, int layout, - int raiddisks, - mddev_dev_t devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int verbose) +int Build(char *mddev, int chunk, int level, int layout, + int raiddisks, mddev_dev_t devlist, int assume_clean, + char *bitmap_file, int bitmap_chunk, int write_behind, + int delay, int verbose, int autof, unsigned long long size) { /* Build a linear or raid0 arrays without superblocks * We cannot really do any checks, we just do it. @@ -57,8 +52,11 @@ int subdevs = 0, missing_disks = 0; mddev_dev_t dv; int bitmap_fd; - unsigned long long size = ~0ULL; unsigned long long bitmapsize; + int mdfd; + char chosen_name[1024]; + int uuid[4] = {0,0,0,0}; + struct map_ent *map = NULL; /* scan all devices, make sure they really are block devices */ for (dv = devlist; dv; dv=dv->next) { @@ -112,6 +110,18 @@ break; } + /* We need to create the device. It can have no name. */ + map_lock(&map); + mdfd = create_mddev(mddev, NULL, autof, LOCAL, + chosen_name); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + map_update(&map, fd2devnum(mdfd), "none", uuid, chosen_name); + map_unlock(&map); vers = md_get_version(mdfd); @@ -119,7 +129,7 @@ if (vers >= 9000) { mdu_array_info_t array; array.level = level; - array.size = 0; + array.size = size; array.nr_disks = raiddisks; array.raid_disks = raiddisks; array.md_minor = 0; @@ -140,17 +150,17 @@ if (ioctl(mdfd, SET_ARRAY_INFO, &array)) { fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n", mddev, strerror(errno)); - return 1; + goto abort; } } else if (bitmap_file) { fprintf(stderr, Name ": bitmaps not supported with this kernel\n"); - return 1; + goto abort; } if (bitmap_file && level <= 0) { fprintf(stderr, Name ": bitmaps not meaningful with level %s\n", map_num(pers, level)?:"given"); - return 1; + goto abort; } /* now add the devices */ for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) { @@ -178,12 +188,12 @@ (size == 0 || dsize < size)) size = dsize; close(fd); - if (vers>= 9000) { + if (vers >= 9000) { mdu_disk_info_t disk; disk.number = i; disk.raid_disk = i; disk.state = (1<writemostly) + if (dv->writemostly == 1) disk.state |= 1<>9; /* FIXME wrong for RAID10 */ if (CreateBitmap(bitmap_file, 1, NULL, bitmap_chunk, delay, write_behind, bitmapsize, major)) { - return 1; + goto abort; } bitmap_fd = open(bitmap_file, O_RDWR); if (bitmap_fd < 0) { fprintf(stderr, Name ": %s cannot be openned.", bitmap_file); - return 1; + goto abort; } } if (bitmap_fd >= 0) { if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n", mddev, strerror(errno)); - return 1; + goto abort; } } } @@ -265,6 +275,8 @@ if (verbose >= 0) fprintf(stderr, Name ": array %s built and started.\n", mddev); + wait_for(mddev, mdfd); + close(mdfd); return 0; abort: @@ -272,5 +284,6 @@ ioctl(mdfd, STOP_ARRAY, 0); else ioctl(mdfd, STOP_MD, 0); + close(mdfd); return 1; } diff -Nru mdadm-2.6.7.1/ChangeLog mdadm-3.1.4/ChangeLog --- mdadm-2.6.7.1/ChangeLog 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/ChangeLog 2010-08-31 10:21:13.000000000 +0300 @@ -1,766 +1,115 @@ -Changes Prior to 2.6.7 release - - Avoid NULL reference calling free_super and elsewhere. - - Remove stray semicolon (Causes compile error with gcc-2.95) - - Fix autoassemble for stack arrays. +Please see git logs for detailed change log. +This file just contains highlight. -Changes Prior to 2.6.6 release - - "make everything" now make mdassemble.auto - - fix compile problem with mdassemble.auto - - Update FAQ URLs in man page again. - -Changes Prior to 2.6.5 release - - Avoid segfault when parsing /proc/mdstat with auto-read-only - arrays. - - Fix problem with failing to add devices to v.large (>4TB) arrays, - cause by problems with device-size overflow. - - For v0.90 superblocks, print the 'Events' count as a real count, - not 2 numbers separated by a dot. - - Updates some URLs in the man page. - - Allow creation of a RAID6 with exactly one missing device. - - Use LOG_PID for syslog, so you get the pid of mdadm in the log - files. - - --export now works with --examine too (not just --detail) - - Improve auto-creation of device special file when using - --incremental - - Simple locking for --incremental so mdadm doesn't get confused - when run concurrently with itself. - - Make --incremental cope better with arrays that are being reshaped. - -Changes Prior to 2.6.4 release - - Make "--create --auto=mdp" work for non-standard device names. - - Fix restarting of a 'reshape' if it was stopped in the middle. - - Fix a segfault when using v1 superblock. - - Make --write-mostly effective when re-adding a device to an array. - - Various minor fixes - -Changes Prior to 2.6.3 release - - allow --write-behind to be set for --grow. - - When adding new disk to an array, don't reserve so much bitmap - space that the disk cannot store the required data. (Needed when - 1.x array was created with older mdadm). - - When adding a drive that was a little too small, we did not get - the correct error message. - - Make sure that if --assemble find an array in the critical region - of a reshape, and cannot find the critical data to restart the - reshape, it gives an error message. - - Fix segfault with '--detail --export' and non-persistent - superblocks. - - Various manpage updates. - - Improved 'raid4' support (--assemble, --monitor) - - Option parsing fixes w.r.t -a - - Interpret "--assemble --metadata=1" to allow any version 1.x - metadata, and be more specific in the "metadata=" message printed - with --examine --brief - - Fix spare migration in --monitor. - -Changes Prior to 2.6.2 release - - --fail detached and --remove faulty can be used to fail and - remove devices that are no longer physically present. - - --export option for --detail or present information in a format - that can be processed by udev. - - fix internal bitmap allocation problems with v1.1, v1.2 metadata. - - --help now goes to stdout so you can direct it to a pager. - - Various manpage updates. - - Make "--grow --add" for linear arrays really work. - - --auto-detect to trigger in-kernel autodetect. - - Make return code for "--detail --test" more reliable. Missing - devices as well as failed devices cause an error. - -Changes Prior to 2.6.1 release - - --monitor was producing some meaningless warnings due to a bug. - - Fix some compiler warnings. - - Fully support --grow for raid6. If a reshape crashed during the - critical period, mdadm wouldn't restore the Q information - properly. - - Update documentation for --grow. - -Changes Prior to 2.6 release - - Fixed UUID printing in "--detail --brief" for version1 metadata. - - --update=resync did exactly the wrong thing for version1 metadata. - It caused a resync to not happen, rather than to happen. - - Allow --assemble --force to mark a raid6 clean when it has two - missing devices (which is needed else if won't assemble. - Without this fix it would only assemble if one or zero - missing devices. - - Support --update=devicesize for cases where the underlying device - can change size. - - Default to --auto=yes so the array devices with 'standard' names - get created automatically, as this is almost always what is wanted. - - Give useful message if raid4/5/6 cannot be started because it is - not clean and is also degraded. - - Increase raid456 stripe cache size if needed to --grow the array. - The setting used unfortunately requires intimate knowledge of the - kernel, and it not reset when the reshape finishes. - - Change 'Device Size' to 'Used Dev Size' because it only shows how - much of each device is actually used, not how big they are. - - --wait or -W will wait for resync activity to finish on the given - devices. - - Fix some problems with --update=uuid and add a test. - - If two drives in a raid5 disappear at the same time, then "-Af" - will add them both in rather than just one and forcing the array - to 'clean'. This is slightly safer in some cases. - - Check device is large enough before hot-add: this improves quality - of error message. - - Don't hold md device open for so long in --monitor mode - map_dev - can be slow and interferes with trying to stop the array. - - Support --uuid= with --create to choose your own UUID. - - New major more "--incremental" for incremental assemble of arrays, - intended for use with udev. - -Changes Prior to 2.5.6 release - - Fix bug which meant "bitmap=xxx" in mdadm.conf was not handled - properly. - - Documentation updates. - - Fix bug that caused infinite loop when doing auto-assembly, - in certain cases where arrays couldn't be assembled. - -Changes Prior to 2.5.5 release - - Don't #include linux/blkpg.h as that isn't safe. Just - include the content literally. - - Reduce maximum bitmap usage when working with bitmap files, - so that a only single-page allocations are made, even on - 64bit hosts with 4K pages. - - Close stray fd in mdassemble so that it can assemble stacked - devices - - If mdassemble finds an array already assembled, it marks it - read-write. - - Remove error in md_open if array is already active. This isn't - needed and gets in the ways if an array was created e.g. in - initramfs, but device doesn't yet exist in /dev. - - When --assemble --scan is run, if all arrays that could be found - have already been started, don't report an error. - - Fix a couple of bugs related to raid10 and the new 'offset' layout. - - Improve error message when a wrong '--update' option is given. - -Changes Prior to 2.5.4 release - - When creating devices in /dev/md/ create matching symlinks - from /dev. e.g. /dev/md0 -> /dev/md/0. - Allow this to be disabled in mdadm.conf or on command line. - - Fix some endian-ness issues with version-1 superblocks (affects - bigendian only). - - Fix endian problem with 'bitmap' metadata - - Allow a number (of partitions) after the 'yes' option to --auto= - This is particularly useful in the 'create' line in mdadm.conf. - - Remove partitions from any whole device that is made part of - an md array. This is a work-around for annoying messages - when the first block on some drive accidentally looks like a - partition table. - -Changes Prior to 2.5.3 release - - Document v0.91 superblocks in md.4 - - Make GPL explicit in man pages. - - Fix recent breakage of starting degraded arrays. - - Tidyup automatic name choice for v-1 arrays: - /dev/md_d0 now becomes '0', not '_d0'. - -Changes Prior to 2.5.2 release - - Fix problem with compiling with gcc-2 compilers - - Fix compile problem of post-incrmenting a variable in a macro arg. - - Stop map_dev from returning [0:0], as that breaks things. - - Add 'Array Slot' line to --examine for version-1 superblocks - to make it a bit easier to see what is happening. - - Work around bug in --add handling for version-1 superblocks - in 2.6.17 (and prior). - - Make -assemble a bit more resilient to finding strange - information in superblocks. - - Don't claim newly added spares are InSync!! (don't know why that - code was ever in there) - - Work better when no 'ftw' is available, and check to see - if current uclibc provides ftw. - - Never use /etc/mdadm.conf if --config file is given (previously - some code used one, some used the other). - -Changes Prior to 2.5.1 release - - Various fixes for gcc warnings - - uclibc warnings - - Makefile improvements for static linking/intalling - - Makefile uninstall target - - Really fix return status of --examine - - Typos - - Byteorder stuff (again) - - Don't try to create devices with --manage or --grow - - allow default metadata (superblock) type to be specified - in mdadm.conf - - Get --stop to list devices stopped but honour --quiet - - remove libssl dependency - - Avoid some misdetection of overlapping partitions - - Fix memory leak in --monitor mode - -Changes Prior to 2.5 release - - Support 'mailfrom' line in mdadm.conf so the From: line in alert - emails can be explicitly set. - - Arrange that SparesMissing (which is similar in import to - DegradedArray) generates an Email. - - Assume "DEVICE partitions" if no DEVICE line is given. - - Support new 'offset' layout for raid10. - - When creating a bitmap file, choose a chunksize to limit number - of bitmap chunks to 2 million. More than this can cause kmalloc - failure. - - New 'CREATE' line in mdadm.conf for defaults such as owner, group, - mode and auto-flag - - --detail checks if array has been started or not and includes that - in report. - - When using --update=uuid on an array with a bitmap, update the - bitmap's uuid too. - - Add a copy of /proc/mdstat to the mail message sent by mdadm - --monitor. - - New flag --no-degraded to avoid starting arrays if there are - fewer devices available than last time the array was started. - This is only needed with --scan, as with --scan, that behaviour - is the default. - - Support for 'homehost' concept. This is a fairly major update. - It includes a configfile option and a command line option for - specifying a homehost, records that host in the superblock, - and reports the homehost where possible. - - Support for Auto Assembly. "mdadm -As" will, if provided with - the name of a homehost, try to assemble all arrays it can find - that were created for that homehost. See man pages for more details. - -Changes Prior to 2.4.1 release - - Honour --write-mostly when adding to an array without persistent - superblocks. - - Fix alignment problem in version-1 superblocks. - NOTE: This is an incompatable change affecting raid5 reshape. - If you want to reshape a raid5 using version-1 superblocks, - use 2.6.17-rc2 or later, and mdadm-2.4.1 or later. - -Changes Prior to 2.4 release - - Rewrite 'reshape' support including performing a backup - of the critical region for a raid5 growth, and restoring that - backup after a crash. - - Put a 'canary' at each end of the backup so a corruption - can be more easily detected. - - Remove useless 'ident' arguement from ->getinfo_super method. - - Support --backup-file for backing-up critical section during - growth. - - Erase old superblocks (of different versions) when creating new - array. - - Allow --monitor to work with arrays with >28 devices - - Report reshape information in --detail - - Handle symlinks in /dev better - - Fix mess in --detail output which a device is missing. - - Manpage tidyup - - Support 'bitmap=' in mdadm.conf for auto-assembling arrays with - write-intent bitmaps in separate files. - - Updates to md.4 man page including section on RESTRIPING and SYSFS - -Changes Prior to 2.3.1 release - - Fixed -O2 compile so I could make and RPM. - - Type cast number to be printed %llu so it compiles on 64bit - machines. (Thanks Luca). - - Stop using asm/byteorder.h - to make Redhat happy :-( - - Require bitmap files to have a '/' in their name. - - Error-check a few syscalls - code from SuSE package. - -Changes Prior to 2.3 release - - Try /etc/mdadm/mdadm.conf if /etc/mdadm.conf doesn't exist. - This provided compatability for Debian. - - Fixed for version-1 superblock: - report chunksize for raid6 and raid10 - make sure device size used is a multiple of chunksize - - Fix "--assemble --scan" crash. - - Fix completely failure to create array on ppc64 - - Fix memcmp in place of memcpy - - A few minor improvements to online help - - Clean up usage of 'long long' for used-size of devices, so - that it is possible to create a raid1 of 7TB devices! - - Make internal bitmaps work on 7TB raid1 arrays. - - Provide error message if --examine doesn't find any superblock. - - Report 'reshape' status in --examine - this depends on kernel - patches that are not yet finalised. - - Report bitmap status in --detail and --examine - - Default to v1 superblocks instead of v0.90 if the array - is too big for 0.90 to handle. - - Sort the output of "mdadm --detail --scan" so that it is - in a suitable order for assembling arrays. i.e. components come - before an array that they are part of. - - Print size of large reiserfs array properly went warning of - possible confilcts. - -Changes Prior to 2.2 release - - Assorted bug fixes - - Support write-intent-bitmaps on raid10 - - Support little-endian (Rather than hostendian) bitmaps. - - Return correct error code from 'mdadm -S' - - Remove extra blank line from 'mdadm -Eb' output. - - Improve option parsing so that -a and -b do not have - optional arguements: the arg is either required or not - depending on context. - - Allow scanning of devices listed in /proc/partitions even - if they don't appear in /dev. - - Support --assume-clean in --create mode as well as --build - - Add support for --monitor to report to syslog: -y or --syslog. - Thanks to Ross Vandegrift - - --monitor now reports which device failed in a 'Fail' message - This broke with 2.6 - - Improve chance of array starting properly after a crash. - mdadm was insisting the event numbers were identical, but this - isn't needed, and is a problem if the crash was while the metadata - was being updated. - - Support --update==uuid - - Added README.initramfs and mkinitramfs to help people use an - initram for starting md arrays at boot. - -Changes Prior to 2.1 release - - Fix assembling of raid10 array when devices are missing. - mdadm now correctly detects if a array is workable or not - depending on which devices are present, and so will correctly - handle "--assemble --force" if multiple devices have failed. - - Report raid10 layout in --examine output. - - Fix assembling of arrays that use the version-1 superblock and - have spares. Previously the spares would be ignored. - - Fix bug so that multiple drives can be re-added at once. - - Fix problem with hot-adding a bitmap to version-1-superblock - arrays. - -Changes Prior to 2.0 - - Support assembling from byte-swapped superblocks - metadata type "0.swap" and --update=byteorder - - write-mostly and write-behind support for raid1. - - Support --name= and 'name=' config entry for identifying - arrays be name. - - RAID10 in man pages. - - Lot of minor manpage updates - -Changes Prior to 2.0-devel-3 release - - Assorted fixes for multiple bugs... - - Add test suite - -Changes Prior to 1.12.0 release - Several of these are backported from the Debian package - - Don't use 'lstat' to check for blockdevices, use stat. - - Document --size=max option for --grow - - Document SparesMissing event and DeviceDisappeared/WrongLevel - - --stop --scan repeatly cycles until no more progress can be made - so that stack devices are stopped properly - - Minor type rationalisation for ident->uuid - now always 'int[]' - - Fix type in online help for --grow (was -F, now -G) - - Allow --auto command line option to set default auto= - value when running "--assemble --scan". Previously - --auto was ignored if --scan was given - - Fix a few type casts - - Fix parsing of /dev/md/N in is_standard - - Fix rounding errors in human_size() - - Fix silly example in mdadm.conf-examples - - When finding a /dev name for a device, prefer shorter names - - Suppress listing of devices= in --brief output of -D or -E, - unless -v is given (-vv gives the old behaviour with -Dsv). - This is because the device list can change and so is not a - stable aspect of the array - - Allow --force with --grow so '-Gfn1' works (on raid1) - - Replace sprintf calls with snprintf (to quiet diet-libc) - - Add man page for mdassemble - - Fix compilation with tinyc - -Changes Prior to 1.11.0 release - - Fix embarassing bug which causes --add to always fail. - -Changes Prior to 1.10.0 release - - Fix bug with --config=partitions - - Open sub-devices with O_EXCL to detect if already in use - - Make sure superblock updates are flushed directly to disk. - -Changes Prior to 2.0-devel-1 release - - Support for version-1 superblock. See --metadata option. - - Support for bitmap based intent logging. - - Minor fixes. - -Changes Prior to 1.9.0 release - - Fix rpm build problem (stray %) - - Minor manpage updates - - Change "dirty" status to "active" as it was confusing people. - - --assemble --auto recognises 'standard' name and insists on using - the appropriate major/minor number for them. - - Remove underscore from partition names, so partitions of - "foo" are "foo1", "foo2" etc (unchanged) and partitions of - "f00" are "f00p1", "f00p2" etc rather than "f00_p1"... - - Use "major", "minor", "makedev" macros instead of - "MAJOR", "MINOR", "MKDEV" so that large device numbers work - on 2.6 (providing you have glibc 2.3.3 or later). - - Add some missing closes of open file descriptors. - - Reread /proc/partition for every array assembled when using - it to find devices, rather than only once. - - Make "mdadm -Ss" stop stacked devices properly, by reversing the - order in which arrays are stopped. - - Improve some error messages. - - Allow device name to appear before first option, so e.g. - mdadm /dev/md0 -A /dev/sd[ab] - works. - - Assume '-Q' if just a device is given, rather than being silent. - - Change "dirty" status to "active" as it was confusing people. - -Changes Prior to 1.8.0 release - - Makefile cleanup from Luca Berra - - --pid-file (-i) to set a pid file to use with --monitor --daemonise - - Fix typo in mdadm man page - - Fix coredump when "-s" used with no config file present. - - Support new "faulty" personality which can inject synthetic - faults. (Not in kernel.org yet at 1Nov2004) - - Support raid0/linear on devices > 2 Terabytes - - Make sure raid6 resyncs when created with one missing device - -Changes Prior to 1.7.0 release - - Support "--grow --add" to add a device to a linear array, if the - kernel supports it. Not documented yet. - - Restore support for uclibc which was broken recently. - - Several improvements to the output of --detail, including - reporting "resyncing" or "recovering" in the state. - - Close filedescriptor at end of --detail (exit would have closed it - anyway, so this isn't abig deal). - - Report "Sync checkpoint" in --examine output if appropriate. - - Add --update=resync for --assemble mode to for a resync when the - array is assembled. - - Add support for "raid10", which is under development in 2.6. - Not documented yet. - - --monitor now reads spare-group and spares info from config file - even when names of arrays to scan are given on the command line - -Changes Prior to 1.6.0 release - - Device name given in -Eb is determined by examining /dev rather - than assuming /dev/md%d - - Fix bug in --monitor where an array could be held open an so - could not be stopped without killing mdadm. - - Add --grow mode. Currently only --size and --raid-disks can be - changed. Both require kernel support which, at the time of - writing, is not in a release kernel yet. - - Don't print out "errors" or "no-errors" in -D and -E, as the bit - is never set or used. - - Use md event notification in 2.6.??? to make --monitor mode - respond instantly to events. - - Add --auto= option and auto= configfile entry to tell mdadm to - create device files as needed. This is particularly useful - with partitioned arrays where the major device number can change. - - When generating --brief listing, if the standard name doesn't - exist, search /dev for one rather than using a temp name. - - Allow --build to build raid1 and multipath arrays. - - Add "--assume-clean" for Create and Build, particularly for raid1 - Note: this is dangerous. Only use it if you are certain. - - Fix bug so that Rebuild status monitoring works again. - - Add "degraded" and "recovering" options to the "Status:" - entry for --detail - -Changes Prior to 1.5.0 release - - new commands "mdassemble" which is a stripped-down equivalent of - "mdadm -As", that can be compiled with dietlibc. - Thanks to Luca Berra . - It can be using in an initramfs or initrd. - - Fix compiling error with BLKGETSIZE64 and some signed/unsigned - comparison warnings. - - Add Rebuild Status (% complete) to --detail output. - - Support "--monitor --test" which will generate a test alert - for each array once, to test notification paths. - - Generate RebuildFinished event when rebuild finishes. - - Support for raid6 as found in 2.6.2 - thanks to - H. Peter Anvin - - Support partitioned md arrays with a different major number and - naming scheme (md_dX in /proc/mdstat, /dev/md/dXpY in /dev). - -Changes Prior to 1.4.0 release - - Document fact that creating a raid5 array really creates a - degraded array with a spare. - - Add "spares=" tag to config file and generate it wit --detail and - --examine - - Add "SparesMissing" event when --monitor first sees an array and - it doesn't have the enough spare devices. - - Add --update=summaries for --assemble to update summary - information in superblock, and correct other inconsistancies in - the superblock. - - Add --test option to --detail to set a meaningful exit status. - -Changes Prior to 1.3.0 release - - Make 'size' and unsigned long in Create to allow creation of - larger arrays. - - Explicitly flag spare devices as 'spare' in --detail and --examine - output. Previously they simply had no flags lists. - - Make MailCmd (for monitor) configurable in Makefile, and default - to "/usr/sbin/sendmail -t". Also split out the warning related - flags into CWFLAGS for easier build configurability. - - Minor bugfix in Manage code. - - --monitor now notices and reports degraded arrays at startup using - "DegradedArray" event, and also has a --oneshot option to only - report DegradedArrays, and then exit. - - Small man-page clarification w.r.t. raid levels and raid4 in - particular. - - Disallow creation of arrays with only one device as this is - probably a mistake. --force will override this check. - - Correct some misleading documentation in the "mdadm --create --help" - message. - - Ignore chunksize if raid1 or multipath. - - Explicit statement in man page that raid-disks cannot be changed - after array is created. - - Improve message when attempting to start an array with - insufficient devices. Instead of required the array to be full, - we only require it has as many active devices as last time. - -Changes Prior to 1.2.0 release - - Fix bug where --daemonise required an argument. - - In --assemble --verbose, print appropriate message if device is - not in devices= list - - Updated mdadm.conf.5 to reflect fact that device= takes wildcards - - Typos: componenet -> component - - Reduce size of "--help" message put excess into "--help-options" - - Fix bug introduced when MD_SB_DISKS dependancy removed, and which - caused spares not be assembled properly. - - Print appropriate message if --monitor --scan decides not to - monitor anything. -Changes Prior to 1.1.0 release - - add --deamonise flag for --monitor - forks and prints pid to stdout - - Fix bug so we REALLY clear dirty flag with -Af - - -Db now prints a 'devices=' word for each array. - - "mdadm -A /dev/md0" will get info from configfile, even without scan - - When assembling multipath arrays, ignore devices which are flagged - as having errors. - - take --super-minor=dev to mean "use the minor number of the mddev - being assembled. - - take --config=none to mean "completely ignore config file" - - Make --monitor require --scan or a device list. -Changes Prior to 1.0.9 release - - Documentation updates including kernel parameters documented - in md.4 - - --assemble --force for raid4/5 will mark clean, needed for 2.5 - - --detail prints out the events counter as well - - flush device before reading superblock to be sure to get - current data - - added mdadm.static target to makefile for static linking - - --monitor was ignoring /dev/md0 due to off-by-one error - - Fix assorted typos - - Fix printing of Gibibytes - calc was wrong. - - Fix printing of Array Size in --detail when very big. - - --monitor no longer tries to work for raid0 or linear as these - have nothing to be monitored. - - The word 'partitions' on a DEVICE line will cause all partitions - listed in /proc/partitions to be considered - - If the config file is called 'partitions' then it will be treated - as though it contained exactly 'device partitions' so e.g. - mdadm -Ebsc partitions - will find all raid partitions easily. - - successfully assemble multipath devices by ignoring raid_disk - value from superblock (it is always the same). - - --assemble not tied to MD_SB_DISKS limit quite so much - - Support compiling with tcc - - Support compiling with uclibc - just skip scan of /dev - - Add --update= option for Assemble mode. Either sparc2.2 - or super-minor updates are possible. See mdadm.8 - -Changes Prior to 1.0.1 release - - Round off MB/GiB etc values instead of round down. - - Add --sparc2.2 option to examine to shift superblock around - and --sparc2.2update to rewrite the superblock - - Fix assorted typos in online help - -Changes Prior to 1.0.0 release - - Allow --config with Misc mode (for --examine --scan) - - Add $(CXFLAGS) to end of CFLAGS in makefile - - When making an N disk raid5 array, the Nth drive - is moved to the end of the array as a spare rather than - being shifted up one place. This means that when the - kernel builds onto the last spare and inserts it, - the devices will be in the expected order. - - Man page improvements -Changes Prior to 0.8.2 release - - Correct spelling of persist[ae]nce/persist[ae]nt. - - Change "disk" to "device" in options and config file - - convert array size to "long long" *before* shift-left in -D and -Q - -Changes Prior to 0.8.1 release - - Add "INSTALL" file. - - Fix some "i" variables that were not being set properly - - Initialise minsize and maxsize so that compilers don't complain. - - Tidy up Makefile and mdadm.spec installations - - Add "multipath" to documentation of valid levels - -Changes Prior to 0.8 release - - Fix another bug in Assemble.c due to confusing 'i' with 'j' - - Minimal, untested, support for multipath - - re-write of argument parsing to have more coherent modes, - - add --query,-Q option - - Update mdadm.8 to reflect arg processing change and --query - - Change "long" to "unsigned long" for device sizes - - Handle "mailaddr" and "program" lines in config file for follow/scan mode. - - --follow --scan will exit if no program or mail found - - Add MAILADDR and PROGRAM to mdadm.conf-example - - Spell check man pages - - consistently use "component devices" instead of "subdevices" - - Make -Wall -Werror really work and fix lots of errors. - - --detail and --stop can have --scan which chooses devices from /proc/mdstat - - --monitor detects 20% changes in resync, failed spares, - disappearing arrays, - - --monitor --scan will automatically add any devices found in /proc/mdstat - - --monitor will move spares between arrays with same spare-group if necessary - - Documentation for Monitor Mode - - --query notes if the array containing the given device is active or not - - Finished md.4 man page. - -Changes Prior to 0.7.2 release - - mdadm.spec updates and ifdef BLKGETSIZE64 from Luca Berra -- bluca@comedia.it - - more mdadm.spec updates from Gregory Leblanc - - make directory for mdadm.conf configurable in Makefile - - Finished mdadm.conf.5. Removed details of conf file from - mdadm.8 leaving a reference to mdadm.conf.5. - - Fix bug in Assemble.c, thanks to Junaid Rizvi - - Get --assemble --force to make sure old major/minor numbers are - consistant, as md.c worries about this :-( - - -Changes Prior to 0.7.1 release - - update mdadm.spec - - use BLKGETSIZE64 if available for array size - - give human readable as GiB/MiB and GB and MB, with 2 decimal point precision - - Only warn about size variation for raid1/4/5. - - Started md.4 man page - - Started mdadm.conf.5 man page - -Changes Prior to 0.7 release - - - Fix makefile to install binary at /sbin and not /sbin/sbin - Also install man page. - - Add --zero-superblock based on --destroywithextremeprejudice - from Dale Stephenson - - change name to mdadm. It is palandromic, and much nicer to pronouce. - -Changes Prior to 0.6 release - - - Remove the limit on the number of device names that can be - given on the command line. - - Fix bug in --assemble --force where it would only update a - single superblock. - - Fix bogus printing of big numbers not being block devices - when given names of devices that don't exist. - - When --assemble --force, consider superblocks with an event - count that is 1 behind as out-of-date. Normally they are - considered up-to-date (as the kernel assumes this too). - - When marking drives as not-failed in the superblock, - we also mark them as ACTIVE and SYNC. - - Don't start arrays for which not all drives are available unless: - --scan which implies that all drives were found automatically - --run which means the user knows what they want - --force which means that we are fixing something broken - - Make sure all device numbers passed as 3rd arg of ioctl - are passed as unsigned lock, so that it works on SPARC - - If HOT_ADD_DISK failes for -a, then only try ADD_NEW_DISK - if we cannot read from the array, i.e. if the array is - not started yet. - - man page update - - Taught Examine to handle --scan. It examines all devices listed - on DEVICE lines in the config file. - - Added --brief (-b) flag for Examine and Detail to print out - and mdctl.conf compatible description with uuid=, level=, - disks= and - for Examine - devices= - --examine --brief collects all devices the make the one array and - list them as one entry. - - Added level= and disks= options to ARRAY lines in config files - so --brief output could be used as-is. - - Make parity style ({left,right}-{,a}symmetric) consistantly use -, - never _. - - Add "Array Size" to --detail output - - Change "Size" to "Device Size" and exclude from Detail of arrays - that do not have a consistent device size. - - Add Human readable MiB or GiB value on size lines of Detail and Examine - - --assemble --scan doesn't complain about active drives - - require number of spares given in -x to be listed. - - Made --build actually work. -Changes Prior to 0.5 release - - --assemble: - spare drives are handled properly. - - --force can be used to recover from 2-drive failures on RAID5 - If you belive that /dev/hda1 /dev/hdb1 /dev/hdc1 /dev/hdd1 should - make a raid5 array, but it has experienced multiple failures and - wont start, then - - mdctl --assemble --force /dev/md0 /dev/hd[abcd]1 - - Should update the superblock on the newest failed drive and - restart the array in degraded mode. You should then remove the - remaining failed drive and re-add it (if you are happy that it - might work). - - Ofcourse whenever you have a 2-drive failure, you have a risk - of corruption in data that hasn't be changed for a long time. So - this doesn't give you your array back all nice and happy, but it - does allow you to recover data that might not be corrupt. - - More flexibility in identifying a RAID array in the mdctl.conf - e.g. - array /dev/md4 super-minor=4 - - assembles /dev/md4 from all devices found that have a raid - superblock that says the minor number of the array is 4. - If the blocks with the right minor number do not all have the - same UUID, an error is flags and no assembly happens. - - array /dev/md3 devices=/dev/hd[abc]2 - - Assembles /dev/md3 drom /dev/hda2 /dev/hdb2 and/dev/hdc2. All - devices must exist and have raid superblock with the same uuid. - - If two identity specifiers are used, only devices that match all - of them are considered, so - - array /dev/md2 devices=/dev/hd?2 super-minor=2 - - will assemble /dev/md2 using all /dev/hd?2 devices which have a - raid superblock with minor number 2. - - --create: - When listing devices for --create, the word "missing" can be - used to indicate that the respective slot does not have a - working drive currently. This is similar to the "failed-disk" - directive in mkraid/raidtab. - e.g. - mdctl --create --level=5 -raid-disks=4 --spare-disks=2 - /dev/md0 /dev/sda /dev/sdb missing /dev/sdc /dev/sdd /dev/sde - - will create a raid5 array with the third slot empty, and two - spares. - - By default, raid5 arrays are created with the last slot empty - and drive listed for the last slot added as a spare. If a - "missing" slot is given, or if --force is given, then --create - does exactly what you ask and doesn't try to be clever. - - - --follow / --monitor: - - This is a new mode. I couldn't stop my self from picking a name - starting with F (as current modes start A,B,C,D,E) but I - relented and provided an alternate name that is somewhat more - meaningful. - - In this mode, mdctl does not exit, but runs continuously and - periodically polls all the md devices to see if they have had - any interested state change. - The changes that it currently notices are: - Fail - an active disc fails - FailSpare - a spare, that was presumably being build, fails - ActiveSpare - a spare becomes active, presumably after a rebuild. - - Options: - --mail mailaddress - send Email on any Fail* event - --program program - run the program on any event. - Args are: eventname mddevice subdevice(if-known) - --delay seconds - change from the default 60second pause - between polls. - - I plan to add functionality to this mode to allow sharing of - spare drives. If an array is marks "spare-group=fred", and it - has a failed drive and no spares, and if some other array is - also "spare-group=fred" and it has no failed drives, but does - have a spare drive that is big enough, the spare will be moved - to the first array. - - I also have the idea of adding a --grow mode which will re-organise - the data on an N disk raid0/4/5 array to be on an N+M disk array. - I have no concrete plans for this though. - - I got rid of the "v" in the archive file name, and include the - version number in the directory created by the archive. - - There is now a man page and mdctl.spec (for rpm) thanks to - Danilo Godec . - - Ofcourse, the man page is now out of date and despite being based on - the --help output, is not wholy correct. After I get --follow - working properly, I plan to revise the various documentation and/or - the code to make sure the two match. +Changes Prior to release 3.1.4 + Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev + And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + +Changes Prior to release 3.1.3 + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +Changes Prior to release 3.1.2 + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + +Changes Prior to release 3.1.1 + - Multiple fixes for new --grow levels including fixes for + serious data corruption problems. + - Change default metadata to v1.1 + - Change default chunk size to 512K + - Change default bitmap chunk size to 64Meg + - When --re-add is used, don't fall back to + --add if --re-add fails as this can destroy data. + +Changes Prior to release 3.1 + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Changes Prior to release 3.0.3 + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +Changes Prior to release 3.0.2 + - Fix crash when hosthost is not set, as often happens in + early boot. + +Changes Prior to release 3.0.1 + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +Changes Prior to release 3.0 + - Support for externally managed metadata, specifically DDF and IMSM. + - Depend on udev to create entries in /dev, rather than creating them + ourselves. + - remove --auto-update-home-hosts + - new config file line "auto" + - new "" and "any" options for "homehost" + - numerous bug fixes and minor enhancements. diff -Nru mdadm-2.6.7.1/check.d/_numbers mdadm-3.1.4/check.d/_numbers --- mdadm-2.6.7.1/check.d/_numbers 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/check.d/_numbers 1970-01-01 02:00:00.000000000 +0200 @@ -1 +0,0 @@ -07 root_on_raid diff -Nru mdadm-2.6.7.1/check.d/root_on_raid mdadm-3.1.4/check.d/root_on_raid --- mdadm-2.6.7.1/check.d/root_on_raid 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/check.d/root_on_raid 1970-01-01 02:00:00.000000000 +0200 @@ -1,35 +0,0 @@ -#!/bin/sh - -. /lib/preseed/preseed.sh -. /lib/partman/lib/base.sh - -# Prompt for BOOT_DEGRADED=true|false if / or /boot is on a /dev/md* -root_on_raid () { - prompt=$( - for i in /lib/partman/fstab.d/*; do - [ -x "$i" ] || continue - $i - done | - while read fs mp type options dump pass; do - if mdadm --detail "$fs" 2>/dev/null | grep -qsi " raid1$"; then - if [ "$mp" = "/" ] || [ "$mp" = "/boot" ]; then - echo "true" - fi - fi - done - ) - prompt="$(echo "$prompt" | head -n1)" - case $prompt in - true) - db_input critical mdadm/boot_degraded || true - db_go || true - db_get mdadm/boot_degraded - # write to preseed log so that /target knows about it - echo mdadm mdadm/boot_degraded boolean "$RET" >> "$logfile" - - ;; - esac - exit 0 -} - -root_on_raid diff -Nru mdadm-2.6.7.1/config.c mdadm-3.1.4/config.c --- mdadm-2.6.7.1/config.c 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/config.c 2010-08-31 10:18:39.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -35,7 +30,6 @@ #include #include #include -#include /* * Read the config file @@ -57,7 +51,7 @@ * with a key word, and not be indented, or must start with a * non-key-word and must be indented. * - * Keywords are DEVICE and ARRAY + * Keywords are DEVICE and ARRAY ... and several others. * DEV{ICE} introduces some devices that might contain raid components. * e.g. * DEV style=0 /dev/sda* /dev/hd* @@ -80,7 +74,8 @@ char DefaultConfFile[] = CONFFILE; char DefaultAltConfFile[] = CONFFILE2; -enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, Homehost, LTEnd }; +enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, + Homehost, AutoMode, LTEnd }; char *keywords[] = { [Devices] = "devices", [Array] = "array", @@ -89,6 +84,7 @@ [Program] = "program", [CreateDev]= "create", [Homehost] = "homehost", + [AutoMode] = "auto", [LTEnd] = NULL }; @@ -262,12 +258,44 @@ d->devname = strdup(name); d->next = rv; d->used = 0; + d->content = NULL; rv = d; } fclose(f); return rv; } +mddev_dev_t load_containers(void) +{ + struct mdstat_ent *mdstat = mdstat_read(1, 0); + struct mdstat_ent *ent; + mddev_dev_t d; + mddev_dev_t rv = NULL; + + if (!mdstat) + return NULL; + + for (ent = mdstat; ent; ent = ent->next) + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0 && + !is_subarray(&ent->metadata_version[9])) { + d = malloc(sizeof(*d)); + if (!d) + continue; + if (asprintf(&d->devname, "/dev/%s", ent->dev) < 0) { + free(d); + continue; + } + d->next = rv; + d->used = 0; + d->content = NULL; + rv = d; + } + free_mdstat(mdstat); + + return rv; +} + struct createinfo createinfo = { .autof = 2, /* by default, create devices with standard names */ .symlinks = 1, @@ -399,7 +427,8 @@ struct conf_dev *cd; for (w=dl_next(line); w != line; w=dl_next(w)) { - if (w[0] == '/' || strcasecmp(w, "partitions") == 0) { + if (w[0] == '/' || strcasecmp(w, "partitions") == 0 || + strcasecmp(w, "containers") == 0) { cd = malloc(sizeof(*cd)); cd->name = strdup(w); cd->next = cdevlist; @@ -414,6 +443,17 @@ mddev_ident_t mddevlist = NULL; mddev_ident_t *mddevlp = &mddevlist; +static int is_number(char *w) +{ + /* check if there are 1 or more digits and nothing else */ + int digits = 0; + while (*w && isdigit(*w)) { + digits++; + w++; + } + return (digits && ! *w); +} + void arrayline(char *line) { char *w; @@ -435,13 +475,39 @@ mis.bitmap_fd = -1; mis.bitmap_file = NULL; mis.name[0] = 0; + mis.container = NULL; + mis.member = NULL; for (w=dl_next(line); w!=line; w=dl_next(w)) { - if (w[0] == '/') { - if (mis.devname) - fprintf(stderr, Name ": only give one device per ARRAY line: %s and %s\n", - mis.devname, w); - else mis.devname = w; + if (w[0] == '/' || strchr(w, '=') == NULL) { + /* This names the device, or is ''. + * The rules match those in create_mddev. + * 'w' must be: + * /dev/md/{anything} + * /dev/mdNN + * /dev/md_dNN + * + * or anything that doesn't start '/' or '<' + */ + if (strcasecmp(w, "") == 0 || + strncmp(w, "/dev/md/", 8) == 0 || + (w[0] != '/' && w[0] != '<') || + (strncmp(w, "/dev/md", 7) == 0 && + is_number(w+7)) || + (strncmp(w, "/dev/md_d", 9) == 0 && + is_number(w+9)) + ) { + /* This is acceptable */; + if (mis.devname) + fprintf(stderr, Name ": only give one " + "device per ARRAY line: %s and %s\n", + mis.devname, w); + else + mis.devname = w; + }else { + fprintf(stderr, Name ": %s is an invalid name for " + "an md device - ignored.\n", w); + } } else if (strncasecmp(w, "uuid=", 5)==0 ) { if (mis.uuid_set) fprintf(stderr, Name ": only specify uuid once, %s ignored.\n", @@ -458,12 +524,13 @@ w); else { char *endptr; - mis.super_minor= strtol(w+12, &endptr, 10); - if (w[12]==0 || endptr[0]!=0 || mis.super_minor < 0) { + int minor = strtol(w+12, &endptr, 10); + + if (w[12]==0 || endptr[0]!=0 || minor < 0) fprintf(stderr, Name ": invalid super-minor number: %s\n", w); - mis.super_minor = UnSet; - } + else + mis.super_minor = minor; } } else if (strncasecmp(w, "name=", 5)==0) { if (mis.name[0]) @@ -517,19 +584,26 @@ } else if (strncasecmp(w, "auto=", 5) == 0 ) { /* whether to create device special files as needed */ mis.autof = parse_auto(w+5, "auto type", 0); + } else if (strncasecmp(w, "member=", 7) == 0) { + /* subarray within a container */ + mis.member = strdup(w+7); + } else if (strncasecmp(w, "container=", 10) == 0) { + /* the container holding this subarray. Either a device name + * or a uuid */ + mis.container = strdup(w+10); } else { fprintf(stderr, Name ": unrecognised word on ARRAY line: %s\n", w); } } - if (mis.devname == NULL) - fprintf(stderr, Name ": ARRAY line with no device\n"); - else if (mis.uuid_set == 0 && mis.devices == NULL && mis.super_minor == UnSet && mis.name[0] == 0) + if (mis.uuid_set == 0 && mis.devices == NULL && + mis.super_minor == UnSet && mis.name[0] == 0 && + (mis.container == NULL || mis.member == NULL)) fprintf(stderr, Name ": ARRAY line %s has no identity information.\n", mis.devname); else { mi = malloc(sizeof(*mi)); *mi = mis; - mi->devname = strdup(mis.devname); + mi->devname = mis.devname ? strdup(mis.devname) : NULL; mi->next = NULL; *mddevlp = mi; mddevlp = &mi->next; @@ -559,11 +633,12 @@ if (alert_mail_from == NULL) alert_mail_from = strdup(w); else { - char *t= NULL; - int ret = asprintf(&t, "%s %s", alert_mail_from, w); - assert(ret >= 0); - free(alert_mail_from); - alert_mail_from = t; + char *t = NULL; + + if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) { + free(alert_mail_from); + alert_mail_from = t; + } } } } @@ -584,12 +659,15 @@ } static char *home_host = NULL; +static int require_homehost = 1; void homehostline(char *line) { char *w; for (w=dl_next(line); w != line ; w=dl_next(w)) { - if (home_host == NULL) + if (strcasecmp(w, "")==0) + require_homehost = 0; + else if (home_host == NULL) home_host = strdup(w); else fprintf(stderr, Name ": excess host name on HOMEHOST line: %s - ignored\n", @@ -597,6 +675,25 @@ } } +static char *auto_options = NULL; +void autoline(char *line) +{ + char *w; + + if (auto_options) { + fprintf(stderr, Name ": AUTO line may only be give once." + " Subsequent lines ignored\n"); + return; + } + + auto_options = dl_strdup(line); + dl_init(auto_options); + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + char *w2 = dl_strdup(w); + dl_add(auto_options, w2); + } +} int loaded = 0; @@ -667,6 +764,9 @@ case Homehost: homehostline(line); break; + case AutoMode: + autoline(line); + break; default: fprintf(stderr, Name ": Unknown keyword %s\n", line); } @@ -696,9 +796,11 @@ return alert_program; } -char *conf_get_homehost(void) +char *conf_get_homehost(int *require_homehostp) { load_conffile(); + if (require_homehostp) + *require_homehostp = require_homehost; return home_host; } @@ -713,11 +815,19 @@ mddev_ident_t rv; load_conffile(); rv = mddevlist; - while (dev && rv && strcmp(dev, rv->devname)!=0) + while (dev && rv && (rv->devname == NULL + || !devname_matches(dev, rv->devname))) rv = rv->next; return rv; } +static void append_dlist(mddev_dev_t *dlp, mddev_dev_t list) +{ + while (*dlp) + dlp = &(*dlp)->next; + *dlp = list; +} + mddev_dev_t conf_get_devs() { glob_t globbuf; @@ -735,13 +845,17 @@ load_conffile(); - if (cdevlist == NULL) - /* default to 'partitions */ + if (cdevlist == NULL) { + /* default to 'partitions' and 'containers' */ dlist = load_partitions(); + append_dlist(&dlist, load_containers()); + } for (cd=cdevlist; cd; cd=cd->next) { - if (strcasecmp(cd->name, "partitions")==0 && dlist == NULL) - dlist = load_partitions(); + if (strcasecmp(cd->name, "partitions")==0) + append_dlist(&dlist, load_partitions()); + else if (strcasecmp(cd->name, "containers")==0) + append_dlist(&dlist, load_containers()); else { glob(cd->name, flags, NULL, &globbuf); flags |= GLOB_APPEND; @@ -753,6 +867,7 @@ t->devname = strdup(globbuf.gl_pathv[i]); t->next = dlist; t->used = 0; + t->content = NULL; dlist = t; /* printf("one dev is %s\n", t->devname);*/ } @@ -777,6 +892,66 @@ return 0; } +int conf_test_metadata(const char *version, int is_homehost) +{ + /* Check if the given metadata version is allowed + * to be auto-assembled. + * The default is 'yes' but the 'auto' line might over-ride that. + * Words in auto_options are processed in order with the first + * match winning. + * word can be: + * +version - that version can be assembled + * -version - that version cannot be auto-assembled + * yes or +all - any other version can be assembled + * no or -all - no other version can be assembled. + * homehost - any array associated by 'homehost' to this + * host can be assembled. + * + * Thus: + * +ddf -0.90 homehost -all + * will auto-assemble any ddf array, no 0.90 array, and + * any other array (imsm, 1.x) if and only if it is identified + * as belonging to this host. + */ + char *w; + load_conffile(); + if (!auto_options) + return 1; + for (w = dl_next(auto_options); w != auto_options; w = dl_next(w)) { + int rv; + if (strcasecmp(w, "yes") == 0) + return 1; + if (strcasecmp(w, "no") == 0) + return 0; + if (strcasecmp(w, "homehost") == 0) { + if (is_homehost) + return 1; + else + continue; + } + if (w[0] == '+') + rv = 1; + else if (w[0] == '-') + rv = 0; + else continue; + + if (strcasecmp(w+1, "all") == 0) + return rv; + if (strcasecmp(w+1, version) == 0) + return rv; + /* allow '0' to match version '0.90' + * and 1 or 1.whatever to match version '1.x' + */ + if (version[1] == '.' && + strlen(w+1) == 1 && + w[1] == version[0]) + return rv; + if (version[1] == '.' && version[2] == 'x' && + strncmp(w+1, version, 2) == 0) + return rv; + } + return 1; +} int match_oneof(char *devices, char *devname) { @@ -802,3 +977,128 @@ } return 0; } + +int devname_matches(char *name, char *match) +{ + /* See if the given array name matches the + * given match from config file. + * + * First strip and /dev/md/ or /dev/, then + * see if there might be a numeric match of + * mdNN with NN + * then just strcmp + */ + if (strncmp(name, "/dev/md/", 8) == 0) + name += 8; + else if (strncmp(name, "/dev/", 5) == 0) + name += 5; + + if (strncmp(match, "/dev/md/", 8) == 0) + match += 8; + else if (strncmp(match, "/dev/", 5) == 0) + match += 5; + + + if (strncmp(name, "md", 2) == 0 && + isdigit(name[2])) + name += 2; + if (strncmp(match, "md", 2) == 0 && + isdigit(match[2])) + match += 2; + + return (strcmp(name, match) == 0); +} + +int conf_name_is_free(char *name) +{ + /* Check if this name is already take by an ARRAY entry in + * the config file. + * It can be taken either by a match on devname, name, or + * even super-minor. + */ + mddev_ident_t dev; + + load_conffile(); + for (dev = mddevlist; dev; dev = dev->next) { + char nbuf[100]; + if (dev->devname && devname_matches(name, dev->devname)) + return 0; + if (dev->name[0] && devname_matches(name, dev->name)) + return 0; + sprintf(nbuf, "%d", dev->super_minor); + if (dev->super_minor != UnSet && + devname_matches(name, nbuf)) + return 0; + } + return 1; +} + +struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st) +{ + struct mddev_ident_s *array_list, *match; + int verbose = 0; + char *devname = NULL; + array_list = conf_get_ident(NULL); + match = NULL; + for (; array_list; array_list = array_list->next) { + if (array_list->uuid_set && + same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid) + == 0) { + if (verbose >= 2 && array_list->devname) + fprintf(stderr, Name + ": UUID differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->name[0] && + strcasecmp(array_list->name, info->name) != 0) { + if (verbose >= 2 && array_list->devname) + fprintf(stderr, Name + ": Name differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->devices && devname && + !match_oneof(array_list->devices, devname)) { + if (verbose >= 2 && array_list->devname) + fprintf(stderr, Name + ": Not a listed device for %s.\n", + array_list->devname); + continue; + } + if (array_list->super_minor != UnSet && + array_list->super_minor != info->array.md_minor) { + if (verbose >= 2 && array_list->devname) + fprintf(stderr, Name + ": Different super-minor to %s.\n", + array_list->devname); + continue; + } + if (!array_list->uuid_set && + !array_list->name[0] && + !array_list->devices && + array_list->super_minor == UnSet) { + if (verbose >= 2 && array_list->devname) + fprintf(stderr, Name + ": %s doesn't have any identifying information.\n", + array_list->devname); + continue; + } + /* FIXME, should I check raid_disks and level too?? */ + + if (match) { + if (verbose >= 0) { + if (match->devname && array_list->devname) + fprintf(stderr, Name + ": we match both %s and %s - cannot decide which to use.\n", + match->devname, array_list->devname); + else + fprintf(stderr, Name + ": multiple lines in mdadm.conf match\n"); + } + return NULL; + } + match = array_list; + } + return match; +} diff -Nru mdadm-2.6.7.1/crc32.c mdadm-3.1.4/crc32.c --- mdadm-2.6.7.1/crc32.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/crc32.c 2010-03-22 08:08:42.000000000 +0200 @@ -0,0 +1,340 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results about a factor + * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +/* @(#) $Id$ */ + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +/* #include "zutil.h" / * for STDC and FAR definitions */ +#define STDC +#define FAR +#define Z_NULL ((void*)0) +#define OF(X) X +#define ZEXPORT +typedef long ptrdiff_t; +#define NOBYFOUR + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ + +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table(void) +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32( + unsigned long crc, + const unsigned char FAR *buf, + unsigned len) +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ +/* crc = crc ^ 0xffffffffUL;*/ + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc /* ^ 0xffffffffUL*/; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ diff -Nru mdadm-2.6.7.1/crc32.h mdadm-3.1.4/crc32.h --- mdadm-2.6.7.1/crc32.h 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/crc32.h 2010-03-22 08:08:42.000000000 +0200 @@ -0,0 +1,441 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff -Nru mdadm-2.6.7.1/Create.c mdadm-3.1.4/Create.c --- mdadm-2.6.7.1/Create.c 2008-10-15 08:04:09.000000000 +0300 +++ mdadm-3.1.4/Create.c 2010-08-26 05:24:15.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -32,12 +27,50 @@ #include "md_p.h" #include -int Create(struct supertype *st, char *mddev, int mdfd, +static int default_layout(struct supertype *st, int level, int verbose) +{ + int layout = UnSet; + + if (st && st->ss->default_layout) + layout = st->ss->default_layout(level); + + if (layout == UnSet) + switch(level) { + default: /* no layout */ + layout = 0; + break; + case 10: + layout = 0x102; /* near=2, far=1 */ + if (verbose > 0) + fprintf(stderr, + Name ": layout defaults to n2\n"); + break; + case 5: + case 6: + layout = map_name(r5layout, "default"); + if (verbose > 0) + fprintf(stderr, + Name ": layout defaults to %s\n", map_num(r5layout, layout)); + break; + case LEVEL_FAULTY: + layout = map_name(faultylayout, "default"); + + if (verbose > 0) + fprintf(stderr, + Name ": layout defaults to %s\n", map_num(faultylayout, layout)); + break; + } + + return layout; +} + + +int Create(struct supertype *st, char *mddev, int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks, char *name, char *homehost, int *uuid, int subdevs, mddev_dev_t devlist, int runstop, int verbose, int force, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, int delay) + char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof) { /* * Create a new raid array. @@ -55,6 +88,7 @@ * if runstop==run, or raiddisks disks were used, * RUN_ARRAY */ + int mdfd; unsigned long long minsize=0, maxsize=0; char *mindisc = NULL; char *maxdisc = NULL; @@ -66,31 +100,35 @@ int second_missing = subdevs * 2; int missing_disks = 0; int insert_point = subdevs * 2; /* where to insert a missing drive */ + int total_slots; int pass; int vers; int rv; int bitmap_fd; + int have_container = 0; + int container_fd = -1; + int need_mdmon = 0; unsigned long long bitmapsize; - struct mdinfo info; + struct mdinfo info, *infos; + int did_default = 0; + int do_default_layout = 0; + unsigned long safe_mode_delay = 0; + char chosen_name[1024]; + struct map_ent *map = NULL; + unsigned long long newsize; int major_num = BITMAP_MAJOR_HI; memset(&info, 0, sizeof(info)); - vers = md_get_version(mdfd); - if (vers < 9000) { - fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n"); - return 1; - } else { - mdu_array_info_t inf; - memset(&inf, 0, sizeof(inf)); - ioctl(mdfd, GET_ARRAY_INFO, &inf); - if (inf.working_disks != 0) { - fprintf(stderr, Name ": another array by this name" - " is already running.\n"); - return 1; - } + if (level == UnSet) { + /* "ddf" and "imsm" metadata only supports one level - should possibly + * push this into metadata handler?? + */ + if (st && (st->ss == &super_ddf || st->ss == &super_imsm)) + level = LEVEL_CONTAINER; } + if (level == UnSet) { fprintf(stderr, Name ": a RAID level is needed to create an array.\n"); @@ -116,11 +154,55 @@ Name ": This level does not support spare devices\n"); return 1; } + + if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) { + /* If given a single device, it might be a container, and we can + * extract a device list from there + */ + mdu_array_info_t inf; + int fd; + + memset(&inf, 0, sizeof(inf)); + fd = open(devlist->devname, O_RDONLY); + if (fd >= 0 && + ioctl(fd, GET_ARRAY_INFO, &inf) == 0 && + inf.raid_disks == 0) { + /* yep, looks like a container */ + if (st) { + rv = st->ss->load_super(st, fd, + devlist->devname); + if (rv == 0) + have_container = 1; + } else { + st = guess_super(fd); + if (st && !(rv = st->ss-> + load_super(st, fd, + devlist->devname))) + have_container = 1; + else + st = NULL; + } + if (have_container) { + subdevs = raiddisks; + first_missing = subdevs * 2; + second_missing = subdevs * 2; + insert_point = subdevs * 2; + } + } + if (fd >= 0) + close(fd); + } + if (st && st->ss->external && sparedisks) { + fprintf(stderr, + Name ": This metadata type does not support " + "spare disks at create time\n"); + return 1; + } if (subdevs > raiddisks+sparedisks) { fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks); return 1; } - if (subdevs < raiddisks+sparedisks) { + if (!have_container && subdevs < raiddisks+sparedisks) { fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n"); return 1; } @@ -131,32 +213,12 @@ } /* now set some defaults */ - if (layout == UnSet) - switch(level) { - default: /* no layout */ - layout = 0; - break; - case 10: - layout = 0x102; /* near=2, far=1 */ - if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to n1\n"); - break; - case 5: - case 6: - layout = map_name(r5layout, "default"); - if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to %s\n", map_num(r5layout, layout)); - break; - case LEVEL_FAULTY: - layout = map_name(faultylayout, "default"); - if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to %s\n", map_num(faultylayout, layout)); - break; - } + + if (layout == UnSet) { + do_default_layout = 1; + layout = default_layout(st, level, verbose); + } if (level == 10) /* check layout fits in array*/ @@ -172,8 +234,19 @@ case 10: case 6: case 0: - case LEVEL_LINEAR: /* linear */ if (chunk == 0) { + if (st && st->ss->default_chunk) + chunk = st->ss->default_chunk(st); + + chunk = chunk ? : 512; + + if (verbose > 0) + fprintf(stderr, Name ": chunk size defaults to %dK\n", chunk); + } + break; + case LEVEL_LINEAR: + /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */ + if (get_linux_version() < 2006016 && chunk == 0) { chunk = 64; if (verbose > 0) fprintf(stderr, Name ": chunk size defaults to 64K\n"); @@ -182,6 +255,7 @@ case 1: case LEVEL_FAULTY: case LEVEL_MULTIPATH: + case LEVEL_CONTAINER: if (chunk) { chunk = 0; if (verbose > 0) @@ -192,15 +266,27 @@ fprintf(stderr, Name ": unknown level %d\n", level); return 1; } + + if (size && chunk) + size &= ~(unsigned long long)(chunk - 1); + newsize = size * 2; + if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks, + chunk, size*2, NULL, &newsize, verbose>=0)) + return 1; + if (size == 0) { + size = newsize / 2; + if (size && verbose > 0) + fprintf(stderr, Name ": setting size to %lluK\n", + (unsigned long long)size); + } /* now look at the subdevs */ info.array.active_disks = 0; info.array.working_disks = 0; dnum = 0; - for (dv=devlist; dv; dv=dv->next, dnum++) { + for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) { char *dname = dv->devname; - unsigned long long ldsize, freesize; - int fd; + unsigned long long freesize; if (strcasecmp(dname, "missing")==0) { if (first_missing > dnum) first_missing = dnum; @@ -212,18 +298,6 @@ info.array.working_disks++; if (dnum < raiddisks) info.array.active_disks++; - fd = open(dname, O_RDONLY|O_EXCL, 0); - if (fd <0 ) { - fprintf(stderr, Name ": Cannot open %s: %s\n", - dname, strerror(errno)); - fail=1; - continue; - } - if (!get_dev_size(fd, dname, &ldsize)) { - fail = 1; - close(fd); - continue; - } if (st == NULL) { struct createinfo *ci = conf_get_create_info(); if (ci) @@ -231,33 +305,46 @@ } if (st == NULL) { /* Need to choose a default metadata, which is different - * depending on the sizes of devices + * depending on geometry of array. */ int i; char *name = "default"; - if (level >= 1 && ldsize > (0x7fffffffULL<<10)) - name = "default/large"; - for(i=0; !st && superlist[i]; i++) + for(i=0; !st && superlist[i]; i++) { st = superlist[i]->match_metadata_desc(name); + if (do_default_layout) + layout = default_layout(st, level, verbose); + if (st && !st->ss->validate_geometry + (st, level, layout, raiddisks, + chunk, size*2, dname, &freesize, + verbose > 0)) + st = NULL; + } if (!st) { - fprintf(stderr, Name ": internal error - no default metadata style\n"); + fprintf(stderr, Name ": device %s not suitable " + "for any style of array\n", + dname); exit(2); } - if (st->ss->major != 0 || + if (st->ss != &super0 || st->minor_version != 90) - fprintf(stderr, Name ": Defaulting to version" - " %d.%d metadata\n", - st->ss->major, - st->minor_version); - } - freesize = st->ss->avail_size(st, ldsize >> 9); - if (freesize == 0) { - fprintf(stderr, Name ": %s is too small: %luK\n", - dname, (unsigned long)(ldsize>>10)); - fail = 1; - close(fd); - continue; + did_default = 1; + } else { + if (do_default_layout) + layout = default_layout(st, level, verbose); + if (!st->ss->validate_geometry(st, level, layout, + raiddisks, + chunk, size*2, dname, + &freesize, + verbose >= 0)) { + + fprintf(stderr, + Name ": %s is not suitable for " + "this array.\n", + dname); + fail = 1; + continue; + } } freesize /= 2; /* convert to K */ @@ -267,10 +354,10 @@ } if (size && freesize < size) { - fprintf(stderr, Name ": %s is smaller that given size." - " %lluK < %lluK + superblock\n", dname, freesize, size); + fprintf(stderr, Name ": %s is smaller than given size." + " %lluK < %lluK + metadata\n", + dname, freesize, size); fail = 1; - close(fd); continue; } if (maxdisc == NULL || (maxdisc && freesize > maxsize)) { @@ -282,24 +369,60 @@ minsize = freesize; } if (runstop != 1 || verbose >= 0) { + int fd = open(dname, O_RDONLY); + if (fd <0 ) { + fprintf(stderr, Name ": Cannot open %s: %s\n", + dname, strerror(errno)); + fail=1; + continue; + } warn |= check_ext2(fd, dname); warn |= check_reiser(fd, dname); warn |= check_raid(fd, dname); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1) + /* metadata at front */ + warn |= check_partitions(fd, dname, 0); + else if (level == 1 || level == LEVEL_CONTAINER) + /* partitions could be meaningful */ + warn |= check_partitions(fd, dname, freesize*2); + else + /* partitions cannot be meaningful */ + warn |= check_partitions(fd, dname, 0); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1 && + did_default && + level == 1 && + (warn & 1024) == 0) { + warn |= 1024; + fprintf(stderr, Name ": Note: this array has metadata at the start and\n" + " may not be suitable as a boot device. If you plan to\n" + " store '/boot' on this device please ensure that\n" + " your boot-loader understands md/v1.x metadata, or use\n" + " --metadata=0.90\n"); + } + close(fd); } - close(fd); } + if (have_container) + info.array.working_disks = raiddisks; if (fail) { fprintf(stderr, Name ": create aborted\n"); return 1; } if (size == 0) { - if (mindisc == NULL) { + if (mindisc == NULL && !have_container) { fprintf(stderr, Name ": no size and no drives given - aborting create.\n"); return 1; } - if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) { + if (level > 0 || level == LEVEL_MULTIPATH + || level == LEVEL_FAULTY + || st->ss->external ) { /* size is meaningful */ - if (minsize > 0x100000000ULL && st->ss->major == 0) { + if (!st->ss->validate_geometry(st, level, layout, + raiddisks, + chunk, minsize*2, + NULL, NULL, 0)) { fprintf(stderr, Name ": devices too large for RAID level %d\n", level); return 1; } @@ -308,13 +431,21 @@ fprintf(stderr, Name ": size set to %lluK\n", size); } } - if (level > 0 && ((maxsize-size)*100 > maxsize)) { + if (!have_container && level > 0 && ((maxsize-size)*100 > maxsize)) { if (runstop != 1 || verbose >= 0) - fprintf(stderr, Name ": largest drive (%s) exceed size (%lluK) by more than 1%%\n", + fprintf(stderr, Name ": largest drive (%s) exceeds size (%lluK) by more than 1%%\n", maxdisc, size); warn = 1; } + if (st->ss->detail_platform && st->ss->detail_platform(0, 1) != 0) { + if (runstop != 1 || verbose >= 0) + fprintf(stderr, Name ": %s unable to enumerate platform support\n" + " array may not be compatible with hardware/firmware\n", + st->ss->name); + warn = 1; + } + if (warn) { if (runstop!= 1) { if (!ask("Continue creating array? ")) { @@ -331,7 +462,8 @@ * as missing, so that a reconstruct happens (faster than re-parity) * FIX: Can we do this for raid6 as well? */ - if (assume_clean==0 && force == 0 && first_missing >= raiddisks) { + if (st->ss->external == 0 && + assume_clean==0 && force == 0 && first_missing >= raiddisks) { switch ( level ) { case 4: case 5: @@ -348,6 +480,7 @@ * into a spare, else the create will fail */ if (assume_clean == 0 && force == 0 && first_missing < raiddisks && + st->ss->external == 0 && second_missing >= raiddisks && level == 6) { insert_point = raiddisks - 1; if (insert_point == first_missing) @@ -357,12 +490,34 @@ missing_disks++; } - if (level <= 0 && first_missing != subdevs * 2) { + if (level <= 0 && first_missing < subdevs * 2) { fprintf(stderr, Name ": This level does not support missing devices\n"); return 1; } + /* We need to create the device */ + map_lock(&map); + mdfd = create_mddev(mddev, name, autof, LOCAL, chosen_name); + if (mdfd < 0) + return 1; + mddev = chosen_name; + + vers = md_get_version(mdfd); + if (vers < 9000) { + fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n"); + goto abort; + } else { + mdu_array_info_t inf; + memset(&inf, 0, sizeof(inf)); + ioctl(mdfd, GET_ARRAY_INFO, &inf); + if (inf.working_disks != 0) { + fprintf(stderr, Name ": another array by this name" + " is already running.\n"); + goto abort; + } + } + /* Ok, lets try some ioctls */ info.array.level = level; @@ -382,12 +537,16 @@ ( level == 6 && (insert_point < raiddisks || second_missing < raiddisks)) || + ( level <= 0 ) + || assume_clean - ) + ) { info.array.state = 1; /* clean, but one+ drive will be missing*/ - else + info.resync_start = MaxSector; + } else { info.array.state = 0; /* not clean, but no errors */ - + info.resync_start = 0; + } if (level == 10) { /* for raid10, the bitmap size is the capacity of the array, * which is array.size * raid_disks / ncopies; @@ -424,7 +583,6 @@ + info.array.failed_disks; info.array.layout = layout; info.array.chunk_size = chunk*1024; - info.array.major_version = st->ss->major; if (name == NULL || *name == 0) { /* base name on mddev */ @@ -435,6 +593,7 @@ * /dev/md/home -> home * /dev/mdhome -> home */ + /* FIXME compare this with rules in create_mddev */ name = strrchr(mddev, '/'); if (name) { name++; @@ -451,7 +610,37 @@ } } if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid)) - return 1; + goto abort; + + total_slots = info.array.nr_disks; + sysfs_init(&info, mdfd, 0); + st->ss->getinfo_super(st, &info); + + if (did_default && verbose >= 0) { + if (is_subarray(info.text_version)) { + int dnum = devname2devnum(info.text_version+1); + char *path; + int mdp = get_mdp_major(); + struct mdinfo *mdi; + if (dnum > 0) + path = map_dev(MD_MAJOR, dnum, 1); + else + path = map_dev(mdp, (-1-dnum)<< 6, 1); + + mdi = sysfs_read(-1, dnum, GET_VERSION); + + fprintf(stderr, Name ": Creating array inside " + "%s container %s\n", + mdi?mdi->text_version:"managed", path); + sysfs_free(mdi); + } else + fprintf(stderr, Name ": Defaulting to version" + " %s metadata\n", info.text_version); + } + + map_update(&map, fd2devnum(mdfd), info.text_version, + info.uuid, chosen_name); + map_unlock(&map); if (bitmap_file && vers < 9003) { major_num = BITMAP_MAJOR_HOSTENDIAN; @@ -464,31 +653,60 @@ if (bitmap_file && strcmp(bitmap_file, "internal")==0) { if ((vers%100) < 2) { fprintf(stderr, Name ": internal bitmaps not supported by this kernel.\n"); - return 1; + goto abort; + } + if (!st->ss->add_internal_bitmap) { + fprintf(stderr, Name ": internal bitmaps not supported with %s metadata\n", + st->ss->name); + goto abort; } if (!st->ss->add_internal_bitmap(st, &bitmap_chunk, delay, write_behind, bitmapsize, 1, major_num)) { fprintf(stderr, Name ": Given bitmap chunk size not supported.\n"); - return 1; + goto abort; } bitmap_file = NULL; } + sysfs_init(&info, mdfd, 0); - if ((vers % 100) >= 1) { /* can use different versions */ - mdu_array_info_t inf; - memset(&inf, 0, sizeof(inf)); - inf.major_version = st->ss->major; - inf.minor_version = st->minor_version; - rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); - } else - rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); + if (st->ss->external && st->subarray[0]) { + /* member */ + + /* When creating a member, we need to be careful + * to negotiate with mdmon properly. + * If it is already running, we cannot write to + * the devices and must ask it to do that part. + * If it isn't running, we write to the devices, + * and then start it. + * We hold an exclusive open on the container + * device to make sure mdmon doesn't exit after + * we checked that it is running. + * + * For now, fail if it is already running. + */ + container_fd = open_dev_excl(st->container_dev); + if (container_fd < 0) { + fprintf(stderr, Name ": Cannot get exclusive " + "open on container - weird.\n"); + goto abort; + } + if (mdmon_running(st->container_dev)) { + if (verbose) + fprintf(stderr, Name ": reusing mdmon " + "for %s.\n", + devnum2devname(st->container_dev)); + st->update_tail = &st->updates; + } else + need_mdmon = 1; + } + rv = set_array_info(mdfd, st, &info); if (rv) { - fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n", + fprintf(stderr, Name ": failed to set array info for %s: %s\n", mddev, strerror(errno)); - return 1; + goto abort; } if (bitmap_file) { @@ -499,22 +717,22 @@ delay, write_behind, bitmapsize, major_num)) { - return 1; + goto abort; } bitmap_fd = open(bitmap_file, O_RDWR); if (bitmap_fd < 0) { fprintf(stderr, Name ": weird: %s cannot be openned\n", bitmap_file); - return 1; + goto abort; } if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n", mddev, strerror(errno)); - return 1; + goto abort; } } - + infos = malloc(sizeof(*infos) * total_slots); for (pass=1; pass <=2 ; pass++) { mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */ @@ -523,76 +741,191 @@ dv=(dv->next)?(dv->next):moved_disk, dnum++) { int fd; struct stat stb; + struct mdinfo *inf = &infos[dnum]; - info.disk.number = dnum; + if (dnum >= total_slots) + abort(); if (dnum == insert_point) { moved_disk = dv; + continue; } - info.disk.raid_disk = info.disk.number; - if (info.disk.raid_disk < raiddisks) - info.disk.state = (1<devname, "missing")==0) + continue; + if (have_container) + moved_disk = NULL; + if (have_container && dnum < info.array.raid_disks - 1) + /* repeatedly use the container */ + moved_disk = dv; + + switch(pass) { + case 1: + *inf = info; + + inf->disk.number = dnum; + inf->disk.raid_disk = dnum; + if (inf->disk.raid_disk < raiddisks) + inf->disk.state = (1<writemostly) - info.disk.state |= (1<devname, "missing")==0) { - info.disk.major = 0; - info.disk.minor = 0; - info.disk.state = (1<devname, O_RDONLY|O_EXCL, 0); - if (fd < 0) { - fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n", - dv->devname); - return 1; + else + inf->disk.state = 0; + + if (dv->writemostly == 1) + inf->disk.state |= (1<ss->external && st->subarray[0]) + fd = open(dv->devname, O_RDWR); + else + fd = open(dv->devname, O_RDWR|O_EXCL); + + if (fd < 0) { + fprintf(stderr, Name ": failed to open %s " + "after earlier success - aborting\n", + dv->devname); + goto abort; + } + fstat(fd, &stb); + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); + } + if (fd >= 0) + remove_partitions(fd); + if (st->ss->add_to_super(st, &inf->disk, + fd, dv->devname)) { + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + st->ss->getinfo_super(st, inf); + safe_mode_delay = inf->safe_mode_delay; + + if (have_container && verbose > 0) + fprintf(stderr, Name ": Using %s for device %d\n", + map_dev(inf->disk.major, + inf->disk.minor, + 0), dnum); + + if (!have_container) { + /* getinfo_super might have lost these ... */ + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); } - fstat(fd, &stb); - info.disk.major = major(stb.st_rdev); - info.disk.minor = minor(stb.st_rdev); - remove_partitions(fd); - close(fd); - } - switch(pass){ - case 1: - st->ss->add_to_super(st, &info.disk); break; case 2: - if (info.disk.state == 1) break; - Kill(dv->devname, 0, 1); /* Just be sure it is clean */ - Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */ - st->ss->write_init_super(st, &info.disk, - dv->devname); + inf->errors = 0; + rv = 0; + + rv = add_disk(mdfd, st, &info, inf); - if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) { - fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n", + if (rv) { + fprintf(stderr, + Name ": ADD_NEW_DISK for %s " + "failed: %s\n", dv->devname, strerror(errno)); st->ss->free_super(st); - return 1; + goto abort; } - break; } - if (dv == moved_disk && dnum != insert_point) break; + if (!have_container && + dv == moved_disk && dnum != insert_point) break; + } + if (pass == 1) { + struct mdinfo info_new; + struct map_ent *me = NULL; + + /* check to see if the uuid has changed due to these + * metadata changes, and if so update the member array + * and container uuid. Note ->write_init_super clears + * the subarray cursor such that ->getinfo_super once + * again returns container info. + */ + map_lock(&map); + st->ss->getinfo_super(st, &info_new); + if (st->ss->external && level != LEVEL_CONTAINER && + !same_uuid(info_new.uuid, info.uuid, 0)) { + map_update(&map, fd2devnum(mdfd), + info_new.text_version, + info_new.uuid, chosen_name); + me = map_by_devnum(&map, st->container_dev); + } + + st->ss->write_init_super(st); + + /* update parent container uuid */ + if (me) { + char *path = strdup(me->path); + + st->ss->getinfo_super(st, &info_new); + map_update(&map, st->container_dev, + info_new.text_version, + info_new.uuid, path); + free(path); + } + map_unlock(&map); + + flush_metadata_updates(st); } } + free(infos); st->ss->free_super(st); - /* param is not actually used */ - if (runstop == 1 || subdevs >= raiddisks) { - mdu_param_t param; - if (ioctl(mdfd, RUN_ARRAY, ¶m)) { - fprintf(stderr, Name ": RUN_ARRAY failed: %s\n", - strerror(errno)); - Manage_runstop(mddev, mdfd, -1, 0); - return 1; + if (level == LEVEL_CONTAINER) { + /* No need to start. But we should signal udev to + * create links */ + sysfs_uevent(&info, "change"); + if (verbose >= 0) + fprintf(stderr, Name ": container %s prepared.\n", mddev); + wait_for(chosen_name, mdfd); + } else if (runstop == 1 || subdevs >= raiddisks) { + if (st->ss->external) { + switch(level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + sysfs_set_str(&info, NULL, "array_state", + "active"); + need_mdmon = 0; + break; + default: + sysfs_set_str(&info, NULL, "array_state", + "readonly"); + break; + } + sysfs_set_safemode(&info, safe_mode_delay); + } else { + /* param is not actually used */ + mdu_param_t param; + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + fprintf(stderr, Name ": RUN_ARRAY failed: %s\n", + strerror(errno)); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } } if (verbose >= 0) fprintf(stderr, Name ": array %s started.\n", mddev); + if (st->ss->external && st->subarray[0]) { + if (need_mdmon) + start_mdmon(st->container_dev); + + ping_monitor(devnum2devname(st->container_dev)); + close(container_fd); + } + wait_for(chosen_name, mdfd); } else { fprintf(stderr, Name ": not starting array - not enough devices.\n"); } + close(mdfd); return 0; + + abort: + map_lock(&map); + map_remove(&map, fd2devnum(mdfd)); + map_unlock(&map); + + if (mdfd >= 0) + close(mdfd); + return 1; } diff -Nru mdadm-2.6.7.1/davidpashley.com_blog_2008_07_12#rebuilding-raid.html mdadm-3.1.4/davidpashley.com_blog_2008_07_12#rebuilding-raid.html --- mdadm-2.6.7.1/davidpashley.com_blog_2008_07_12#rebuilding-raid.html 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/davidpashley.com_blog_2008_07_12#rebuilding-raid.html 1970-01-01 02:00:00.000000000 +0200 @@ -1,364 +0,0 @@ - - - - - - JD : /2008/07/12 - - - - - - - - - - -
- - -
- -
-
-

Sat, 12 Jul 2008

-

Rebuilding a RAID array

I recently had a failed drive in my RAID1 array. I've just installed -the replacement drive and thought I'd share the method.

Let's look at the current situation:

-root@ace:~# cat /proc/mdstat 
-Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] 
-md1 : active raid1 sda3[1]
-      483403776 blocks [2/1] [_U]
-      
-md0 : active raid1 sda1[1]
-      96256 blocks [2/1] [_U]
-      
-unused devices: <none>
-

So we can see we have two mirrored arrays with one drive missing in both.

Let's see that we've recognised the second drive:

-root@ace:~# dmesg | grep sd
-[   21.465395] Driver 'sd' needs updating - please use bus_type methods
-[   21.465486] sd 2:0:0:0: [sda] 976773168 512-byte hardware sectors (500108 MB)
-[   21.465496] sd 2:0:0:0: [sda] Write Protect is off
-[   21.465498] sd 2:0:0:0: [sda] Mode Sense: 00 3a 00 00
-[   21.465512] sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
-[   21.465562] sd 2:0:0:0: [sda] 976773168 512-byte hardware sectors (500108 MB)
-[   21.465571] sd 2:0:0:0: [sda] Write Protect is off
-[   21.465573] sd 2:0:0:0: [sda] Mode Sense: 00 3a 00 00
-[   21.465587] sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
-[   21.465590]  sda: sda1 sda2 sda3
-[   21.487248] sd 2:0:0:0: [sda] Attached SCSI disk
-[   21.487303] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
-[   21.487314] sd 2:0:1:0: [sdb] Write Protect is off
-[   21.487317] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
-[   21.487331] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
-[   21.487371] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
-[   21.487381] sd 2:0:1:0: [sdb] Write Protect is off
-[   21.487382] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
-[   21.487403] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
-[   21.487407]  sdb: unknown partition table
-[   21.502763] sd 2:0:1:0: [sdb] Attached SCSI disk
-[   21.506690] sd 2:0:0:0: Attached scsi generic sg0 type 0
-[   21.506711] sd 2:0:1:0: Attached scsi generic sg1 type 0
-[   21.793835] md: bind<sda1>
-[   21.858027] md: bind<sda3>
-

So, sda has three partitions, sda1, sda2 and sda3, and sdb has no partition -table. Let's give it one the same as sda. The easiest way to do this is using -sfdisk:

-root@ace:~# sfdisk -d /dev/sda | sfdisk /dev/sdb
-Checking that no-one is using this disk right now ...
-OK
-
-Disk /dev/sdb: 60801 cylinders, 255 heads, 63 sectors/track
-
-sfdisk: ERROR: sector 0 does not have an MSDOS signature
- /dev/sdb: unrecognised partition table type
-Old situation:
-No partitions found
-New situation:
-Units = sectors of 512 bytes, counting from 0
-
-   Device Boot    Start       End   #sectors  Id  System
-/dev/sdb1   *        63    192779     192717  fd  Linux RAID autodetect
-/dev/sdb2        192780   9960299    9767520  82  Linux swap / Solaris
-/dev/sdb3       9960300 976768064  966807765  fd  Linux RAID autodetect
-/dev/sdb4             0         -          0   0  Empty
-Successfully wrote the new partition table
-
-Re-reading the partition table ...
-
-If you created or changed a DOS partition, /dev/foo7, say, then use dd(1)
-to zero the first 512 bytes:  dd if=/dev/zero of=/dev/foo7 bs=512 count=1
-(See fdisk(8).)
-

If we check dmesg now to check it's worked, we'll see:

-root@ace:~# dmesg | grep sd
-...
-[  224.246102] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
-[  224.246322] sd 2:0:1:0: [sdb] Write Protect is off
-[  224.246325] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
-[  224.246547] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
-[  224.246686]  sdb: unknown partition table
-[  227.326278] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
-[  227.326504] sd 2:0:1:0: [sdb] Write Protect is off
-[  227.326507] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
-[  227.326703] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
-[  227.326708]  sdb: sdb1 sdb2 sdb3
-

So, now we have identical partition tables. The next thing to do is to add the new partitions to the array:

-root@ace:~# mdadm /dev/md0 --add /dev/sdb1
-mdadm: added /dev/sdb1
-root@ace:~# mdadm /dev/md1 --add /dev/sdb3
-mdadm: added /dev/sdb3
-

Everything looks good. Let's check dmesg:

-[  323.941542] md: bind<sdb1>
-[  324.038183] RAID1 conf printout:
-[  324.038189]  --- wd:1 rd:2
-[  324.038192]  disk 0, wo:1, o:1, dev:sdb1
-[  324.038195]  disk 1, wo:0, o:1, dev:sda1
-[  324.038300] md: recovery of RAID array md0
-[  324.038303] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
-[  324.038305] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
-[  324.038310] md: using 128k window, over a total of 96256 blocks.
-[  325.417219] md: md0: recovery done.
-[  325.453629] RAID1 conf printout:
-[  325.453632]  --- wd:2 rd:2
-[  325.453634]  disk 0, wo:0, o:1, dev:sdb1
-[  325.453636]  disk 1, wo:0, o:1, dev:sda1
-[  347.970105] md: bind<sdb3>
-[  348.004566] RAID1 conf printout:
-[  348.004571]  --- wd:1 rd:2
-[  348.004573]  disk 0, wo:1, o:1, dev:sdb3
-[  348.004574]  disk 1, wo:0, o:1, dev:sda3
-[  348.004657] md: recovery of RAID array md1
-[  348.004659] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
-[  348.004660] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
-[  348.004664] md: using 128k window, over a total of 483403776 blocks.
-

Everything still looks good. Let's sit back and watch it rebuild using the wonderfully useful watch command:

-root@ace:~# watch -n 1 cat /proc/mdstat
-Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] 
-md1 : active raid1 sdb3[2] sda3[1]
-      483403776 blocks [2/1] [_U]
-      [=====>...............]  recovery = 26.0% (126080960/483403776) finish=96.2min speed=61846K/sec
-      
-md0 : active raid1 sdb1[0] sda1[1]
-      96256 blocks [2/2] [UU]
-      
-unused devices: <none>
-

The Ubuntu and Debian installers will allow you create RAID1 arrays -with less drives than you actually have, so you can use this technique -if you plan to add an additional drive after you've installed the -system. Just tell it the eventual number of drives, but only select the -available partitions during RAID setup. I used this method when a new machine recent -didn't have enough SATA power cables and had to wait for an adaptor to -be delivered.

(Why did no one tell me about watch until recently. I wonder -how many more incredibly useful programs I've not discovered even after 10 -years of using Linux)

- [linux,mdadm,RAID] | # Read Comments (0) | -
- -
-
- -
- -
-

Comments

-
-
-
-
-

- Name:
-
-
- E-mail:
-
-
- URL:
- -
-
- Comment:
-
-
- Please enter "fudge" to prove you are a human - -
- - -

-
-
-
-
-
- -
-
- - diff -Nru mdadm-2.6.7.1/debian/bugscript mdadm-3.1.4/debian/bugscript --- mdadm-2.6.7.1/debian/bugscript 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/bugscript 1970-01-01 02:00:00.000000000 +0200 @@ -1,104 +0,0 @@ -#!/bin/bash -# -# mdadm bug submission control script -# -# allows Debian's bug tools to include relevant information in bug reports. -# -# Copyright © martin f. krafft -# distributed under the terms of the Artistic Licence 2.0 -# -# we need /bin/bash for readline and -n capabalities in the prompt(s) -# -set -eu - -if ! command -v yesno >/dev/null; then - if [ -r /usr/share/reportbug/handle_bugscript ]; then - exec /usr/share/reportbug/handle_bugscript ". $0" /dev/stdout - fi - yesno() { - read -n1 -p"$1" REPLY - case "$REPLY" in - [yY]) REPLY=yep;; - [nN]) REPLY=nop;; - ('') REPLY="$2";; - esac - } - exec 3>&1 -fi - -if [ ! -r /proc/mdstat ]; then - echo "The local system does not have MD (RAID) support: no drivers loaded." - echo "Without MD support, I cannot collect as much information as I'd like." - - #yesno "Are you sure you want to report a bug at this time? " yep - yesno "Hit any key to continue..." yep - #[ "$REPLY" = yep ] || exit 1 -fi - -echo "--- mount output" >&3 -mount >&3 -echo >&3 - -echo "--- mdadm.conf" >&3 -if [ -r /etc/mdadm/mdadm.conf ]; then - cat /etc/mdadm/mdadm.conf >&3 -else - echo no mdadm.conf file. >&3 -fi -echo >&3 - -echo "--- /proc/mdstat:" >&3 -cat /proc/mdstat >&3 2>&3 || : -echo >&3 - -echo "--- /proc/partitions:" >&3 -cat /proc/partitions >&3 2>&3 || : -echo >&3 - -echo "--- initrd.img-$(uname -r):" >&3 -if [ -r /boot/initrd.img-$(uname -r) ]; then - zcat /boot/initrd.img-$(uname -r) 2>&3 | cpio -t 2>&3 | egrep '/md[a/]' >&3 -fi -echo >&3 - -if [ -r /proc/modules ]; then - echo "--- /proc/modules:" >&3 - egrep '(dm_|raid|linear|multipath|faulty)' < /proc/modules >&3 || : - echo >&3 -fi - -if [ -r /var/log/syslog ]; then - echo "--- /var/log/syslog:" >&3 - egrep "^\w{3} [ :[:digit:]]{11} ($(hostname)|localhost) (kernel: md|mdadm): " /var/log/syslog >&3 || : - echo >&3 -fi - -echo "--- volume detail:" >&3 -for dev in /dev/[hs]d[a-z]*; do mdadm -E $dev 2>/dev/null && echo -- || :; done >&3 -echo >&3 - -if [ -r /proc/cmdline ]; then - echo "--- /proc/cmdline" >&3 - cat /proc/cmdline >&3 - echo >&3 -fi - -if [ -f /boot/grub/menu.lst ]; then - echo "--- grub:" >&3 - if [ -r /boot/grub/menu.lst ]; then - grep '^[^#].*root=' /boot/grub/menu.lst >&3 || : - else - echo menu.lst file not readable. >&3 - fi - echo >&3 -fi - -if [ -f /etc/lilo.conf ]; then - echo "--- lilo:" >&3 - if [ -r /etc/lilo.conf ]; then - egrep '^([^#].*)?root=' /etc/lilo.conf >&3 || : - else - echo lilo.conf file not readable. >&3 - fi - echo >&3 -fi diff -Nru mdadm-2.6.7.1/debian/bugscript.in mdadm-3.1.4/debian/bugscript.in --- mdadm-2.6.7.1/debian/bugscript.in 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/bugscript.in 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,222 @@ +#!/bin/bash +# +# mdadm bug submission control script +# +# allows Debian's bug tools to include relevant information in bug reports. +# +# Copyright © martin f. krafft +# distributed under the terms of the Artistic Licence 2.0 +# +# we need /bin/bash for readline and -n capabalities in the prompt(s) +# + +# maximise information output even in the case of errors +set +eu + +VERSION=%VERSION% + +if ! command -v yesno >/dev/null; then + if [ -r /usr/share/reportbug/handle_bugscript ]; then + exec /usr/share/reportbug/handle_bugscript ". $0" /dev/stdout + fi + yesno() { + read -n1 -p"$1" REPLY + case "$REPLY" in + [yY]) REPLY=yep;; + [nN]) REPLY=nop;; + ('') REPLY="$2";; + esac + } + exec 3>&1 +fi + +# do not let people ctrl-c out of the bugscript +trap : INT + +if [ $(id -u) != 0 ]; then + if [ -x "$(command -v sudo)" ]; then + yesno "Gather system information as root using sudo? (Y/n) " yep + if [ "$REPLY" = yep ]; then + echo running sudo "$0" "$@"... + sudo "$0" "$@" >&3 && exit 0 + echo "sudo invocation failed, trying /bin/su..." + fi + fi + + yesno "Gather system information as root using su? (Y/n) " yep + if [ "$REPLY" = yep ]; then + ARGS= + for i in "$@"; do ARGS="${ARGS:+$ARGS }'$1'"; shift; done + echo "running su root -s '/bin/sh -c $0${ARGS:+ $ARGS}'..." + su root -s /bin/sh -c "$0 $ARGS" >&3 && exit 0 + unset ARGS + echo "su invocation failed." + fi + + # arrive here only if neither sudo nor su worked: + yesno "Will you provide system information in the bug report yourself? (N/y) " nop + if [ "$REPLY" = yep ]; then + cat <<_eof >&3 + +IMPORTANT: + please do not forget to include all relevant system information with this + bug report. You could run + /usr/share/bug/mdadm/script 3>&1 + as root and attach or include the output. + +_eof + exit 0 + fi + + # try our best + cat <<_eof >&3 + +WARNING: + the following output was not generated by the root user. If you can, please + replace the following up until "-- System Information:" with the output of + /usr/share/bug/mdadm/script 3>&1 + run as root. Thanks! + +_eof +fi + +if [ ! -r /proc/mdstat ]; then + echo "The local system does not have MD (RAID) support: no drivers loaded." + echo "Without MD support, I cannot collect as much information as I'd like." + + #yesno "Are you sure you want to report a bug at this time? " yep + yesno "Hit any key to continue..." yep + #[ "$REPLY" = yep ] || exit 1 +fi + +echo "--- mdadm.conf" >&3 +if [ -r /etc/mdadm/mdadm.conf ]; then + grep '^[^#]' /etc/mdadm/mdadm.conf >&3 +elif [ -r /etc/mdadm.conf ]; then + grep '^[^#]' /etc/mdadm.conf >&3 +else + echo no mdadm.conf file. >&3 +fi +echo >&3 + +echo "--- /etc/default/mdadm" >&3 +if [ -r /etc/default/mdadm ]; then + grep '^[^#]' /etc/default/mdadm >&3 +else + echo no /etc/default/mdadm file. >&3 +fi +echo >&3 + +echo "--- /proc/mdstat:" >&3 +cat /proc/mdstat >&3 2>&3 || : +echo >&3 + +echo "--- /proc/partitions:" >&3 +cat /proc/partitions >&3 2>&3 || : +echo >&3 + +echo "--- LVM physical volumes:" >&3 +if [ -x "$(command -v pvs)" ]; then + pvs >&3 +else + echo "LVM does not seem to be used." >&3 +fi + +echo "--- mount output" >&3 +mount >&3 +echo >&3 + +echo "--- initrd.img-$(uname -r):" >&3 +if [ -r /boot/initrd.img-$(uname -r) ]; then + TEMPDIR=$(mktemp -d) + OLDPWD="$PWD" + cd "$TEMPDIR" + zcat /boot/initrd.img-$(uname -r) 2>&3 | cpio -i 2>&3 + find -regex '.*/md[a/].+' -type f -exec md5sum {} \; >&3 + + echo >&3 + echo "--- initrd's /conf/conf.d/md:" >&3 + if [ -r conf/conf.d/md ]; then + cat conf/conf.d/md >&3 + else + echo "no conf/md file." >&3 + fi + + cd "$OLDPWD" + rm -rf "$TEMPDIR" + unset TEMPDIR +else + echo "no initrd.img-$(uname -r) found." >&3 +fi +echo >&3 + +if [ -r /proc/modules ]; then + echo "--- /proc/modules:" >&3 + egrep '(dm_|raid|linear|multipath|faulty)' < /proc/modules >&3 || : + echo >&3 +fi + +if [ -f /var/log/syslog ]; then + if [ -r /var/log/syslog ]; then + echo "--- /var/log/syslog:" >&3 + egrep "^\w{3} [ :[:digit:]]{11} ($(hostname)|localhost) (kernel: md|mdadm): " /var/log/syslog >&3 || : + echo >&3 + else + echo "syslog not readable by user." >&3 + fi +fi + +echo "--- volume detail:" >&3 +for dev in /dev/[hsv]d[a-z]*; do + [ ! -r $dev ] && echo "$dev not readable by user." && continue + mdadm -E $dev 2>/dev/null && echo -- || echo "$dev is not recognised by mdadm." +done >&3 +echo >&3 + +if [ -r /proc/cmdline ]; then + echo "--- /proc/cmdline" >&3 + cat /proc/cmdline >&3 + echo >&3 +fi + +if [ -f /boot/grub/grub.cfg ]; then + echo "--- grub2:" >&3 + if [ -r /boot/grub/grub.cfg ]; then + egrep '^[^#].*\<(root=|raid)' /boot/grub/grub.cfg >&3 || : + else + echo grub.cfg file not readable. >&3 + fi + echo >&3 +fi + +if [ -f /boot/grub/menu.lst ]; then + echo "--- grub legacy:" >&3 + if [ -r /boot/grub/menu.lst ]; then + grep '^[^#].*\&3 || : + else + echo menu.lst file not readable. >&3 + fi + echo >&3 +fi + +if [ -f /etc/lilo.conf ]; then + echo "--- lilo:" >&3 + if [ -r /etc/lilo.conf ]; then + egrep '^([^#].*)?root=' /etc/lilo.conf >&3 || : + else + echo lilo.conf file not readable. >&3 + fi + echo >&3 +fi + +echo "--- udev:" >&3 +COLUMNS=70 dpkg -l udev | grep '\' >&3 +md5sum /etc/udev/rules.d/*md* /lib/udev/rules.d/*md* >&3 2>/dev/null +echo >&3 + +echo "--- /dev:" >&3 +ls -l /dev/md* /dev/disk/by-* >&3 +echo >&3 + +echo "Auto-generated on $(date -R)" >&3 +echo "by mdadm bugscript $VERSION" >&3 diff -Nru mdadm-2.6.7.1/debian/changelog mdadm-3.1.4/debian/changelog --- mdadm-2.6.7.1/debian/changelog 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/changelog 2010-10-18 14:09:22.000000000 +0300 @@ -1,3 +1,284 @@ +mdadm (3.1.4-1+8efb9d1ubuntu1) UNRELEASED; urgency=low + + * Merge from debian unstable. (LP: #603582) + * Remaining changes + - Assemble.c, config.c: upgraded to the mdadm-3.1.4 version of these files + from Debian. + - debian/control: we need udev and util-linux in the right version. We + default to postfix as our mta + - debian/initramfs/hook: kept the Ubuntus version for handling the absence + of active raid arrays in /etc/mdadm/mdadm.conf + - debian/initramfs/script.local-top.DEBIAN, debian/mdadm-startall, + debian/mdadm.raid.DEBIAN: removed. udev does its job now instead. + - debian/mdadm-startall.sgml, debian/mdadm-startall.8: documentation of + unused startall script + - debian/mdadm.config, debian/mdadm.postinst - let udev do the handling + instead. Resolved merge conflict by keeping Ubuntu's version. + - debian/rules: kept debian's switch to using dh_lintian + - debian/mdadm.links, debian/mdadm.manpages: dropped owing to the fact + that these are not used in Ubuntu. Also dropped the build-dep on docbook + to man) + - debian/mdadm.postinst, debian/mdadm.config, initramfs/init-premount: + boot-degraded enablement; maintain udev starting of RAID devices; + init-premount hook script for the initramfs, to provide information at + boot + - debian/mkconf.in is the older mkconf. Kept the Ubuntus version. + - debian/rules: Kept Ubuntus version for installing apport hooks, not + installing un-used startall script and for adding a udev rule + corresponding to mdadm. + - debian/install-rc, check.d/_numbers, check.d/root_on_raid: Ubuntu partman + installer changes + - debian/presubj: Dropped this unused bug reporting file. Instead use + source_mdadm.py act as an apport hook for bug handling. + + -- Surbhi Palande Thu, 30 Sep 2010 17:46:19 +0100 + +mdadm (3.1.4-1+8efb9d1) unstable; urgency=low + + * Added patch with Makefile fix from upstream (commit 8efb9d1) to fix + compiler/linker problem on non-x86 architectures (closes: #595290). + + -- martin f. krafft Fri, 03 Sep 2010 10:45:01 +0200 + +mdadm (3.1.4-1) unstable; urgency=low + + * New upstream release, which closes:#595039 and addresses the following + issues too: + - reverts move of incremental map (closes: #585015). + - fixes mdadm monitor in the case of an inactive (or start-failed) raid0 + or linear array (closes: #539154). + - prevent --remove faulty from skipping renumbered devices + (closes: #587550). + - fixed overflow when growing a RAID6 (closes: #589493). + * However, disable the incremental assembly upstream turned on in 3.1.3 for + now, this will have to wait until after the squeeze release. + * initramfs/hook: make sure configuration file exists before accessing it; + thanks to Michael Prokop for the fix and NMU (closes: #589836). + * initramfs/hook: Match UUID case-insensitive while checking for running + arrays not listed in mdadm.conf; thanks to Mario 'BitKoenig' Holbe for the + patch (closes: #583545). + * Fix URL in the bug reporting preamble (presubj) (closes: #589833). + * Add I/O rescheduling functionality to the checkarray script and make the + cronjob use the idle priority; this should now minimise the impact of the + monthly re-check on the running system; Florian Heigl had the idea + (closes: #592149, #508123). + + -- martin f. krafft Sun, 29 Aug 2010 13:44:59 +0200 + +mdadm (3.1.2-2) unstable; urgency=low + + * Fix logcheck regexp to cure "egrep: Unmatched [ or [^" message + (closes: #583376). + * Cherry-pick 94fcb80 from upstream to fix compiler error due to argument + type error (at least on ia64, sparc, powerpc) (closes: #583495). + + -- martin f. krafft Fri, 28 May 2010 09:35:42 +0200 + +mdadm (3.1.2-1) unstable; urgency=low + + * New upstream release (closes: #567167). + * Ignore lintian error about not stopping in runlevel 1. + * Include more information about the configuration, initrd, and LVM in + bugscript output. + * Check active devices against configuration file based on UUID, not device + name (closes: #553896). + * When preparing the list of devices, treat /dev/mdX and /dev/md/X equally. + * Bump Standards-Version to 3.8.4 without having to make changes. + * Fix logcheck rule with patch from Frédéric Brière, since Rebuild events + are now arbitrary, no longer multiples of 20 (closes: #570315). + * checkarray: do not exit non-zero when there are no arrays found (see + #582360). + * Do not exit checkarray cronjob non-zero when the tests whether to run + fail (closes: #580825). This is related to a recent change in behaviour in + cron 3.0pl1-110 (see e.g. #581612). Thanks to everyone for their + suggestions! + * Call MAKEDEV to create device nodes only if MAKEDEV is installed + (closes: #569360). + + -- martin f. krafft Thu, 27 May 2010 09:34:24 +0200 + +mdadm (3.1.1-1) unstable; urgency=low + + * New upstream release. + * Retire fixed/blkid-dev branch. + * Reword warning about unbootable system when mdadm is purged + (closes: #544558). + * Updated FAQ to include s2ram as one of the reasons that can prevent an + array from being stopped; thanks to Pascal Hambourg for writing in. + * Install udev rules into udeb package (closes: #558823). + * Update mdadd.sh (formerly /usr/share/doc/mdadm/examples/newdisk.gz) from + Arno's website, refactoring the script into its own branch/patch + (closes: #539103). + * Do not single-quote homehost in initramfs script (closes: #549083). + + -- martin f. krafft Wed, 27 Jan 2010 10:14:25 +1300 + +mdadm (3.0.3-2) unstable; urgency=low + + * Bumped Standards-Version to 3.8.3 without having to make changes. + * Fixed init dependencies of mdadm daemon init.d script; thanks Petter + Reinholdtsen (closes: #541396). + * Switched source package to v3-quilt format. + + -- martin f. krafft Fri, 06 Nov 2009 10:06:03 +0100 + +mdadm (3.0.3-1) unstable; urgency=low + + * New upstream release. + * Acknowledge 3.0-3.1 NMU by Christian Kujau (patch by Marco d'Itri) + (closes: #541884), and add util-linux dependency. + * Copy udev rules into initramfs, which udev stopped doing + (closes: #549535, #549083, #538843, #538143). + * Fix the bug script to write debug information to the correct file + descriptor (closes: #537734). + * Switch to TopGit and split Debian diff into topic branches; add + README.source. + * Drop the outdated rootraiddoc.97.html document. + + -- martin f. krafft Tue, 27 Oct 2009 18:06:13 +0100 + +mdadm (3.0-3.1) unstable; urgency=medium + + * Non-maintainer upload. + * use blkid instead of vold_id in udev-md-raid.rules (closes: #541884) + + -- christian kujau Mon, 14 Sep 2009 10:15:21 +0200 + +mdadm (3.0-2) unstable; urgency=low + + * Fixed initramfs script with patch from Steffen Hau: it was still using + --auto-update-homehost, which has been removed and replaced by a better + heuristic: arrays created for a different "homehost" will now be + assembled read-only, rather than shoehorned into the system with + --auto-update-homehost (closes: #537820). + * Add version stamps to bugscript and mkconf scripts to facilitate + debugging. + + -- martin f. krafft Tue, 21 Jul 2009 10:33:30 +0200 + +mdadm (3.0-1) unstable; urgency=low + + * New stable upstream release. + * Add information about udev and device links in /dev to bugscript output. + * Add pointer to FAQ and in particular rootdelay to the bug script + pre-subject file, which is displayed by tools like reportbug and thus + hopefully reduces the numbers of duplicated bugs. + * Patch from Frédéric Brière to make logcheck rules printk_time aware + (closes: #537460). + * Updated German translation due to typos and old spelling rules; thanks to + Helge Kreutzmann for the patch (closes: #534663). + * Bumped Standard-Version to 3.8.2; no changes necessary. + + -- martin f. krafft Mon, 20 Jul 2009 16:12:41 +0200 + +mdadm (3.0~devel3-43-g2800528-1) experimental; urgency=low + + * Merge tip of upstream's devel-3.0 branch at commit 2800528. + * Drop our own udev rules in favour of upstream's. If + /etc/udev/rules.d/65_mdadm.vol_id.rules has not been modified (md5sum + check), it is automatically removed; else, a warning is emitted. + * Add information about udev and device links in /dev to bugscript output. + + -- martin f. krafft Tue, 05 May 2009 15:10:46 +0200 + +mdadm (2.6.9-3) unstable; urgency=low + + * Fix the multipath prereq patch (#516605) and make it exit after printing + the prerequisites (closes: #526793). + * Change my previous recommendation for postfix over to the new virtual + package default-mta (see #522300 and #508644). + * Enhance bugscript, which now asks to run as root (sudo/su) if invoked by + a normal user. + * Include MD5 sums of md-related files in initrd in bug reports. + * Add grub2 information retrieval to bugscript. + * Trap SIGINT and thus prevent ctrl-c from terminating the bugscript + prematurely. + + -- martin f. krafft Tue, 05 May 2009 11:46:22 +0200 + +mdadm (3.0~devel3-1) experimental; urgency=low + + * Initial release of DEVELOPMENT BRANCH 3.0 to experimental. + + -- martin f. krafft Thu, 30 Apr 2009 11:51:39 +0200 + +mdadm (2.6.9-2) unstable; urgency=low + + * Fix the check of whether mdadm.conf defines all devices known to the + system; thanks Cristian Ionescu-Idbohrn (closes: #525655). + * No longer pass -k to modprobe, which has been deprecated for a long time; + thanks to Jan Hudec (closes: #519999). + * Remove Mario Joußen from the uploaders list, since his email started + bouncing. + * Prepare mdadm source to use quilt, with the long-term goal to switch to + TopGit, once I find the time. + * Cherry-pick caa0f6c & 667e66d from Neil into a quilt patch to fix gcc-4.4 + compiler issues (closes: #505375). + + -- martin f. krafft Sun, 26 Apr 2009 16:08:28 +0200 + +mdadm (2.6.9-1) unstable; urgency=low + + * New upstream release. + * Do not set -eu in the bugscript to maximise information output in the case + of errors. + * Make initramfs script depend on multipath to ensure its script is run + before ours (closes: #516605). + * Provide an alternative (postfix) for mail-transport-agent (closes: + #522300). I chose postfix because that's the only one I could recommend, + and since the alternative does not affect people who already have an MTA + installed, or have a preference, it won't affect them. + * Honour debconf pre-selection of mdadm/initrdstart (closes: #516802). + * Incorporate patch from Adrian Bridgett: the initramfs hook now checks to + see if all known arrays are listed in mdadm.conf and issues a warning if + this is not the case (closes: #519328). + * Make checkarray skip over arrays still marked auto-read-only + (closes: #510641). + * Add cron.daily snippet from Paul Slootman to run one-shot scans every day + to ensure that failed arrays don't go unnoticed (closes: #497949). + * Bumped Standards-Version to 3.8.1; no changes necessary. + + -- martin f. krafft Sat, 25 Apr 2009 19:04:47 +0200 + +mdadm (2.6.8-12-gb47dff6-2) unstable; urgency=low + + * Brown paper bag release: I built from the wrong branch which caused some + Debian-specific changes not to get into the package. Thus build fixes it. + + -- martin f. krafft Mon, 16 Feb 2009 12:15:37 +0100 + +mdadm (2.6.8-12-gb47dff6-1) unstable; urgency=low + + * New upstream release. + - better checks asprintf() return codes, thanks to patch from Dustin + Kirkland (closes: #509167). + * Fix start/stop runlevels in header of mdadm monitor init.d script + (closes: #514923). + * Use modprobe -q instead of --syslog from initramfs (closes: #502988). + + -- martin f. krafft Mon, 16 Feb 2009 11:07:18 +0100 + +mdadm (2.6.7.2-1) unstable; urgency=low + + * New upstream release, created for Debian lenny: + - fixes assembly of arrays that are being reshaped (closes: #512475) + - this bug was also responsible for other assembly problems + (closes: #498505, #499643, #496334) + Again, many thanks to Neil Brown for being such an awesome upstream. + + * Documentation updates: + - Actually install David Pashley's blog post added in 2.6.7.1-1, and + register it with doc-base. + - Update md.txt to version 2.6.26 (the lenny kernel). + - Add a dump of a website detailing md superblock formats. + - Register FAQ, md.txt, RAID5-vs-RAID10, README.recipes with doc-base + - Cherry-picked UID/UUID typo in mdadm.conf(5) manpage fix (commit + 0e69da7) (closes: #506245). + + * Added Italian debconf translation; thanks Luca Monducci (closes: #506572). + + -- martin f. krafft Tue, 03 Feb 2009 21:28:34 +0100 + mdadm (2.6.7.1-1ubuntu16) maverick; urgency=low * debian/initramfs/hook: Added following code (invoked on update-initramfs) @@ -1834,7 +2115,7 @@ mdadm (1.0.1-4) unstable; urgency=low - * Changed mdrun so that it can deal with partition statistics in + * Changed mdrun so that it can deal with partition statistics in /proc/partitions. (closes: Bug#174000, Bug#175130) * Added russian (ru) debconf template translation. Thanks to Sergey @@ -1884,7 +2165,7 @@ mdctl (0.5-4) unstable; urgency=low * Added debconf template to ask the user if the init script links - should be updated. + should be updated. -- Mario Joussen Mon, 4 Mar 2002 22:53:37 +0100 @@ -1892,7 +2173,7 @@ * Splitted up init script in two parts. One starts the md array and the other starts the raid monitor daemon. - (closes: Bug#136184) + (closes: Bug#136184) -- Mario Joussen Thu, 28 Feb 2002 22:45:57 +0100 @@ -1905,7 +2186,7 @@ * Added an init script, which can start md arrays and the raid monitor daemon * MD devices are now created under /dev if necessary - * Added a sample configuration file + * Added a sample configuration file -- Mario Joussen Sun, 13 Jan 2002 23:43:40 +0100 @@ -1915,3 +2196,4 @@ (closes: Bug#126610) -- Mario Joussen Wed, 26 Dec 2001 17:07:09 +0100 + diff -Nru mdadm-2.6.7.1/debian/checkarray mdadm-3.1.4/debian/checkarray --- mdadm-2.6.7.1/debian/checkarray 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/checkarray 2010-09-30 19:46:18.000000000 +0300 @@ -12,7 +12,6 @@ about() { echo "$PROGNAME -- MD array (RAID) redundancy checker tool" - echo '$Id$' echo "Copyright © martin f. krafft " echo "Released under the terms of the Artistic Licence 2.0" } @@ -26,9 +25,13 @@ echo "Valid options are:" cat <<-_eof | column -s\& -t -a|--all & check all assembled arrays (check /proc/mdstat). - -c|--cron & honour AUTOCHECK setting in /etc/default/mdadm. -s|--status & print redundancy check status of devices. -x|--cancel & queue a request to cancel a running redundancy check. + -i|--idle & perform check in a lowest I/O scheduling class (idle). + -l|--slow & perform check in a lower-than-standard I/O scheduling class. + -f|--fast & perform check in higher-than-standard I/O scheduling class. + --realtime & perform check in real-time I/O scheduling class (DANGEROUS!). + -c|--cron & honour AUTOCHECK setting in /etc/default/mdadm. -q|--quiet & suppress informational messages. -Q|--real-quiet & suppress all output messages, including warnings and errors. -h|--help & show this output. @@ -36,7 +39,7 @@ _eof echo echo "Examples:" - echo " $PROGNAME --all" + echo " $PROGNAME --all --idle" echo " $PROGNAME --quiet /dev/md[123]" echo " $PROGNAME -sa" echo " $PROGNAME -x --all" @@ -50,8 +53,8 @@ echo "You can also control the status of a check with /proc/mdstat ." } -SHORTOPTS=achVqQsx -LONGOPTS=all,cron,help,version,quiet,real-quiet,status,cancel +SHORTOPTS=achVqQsxilf +LONGOPTS=all,cron,help,version,quiet,real-quiet,status,cancel,idle,slow,fast,realtime eval set -- $(getopt -o $SHORTOPTS -l $LONGOPTS -n $PROGNAME -- "$@") @@ -61,16 +64,21 @@ quiet=0 status=0 action=check +ionice= for opt in $@; do case "$opt" in -a|--all) all=1;; - -c|--cron) cron=1;; -s|--status) action=status;; -x|--cancel) action=idle;; - -h|--help) usage; exit 0;; + -i|--idle) ionice=idle;; + -l|--slow) ionice=low;; + -f|--fast) ionice=high;; + --realtime) ionice=realtime;; + -c|--cron) cron=1;; -q|--quiet) quiet=1;; -Q|--real-quiet) quiet=2;; + -h|--help) usage; exit 0;; -V|--version) about; exit 0;; /dev/md/*|md/*) arrays="${arrays:+$arrays }md${opt#*md/}";; /dev/md*|md*) arrays="${arrays:+$arrays }${opt#/dev/}";; @@ -110,7 +118,7 @@ echo "$PROGNAME: W: no active MD arrays found." >&2 echo "$PROGNAME: W: (maybe uninstall the mdadm package?)" >&2 fi - exit 5 + exit 0 fi if [ -z "$(ls /sys/block/md*/md/level 2>/dev/null)" ]; then @@ -150,6 +158,11 @@ exit 4 fi + if [ "$(cat /sys/block/$array/md/array_state)" = 'read-auto' ]; then + echo "$PROGNAME: W: array $array in auto-read-only state, skipping..." >&2 + continue + fi + case "$action" in idle) echo $action > $SYNC_ACTION_CTL @@ -166,6 +179,26 @@ # are properly queued so as to not kill one of the array. echo $action > $SYNC_ACTION_CTL [ $quiet -lt 1 ] && echo "$PROGNAME: I: check queued for array $array." >&2 + + case "$ionice" in + idle) arg='-c3';; + low) arg='-c2 -n7';; + high) arg='-c2 -n0';; + realtime) arg='-c1 -n4';; + *) break;; + esac + + resync_pid= wait=5 + while [ $wait -gt 0 ]; do + wait=$((wait - 1)) + resync_pid=$(ps -ef | awk -v dev=$array 'BEGIN { pattern = "^\\[" dev "_resync]$" } $8 ~ pattern { print $2 }') + if [ -n "$resync_pid" ]; then + echo "$PROGNAME: I: selecting $ionice I/O scheduling class for resync of $array." >&2 + ionice -p "$resync_pid" $arg + break + fi + sleep 1 + done ;; esac diff -Nru mdadm-2.6.7.1/debian/control mdadm-3.1.4/debian/control --- mdadm-2.6.7.1/debian/control 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/control 2010-10-18 14:11:31.000000000 +0300 @@ -3,9 +3,9 @@ Priority: optional XSBC-Original-Maintainer: Debian mdadm maintainers Maintainer: Ubuntu Core Developers -Uploaders: martin f. krafft , Mario Joussen +Uploaders: martin f. krafft Build-Depends: debhelper (>= 7.4.2), po-debconf, groff-base -Standards-Version: 3.8.0 +Standards-Version: 3.8.4 Vcs-Git: git://git.debian.org/git/pkg-mdadm/mdadm Vcs-Browser: http://git.debian.org/?p=pkg-mdadm/mdadm.git Homepage: http://neil.brown.name/blog/mdadm diff -Nru mdadm-2.6.7.1/debian/copyright mdadm-3.1.4/debian/copyright --- mdadm-2.6.7.1/debian/copyright 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/copyright 2010-09-30 19:46:18.000000000 +0300 @@ -1,6 +1,6 @@ This package was debianized by Mario Jou/3en on Wed, 26 Dec 2001 17:07:09 +0100. -Martin F. Krafft took over on +Martin F. Krafft took over on Tue, 16 May 2006 13:21:06 -0500 The mdadm source was downloaded from @@ -17,5 +17,5 @@ the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -On Debian GNU/Linux systems, the complete text of the GNU General +On Debian GNU/Linux systems, the complete text of the GNU General Public License can be found in '/usr/share/common-licenses/GPL'. diff -Nru mdadm-2.6.7.1/debian/FAQ mdadm-3.1.4/debian/FAQ --- mdadm-2.6.7.1/debian/FAQ 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/FAQ 2010-09-30 19:46:18.000000000 +0300 @@ -4,7 +4,7 @@ Also see /usr/share/doc/mdadm/README.recipes.gz . The latest version of this FAQ is available here: - http://svn.debian.org/wsvn/pkg-mdadm/mdadm/trunk/debian/FAQ?op=file&rev=0&sc=0 + http://git.debian.org/?p=pkg-mdadm/mdadm.git;a=blob;f=debian/FAQ;hb=HEAD 0. What does MD stand for? ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,11 +59,11 @@ mdadm --detail /dev/mdX | sed -ne 's,.*Version : ,,p' to determine the superblock version of a running array, or - + mdadm --examine /dev/sdXY | sed -ne 's,.*Version : ,,p' to determine the superblock version from a component device of an array. - + Version 0 superblocks (00.90.XX) '''''''''''''''''''''''''''''''' You need to know the preferred minor number stored in the superblock, @@ -111,7 +111,7 @@ space. For example, if you have disks of size X, then in order to get 2X storage, you need 3 disks for RAID5, but 4 if you use RAID10 or RAID1+0 (or RAID6). - + This gain in usable space comes at a price: performance; RAID1/10 can be up to four times faster than RAID4/5/6. @@ -206,7 +206,7 @@ RAID1+0/10 has a greater chance to survive two disk failures, its performance suffers less when in degraded state, and it resyncs faster after replacing a failed disk. - + See http://aput.net/~jheiss/raid10/ for more details. 7. Which RAID10 layout scheme should I use @@ -239,7 +239,7 @@ hdd4 Bb1 Db1 .... Ba2 Da2 Where the second set start half-way through the drives. - + The advantage of far= is that you can easily spread a long sequential read across the drives. The cost is more seeking for writes. offset= can possibly get similar benefits with large enough chunk size. Neither upstream @@ -265,8 +265,10 @@ * LVM * dm-crypt * EVMS + * The array contains a swap partition used for suspend-to-ram + (check /etc/initramfs-tools/conf.d/resume) * The array is used by a process (check with `lsof') - + 9. Should I use RAID0 (or linear)? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ No. Unless you know what you're doing and keep backups, or use it for data @@ -290,7 +292,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In certain configurations, especially if your last partition extends all the way to the end of the disk, mdadm may display a warning like: - + mdadm: WARNING /dev/hdc3 and /dev/hdc appear to have very similar superblocks. If they are really different, please --zero the superblock on one. If they are the same or overlap, please remove one from the DEVICE @@ -313,7 +315,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In almost all cases, mdadm updates the super-minor field in an array's superblock when assembling the array. It does *not* do this for RAID0 - arrays. Thus, you may end up seeing something like this when you run + arrays. Thus, you may end up seeing something like this when you run mdadm -E or mkconf: ARRAY /dev/md0 level=raid0 num-devices=2 UUID=abcd... @@ -333,16 +335,19 @@ 13. Can a MD array be partitioned? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - For a MD array to be able to hold partitions, it must be created as - a "partitionable array", using the configuration auto=part on the command - line or in the configuration file, or by using the standard naming scheme - (md_d* or md/d*) for partitionable arrays: + Since kernel 2.6.28, MD arrays can be partitioned like any other block + device. + + Prior to 2.6.28, for a MD array to be able to hold partitions, it must be + created as a "partitionable array", using the configuration auto=part on the + command line or in the configuration file, or by using the standard naming + scheme (md_d* or md/d*) for partitionable arrays: mdadm --create --auto=yes ... /dev/md_d0 ... # see mdadm(8) manpage about the values of the --auto keyword -14. When would I use partitionable arrays? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +14. When would I partition an array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This answer by Doug Ledford is shamelessly adapted from [0] (with permission): @@ -438,7 +443,7 @@ 2 0/0 1/1 1/1 1/1 3 0/0 1/1 2/2 2/2 4 0/0 1/2 2/2 3/3 - 5 0/0 1/2 2/2 3/3 + 5 0/0 1/2 2/2 3/3 6 0/0 1/3 2/3 3/3 7 0/0 1/3 2/3 3/3 8 0/0 1/4 2/3 3/4 @@ -450,7 +455,7 @@ 19. What should I do if a disk fails? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Replace it as soon as possible: - + mdadm --remove /dev/md0 /dev/sda1 halt @@ -460,7 +465,7 @@ array? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Did you read the previous question and its answer? - + For cases when you have two copies of each block, the question is easily answered by looking at the output of /proc/mdstat. For instance on a four disk array: @@ -546,4 +551,36 @@ Or make use of /usr/share/mdadm/startall . - -- martin f. krafft Wed, 02 Jul 2008 11:32:51 +0200 +25. Why are my arrays marked auto-read-only in /proc/mdstat? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Arrays are kept read-only until the first write occurs. This allows md to + skip lengthy resynchronisation for arrays that have not been properly shut + down, but which also not have changed. + +26. Why doesn't mdadm find arrays specified in the config file and causes the + boot to fail? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + My boot process dies at an early stage and drops me into the busybox shell. + The last relevant output seems to be from mdadm and is something like + + "/dev/md2 does not exist" + + or + + "No devices listed in conf file found" + + Why does mdadm break my system? + + Short answer: It doesn't, the underlying devices aren't yet available yet + when mdadm runs during the early boot process. + + Long answer: It doesn't. but the drivers of those devices incorrectly + communicate to the kernel that the devices are ready, when in fact they are + not. I consider this a bug in those drivers. Please consider reporting it. + + Workaround: there is nothing mdadm can or will do against this. Fortunately + though, initramfs provides a method, documented at + http://wiki.debian.org/InitramfsDebug. Please append rootdelay=10 to the + kernel command line and try if the boot now works. + + -- martin f. krafft Wed, 13 May 2009 09:59:53 +0200 diff -Nru mdadm-2.6.7.1/debian/initramfs/hook mdadm-3.1.4/debian/initramfs/hook --- mdadm-2.6.7.1/debian/initramfs/hook 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/initramfs/hook 2010-10-18 12:46:15.000000000 +0300 @@ -13,7 +13,7 @@ echo "$PREREQ" } -case ${1:-} in +case "${1:-}" in prereqs) prereqs exit 0 @@ -22,10 +22,10 @@ is_true() { - case "${1:-}" in - [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; - *) return 1;; - esac + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; + *) return 1;; + esac } write() @@ -103,3 +103,4 @@ for module in linear multipath raid0 raid1 raid456 raid5 raid6 raid10; do force_load $module done + diff -Nru mdadm-2.6.7.1/debian/mdadm.config mdadm-3.1.4/debian/mdadm.config --- mdadm-2.6.7.1/debian/mdadm.config 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.config 2010-10-18 12:55:09.000000000 +0300 @@ -16,7 +16,7 @@ CONFIG=/etc/mdadm/mdadm.conf ALTCONFIG=/etc/mdadm.conf -[ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG +[ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG DEBIANCONFIG=/etc/default/mdadm DEBIANCONFIG_OLD=/etc/mdadm/debian.conf @@ -27,7 +27,8 @@ fi fi -INITRDSTART='' +db_get mdadm/initrdstart || : +INITRDSTART="$RET" if [ -s $DEBIANCONFIG ] ; then AUTOCHECK=true diff -Nru mdadm-2.6.7.1/debian/mdadm.cron.d mdadm-3.1.4/debian/mdadm.cron.d --- mdadm-2.6.7.1/debian/mdadm.cron.d 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.cron.d 2010-09-30 19:46:18.000000000 +0300 @@ -9,4 +9,4 @@ # the month is less than or equal to 7. Thus, only run on the first Sunday of # each month. crontab(5) sucks, unfortunately, in this regard; therefore this # hack (see #380425). -57 0 * * 0 root [ -x /usr/share/mdadm/checkarray ] && [ $(date +\%d) -le 7 ] && /usr/share/mdadm/checkarray --cron --all --quiet +57 0 * * 0 root if [ -x /usr/share/mdadm/checkarray ] && [ $(date +\%d) -le 7 ]; then /usr/share/mdadm/checkarray --cron --all --idle --quiet; fi diff -Nru mdadm-2.6.7.1/debian/mdadm.cron.daily mdadm-3.1.4/debian/mdadm.cron.daily --- mdadm-2.6.7.1/debian/mdadm.cron.daily 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.cron.daily 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,18 @@ +#!/bin/sh +# +# cron.daily/mdadm -- daily check that MD devices are functional +# +# Copyright © 2008 Paul Slootman +# distributed under the terms of the Artistic Licence 2.0 + +# As recommended by the manpage, run +# mdadm --monitor --scan --oneshot +# every day to ensure that any degraded MD devices don't go unnoticed. +# Email will go to the address specified in /etc/mdadm/mdadm.conf . +# +set -eu + +MDADM=/sbin/mdadm +[ -x $MDADM ] || exit 0 # package may be removed but not purged + +exec $MDADM --monitor --scan --oneshot diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base mdadm-3.1.4/debian/mdadm.doc-base --- mdadm-2.6.7.1/debian/mdadm.doc-base 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.doc-base 1970-01-01 02:00:00.000000000 +0200 @@ -1,9 +0,0 @@ -Document: mdadm -Title: Convert Root System to Bootable Software RAID1 (Debian) -Author: Lucas Albers -Abstract: How to convert a Debian system to bootable Software RAID 1 with a second hard drive, 'mdadm' and a few standard UNIX tools -Section: System/Administration - -Format: HTML -Index: /usr/share/doc/mdadm/rootraiddoc.97.html -Files: /usr/share/doc/mdadm/rootraiddoc.97.html diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base.faq mdadm-3.1.4/debian/mdadm.doc-base.faq --- mdadm-2.6.7.1/debian/mdadm.doc-base.faq 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.doc-base.faq 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,9 @@ +Document: mdadm-faq +Title: mdadm Debian FAQ +Author: martin f. krafft +Abstract: The document answers frequently asked questions about Debian's mdadm +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/FAQ.gz +Files: /usr/share/doc/mdadm/FAQ.gz diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base.jd-rebuild-raid mdadm-3.1.4/debian/mdadm.doc-base.jd-rebuild-raid --- mdadm-2.6.7.1/debian/mdadm.doc-base.jd-rebuild-raid 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.doc-base.jd-rebuild-raid 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,9 @@ +Document: mdadm-jd-rebuild-raid +Title: Rebuilding a RAID array +Author: David Pashley +Abstract: The document describes how to replace a failed component in a RAID(1) array +Section: System/Administration + +Format: HTML +Index: /usr/share/doc/mdadm/rebuilding-raid.html +Files: /usr/share/doc/mdadm/rebuilding-raid.html diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base.md-txt mdadm-3.1.4/debian/mdadm.doc-base.md-txt --- mdadm-2.6.7.1/debian/mdadm.doc-base.md-txt 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.doc-base.md-txt 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,11 @@ +Document: mdadm-md-txt +Title: Linux kernel documentation on the md driver (2.6.26) +Author: Neil Brown +Abstract: The document comes from the Linux kernel sources' Documentation/ + directory and contains notes and other information about the md kernel + driver (which mdadm uses). It is current for version 2.6.26 of the kernel. +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/md.txt.gz +Files: /usr/share/doc/mdadm/md.txt.gz diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base.raid5-vs-10 mdadm-3.1.4/debian/mdadm.doc-base.raid5-vs-10 --- mdadm-2.6.7.1/debian/mdadm.doc-base.raid5-vs-10 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.doc-base.raid5-vs-10 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,11 @@ +Document: mdadm-raid5-vs-10 +Title: Comparison of RAID5 and RAID10 +Author: Art S. Kagel +Abstract: This document compares RAID5 and RAID10 and strongly argues against + RAID5. The Debian package maintainer does not agree in full but includes the + document to allow people to make up their own mind. +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/RAID5_versus_RAID10.txt.gz +Files: /usr/share/doc/mdadm/RAID5_versus_RAID10.txt.gz diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base.recipes mdadm-3.1.4/debian/mdadm.doc-base.recipes --- mdadm-2.6.7.1/debian/mdadm.doc-base.recipes 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.doc-base.recipes 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,9 @@ +Document: mdadm-readme-recipes +Title: mdadm Debian recipes +Author: David Pashley +Abstract: The document contains some common recipes for mdadm usage on Debian +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/README.recipes.gz +Files: /usr/share/doc/mdadm/README.recipes.gz diff -Nru mdadm-2.6.7.1/debian/mdadm.doc-base.superblock-formats mdadm-3.1.4/debian/mdadm.doc-base.superblock-formats --- mdadm-2.6.7.1/debian/mdadm.doc-base.superblock-formats 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mdadm.doc-base.superblock-formats 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,10 @@ +Document: mdadm-superblock-formats +Title: Description of md superblock formats +Author: GrangerX +Abstract: This document details the different md superblock formats and their + disk layouts +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/md_superblock_formats.txt.gz +Files: /usr/share/doc/mdadm/md_superblock_formats.txt.gz diff -Nru mdadm-2.6.7.1/debian/mdadm.docs mdadm-3.1.4/debian/mdadm.docs --- mdadm-2.6.7.1/debian/mdadm.docs 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.docs 2010-09-30 19:46:18.000000000 +0300 @@ -1,7 +1,5 @@ +docs/* TODO debian/README.recipes debian/README.checkarray debian/FAQ -md.txt -rootraiddoc.97.html -RAID5_versus_RAID10.txt diff -Nru mdadm-2.6.7.1/debian/mdadm.init mdadm-3.1.4/debian/mdadm.init --- mdadm-2.6.7.1/debian/mdadm.init 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.init 2010-09-30 19:46:18.000000000 +0300 @@ -3,16 +3,15 @@ # Start the MD monitor daemon for all active MD arrays if desired. # # Copyright © 2001-2005 Mario Jou/3en -# Copyright © 2005-2008 Martin F. Krafft +# Copyright © 2005-2009 Martin F. Krafft # Distributable under the terms of the GNU GPL version 2. # ### BEGIN INIT INFO # Provides: mdadm -# Required-Start: checkroot -# Required-Stop: umountroot -# Should-Start: module-init-tools -# Default-Start: S -# Default-Stop: 0 6 +# Required-Start: $local_fs $syslog mdadm-raid +# Required-Stop: $local_fs $syslog mdadm-raid +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 # Short-Description: MD monitoring daemon # Description: mdadm provides a monitor mode, in which it will scan for # problems with the MD devices. If a problem is found, the diff -Nru mdadm-2.6.7.1/debian/mdadm.lintian-overrides mdadm-3.1.4/debian/mdadm.lintian-overrides --- mdadm-2.6.7.1/debian/mdadm.lintian-overrides 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.lintian-overrides 2010-09-30 19:46:18.000000000 +0300 @@ -1 +1,2 @@ mdadm: virtual-package-depends-without-real-package-depends recommends: mail-transport-agent +mdadm: init.d-script-possible-missing-stop /etc/init.d/mdadm-raid 1 diff -Nru mdadm-2.6.7.1/debian/mdadm.logcheck.ignore.server mdadm-3.1.4/debian/mdadm.logcheck.ignore.server --- mdadm-2.6.7.1/debian/mdadm.logcheck.ignore.server 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.logcheck.ignore.server 2010-09-30 19:46:18.000000000 +0300 @@ -1,23 +1,23 @@ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: md driver [.[:digit:]]+ MAX_MD_DEVS=[[:digit:]]+, MD_SB_DISKS=[[:digit:]]+$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: bitmap version [.[:digit:]]+$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: md[[:digit:]]+ stopped\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: md[[:digit:]]+ still in use\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: cannot remove active disk [[:alnum:]]+ from md[[:digit:]]+ \.\.\. ?$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: raid([01456]|456|10) personality registered for level ([01456]|10)$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: (data-check|requested-resync|resync|reshape|recovery) of RAID array md[[:digit:]]+$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: resuming (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ from checkpoint\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: md[[:digit:]]+: (data-check|requested-resync|resync|reshape|recovery) done\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: minimum _guaranteed_ ?speed: [[:digit:]]+ KB/sec/disk\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: using maximum available idle IO bandwidth \(but not more than [[:digit:]]+ KB/sec\) for (data-check|requested-resync|resync|reshape|recovery)\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: delaying (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ until md[[:digit:]]+ has finished \(they share one or more physical units\)$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: using [[:digit:]]+k window, over a total of [[:digit:]]+ blocks\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: (un)?bind<[^>]+>$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: export_rdev\([^)]+\)$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: raid[[:digit:]]+: raid set [[:alnum:]]+ active with [[:digit:]]+ out of [[:digit:]]+ mirrors$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: RAID([01456]|10) conf printout:$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:[[:space:]]+---( [wrf]d:[[:digit:]]+){2,3}$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:[[:space:]]+disk [[:digit:]]+,( wo:[[:digit:]]+,)? o:[[:digit:]]+, dev:[[:alnum:]]+$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: Rebuild((Start|Finish)ed|[2468]0) event detected on md device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md driver [.[:digit:]]+ MAX_MD_DEVS=[[:digit:]]+, MD_SB_DISKS=[[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: bitmap version [.[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+ stopped\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+ still in use\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: cannot remove active disk [[:alnum:]]+ from md[[:digit:]]+ \.\.\. ?$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: raid([01456]|456|10) personality registered for level ([01456]|10)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: (data-check|requested-resync|resync|reshape|recovery) of RAID array md[[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: resuming (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ from checkpoint\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+: (data-check|requested-resync|resync|reshape|recovery) done\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: minimum _guaranteed_ ?speed: [[:digit:]]+ KB/sec/disk\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: using maximum available idle IO bandwidth \(but not more than [[:digit:]]+ KB/sec\) for (data-check|requested-resync|resync|reshape|recovery)\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: delaying (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ until md[[:digit:]]+ has finished \(they share one or more physical units\)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: using [[:digit:]]+k window, over a total of [[:digit:]]+ blocks\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: (un)?bind<[^>]+>$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: export_rdev\([^)]+\)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? raid[[:digit:]]+: raid set [[:alnum:]]+ active with [[:digit:]]+ out of [[:digit:]]+ mirrors$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? RAID([01456]|10) conf printout:$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+---( [wrf]d:[[:digit:]]+){2,3}$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+disk [[:digit:]]+,( wo:[[:digit:]]+,)? o:[[:digit:]]+, dev:[[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: Rebuild((Start|Finish)ed|[[:digit:]]+) event detected on md device /dev/[-_./[:alnum:]]+$ ^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: SpareActive event detected on md device /dev/[-_./[:alnum:]]+, component device /dev/[-_./[:alnum:]]+$ ^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: (New|Degraded)Array event detected on md device /dev/[-_./[:alnum:]]+$ ^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: DeviceDisappeared event detected on md device /dev/[-_./[:alnum:]]+$ diff -Nru mdadm-2.6.7.1/debian/mdadm.logcheck.violations mdadm-3.1.4/debian/mdadm.logcheck.violations --- mdadm-2.6.7.1/debian/mdadm.logcheck.violations 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.logcheck.violations 2010-09-30 19:46:18.000000000 +0300 @@ -1,3 +1,3 @@ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: md: kicking non-fresh [[:alnum:]]+ from array!$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel: raid[[:digit:]]+: Disk failure on [[:alnum:]]+, disabling device\.$ -^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:[[:space:]]+Operation continuing on [[:digit:]]+ devices?$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: kicking non-fresh [[:alnum:]]+ from array!$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? raid[[:digit:]]+: Disk failure on [[:alnum:]]+, disabling device\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+Operation continuing on [[:digit:]]+ devices?$ diff -Nru mdadm-2.6.7.1/debian/mdadm.manpages mdadm-3.1.4/debian/mdadm.manpages --- mdadm-2.6.7.1/debian/mdadm.manpages 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.manpages 1970-01-01 02:00:00.000000000 +0200 @@ -1,3 +0,0 @@ -mdadm.8 -md.4 -mdadm.conf.5 diff -Nru mdadm-2.6.7.1/debian/mdadm.postinst mdadm-3.1.4/debian/mdadm.postinst --- mdadm-2.6.7.1/debian/mdadm.postinst 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.postinst 2010-10-18 12:56:14.000000000 +0300 @@ -12,19 +12,21 @@ configure|reconfigure) if [ ! -f /proc/mdstat ] && [ -x $(command -v modprobe 2>/dev/null) ]; then - modprobe -k md >/dev/null 2>&1 || : + modprobe md >/dev/null 2>&1 || : fi if [ ! -f /proc/mdstat ]; then echo 'W: mdadm: failed to load MD subsystem.' >&2 fi + MAKEDEV=/dev/MAKEDEV if [ ! -e /dev/md15 ] \ && [ ! -e /dev/.static/dev/md15 ] \ - && [ ! -e /dev/.devfsd ]; then + && [ ! -e /dev/.devfsd ] \ + && [ -x $MAKEDEV ]; then echo -n 'Generating array device nodes... ' >&2 cd /dev - if /dev/MAKEDEV md >&2 >/dev/null; then + if $MAKEDEV md >&2 >/dev/null; then echo 'done.' >&2 else echo 'failed.' >&2 diff -Nru mdadm-2.6.7.1/debian/mdadm.preinst mdadm-3.1.4/debian/mdadm.preinst --- mdadm-2.6.7.1/debian/mdadm.preinst 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mdadm.preinst 2010-09-30 19:46:18.000000000 +0300 @@ -4,6 +4,26 @@ # set -eu +# based on idea from http://www.dpkg.org/dpkg/ConffileHandling +rm_conffile() { + local conffile md5sum package old_md5sum + + conffile="$1" + if [ -e "$conffile" ]; then + md5sum=$(md5sum "$conffile" | cut -d' ' -f1) + package=$(dpkg -S "$conffile" | cut -d: -f1) + old_md5sum=$(dpkg -s $package | sed -rne "s,[[:space:]]+${conffile}[[:space:]]+,,p") + if [ "$md5sum" != "$old_md5sum" ]; then + echo "Obsolete conffile $conffile has been modified by you." + echo "Saving as ${conffile}.dpkg-bak ..." + mv -f "$conffile" "$conffile".dpkg-bak + else + echo "Removing obsolete conffile $conffile ..." + rm -f "$conffile" + fi + fi +} + case "$1" in upgrade|install) @@ -32,6 +52,8 @@ echo DEVICE partitions > /var/backups/mdadm-Es_v1.dump $MDADM -Esc /var/backups/mdadm-Es_v1.dump >> /var/backups/mdadm-Es_v1.dump || : fi + + rm_conffile /etc/udev/rules.d/65_mdadm.vol_id.rules ;; *) :;; diff -Nru mdadm-2.6.7.1/debian/mkconf mdadm-3.1.4/debian/mkconf --- mdadm-2.6.7.1/debian/mkconf 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/mkconf 1970-01-01 02:00:00.000000000 +0200 @@ -1,109 +0,0 @@ -#!/bin/sh -# -# mkconf -- outputs valid mdadm.conf contents for the local system -# -# Copyright © martin f. krafft -# distributed under the terms of the Artistic Licence 2.0 -# -set -eu - -ME="${0##*/}" -MDADM=/sbin/mdadm -DEBIANCONFIG=/etc/default/mdadm -CONFIG=/etc/mdadm/mdadm.conf - -PROGRAM= - -test -r $DEBIANCONFIG && . $DEBIANCONFIG - -if [ -n "${MDADM_MAILADDR__:-}" ]; then - # honour MAILADDR from the environment (from postinst) - MAILADDR="$MDADM_MAILADDR__" -else - # preserve existing MAILADDR - MAILADDR="$(sed -ne 's/^MAILADDR //p' $CONFIG 2>/dev/null)" || : -fi - -# save existing values as defaults -if [ -r "$CONFIG" ]; then - DEVICE="$(sed -ne 's/^DEVICE //p' $CONFIG)" - CREATE="$(sed -ne 's/^CREATE //p' $CONFIG)" - HOMEHOST="$(sed -ne 's/^HOMEHOST //p' $CONFIG)" - PROGRAM="$(sed -ne 's/^PROGRAM //p' $CONFIG)" -fi - -generate=0 -[ "${1:-}" = force-generate ] && rm -f $CONFIG -case "${1:-}" in - generate|force-generate) - [ -n "${2:-}" ] && CONFIG=$2 - # only barf if the config file specifies anything else than MAILADDR - if egrep -qv '^(MAILADDR.*|#.*|)$' $CONFIG 2>/dev/null; then - echo "E: $ME: $CONFIG already exists." >&2 - exit 255 - fi - - mkdir --parent ${CONFIG%/*} - exec >$CONFIG - generate=1 - ;; -esac - -cat <<_eof -# mdadm.conf -# -# Please refer to mdadm.conf(5) for information about this file. -# - -# by default, scan all partitions (/proc/partitions) for MD superblocks. -# alternatively, specify devices to scan, using wildcards if desired. -DEVICE ${DEVICE:-partitions} - -# auto-create devices with Debian standard permissions -CREATE ${CREATE:-owner=root group=disk mode=0660 auto=yes} - -# automatically tag new arrays as belonging to the local system -HOMEHOST ${HOMEHOST:-} - -# instruct the monitoring daemon where to send mail alerts -MAILADDR ${MAILADDR:-root} - -_eof - -if [ -n "${PROGRAM:-}" ]; then - cat <<-_eof - # program to run when mdadm monitor detects potentially interesting events - PROGRAM ${PROGRAM} - - _eof -fi - -error=0 -if [ ! -r /proc/mdstat ]; then - echo W: $ME: MD subsystem is not loaded, thus I cannot scan for arrays. >&2 - error=1 -elif [ ! -r /proc/partitions ]; then - echo W: $ME: /proc/partitions cannot be read, thus I cannot scan for arrays. >&2 - error=2 -else - echo "# definitions of existing MD arrays" - if ! $MDADM --examine --scan --config=partitions; then - error=$(($? + 128)) - echo W: $ME: failed to scan for partitions. >&2 - echo "### WARNING: scan failed." - else - echo - fi -fi - -if [ $generate -eq 1 ]; then - cat <<_eof -# This file was auto-generated on $(date -R) -# by mkconf \$Id$ -_eof - - mkdir -p /var/lib/mdadm - md5sum $CONFIG > /var/lib/mdadm/mdadm.conf-generated -fi - -exit $error diff -Nru mdadm-2.6.7.1/debian/mkconf.in mdadm-3.1.4/debian/mkconf.in --- mdadm-2.6.7.1/debian/mkconf.in 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/mkconf.in 2010-09-30 19:46:16.000000000 +0300 @@ -0,0 +1,109 @@ +#!/bin/sh +# +# mkconf -- outputs valid mdadm.conf contents for the local system +# +# Copyright © martin f. krafft +# distributed under the terms of the Artistic Licence 2.0 +# +set -eu + +ME="${0##*/}" +MDADM=/sbin/mdadm +DEBIANCONFIG=/etc/default/mdadm +CONFIG=/etc/mdadm/mdadm.conf + +PROGRAM= + +test -r $DEBIANCONFIG && . $DEBIANCONFIG + +if [ -n "${MDADM_MAILADDR__:-}" ]; then + # honour MAILADDR from the environment (from postinst) + MAILADDR="$MDADM_MAILADDR__" +else + # preserve existing MAILADDR + MAILADDR="$(sed -ne 's/^MAILADDR //p' $CONFIG 2>/dev/null)" || : +fi + +# save existing values as defaults +if [ -r "$CONFIG" ]; then + DEVICE="$(sed -ne 's/^DEVICE //p' $CONFIG)" + CREATE="$(sed -ne 's/^CREATE //p' $CONFIG)" + HOMEHOST="$(sed -ne 's/^HOMEHOST //p' $CONFIG)" + PROGRAM="$(sed -ne 's/^PROGRAM //p' $CONFIG)" +fi + +generate=0 +[ "${1:-}" = force-generate ] && rm -f $CONFIG +case "${1:-}" in + generate|force-generate) + [ -n "${2:-}" ] && CONFIG=$2 + # only barf if the config file specifies anything else than MAILADDR + if egrep -qv '^(MAILADDR.*|#.*|)$' $CONFIG 2>/dev/null; then + echo "E: $ME: $CONFIG already exists." >&2 + exit 255 + fi + + mkdir --parent ${CONFIG%/*} + exec >$CONFIG + generate=1 + ;; +esac + +cat <<_eof +# mdadm.conf +# +# Please refer to mdadm.conf(5) for information about this file. +# + +# by default, scan all partitions (/proc/partitions) for MD superblocks. +# alternatively, specify devices to scan, using wildcards if desired. +DEVICE ${DEVICE:-partitions} + +# auto-create devices with Debian standard permissions +CREATE ${CREATE:-owner=root group=disk mode=0660 auto=yes} + +# automatically tag new arrays as belonging to the local system +HOMEHOST ${HOMEHOST:-} + +# instruct the monitoring daemon where to send mail alerts +MAILADDR ${MAILADDR:-root} + +_eof + +if [ -n "${PROGRAM:-}" ]; then + cat <<-_eof + # program to run when mdadm monitor detects potentially interesting events + PROGRAM ${PROGRAM} + + _eof +fi + +error=0 +if [ ! -r /proc/mdstat ]; then + echo W: $ME: MD subsystem is not loaded, thus I cannot scan for arrays. >&2 + error=1 +elif [ ! -r /proc/partitions ]; then + echo W: $ME: /proc/partitions cannot be read, thus I cannot scan for arrays. >&2 + error=2 +else + echo "# definitions of existing MD arrays" + if ! $MDADM --examine --scan --config=partitions; then + error=$(($? + 128)) + echo W: $ME: failed to scan for partitions. >&2 + echo "### WARNING: scan failed." + else + echo + fi +fi + +if [ $generate -eq 1 ]; then + cat <<_eof +# This file was auto-generated on $(date -R) +# by mkconf \$Id$ +_eof + + mkdir -p /var/lib/mdadm + md5sum $CONFIG > /var/lib/mdadm/mdadm.conf-generated +fi + +exit $error diff -Nru mdadm-2.6.7.1/debian/newdisk mdadm-3.1.4/debian/newdisk --- mdadm-2.6.7.1/debian/newdisk 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/newdisk 1970-01-01 02:00:00.000000000 +0200 @@ -1,243 +0,0 @@ -#!/bin/sh - -MY_VERSION="1.40" -# ---------------------------------------------------------------------------------------------------------------------- -# Linux MD (Soft)RAID Add Script - Add a (new) harddisk to another multi MD-array harddisk -# Last update: July 15, 2008 -# (C) Copyright 2005-2008 by Arno van Amersfoort -# Homepage : http://rocky.eld.leidenuniv.nl/ -# Email : a r n o v a AT r o c k y DOT e l d DOT l e i d e n u n i v DOT n l -# (note: you must remove all spaces and substitute the @ and the . at the proper locations!) -# ---------------------------------------------------------------------------------------------------------------------- -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# version 2 as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# ---------------------------------------------------------------------------------------------------------------------- - -show_help() -{ - echo "Bad or missing parameter(s)" - echo "Usage: $(basename $0) [ source_disk ] [ target_disk ] [ options ]" - echo "Options:" - echo "--force = Even proceed if target device does not appear empty" - echo "--noptupdate = Do NOT update the partition table on the target device (EXPERT!)" - echo "--nombrupdate = Do NOT update the MBR boot-loader on the target device (EXPERT!)" -} - - -echo "MDadd for SoftRAID-MDADM v$MY_VERSION" -echo "Written by Arno van Amersfoort" -echo "--------------------------------" - -if [ "$UID" != "0" ]; then - printf "\033[40m\033[1;31mERROR: Root check FAILED (you MUST be root to use this script)! Quitting...\n\033[0m" - exit 1 -fi - -if ! which mdadm 2>&1 >/dev/null; then - printf "\033[40m\033[1;31mERROR: Unable to find mdadm-binary! Quitting...\n\033[0m" - exit 2 -fi - -if ! which sfdisk 2>&1 >/dev/null; then - printf "\033[40m\033[1;31mERROR: Unable to find sfdisk-binary! Quitting...\n\033[0m" - exit 2 -fi - -if ! which dd 2>&1 >/dev/null; then - printf "\033[40m\033[1;31mERROR: Unable to find dd-binary! Quitting...\n\033[0m" - exit 2 -fi - -# Set environment variables to default -FORCE=0 -NOPTUPDATE=0 -NOMBRUPDATE=0 -SOURCE="" -TARGET="" - -# Check arguments -for arg in $*; do - ARGNAME="$(echo "$arg" |cut -d= -f1)" - ARGVAL="$(echo "$arg" |cut -d= -f2)" - - if ! echo "$ARGNAME" |grep -q "^-"; then - if [ -z "$SOURCE" ]; then - SOURCE="$ARGVAL" - else - if [ -z "$TARGET" ]; then - TARGET="$ARGVAL" - else - show_help; - exit 3 - fi - fi - else - case "$ARGNAME" in - --force|-force|-f) FORCE=1;; - --noptupdate|-noptupdate|--noptu|-noptu) NOPTUPDATE=1;; - --nombrupdate|-nombrupdate|--nombru|nombru) NOMBRUPDATE=1;; - --help) show_help; - exit 0;; - *) echo "ERROR: Bad argument: $ARGNAME"; - show_help; - exit 3;; - esac - fi -done - -if [ -z "$SOURCE" ] || [ -z "$TARGET" ]; then - echo "ERROR: Bad or missing argument(s)" - show_help; - exit 4 -fi - -if ! echo "$SOURCE" |grep -q '^/dev/'; then - printf "\033[40m\033[1;31mERROR: Source device $SOURCE does not start with /dev/! Quitting...\n\033[0m" - exit 5 -fi - -if ! echo "$TARGET" |grep -q '^/dev/'; then - printf "\033[40m\033[1;31mERROR: Target device $TARGET does not start with /dev/! Quitting...\n\033[0m" - exit 6 -fi - -if echo "$SOURCE" |grep -q 'md'; then - printf "\033[40m\033[1;31mERROR: The source device specified is an md-device! Quitting...\n\033[0m" - echo "A physical drive (part of the md-array('s)) is required as source device (ie. /dev/hda)!" - exit 7 -fi - -# We also want variables without /dev/ : -SOURCE_NODEV="$(echo "$SOURCE" |sed s,'^/dev/',,)" -TARGET_NODEV="$(echo "$TARGET" |sed s,'^/dev/',,)" - -if ! grep -q -e " $TARGET_NODEV " -e " $TARGET_NODEV$" /proc/partitions; then - printf "\033[40m\033[1;31mERROR: Target device $TARGET is NOT a valid target drive! Quitting...\n\033[0m" - exit 8 -fi - -if ! grep -q -e " $SOURCE_NODEV " -e " $SOURCE_NODEV$" /proc/partitions; then - printf "\033[40m\033[1;31mERROR: Source device $SOURCE is NOT a valid source drive! Quitting...\n\033[0m" - exit 9 -fi - -if ! grep -q -e " $SOURCE_NODEV[p,1..9]" /proc/partitions; then - printf "\033[40m\033[1;31mERROR: Source device $SOURCE does not contain any partitions!? Quitting...\n\033[0m" - exit 10 -fi - -if grep -q -e " $TARGET_NODEV[p,1..9]" /proc/partitions && [ "$FORCE" != "1" ]; then - printf "\033[40m\033[1;31mERROR: Target device $TARGET is NOT empty! Use --force to override. Quitting...\n\033[0m" - exit 11 -fi - -if grep -q -e " $TARGET_NODEV" /proc/mdstat; then - grep " $TARGET_NODEV" /proc/mdstat - printf "\033[40m\033[1;31mWARNING: Target device is already in use by an MD RAID array!\nPress any key to continue or CTRL-C to abort...\n\033[0m" - read -n 1 -fi - -# Create backup of partition table: -echo "--> Backing up partition table of target device $TARGET to /tmp/partitions.$TARGET_NODEV..." -sfdisk -d "$TARGET" >"/tmp/partitions.$TARGET_NODEV" - -# Disable all swaps on this disk -echo "--> Disabling any swap partitions on target device $TARGET" -grep "^$TARGET" /proc/swaps |awk '{ print $1 }' |while read SWAP; do - swapoff $SWAP 2>&1 >/dev/null -done - -#echo "--> Copying source device $SOURCE to target device $TARGET:" - - -if [ "$NOMBRUPDATE" != "1" ]; then - echo "--> Copying track0(containing MBR)..." - dd if="$SOURCE" of="$TARGET" bs=65536 count=1 -fi - -if [ "$NOPTUPDATE" != "1" ]; then - echo "--> Copying partition table from $SOURCE to $TARGET..." - sfdisk -d "$SOURCE" |sfdisk --force "$TARGET" -else - echo "--> Restoring partition table from /tmp/partitions.$TARGET_NODEV to $TARGET..." - sfdisk -d "$SOURCE" |sfdisk --force "$TARGET" -fi - -mdadm --detail --scan --verbose >/tmp/mdadm-detail-scan.txt -retval=$? -if [ "$retval" != "0" ]; then - printf "\033[40m\033[1;31mERROR: MDADM returned an error($retval) while determining detail information!\n\033[0m" - exit 12 -fi - -# Copy/build all md devices that exist on the source drive: -BOOT=0 -NO_ADD=1 -while read STRING; do - if echo "$STRING" |grep -q "^ARRAY "; then - MD_DEV=$(echo "$STRING" |awk '{ print $2 }') - fi - - if echo "$STRING" |grep -q "devices=.*$SOURCE"; then - NO_ADD=0 - PARTITION_NR="$(echo "$STRING" |sed -e s:".*devices=.*$SOURCE":"": -e s:",.*":"":)" - - if [ -z "$PARTITION_NR" ]; then - printf "\033[40m\033[1;31mERROR: Unable to retrieve detail information for $SOURCE from $MD_DEV!\n\033[0m" - exit 13 - fi - - if grep -q -e "^$MD_DEV.*/boot" -e "^$MD_DEV.*/.*1$" /etc/fstab; then - BOOT=1 - fi - - echo "" - echo "--> Adding $TARGET$PARTITION_NR to RAID array $MD_DEV:" - printf "\033[40m\033[1;31m" - mdadm --add "$MD_DEV" "$TARGET""$PARTITION_NR" - printf "\033[0m" - fi -done < /tmp/mdadm-detail-scan.txt - -echo "" - -# Create swapspace on partitions with ID=82 -echo "--> Creating swapspace on target device (if any swap partitions exist):" -sfdisk -d "$TARGET" |grep -i "Id=82" |awk '{ print $1 }' |while read SWAP_DEVICE; do - mkswap "$SWAP_DEVICE" - swapon "$SWAP_DEVICE" - - if ! grep -q "$SWAP_DEVICE.*none.*swap" /etc/fstab; then - printf "\033[40m\033[1;31mWARNING: /etc/fstab does NOT contain a (valid) swap entry for $SWAP_DEVICE\n\033[0m" - fi -done - -#echo "--> Showing current mdadm detail-scan (you may need to update your mdadm.conf (manually):" -#mdadm --detail --scan - -echo "--> Showing current /proc/mdstat (you may need to update your mdadm.conf (manually):" -cat /proc/mdstat -echo "" - -if [ "$NO_ADD" = "1" ]; then - printf "\033[40m\033[1;31mWARNING: No mdadm --add actions were performed, please investigate!\n\033[0m" -fi - -if [ "$BOOT" = "1" ]; then - printf "\033[40m\033[1;31mNOTE: Boot and/or root partition detected.\n You *MAY* need to reinstall your boot loader (ie. GRUB) on this device!\n\033[0m" -fi - -# TODO?: -# sanity check nopt (check if target device has a partition table)? -# detect if device has superblock (mdadm --examine /dev/sda1; echo $?)? -# continue ask (show what will be done):? diff -Nru mdadm-2.6.7.1/debian/NEWS mdadm-3.1.4/debian/NEWS --- mdadm-2.6.7.1/debian/NEWS 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/NEWS 2010-09-30 19:46:18.000000000 +0300 @@ -40,16 +40,16 @@ mdadm (2.4.1-5) unstable; urgency=low - This version drops the automatic generation of the /etc/mdadm/mdadm.conf - file on every boot (if it was missing). This means that you need to ensure + This version drops the automatic generation of the /etc/mdadm/mdadm.conf + file on every boot (if it was missing). This means that you need to ensure that you have a valid configuration file. If none is present during package configuration, mdadm *will* try to generate one, but it will only contain - information about arrays that were running at the time of package - configuration. Arrays not listed in the configuration file will *not* be + information about arrays that were running at the time of package + configuration. Arrays not listed in the configuration file will *not* be started automatically after boot (with the exception of the root partition). - If you want to recreate your configuration file, either figure out what it - should contain from the mdadm.conf(5) manpage, or simply assemble and run + If you want to recreate your configuration file, either figure out what it + should contain from the mdadm.conf(5) manpage, or simply assemble and run all the arrays the way you like it, then run /usr/share/mdadm/mkconf force-generate /etc/mdadm/mdadm.conf diff -Nru mdadm-2.6.7.1/debian/patches/contrib/docs/jd-rebuilding-raid.diff mdadm-3.1.4/debian/patches/contrib/docs/jd-rebuilding-raid.diff --- mdadm-2.6.7.1/debian/patches/contrib/docs/jd-rebuilding-raid.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/contrib/docs/jd-rebuilding-raid.diff 2010-09-03 12:06:50.000000000 +0300 @@ -0,0 +1,578 @@ +From: martin f. krafft +Subject: Article from David Pashley on rebuilding RAID +Origin: http://www.davidpashley.com/blog/linux/rebuilding-raid +Signed-off-by: martin f. krafft + +--- + docs/rebuilding-raid.html | 561 +++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 561 insertions(+), 0 deletions(-) + +diff --git a/docs/rebuilding-raid.html b/docs/rebuilding-raid.html +new file mode 100644 +index 0000000..1d7b8c0 +--- /dev/null ++++ b/docs/rebuilding-raid.html +@@ -0,0 +1,561 @@ ++ ++ ++ ++ ++ ++ JD : /linux/rebuilding-raid ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
++ ++ ++ ++
++ ++
++
++

Sat, 12 Jul 2008

++

Rebuilding a RAID array

I recently had a failed drive in my RAID1 array. I've just installed ++the replacement drive and thought I'd share the method.

Let's look at the current situation:

++root@ace:~# cat /proc/mdstat 
++Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] 
++md1 : active raid1 sda3[1]
++      483403776 blocks [2/1] [_U]
++      
++md0 : active raid1 sda1[1]
++      96256 blocks [2/1] [_U]
++      
++unused devices: <none>
++

So we can see we have two mirrored arrays with one drive missing in both.

Let's see that we've recognised the second drive:

++root@ace:~# dmesg | grep sd
++[   21.465395] Driver 'sd' needs updating - please use bus_type methods
++[   21.465486] sd 2:0:0:0: [sda] 976773168 512-byte hardware sectors (500108 MB)
++[   21.465496] sd 2:0:0:0: [sda] Write Protect is off
++[   21.465498] sd 2:0:0:0: [sda] Mode Sense: 00 3a 00 00
++[   21.465512] sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
++[   21.465562] sd 2:0:0:0: [sda] 976773168 512-byte hardware sectors (500108 MB)
++[   21.465571] sd 2:0:0:0: [sda] Write Protect is off
++[   21.465573] sd 2:0:0:0: [sda] Mode Sense: 00 3a 00 00
++[   21.465587] sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
++[   21.465590]  sda: sda1 sda2 sda3
++[   21.487248] sd 2:0:0:0: [sda] Attached SCSI disk
++[   21.487303] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
++[   21.487314] sd 2:0:1:0: [sdb] Write Protect is off
++[   21.487317] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
++[   21.487331] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
++[   21.487371] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
++[   21.487381] sd 2:0:1:0: [sdb] Write Protect is off
++[   21.487382] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
++[   21.487403] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
++[   21.487407]  sdb: unknown partition table
++[   21.502763] sd 2:0:1:0: [sdb] Attached SCSI disk
++[   21.506690] sd 2:0:0:0: Attached scsi generic sg0 type 0
++[   21.506711] sd 2:0:1:0: Attached scsi generic sg1 type 0
++[   21.793835] md: bind<sda1>
++[   21.858027] md: bind<sda3>
++

So, sda has three partitions, sda1, sda2 and sda3, and sdb has no partition ++table. Let's give it one the same as sda. The easiest way to do this is using ++sfdisk:

++root@ace:~# sfdisk -d /dev/sda | sfdisk /dev/sdb
++Checking that no-one is using this disk right now ...
++OK
++
++Disk /dev/sdb: 60801 cylinders, 255 heads, 63 sectors/track
++
++sfdisk: ERROR: sector 0 does not have an MSDOS signature
++ /dev/sdb: unrecognised partition table type
++Old situation:
++No partitions found
++New situation:
++Units = sectors of 512 bytes, counting from 0
++
++   Device Boot    Start       End   #sectors  Id  System
++/dev/sdb1   *        63    192779     192717  fd  Linux RAID autodetect
++/dev/sdb2        192780   9960299    9767520  82  Linux swap / Solaris
++/dev/sdb3       9960300 976768064  966807765  fd  Linux RAID autodetect
++/dev/sdb4             0         -          0   0  Empty
++Successfully wrote the new partition table
++
++Re-reading the partition table ...
++
++If you created or changed a DOS partition, /dev/foo7, say, then use dd(1)
++to zero the first 512 bytes:  dd if=/dev/zero of=/dev/foo7 bs=512 count=1
++(See fdisk(8).)
++

If we check dmesg now to check it's worked, we'll see:

++root@ace:~# dmesg | grep sd
++...
++[  224.246102] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
++[  224.246322] sd 2:0:1:0: [sdb] Write Protect is off
++[  224.246325] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
++[  224.246547] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
++[  224.246686]  sdb: unknown partition table
++[  227.326278] sd 2:0:1:0: [sdb] 976773168 512-byte hardware sectors (500108 MB)
++[  227.326504] sd 2:0:1:0: [sdb] Write Protect is off
++[  227.326507] sd 2:0:1:0: [sdb] Mode Sense: 00 3a 00 00
++[  227.326703] sd 2:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
++[  227.326708]  sdb: sdb1 sdb2 sdb3
++

So, now we have identical partition tables. The next thing to do is to add the new partitions to the array:

++root@ace:~# mdadm /dev/md0 --add /dev/sdb1
++mdadm: added /dev/sdb1
++root@ace:~# mdadm /dev/md1 --add /dev/sdb3
++mdadm: added /dev/sdb3
++

Everything looks good. Let's check dmesg:

++[  323.941542] md: bind<sdb1>
++[  324.038183] RAID1 conf printout:
++[  324.038189]  --- wd:1 rd:2
++[  324.038192]  disk 0, wo:1, o:1, dev:sdb1
++[  324.038195]  disk 1, wo:0, o:1, dev:sda1
++[  324.038300] md: recovery of RAID array md0
++[  324.038303] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
++[  324.038305] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
++[  324.038310] md: using 128k window, over a total of 96256 blocks.
++[  325.417219] md: md0: recovery done.
++[  325.453629] RAID1 conf printout:
++[  325.453632]  --- wd:2 rd:2
++[  325.453634]  disk 0, wo:0, o:1, dev:sdb1
++[  325.453636]  disk 1, wo:0, o:1, dev:sda1
++[  347.970105] md: bind<sdb3>
++[  348.004566] RAID1 conf printout:
++[  348.004571]  --- wd:1 rd:2
++[  348.004573]  disk 0, wo:1, o:1, dev:sdb3
++[  348.004574]  disk 1, wo:0, o:1, dev:sda3
++[  348.004657] md: recovery of RAID array md1
++[  348.004659] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
++[  348.004660] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
++[  348.004664] md: using 128k window, over a total of 483403776 blocks.
++

Everything still looks good. Let's sit back and watch it rebuild using the wonderfully useful watch command:

++root@ace:~# watch -n 1 cat /proc/mdstat
++Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] 
++md1 : active raid1 sdb3[2] sda3[1]
++      483403776 blocks [2/1] [_U]
++      [=====>...............]  recovery = 26.0% (126080960/483403776) finish=96.2min speed=61846K/sec
++      
++md0 : active raid1 sdb1[0] sda1[1]
++      96256 blocks [2/2] [UU]
++      
++unused devices: <none>
++

The Ubuntu and Debian installers will allow you create RAID1 arrays ++with less drives than you actually have, so you can use this technique ++if you plan to add an additional drive after you've installed the ++system. Just tell it the eventual number of drives, but only select the ++available partitions during RAID setup. I used this method when a new machine recent ++didn't have enough SATA power cables and had to wait for an adaptor to ++be delivered.

(Why did no one tell me about watch until recently. I wonder ++how many more incredibly useful programs I've not discovered even after 10 ++years of using Linux)

++ [, , ] | # Read Comments (3) | ++ ++
++ ++
++ ++
++

Comments

++
++
++ One extra step that I do is install an MBR on the new disk, to make it bootable:
++
++install-mbr /dev/sdb
++ Posted by Marius Gedminas at Sun Jul 13 07:51:06 2008 ++
++
++ Great article!
++Maybe it would be even more useful if merged here:
++http://linux-raid.osdl.org/index.php/Reconstruction
++ Posted by Eduardo Pérez Ureta at Sun Jul 13 18:54:07 2008 ++
++
++ Semi-Tangential note about performance:  On my home (== partly "play") machine, I made the experience that "mdadm --manage .. --fail"-ing the root partition before doing lots of package upgrades (installing KDE 4/experimental and lots of other updates in my case, on a mostly etch system.  Dual screen support sucks if the screens don't have the same size, btw!) speeds up apt considerably, while the subsequent reconstruct step (--remove and then --add the partition) doesn't slow down the system much during light desktop workload.
++
++My system is a few years old (no SATA, probably not too much cache on the disks, too) and has only 512M RAM, so maybe a better equipped system would make this less noticeable.
++
++(... and no, I probably wouldn't force-fail part of my /home partition for any length of time :-)
++ Posted by cmot at Mon Jul 14 12:30:50 2008 ++
++
++
++
++

++ Name:
++
++
++ E-mail:
++
++
++ URL:
++ ++
++
++ Comment:
++
++
++ Please enter "fudge" to prove you are a human ++ ++
++ ++ ++

++
++
++
++
++
++ ++
++
++ ++ +-- +tg: (d28c1a7..) contrib/docs/jd-rebuilding-raid (depends on: mdadm-3.0.3) diff -Nru mdadm-2.6.7.1/debian/patches/contrib/docs/md.txt.diff mdadm-3.1.4/debian/patches/contrib/docs/md.txt.diff --- mdadm-2.6.7.1/debian/patches/contrib/docs/md.txt.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/contrib/docs/md.txt.diff 2010-09-03 12:06:50.000000000 +0300 @@ -0,0 +1,531 @@ +From: martin f. krafft +Subject: Import md.txt from the kernel + +This patch imports md.txt from the kernel repository. + +Origin: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=blob_plain;f=Documentation/md.txt;hb=HEAD +Acked-by: martin f. krafft + +--- + docs/md.txt | 511 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 511 insertions(+), 0 deletions(-) + +diff --git a/docs/md.txt b/docs/md.txt +new file mode 100644 +index 0000000..727238d +--- /dev/null ++++ b/docs/md.txt +@@ -0,0 +1,511 @@ ++# From: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=blob_plain;f=Documentation/md.txt;hb=v2.6.31 ++ ++Tools that manage md devices can be found at ++ http://www..kernel.org/pub/linux/utils/raid/.... ++ ++ ++Boot time assembly of RAID arrays ++--------------------------------- ++ ++You can boot with your md device with the following kernel command ++lines: ++ ++for old raid arrays without persistent superblocks: ++ md=,,,,dev0,dev1,...,devn ++ ++for raid arrays with persistent superblocks ++ md=,dev0,dev1,...,devn ++or, to assemble a partitionable array: ++ md=d,dev0,dev1,...,devn ++ ++md device no. = the number of the md device ... ++ 0 means md0, ++ 1 md1, ++ 2 md2, ++ 3 md3, ++ 4 md4 ++ ++raid level = -1 linear mode ++ 0 striped mode ++ other modes are only supported with persistent super blocks ++ ++chunk size factor = (raid-0 and raid-1 only) ++ Set the chunk size as 4k << n. ++ ++fault level = totally ignored ++ ++dev0-devn: e.g. /dev/hda1,/dev/hdc1,/dev/sda1,/dev/sdb1 ++ ++A possible loadlin line (Harald Hoyer ) looks like this: ++ ++e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro ++ ++ ++Boot time autodetection of RAID arrays ++-------------------------------------- ++ ++When md is compiled into the kernel (not as module), partitions of ++type 0xfd are scanned and automatically assembled into RAID arrays. ++This autodetection may be suppressed with the kernel parameter ++"raid=noautodetect". As of kernel 2.6.9, only drives with a type 0 ++superblock can be autodetected and run at boot time. ++ ++The kernel parameter "raid=partitionable" (or "raid=part") means ++that all auto-detected arrays are assembled as partitionable. ++ ++Boot time assembly of degraded/dirty arrays ++------------------------------------------- ++ ++If a raid5 or raid6 array is both dirty and degraded, it could have ++undetectable data corruption. This is because the fact that it is ++'dirty' means that the parity cannot be trusted, and the fact that it ++is degraded means that some datablocks are missing and cannot reliably ++be reconstructed (due to no parity). ++ ++For this reason, md will normally refuse to start such an array. This ++requires the sysadmin to take action to explicitly start the array ++despite possible corruption. This is normally done with ++ mdadm --assemble --force .... ++ ++This option is not really available if the array has the root ++filesystem on it. In order to support this booting from such an ++array, md supports a module parameter "start_dirty_degraded" which, ++when set to 1, bypassed the checks and will allows dirty degraded ++arrays to be started. ++ ++So, to boot with a root filesystem of a dirty degraded raid[56], use ++ ++ md-mod.start_dirty_degraded=1 ++ ++ ++Superblock formats ++------------------ ++ ++The md driver can support a variety of different superblock formats. ++Currently, it supports superblock formats "0.90.0" and the "md-1" format ++introduced in the 2.5 development series. ++ ++The kernel will autodetect which format superblock is being used. ++ ++Superblock format '0' is treated differently to others for legacy ++reasons - it is the original superblock format. ++ ++ ++General Rules - apply for all superblock formats ++------------------------------------------------ ++ ++An array is 'created' by writing appropriate superblocks to all ++devices. ++ ++It is 'assembled' by associating each of these devices with an ++particular md virtual device. Once it is completely assembled, it can ++be accessed. ++ ++An array should be created by a user-space tool. This will write ++superblocks to all devices. It will usually mark the array as ++'unclean', or with some devices missing so that the kernel md driver ++can create appropriate redundancy (copying in raid1, parity ++calculation in raid4/5). ++ ++When an array is assembled, it is first initialized with the ++SET_ARRAY_INFO ioctl. This contains, in particular, a major and minor ++version number. The major version number selects which superblock ++format is to be used. The minor number might be used to tune handling ++of the format, such as suggesting where on each device to look for the ++superblock. ++ ++Then each device is added using the ADD_NEW_DISK ioctl. This ++provides, in particular, a major and minor number identifying the ++device to add. ++ ++The array is started with the RUN_ARRAY ioctl. ++ ++Once started, new devices can be added. They should have an ++appropriate superblock written to them, and then passed be in with ++ADD_NEW_DISK. ++ ++Devices that have failed or are not yet active can be detached from an ++array using HOT_REMOVE_DISK. ++ ++ ++Specific Rules that apply to format-0 super block arrays, and ++ arrays with no superblock (non-persistent). ++------------------------------------------------------------- ++ ++An array can be 'created' by describing the array (level, chunksize ++etc) in a SET_ARRAY_INFO ioctl. This must has major_version==0 and ++raid_disks != 0. ++ ++Then uninitialized devices can be added with ADD_NEW_DISK. The ++structure passed to ADD_NEW_DISK must specify the state of the device ++and it's role in the array. ++ ++Once started with RUN_ARRAY, uninitialized spares can be added with ++HOT_ADD_DISK. ++ ++ ++ ++MD devices in sysfs ++------------------- ++md devices appear in sysfs (/sys) as regular block devices, ++e.g. ++ /sys/block/md0 ++ ++Each 'md' device will contain a subdirectory called 'md' which ++contains further md-specific information about the device. ++ ++All md devices contain: ++ level ++ a text file indicating the 'raid level'. e.g. raid0, raid1, ++ raid5, linear, multipath, faulty. ++ If no raid level has been set yet (array is still being ++ assembled), the value will reflect whatever has been written ++ to it, which may be a name like the above, or may be a number ++ such as '0', '5', etc. ++ ++ raid_disks ++ a text file with a simple number indicating the number of devices ++ in a fully functional array. If this is not yet known, the file ++ will be empty. If an array is being resized this will contain ++ the new number of devices. ++ Some raid levels allow this value to be set while the array is ++ active. This will reconfigure the array. Otherwise it can only ++ be set while assembling an array. ++ A change to this attribute will not be permitted if it would ++ reduce the size of the array. To reduce the number of drives ++ in an e.g. raid5, the array size must first be reduced by ++ setting the 'array_size' attribute. ++ ++ chunk_size ++ This is the size in bytes for 'chunks' and is only relevant to ++ raid levels that involve striping (0,4,5,6,10). The address space ++ of the array is conceptually divided into chunks and consecutive ++ chunks are striped onto neighbouring devices. ++ The size should be at least PAGE_SIZE (4k) and should be a power ++ of 2. This can only be set while assembling an array ++ ++ layout ++ The "layout" for the array for the particular level. This is ++ simply a number that is interpretted differently by different ++ levels. It can be written while assembling an array. ++ ++ array_size ++ This can be used to artificially constrain the available space in ++ the array to be less than is actually available on the combined ++ devices. Writing a number (in Kilobytes) which is less than ++ the available size will set the size. Any reconfiguration of the ++ array (e.g. adding devices) will not cause the size to change. ++ Writing the word 'default' will cause the effective size of the ++ array to be whatever size is actually available based on ++ 'level', 'chunk_size' and 'component_size'. ++ ++ This can be used to reduce the size of the array before reducing ++ the number of devices in a raid4/5/6, or to support external ++ metadata formats which mandate such clipping. ++ ++ reshape_position ++ This is either "none" or a sector number within the devices of ++ the array where "reshape" is up to. If this is set, the three ++ attributes mentioned above (raid_disks, chunk_size, layout) can ++ potentially have 2 values, an old and a new value. If these ++ values differ, reading the attribute returns ++ new (old) ++ and writing will effect the 'new' value, leaving the 'old' ++ unchanged. ++ ++ component_size ++ For arrays with data redundancy (i.e. not raid0, linear, faulty, ++ multipath), all components must be the same size - or at least ++ there must a size that they all provide space for. This is a key ++ part or the geometry of the array. It is measured in sectors ++ and can be read from here. Writing to this value may resize ++ the array if the personality supports it (raid1, raid5, raid6), ++ and if the component drives are large enough. ++ ++ metadata_version ++ This indicates the format that is being used to record metadata ++ about the array. It can be 0.90 (traditional format), 1.0, 1.1, ++ 1.2 (newer format in varying locations) or "none" indicating that ++ the kernel isn't managing metadata at all. ++ Alternately it can be "external:" followed by a string which ++ is set by user-space. This indicates that metadata is managed ++ by a user-space program. Any device failure or other event that ++ requires a metadata update will cause array activity to be ++ suspended until the event is acknowledged. ++ ++ resync_start ++ The point at which resync should start. If no resync is needed, ++ this will be a very large number. At array creation it will ++ default to 0, though starting the array as 'clean' will ++ set it much larger. ++ ++ new_dev ++ This file can be written but not read. The value written should ++ be a block device number as major:minor. e.g. 8:0 ++ This will cause that device to be attached to the array, if it is ++ available. It will then appear at md/dev-XXX (depending on the ++ name of the device) and further configuration is then possible. ++ ++ safe_mode_delay ++ When an md array has seen no write requests for a certain period ++ of time, it will be marked as 'clean'. When another write ++ request arrives, the array is marked as 'dirty' before the write ++ commences. This is known as 'safe_mode'. ++ The 'certain period' is controlled by this file which stores the ++ period as a number of seconds. The default is 200msec (0.200). ++ Writing a value of 0 disables safemode. ++ ++ array_state ++ This file contains a single word which describes the current ++ state of the array. In many cases, the state can be set by ++ writing the word for the desired state, however some states ++ cannot be explicitly set, and some transitions are not allowed. ++ ++ Select/poll works on this file. All changes except between ++ active_idle and active (which can be frequent and are not ++ very interesting) are notified. active->active_idle is ++ reported if the metadata is externally managed. ++ ++ clear ++ No devices, no size, no level ++ Writing is equivalent to STOP_ARRAY ioctl ++ inactive ++ May have some settings, but array is not active ++ all IO results in error ++ When written, doesn't tear down array, but just stops it ++ suspended (not supported yet) ++ All IO requests will block. The array can be reconfigured. ++ Writing this, if accepted, will block until array is quiessent ++ readonly ++ no resync can happen. no superblocks get written. ++ write requests fail ++ read-auto ++ like readonly, but behaves like 'clean' on a write request. ++ ++ clean - no pending writes, but otherwise active. ++ When written to inactive array, starts without resync ++ If a write request arrives then ++ if metadata is known, mark 'dirty' and switch to 'active'. ++ if not known, block and switch to write-pending ++ If written to an active array that has pending writes, then fails. ++ active ++ fully active: IO and resync can be happening. ++ When written to inactive array, starts with resync ++ ++ write-pending ++ clean, but writes are blocked waiting for 'active' to be written. ++ ++ active-idle ++ like active, but no writes have been seen for a while (safe_mode_delay). ++ ++ ++As component devices are added to an md array, they appear in the 'md' ++directory as new directories named ++ dev-XXX ++where XXX is a name that the kernel knows for the device, e.g. hdb1. ++Each directory contains: ++ ++ block ++ a symlink to the block device in /sys/block, e.g. ++ /sys/block/md0/md/dev-hdb1/block -> ../../../../block/hdb/hdb1 ++ ++ super ++ A file containing an image of the superblock read from, or ++ written to, that device. ++ ++ state ++ A file recording the current state of the device in the array ++ which can be a comma separated list of ++ faulty - device has been kicked from active use due to ++ a detected fault ++ in_sync - device is a fully in-sync member of the array ++ writemostly - device will only be subject to read ++ requests if there are no other options. ++ This applies only to raid1 arrays. ++ blocked - device has failed, metadata is "external", ++ and the failure hasn't been acknowledged yet. ++ Writes that would write to this device if ++ it were not faulty are blocked. ++ spare - device is working, but not a full member. ++ This includes spares that are in the process ++ of being recovered to ++ This list may grow in future. ++ This can be written to. ++ Writing "faulty" simulates a failure on the device. ++ Writing "remove" removes the device from the array. ++ Writing "writemostly" sets the writemostly flag. ++ Writing "-writemostly" clears the writemostly flag. ++ Writing "blocked" sets the "blocked" flag. ++ Writing "-blocked" clear the "blocked" flag and allows writes ++ to complete. ++ ++ This file responds to select/poll. Any change to 'faulty' ++ or 'blocked' causes an event. ++ ++ errors ++ An approximate count of read errors that have been detected on ++ this device but have not caused the device to be evicted from ++ the array (either because they were corrected or because they ++ happened while the array was read-only). When using version-1 ++ metadata, this value persists across restarts of the array. ++ ++ This value can be written while assembling an array thus ++ providing an ongoing count for arrays with metadata managed by ++ userspace. ++ ++ slot ++ This gives the role that the device has in the array. It will ++ either be 'none' if the device is not active in the array ++ (i.e. is a spare or has failed) or an integer less than the ++ 'raid_disks' number for the array indicating which position ++ it currently fills. This can only be set while assembling an ++ array. A device for which this is set is assumed to be working. ++ ++ offset ++ This gives the location in the device (in sectors from the ++ start) where data from the array will be stored. Any part of ++ the device before this offset us not touched, unless it is ++ used for storing metadata (Formats 1.1 and 1.2). ++ ++ size ++ The amount of the device, after the offset, that can be used ++ for storage of data. This will normally be the same as the ++ component_size. This can be written while assembling an ++ array. If a value less than the current component_size is ++ written, it will be rejected. ++ ++ ++An active md device will also contain and entry for each active device ++in the array. These are named ++ ++ rdNN ++ ++where 'NN' is the position in the array, starting from 0. ++So for a 3 drive array there will be rd0, rd1, rd2. ++These are symbolic links to the appropriate 'dev-XXX' entry. ++Thus, for example, ++ cat /sys/block/md*/md/rd*/state ++will show 'in_sync' on every line. ++ ++ ++ ++Active md devices for levels that support data redundancy (1,4,5,6) ++also have ++ ++ sync_action ++ a text file that can be used to monitor and control the rebuild ++ process. It contains one word which can be one of: ++ resync - redundancy is being recalculated after unclean ++ shutdown or creation ++ recover - a hot spare is being built to replace a ++ failed/missing device ++ idle - nothing is happening ++ check - A full check of redundancy was requested and is ++ happening. This reads all block and checks ++ them. A repair may also happen for some raid ++ levels. ++ repair - A full check and repair is happening. This is ++ similar to 'resync', but was requested by the ++ user, and the write-intent bitmap is NOT used to ++ optimise the process. ++ ++ This file is writable, and each of the strings that could be ++ read are meaningful for writing. ++ ++ 'idle' will stop an active resync/recovery etc. There is no ++ guarantee that another resync/recovery may not be automatically ++ started again, though some event will be needed to trigger ++ this. ++ 'resync' or 'recovery' can be used to restart the ++ corresponding operation if it was stopped with 'idle'. ++ 'check' and 'repair' will start the appropriate process ++ providing the current state is 'idle'. ++ ++ This file responds to select/poll. Any important change in the value ++ triggers a poll event. Sometimes the value will briefly be ++ "recover" if a recovery seems to be needed, but cannot be ++ achieved. In that case, the transition to "recover" isn't ++ notified, but the transition away is. ++ ++ degraded ++ This contains a count of the number of devices by which the ++ arrays is degraded. So an optimal array with show '0'. A ++ single failed/missing drive will show '1', etc. ++ This file responds to select/poll, any increase or decrease ++ in the count of missing devices will trigger an event. ++ ++ mismatch_count ++ When performing 'check' and 'repair', and possibly when ++ performing 'resync', md will count the number of errors that are ++ found. The count in 'mismatch_cnt' is the number of sectors ++ that were re-written, or (for 'check') would have been ++ re-written. As most raid levels work in units of pages rather ++ than sectors, this my be larger than the number of actual errors ++ by a factor of the number of sectors in a page. ++ ++ bitmap_set_bits ++ If the array has a write-intent bitmap, then writing to this ++ attribute can set bits in the bitmap, indicating that a resync ++ would need to check the corresponding blocks. Either individual ++ numbers or start-end pairs can be written. Multiple numbers ++ can be separated by a space. ++ Note that the numbers are 'bit' numbers, not 'block' numbers. ++ They should be scaled by the bitmap_chunksize. ++ ++ sync_speed_min ++ sync_speed_max ++ This are similar to /proc/sys/dev/raid/speed_limit_{min,max} ++ however they only apply to the particular array. ++ If no value has been written to these, of if the word 'system' ++ is written, then the system-wide value is used. If a value, ++ in kibibytes-per-second is written, then it is used. ++ When the files are read, they show the currently active value ++ followed by "(local)" or "(system)" depending on whether it is ++ a locally set or system-wide value. ++ ++ sync_completed ++ This shows the number of sectors that have been completed of ++ whatever the current sync_action is, followed by the number of ++ sectors in total that could need to be processed. The two ++ numbers are separated by a '/' thus effectively showing one ++ value, a fraction of the process that is complete. ++ A 'select' on this attribute will return when resync completes, ++ when it reaches the current sync_max (below) and possibly at ++ other times. ++ ++ sync_max ++ This is a number of sectors at which point a resync/recovery ++ process will pause. When a resync is active, the value can ++ only ever be increased, never decreased. The value of 'max' ++ effectively disables the limit. ++ ++ ++ sync_speed ++ This shows the current actual speed, in K/sec, of the current ++ sync_action. It is averaged over the last 30 seconds. ++ ++ suspend_lo ++ suspend_hi ++ The two values, given as numbers of sectors, indicate a range ++ within the array where IO will be blocked. This is currently ++ only supported for raid4/5/6. ++ ++ ++Each active md device may also have attributes specific to the ++personality module that manages it. ++These are specific to the implementation of the module and could ++change substantially if the implementation changes. ++ ++These currently include ++ ++ stripe_cache_size (currently raid5 only) ++ number of entries in the stripe cache. This is writable, but ++ there are upper and lower limits (32768, 16). Default is 128. ++ strip_cache_active (currently raid5 only) ++ number of active entries in the stripe cache ++ preread_bypass_threshold (currently raid5 only) ++ number of times a stripe requiring preread will be bypassed by ++ a stripe that does not require preread. For fairness defaults ++ to 1. Setting this to 0 disables bypass accounting and ++ requires preread stripes to wait until all full-width stripe- ++ writes are complete. Valid values are 0 to stripe_cache_size. +-- +tg: (d28c1a7..) contrib/docs/md.txt (depends on: mdadm-3.0.3) diff -Nru mdadm-2.6.7.1/debian/patches/contrib/docs/raid5-vs-raid10.diff mdadm-3.1.4/debian/patches/contrib/docs/raid5-vs-raid10.diff --- mdadm-2.6.7.1/debian/patches/contrib/docs/raid5-vs-raid10.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/contrib/docs/raid5-vs-raid10.diff 2010-09-03 12:06:50.000000000 +0300 @@ -0,0 +1,197 @@ +From: martin f. krafft +Subject: RAID5 versus RAID10 (anti-RAID5) document + +A document presenting a case against RAID[345]. + +Signed-off-by: martin f. krafft +Origin: http://www.miracleas.com/BAARF/RAID5_versus_RAID10.txt + +--- + docs/RAID5_versus_RAID10.txt | 177 ++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 177 insertions(+), 0 deletions(-) + +diff --git a/docs/RAID5_versus_RAID10.txt b/docs/RAID5_versus_RAID10.txt +new file mode 100644 +index 0000000..ac39afb +--- /dev/null ++++ b/docs/RAID5_versus_RAID10.txt +@@ -0,0 +1,177 @@ ++# from http://www.miracleas.com/BAARF/RAID5_versus_RAID10.txt ++# also see http://www.miracleas.com/BAARF/BAARF2.html ++# ++# Note: I, the Debian maintainer, do not agree with some of the arguments, ++# especially not with the total condemning of RAID5. Anyone who talks about ++# data loss and blames the RAID system should spend time reading up on Backups ++# instead of trying to evangelise, but that's only my opinion. RAID5 has its ++# merits and its shortcomings, just like any other method. However, the author ++# of this argument puts forth a good case and thus I am including the ++# document. Remember that you're the only one that can decide which RAID level ++# to use. ++# ++ ++RAID5 versus RAID10 (or even RAID3 or RAID4) ++ ++First let's get on the same page so we're all talking about apples. ++ ++What is RAID5? ++ ++OK here is the deal, RAID5 uses ONLY ONE parity drive per stripe and many ++RAID5 arrays are 5 (if your counts are different adjust the calculations ++appropriately) drives (4 data and 1 parity though it is not a single drive ++that is holding all of the parity as in RAID 3 & 4 but read on). If you ++have 10 drives or say 20GB each for 200GB RAID5 will use 20% for parity ++(assuming you set it up as two 5 drive arrays) so you will have 160GB of ++storage. Now since RAID10, like mirroring (RAID1), uses 1 (or more) mirror ++drive for each primary drive you are using 50% for redundancy so to get the ++same 160GB of storage you will need 8 pairs or 16 - 20GB drives, which is ++why RAID5 is so popular. This intro is just to put things into ++perspective. ++ ++RAID5 is physically a stripe set like RAID0 but with data recovery ++included. RAID5 reserves one disk block out of each stripe block for ++parity data. The parity block contains an error correction code which can ++correct any error in the RAID5 block, in effect it is used in combination ++with the remaining data blocks to recreate any single missing block, gone ++missing because a drive has failed. The innovation of RAID5 over RAID3 & ++RAID4 is that the parity is distributed on a round robin basis so that ++there can be independent reading of different blocks from the several ++drives. This is why RAID5 became more popular than RAID3 & RAID4 which ++must sychronously read the same block from all drives together. So, if ++Drive2 fails blocks 1,2,4,5,6 & 7 are data blocks on this drive and blocks ++3 and 8 are parity blocks on this drive. So that means that the parity on ++Drive5 will be used to recreate the data block from Disk2 if block 1 is ++requested before a new drive replaces Drive2 or during the rebuilding of ++the new Drive2 replacement. Likewise the parity on Drive1 will be used to ++repair block 2 and the parity on Drive3 will repair block4, etc. For block ++2 all the data is safely on the remaining drives but during the rebuilding ++of Drive2's replacement a new parity block will be calculated from the ++block 2 data and will be written to Drive 2. ++ ++Now when a disk block is read from the array the RAID software/firmware ++calculates which RAID block contains the disk block, which drive the disk ++block is on and which drive contains the parity block for that RAID block ++and reads ONLY the one data drive. It returns the data block. If you ++later modify the data block it recalculates the parity by subtracting the ++old block and adding in the new version then in two separate operations it ++writes the data block followed by the new parity block. To do this it must ++first read the parity block from whichever drive contains the parity for ++that stripe block and reread the unmodified data for the updated block from ++the original drive. This read-read-write-write is known as the RAID5 write ++penalty since these two writes are sequential and synchronous the write ++system call cannot return until the reread and both writes complete, for ++safety, so writing to RAID5 is up to 50% slower than RAID0 for an array of ++the same capacity. (Some software RAID5's avoid the re-read by keeping an ++unmodified copy of the orginal block in memory.) ++ ++Now what is RAID10: ++ ++RAID10 is one of the combinations of RAID1 (mirroring) and RAID0 ++(striping) which are possible. There used to be confusion about what ++RAID01 or RAID01 meant and different RAID vendors defined them ++differently. About five years or so ago I proposed the following standard ++language which seems to have taken hold. When N mirrored pairs are ++striped together this is called RAID10 because the mirroring (RAID1) is ++applied before striping (RAID0). The other option is to create two stripe ++sets and mirror them one to the other, this is known as RAID01 (because ++the RAID0 is applied first). In either a RAID01 or RAID10 system each and ++every disk block is completely duplicated on its drive's mirror. ++Performance-wise both RAID01 and RAID10 are functionally equivalent. The ++difference comes in during recovery where RAID01 suffers from some of the ++same problems I will describe affecting RAID5 while RAID10 does not. ++ ++Now if a drive in the RAID5 array dies, is removed, or is shut off data is ++returned by reading the blocks from the remaining drives and calculating ++the missing data using the parity, assuming the defunct drive is not the ++parity block drive for that RAID block. Note that it takes 4 physical ++reads to replace the missing disk block (for a 5 drive array) for four out ++of every five disk blocks leading to a 64% performance degradation until ++the problem is discovered and a new drive can be mapped in to begin ++recovery. Performance is degraded further during recovery because all ++drives are being actively accessed in order to rebuild the replacement ++drive (see below). ++ ++If a drive in the RAID10 array dies data is returned from its mirror drive ++in a single read with only minor (6.25% on average for a 4 pair array as a ++whole) performance reduction when two non-contiguous blocks are needed from ++the damaged pair (since the two blocks cannot be read in parallel from both ++drives) and none otherwise. ++ ++One begins to get an inkling of what is going on and why I dislike RAID5, ++but, as they say on late night info-mercials, there's more. ++ ++What's wrong besides a bit of performance I don't know I'm missing? ++ ++OK, so that brings us to the final question of the day which is: What is ++the problem with RAID5? It does recover a failed drive right? So writes ++are slower, I don't do enough writing to worry about it and the cache ++helps a lot also, I've got LOTS of cache! The problem is that despite the ++improved reliability of modern drives and the improved error correction ++codes on most drives, and even despite the additional 8 bytes of error ++correction that EMC puts on every Clariion drive disk block (if you are ++lucky enough to use EMC systems), it is more than a little possible that a ++drive will become flaky and begin to return garbage. This is known as ++partial media failure. Now SCSI controllers reserve several hundred disk ++blocks to be remapped to replace fading sectors with unused ones, but if ++the drive is going these will not last very long and will run out and SCSI ++does NOT report correctable errors back to the OS! Therefore you will not ++know the drive is becoming unstable until it is too late and there are no ++more replacement sectors and the drive begins to return garbage. [Note ++that the recently popular IDE/ATA drives do not (TMK) include bad sector ++remapping in their hardware so garbage is returned that much sooner.] ++When a drive returns garbage, since RAID5 does not EVER check parity on ++read (RAID3 & RAID4 do BTW and both perform better for databases than ++RAID5 to boot) when you write the garbage sector back garbage parity will ++be calculated and your RAID5 integrity is lost! Similarly if a drive ++fails and one of the remaining drives is flaky the replacement will be ++rebuilt with garbage also propagating the problem to two blocks instead of ++just one. ++ ++Need more? During recovery, read performance for a RAID5 array is ++degraded by as much as 80%. Some advanced arrays let you configure the ++preference more toward recovery or toward performance. However, doing so ++will increase recovery time and increase the likelihood of losing a second ++drive in the array before recovery completes resulting in catastrophic ++data loss. RAID10 on the other hand will only be recovering one drive out ++of 4 or more pairs with performance ONLY of reads from the recovering pair ++degraded making the performance hit to the array overall only about 20%! ++Plus there is no parity calculation time used during recovery - it's a ++straight data copy. ++ ++What about that thing about losing a second drive? Well with RAID10 there ++is no danger unless the one mirror that is recovering also fails and ++that's 80% or more less likely than that any other drive in a RAID5 array ++will fail! And since most multiple drive failures are caused by ++undetected manufacturing defects you can make even this possibility ++vanishingly small by making sure to mirror every drive with one from a ++different manufacturer's lot number. ("Oh", you say, "this schenario does ++not seem likely!" Pooh, we lost 50 drives over two weeks when a batch of ++200 IBM drives began to fail. IBM discovered that the single lot of ++drives would have their spindle bearings freeze after so many hours of ++operation. Fortunately due in part to RAID10 and in part to a herculean ++effort by DG techs and our own people over 2 weeks no data was lost. ++HOWEVER, one RAID5 filesystem was a total loss after a second drive failed ++during recover. Fortunately everything was on tape. ++ ++Conclusion? For safety and performance favor RAID10 first, RAID3 second, ++RAID4 third, and RAID5 last! The original reason for the RAID2-5 specs ++was that the high cost of disks was making RAID1, mirroring, impractical. ++That is no longer the case! Drives are commodity priced, even the biggest ++fastest drives are cheaper in absolute dollars than drives were then and ++cost per MB is a tiny fraction of what it was. Does RAID5 make ANY sense ++anymore? Obviously I think not. ++ ++To put things into perspective: If a drive costs $1000US (and most are far ++less expensive than that) then switching from a 4 pair RAID10 array to a 5 ++drive RAID5 array will save 3 drives or $3000US. What is the cost of ++overtime, wear and tear on the technicians, DBAs, managers, and customers ++of even a recovery scare? What is the cost of reduced performance and ++possibly reduced customer satisfaction? Finally what is the cost of lost ++business if data is unrecoverable? I maintain that the drives are FAR ++cheaper! Hence my mantra: ++ ++NO RAID5! NO RAID5! NO RAID5! NO RAID5! NO RAID5! NO RAID5! NO RAID5! ++ ++Art S. Kagel ++ +-- +tg: (d28c1a7..) contrib/docs/raid5-vs-raid10 (depends on: mdadm-3.0.3) diff -Nru mdadm-2.6.7.1/debian/patches/contrib/docs/superblock_formats.diff mdadm-3.1.4/debian/patches/contrib/docs/superblock_formats.diff --- mdadm-2.6.7.1/debian/patches/contrib/docs/superblock_formats.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/contrib/docs/superblock_formats.diff 2010-09-03 12:06:50.000000000 +0300 @@ -0,0 +1,555 @@ +From: martin f. krafft +Subject: Document on RAID superblock formats + +This document describes the three different superblock formats in use by Linux +md. + +Acked-by: martin f. krafft +Origin: http://linux-raid.osdl.org/index.php/RAID_superblock_formats + +--- + docs/md_superblock_formats.txt | 534 ++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 534 insertions(+), 0 deletions(-) + +diff --git a/docs/md_superblock_formats.txt b/docs/md_superblock_formats.txt +new file mode 100644 +index 0000000..f9c3eb8 +--- /dev/null ++++ b/docs/md_superblock_formats.txt +@@ -0,0 +1,534 @@ ++# From: http://linux-raid.osdl.org/index.php/RAID_superblock_formats ++ ++RAID superblock formats ++ ++From Linux-raid ++ ++Jump to: navigation, search ++ ++Contents ++ ++ • 1 RAID superblock formats ++ â–¡ 1.1 The version-0.90 Superblock Format ++ â–¡ 1.2 The version-1 Superblock Format ++ â–¡ 1.3 Sub-versions of the version-1 superblock ++ â–¡ 1.4 The version-1 superblock format on-disk layout ++ ☆ 1.4.1 Total Size of superblock ++ ☆ 1.4.2 Section: Superblock/"Magic-Number" Identification area ++ ☆ 1.4.3 Section: Per-Array Identification & Configuration area ++ ☆ 1.4.4 Section: RAID-Reshape In-Process Metadata Storage/Recovery ++ area ++ ☆ 1.4.5 Section: This-Component-Device Information area ++ ☆ 1.4.6 Section: Array-State Information area ++ ☆ 1.4.7 Section: Device-Roles (Positions-in-Array) area ++ ++[edit] ++ ++RAID superblock formats ++ ++Currently, the Linux RAID subsystem recognizes two distinct variant ++superblocks. ++ ++They are known as "version-0.90" and "version-1" Superblock formats. ++ ++[edit] ++ ++The version-0.90 Superblock Format ++ ++The version-0.90 superblock format has several limitations. It limits the ++number of component devices within an array to 28, and limits each component ++device to a maximum size of 2TB. ++ ++[edit] ++ ++The version-1 Superblock Format ++ ++The version-1 superblock format represents a more-expandable format, capable of ++supporting arrays with 384+ devices, with 64-bit sector lengths. ++ ++[edit] ++ ++Sub-versions of the version-1 superblock ++ ++The "version-1" superblock format is currently used in three different ++"sub-versions". ++ ++The sub-versions differ primarily (solely?) in the location on each component ++device at which they actually store the superblock. ++ ++┌───────────┬───────────────────────────────────┠++│Sub-Version│ Superblock Position on Device │ ++├───────────┼───────────────────────────────────┤ ++│1.0 │At the end of the device │ ++├───────────┼───────────────────────────────────┤ ++│1.1 │At the beginning of the device │ ++├───────────┼───────────────────────────────────┤ ++│1.2 │4K from the beginning of the device│ ++└───────────┴───────────────────────────────────┘ ++[edit] ++ ++The version-1 superblock format on-disk layout ++ ++[edit] ++ ++Total Size of superblock ++ ++Total Size of superblock: 256 Bytes, plus 2 bytes per device in the array ++ ++[edit] ++ ++Section: Superblock/"Magic-Number" Identification area ++ ++16 Bytes, Offset 0-15 (0x00 - 0x0F) ++ ++┌──────┬──────┬──────┬─────────────┬───────────┬─────┬──────────────────────────┬───────┠++│Offset│Offset│Length│ │ Usage/ │Data │ │ │ ++│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ ++│ │ │bytes)│ │ │ │ │ │ ++├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ ++│ │ │ │ │"Magic │ │ │ │ ++│0x00 -│0 - 3 │4 │magic │Number" │__u32│0xa92b4efc │ │ ++│0x03 │ │ │ │(Superblock│ │(little-endian) │ │ ++│ │ │ │ │ID) │ │ │ │ ++├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ ++│ │ │ │ │Major │ │ │ │ ++│0x04 -│4 - 7 │4 │major_version│Version │__u32│1 │ │ ++│0x07 │ │ │ │of the │ │ │ │ ++│ │ │ │ │Superblock │ │ │ │ ++├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ ++│ │ │ │ │ │ │0 │ │ ++│ │ │ │ │ │ │Bit-Mapped Field │ │ ++│ │ │ │ │ │ │ │ │ ++│ │ │ │ │ │ │┌─────┬──────────────────â”│ │ ++│ │ │ │ │ │ ││ Bit │ Meaning ││ │ ++│ │ │ │ │ │ ││Value│ ││ │ ++│ │ │ │ │ │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │ │ ││1 │RAID Bitmap is ││ │ ++│ │ │ │ │ │ ││ │used ││ │ ++│ │ │ │ │ │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │Feature Map│ ││ │RAID Recovery is ││ │ ++│ │ │ │ │- which │ ││2 │in progress ││ │ ++│ │ │ │ │extended │ ││ │(See ││ │ ++│ │ │ │ │features │ ││ │"recovery_offset")││ │ ++│ │ │ │ │(such as │ │├─────┼──────────────────┤│ │ ++│0x08 -│ │ │ │volume │ ││4 │RAID Reshape is in││ │ ++│0x0B │8 - 11│4 │feature_map │bitmaps, │__u32││ │progress ││ │ ++│ │ │ │ │recovery, │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │or reshape)│ ││8 │undefined/reserved││ │ ++│ │ │ │ │are in use │ ││ │(0) ││ │ ++│ │ │ │ │on this │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │array │ ││16 │undefined/reserved││ │ ++│ │ │ │ │ │ ││ │(0) ││ │ ++│ │ │ │ │ │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │ │ ││32 │undefined/reserved││ │ ++│ │ │ │ │ │ ││ │(0) ││ │ ++│ │ │ │ │ │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │ │ ││64 │undefined/reserved││ │ ++│ │ │ │ │ │ ││ │(0) ││ │ ++│ │ │ │ │ │ │├─────┼──────────────────┤│ │ ++│ │ │ │ │ │ ││128 │undefined/reserved││ │ ++│ │ │ │ │ │ ││ │(0) ││ │ ++│ │ │ │ │ │ │└─────┴──────────────────┘│ │ ++├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ ++│ │ │ │ │ │ │ │Always │ ++│0x0C -│12 - │ │ │Padding │ │ │set to │ ++│0x0F │15 │4 │pad0 │Block 0 │__u32│0 │zero │ ++│ │ │ │ │ │ │ │when │ ++│ │ │ │ │ │ │ │writing│ ++└──────┴──────┴──────┴─────────────┴───────────┴─────┴──────────────────────────┴───────┘ ++ ++ ++[edit] ++ ++Section: Per-Array Identification & Configuration area ++ ++48 Bytes, Offset 16-63 (0x10 - 0x3F) ++ ++┌──────┬──────┬──────┬─────────────┬──────────┬─────┬────────────────┬───────────┠++│Offset│Offset│Length│ │ Usage/ │Data │ │ │ ++│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ ++│ │ │bytes)│ │ │ │ │ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│0x10 -│16 - │ │ │UUID for │__u8 │Set by │ │ ++│0x1F │31 │16 │set_uuid │the Array │[16] │user-space │ │ ++│ │ │ │ │(?) │ │formatting util │ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│0x20 -│32 - │ │ │Name for │char │Set and used by │ │ ++│0x3F │63 │32 │set_name │the Array │[32] │user-space utils│Nt │ ++│ │ │ │ │(?) │ │ │ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │low 40-bits are │ │ ++│0x40 -│64 - │8 │ctime │Creation │__u64│seconds │ │ ++│0x47 │71 │ │ │Time(?) │ │high 24-bits are│ │ ++│ │ │ │ │ │ │uSeconds │ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │┌──┬───────────â”│ │ ++│ │ │ │ │ │ ││-4│Multi-Path ││ │ ++│ │ │ │ │ │ │├──┼───────────┤│ │ ++│ │ │ │ │ │ ││-1│Linear ││ │ ++│ │ │ │ │ │ │├──┼───────────┤│ │ ++│ │ │ │ │ │ ││0 │RAID-0 ││ │ ++│ │ │ │ │ │ ││ │(Striped) ││ │ ++│ │ │ │ │ │ │├──┼───────────┤│ │ ++│ │ │ │ │ │ ││1 │RAID-1 ││ │ ++│ │ │ │ │ │ ││ │(Mirrored) ││mdadm │ ++│ │ │ │ │ │ │├──┼───────────┤│versions │ ++│ │ │ │ │ │ ││ │RAID-4 ││(as of │ ++│ │ │ │ │ │ ││ │(Striped ││v2.6.4) │ ++│0x48 -│72 - │ │ │RAID Level│ ││4 │with ││limit │ ++│0x4B │75 │4 │level │of the │__u32││ │Dedicated ││RAID-6 │ ++│ │ │ │ │Array │ ││ │Block-Level││(creation) │ ++│ │ │ │ │ │ ││ │Parity) ││to 256 │ ++│ │ │ │ │ │ │├──┼───────────┤│disks or │ ++│ │ │ │ │ │ ││ │RAID-5 ││less │ ++│ │ │ │ │ │ ││ │(Striped ││ │ ++│ │ │ │ │ │ ││5 │with ││ │ ++│ │ │ │ │ │ ││ │Distributed││ │ ++│ │ │ │ │ │ ││ │Parity) ││ │ ++│ │ │ │ │ │ │├──┼───────────┤│ │ ++│ │ │ │ │ │ ││ │RAID-6 ││ │ ++│ │ │ │ │ │ ││6 │(Striped ││ │ ++│ │ │ │ │ │ ││ │with Dual ││ │ ++│ │ │ │ │ │ ││ │Parity) ││ │ ++│ │ │ │ │ │ │└──┴───────────┘│ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │┌─┬────────────â”│ │ ++│ │ │ │ │ │ ││0│left ││ │ ++│ │ │ │ │ │ ││ │asymmetric ││ │ ++│ │ │ │ │ │ │├─┼────────────┤│Controls │ ++│ │ │ │ │ │ ││1│right ││the │ ++│ │ │ │ │layout of │ ││ │asymmetric ││relative │ ++│0x4C -│76 - │4 │layout │array │__u32│├─┼────────────┤│arrangement│ ++│0x4F │79 │ │ │(RAID5(and│ ││ │left ││of data and│ ++│ │ │ │ │6?) only) │ ││2│symmetric ││parity │ ++│ │ │ │ │ │ ││ │(default) ││blocks on │ ++│ │ │ │ │ │ │├─┼────────────┤│the disks. │ ++│ │ │ │ │ │ ││3│right ││ │ ++│ │ │ │ │ │ ││ │symmetric ││ │ ++│ │ │ │ │ │ │└─┴────────────┘│ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │size of │ │ ++│ │ │ │ │used-size │ │component │ │ ++│0x50 -│80 - │8 │size │of │__u64│devices │ │ ++│0x57 │87 │ │ │component │ │(in # of │ │ ++│ │ │ │ │devices │ │512-byte │ │ ++│ │ │ │ │ │ │sectors) │ │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │ │default is │ ++│ │ │ │ │ │ │ │64K? for │ ++│ │ │ │ │ │ │ │raid levels│ ++│ │ │ │ │ │ │ │0, 10, 4, │ ++│ │ │ │ │ │ │ │5, and 6 │ ++│ │ │ │ │ │ │ │chunksize │ ++│ │ │ │ │ │ │ │not used in│ ++│ │ │ │ │ │ │ │raid levels│ ++│ │ │ │ │ │ │chunk-size of │1, linear, │ ++│ │ │ │ │chunk-size│ │the array │and │ ++│0x58 -│88 - │4 │chunksize │of the │__u32│(in # of │multi-path │ ++│0x5B │91 │ │ │array │ │512-byte │ │ ++│ │ │ │ │ │ │sectors) │Note: │ ++│ │ │ │ │ │ │ │During │ ++│ │ │ │ │ │ │ │creation │ ++│ │ │ │ │ │ │ │this │ ++│ │ │ │ │ │ │ │appears to │ ++│ │ │ │ │ │ │ │be created │ ++│ │ │ │ │ │ │ │as a │ ++│ │ │ │ │ │ │ │multiple of│ ++│ │ │ │ │ │ │ │1024 rather│ ++│ │ │ │ │ │ │ │than 512. │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │ │raid4 │ ++│ │ │ │ │ │ │ │requires a │ ++│ │ │ │ │ │ │ │minimum of │ ++│ │ │ │ │ │ │ │2 member │ ++│ │ │ │ │ │ │ │devs │ ++│ │ │ │ │ │ │ │raid5 │ ++│ │ │ │ │ │ │ │requires a │ ++│ │ │ │ │ │ │ │minimum of │ ++│ │ │ │ │(?)number │ │ │2 member │ ++│0x5C -│92 - │4 │raid_disks │of disks │__u32│# │devs │ ++│0x5F │95 │ │ │in array │ │ │raid6 │ ++│ │ │ │ │(?) │ │ │requires a │ ++│ │ │ │ │ │ │ │minimum of │ ++│ │ │ │ │ │ │ │4 member │ ++│ │ │ │ │ │ │ │devs │ ++│ │ │ │ │ │ │ │raid6 │ ++│ │ │ │ │ │ │ │limited to │ ++│ │ │ │ │ │ │ │a max of │ ++│ │ │ │ │ │ │ │256 member │ ++│ │ │ │ │ │ │ │devs │ ++├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ ++│ │ │ │ │ │ │ │This is │ ++│ │ │ │ │# of │ │ │only valid │ ++│ │ │ │ │sectors │ │ │if │ ++│ │ │ │ │after │ │ │feature_map│ ++│ │ │ │ │superblock│ │ │[1] is set │ ++│ │ │ │ │that │ │ │ │ ++│0x60 -│96 - │4 │bitmap_offset│bitmap │__u32│(signed) │Signed │ ++│0x63 │99 │ │ │starts │ │ │value │ ++│ │ │ │ │(See note │ │ │allows │ ++│ │ │ │ │about │ │ │bitmap │ ++│ │ │ │ │signed │ │ │to appear │ ++│ │ │ │ │value) │ │ │before │ ++│ │ │ │ │ │ │ │superblock │ ++│ │ │ │ │ │ │ │on the disk│ ++└──────┴──────┴──────┴─────────────┴──────────┴─────┴────────────────┴───────────┘ ++ ++ ++[edit] ++ ++Section: RAID-Reshape In-Process Metadata Storage/Recovery area ++ ++64 Bytes, Offset 100-163 (0x64 - 0x7F) ++(Note: Only contains valid data if feature_map bit '4' is set) ++ ++┌──────┬──────┬──────┬────────────────┬───────────┬─────┬─────────────┬───────┠++│Offset│Offset│Length│ │ Usage/ │Data │ │ │ ++│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ ++│ │ │bytes)│ │ │ │ │ │ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++│ │ │ │ │the new │ │ │ │ ++│0x64 -│100 - │4 │new_level │RAID level │__u32│see level │ │ ++│0x67 │103 │ │ │being │ │field (above)│ │ ++│ │ │ │ │reshaped-to│ │ │ │ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++│ │ │ │ │Next │ │current │ │ ++│0x68 -│104 - │8 │reshape_position│address of │__u64│position of │ │ ++│0x6F │111 │ │ │the array │ │the reshape │ │ ++│ │ │ │ │to reshape │ │operation │ │ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++│ │ │ │ │this holds │ │ │ │ ++│0x70 -│112 - │4 │delta_disks │the change │__u32│change in # │ │ ++│0x73 │115 │ │ │in # of │ │of raid disks│ │ ++│ │ │ │ │raid disks │ │ │ │ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++│0x74 -│116 - │4 │new_layout │new layout │__u32│see layout │ │ ++│0x77 │119 │ │ │for array │ │field (above)│ │ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++│0x78 -│120 - │4 │new_chunk │new chunk │__u32│see chunksize│ │ ++│0x7B │123 │ │ │size │ │field (above)│ │ ++├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ ++│ │ │ │ │ │ │ │Always │ ++│0x7C -│124 - │ │ │Padding │__u8 │ │set to │ ++│0x7F │127 │4 │pad1 │Block #1 │[4] │0 │zero │ ++│ │ │ │ │ │ │ │when │ ++│ │ │ │ │ │ │ │writing│ ++└──────┴──────┴──────┴────────────────┴───────────┴─────┴─────────────┴───────┘ ++ ++ ++ ++[edit] ++ ++Section: This-Component-Device Information area ++ ++64 Bytes, Offset 128-191 (0x80 - 0xbf) ++ ++┌──────┬──────┬──────┬──────────────────┬────────────┬─────┬────────────────────┬────────────┠++│Offset│Offset│Length│ │ Usage/ │Data │ │ │ ++│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ ++│ │ │bytes)│ │ │ │ │ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│0x80 -│128 - │ │ │the sector #│ │sector # where data │ │ ++│0x87 │135 │8 │data_offset │upon which │__u64│begins │ │ ++│ │ │ │ │data starts │ │(Often 0) │ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │sectors in │ │ │ │ ++│0x88 -│136 - │ │ │the device │ │# of sectors that │ │ ++│0x8F │143 │8 │data_size │that are │__u64│can be used for data│ │ ++│ │ │ │ │used for │ │ │ │ ++│ │ │ │ │data │ │ │ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │# of the │ │ │ │ ++│0x90 -│144 - │ │ │sector upon │ │# of the sector upon│ │ ++│0x97 │151 │8 │super_offset │which this │__u64│which this │ │ ++│ │ │ │ │superblock │ │superblock starts │ │ ++│ │ │ │ │starts │ │ │ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │sectors │ │ │ │ ++│ │ │ │ │before this │ │ │ │ ++│0x98 -│152 - │ │ │offset │ │ │ │ ++│0x9F │159 │8 │recovery_offset │(from │__u64│sector # │ │ ++│ │ │ │ │data_offset)│ │ │ │ ++│ │ │ │ │have been │ │ │ │ ++│ │ │ │ │recovered │ │ │ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│0xA0 -│160 - │ │ │ │ │Permanent identifier│ │ ++│0xA3 │163 │4 │dev_number │Fm │__u32│of this device (Not │ │ ++│ │ │ │ │ │ │its role in RAID(?))│ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │Number of │ │ │ │ ++│0xA4 -│164 - │ │ │read-errors │ │ │ │ ++│0xA7 │167 │4 │cnt_corrected_read│that were │__u32│Dv │ │ ++│ │ │ │ │corrected by│ │ │ │ ++│ │ │ │ │re-writing │ │ │ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │UUID of the │ │ │Set by │ ++│0xA8 -│168 - │16 │device_uuid │component │__u8 │ │User-Space │ ++│0xB7 │183 │ │ │device │[16] │ │Ignored by │ ++│ │ │ │ │ │ │ │kernel │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │ │ │Bit-Mapped Field │ │ ++│ │ │ │ │ │ │ │ │ ++│ │ │ │ │ │ │┌─────┬────────────â”│ │ ++│ │ │ │ │ │ ││ Bit │ Meaning ││ │ ++│ │ │ │ │ │ ││Value│ ││WriteMostly1│ ++│ │ │ │ │ │ │├─────┼────────────┤│indicates │ ++│ │ │ │ │ │ ││1 │WriteMostly1││that this │ ++│ │ │ │ │ │ │├─────┼────────────┤│device │ ++│ │ │ │ │ │ ││2 │(?) ││should only │ ++│ │ │ │ │Per-Device │ │├─────┼────────────┤│be updated │ ++│0xB8 │184 │1 │devflags │Flags │__u8 ││4 │(?) ││on writes, │ ++│ │ │ │ │(Bit-Mapped │ │├─────┼────────────┤│not read │ ++│ │ │ │ │Field) │ ││8 │(?) ││from. │ ++│ │ │ │ │ │ │├─────┼────────────┤│(Useful with│ ++│ │ │ │ │ │ ││16 │(?) ││slow devices│ ++│ │ │ │ │ │ │├─────┼────────────┤│in RAID1 │ ++│ │ │ │ │ │ ││32 │(?) ││arrays?) │ ++│ │ │ │ │ │ │├─────┼────────────┤│ │ ++│ │ │ │ │ │ ││64 │(?) ││ │ ++│ │ │ │ │ │ │├─────┼────────────┤│ │ ++│ │ │ │ │ │ ││128 │(?) ││ │ ++│ │ │ │ │ │ │└─────┴────────────┘│ │ ++├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ ++│ │ │ │ │ │ │ │Always set │ ++│0xB9 -│185 - │7 │pad2 │Padding │__u8 │0 │to │ ++│0xBF │191 │ │ │block 2 │[7] │ │zero when │ ++│ │ │ │ │ │ │ │writing │ ++└──────┴──────┴──────┴──────────────────┴────────────┴─────┴────────────────────┴────────────┘ ++ ++ ++[edit] ++ ++Section: Array-State Information area ++ ++64 Bytes, Offset 192-255 (0xC0 - 0xFF) ++ ++┌──────┬──────┬──────┬─────────────┬─────────────┬─────┬────────┬─────────────┠++│Offset│Offset│Length│ │ │Data │ Data │ │ ++│(Hex) │(Dec) │ (in │ Field Name │Usage/Meaning│Type │ Value │ Notes │ ++│ │ │bytes)│ │ │ │ │ │ ++├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ ++│ │ │ │ │ │ │low │ │ ++│ │ │ │ │ │ │40-bits │ │ ++│ │ │ │ │ │ │are │ │ ++│0xC0 -│192 - │8 │utime │Fm │__u64│seconds │Nt │ ++│0xC7 │199 │ │ │ │ │high │ │ ++│ │ │ │ │ │ │24-bits │ │ ++│ │ │ │ │ │ │are │ │ ++│ │ │ │ │ │ │uSeconds│ │ ++├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ ++│ │ │ │ │ │ │ │Updated │ ++│ │ │ │ │ │ │ │whenever the │ ++│ │ │ │ │ │ │ │superblock is│ ++│ │ │ │ │ │ │ │updated. │ ++│ │ │ │ │ │ │ │Used by mdadm│ ++│0xC8 -│200 - │8 │events │Event Count │__u64│# │in │ ++│0xCF │207 │ │ │for the Array│ │ │re-assembly │ ++│ │ │ │ │ │ │ │to detect │ ++│ │ │ │ │ │ │ │failed/ │ ++│ │ │ │ │ │ │ │out-of-sync │ ++│ │ │ │ │ │ │ │component │ ++│ │ │ │ │ │ │ │devices. │ ++├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ ++│ │ │ │ │Offsets │ │ │ │ ++│ │ │ │ │before this │ │ │ │ ++│ │ │ │ │one (starting│ │ │ │ ++│0xD0 -│208 - │8 │resync_offset│from │__u64│offset #│ │ ++│0xD7 │215 │ │ │data_offset) │ │ │ │ ++│ │ │ │ │are 'known' │ │ │ │ ++│ │ │ │ │to be in │ │ │ │ ++│ │ │ │ │sync. │ │ │ │ ++├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ ++│ │ │ │ │ │ │ │This value │ ++│ │ │ │ │Checksum of │ │ │will be │ ++│0xD8 -│216 - │ │ │this │ │ │different for│ ++│0xDB │219 │4 │sb_csum │superblock up│__u32│# │each │ ++│ │ │ │ │to devs │ │ │component │ ++│ │ │ │ │[max_dev] │ │ │device's │ ++│ │ │ │ │ │ │ │superblock. │ ++├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ ++│ │ │ │ │How many │ │ │ │ ++│0xDC -│220 - │ │ │devices are │ │ │ │ ++│0xDF │223 │4 │max_dev │part of (or │__u32│# │ │ ++│ │ │ │ │related to) │ │ │ │ ++│ │ │ │ │the array │ │ │ │ ++├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ ++│0xE0 -│224 - │ │ │Padding Block│__u8 │ │Always set to│ ++│0xFF │255 │32 │pad3 │3 │[32] │0 │zero when │ ++│ │ │ │ │ │ │ │writing │ ++└──────┴──────┴──────┴─────────────┴─────────────┴─────┴────────┴─────────────┘ ++ ++ ++[edit] ++ ++Section: Device-Roles (Positions-in-Array) area ++ ++Length: Variable number of bytes (but at least 768 bytes?) ++2 Bytes per device in the array, including both spare-devices and ++faulty-devices ++ ++┌──────────────────────────────────────────────────────────────────────────────┠++│ Section: Device-Roles (Positions-in-Array) area │ ++├──────────────────────────────────────────────────────────────────────────────┤ ++│(Variable length - 2 Bytes per Device in Array (including Spares/Faulty-Devs) │ ++├──────────────────────────────────────────────────────────────────────────────┤ ++│ │ ++├────────┬───────┬──────┬─────────┬────────┬─────┬───────────────────────┬─────┤ ++│ Offset │Offset │Length│ Field │ Usage/ │Data │ │ │ ++│ (Hex) │ (Dec) │ (in │ Name │Meaning │Type │ Data Value │Notes│ ++│ │ │bytes)│ │ │ │ │ │ ++├────────┴───────┴──────┴─────────┴────────┴─────┴───────────────────────┴─────┤ ++│ ?? Bytes, Offset 256-??? (0x100 - 0x???) │ ++├────────┬───────┬──────┬─────────┬────────┬─────┬───────────────────────┬─────┤ ++│ │ │ │ │ │ │Role or Position of │ │ ++│0x100 - │256 │? │dev_roles│Fm │__u16│device in the array. │ │ ++│0x??? │- ??? │ │ │ │ │0xFFFF means "spare". │ │ ++│ │ │ │ │ │ │0xFFFE means "faulty". │ │ ++└────────┴───────┴──────┴─────────┴────────┴─────┴───────────────────────┴─────┘ ++Retrieved from "http://linux-raid.osdl.org/index.php/RAID_superblock_formats" ++ ++Views ++ ++ • Article ++ • Discussion ++ • Edit ++ • History ++ ++Personal tools ++ ++ • Log in / create account ++ ++ ++ ++Navigation ++ ++ • Linux Raid ++ • Community portal ++ • Current events ++ • Recent changes ++ • Random page ++ • Help ++ • Donations ++ ++Search ++ ++[ ] [Go] [Search] ++Toolbox ++ ++ • What links here ++ • Related changes ++ • Special pages ++ • Printable version ++ • Permanent link ++ ++MediaWiki ++GNU Free Documentation License 1.2 ++ ++ • This page was last modified 04:50, 3 June 2008. ++ • This page has been accessed 5,723 times. ++ • Content is available under GNU Free Documentation License 1.2. ++ • Privacy policy ++ • About Linux-raid ++ • Disclaimers ++ +-- +tg: (d28c1a7..) contrib/docs/superblock_formats (depends on: mdadm-3.0.3) diff -Nru mdadm-2.6.7.1/debian/patches/contrib/scripts/mdadd.diff mdadm-3.1.4/debian/patches/contrib/scripts/mdadd.diff --- mdadm-2.6.7.1/debian/patches/contrib/scripts/mdadd.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/contrib/scripts/mdadd.diff 2010-09-03 12:06:49.000000000 +0300 @@ -0,0 +1,332 @@ +From: Arno van Amersfoort +Subject: Script to add a harddisk to multi MD-array + +Signed-off-by: martin f. krafft +URL: http://rocky.eld.leidenuniv.nl/scripts/mdadd.sh + +--- + contrib/mdadd.sh | 314 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 314 insertions(+), 0 deletions(-) + +diff --git a/contrib/mdadd.sh b/contrib/mdadd.sh +new file mode 100644 +index 0000000..da5464f +--- /dev/null ++++ b/contrib/mdadd.sh +@@ -0,0 +1,314 @@ ++#!/bin/bash ++ ++MY_VERSION="1.46" ++# ---------------------------------------------------------------------------------------------------------------------- ++# Linux MD (Soft)RAID Add Script - Add a (new) harddisk to another multi MD-array harddisk ++# Last update: June 9, 2009 ++# (C) Copyright 2005-2009 by Arno van Amersfoort ++# Homepage : http://rocky.eld.leidenuniv.nl/ ++# Email : a r n o v a AT r o c k y DOT e l d DOT l e i d e n u n i v DOT n l ++# (note: you must remove all spaces and substitute the @ and the . at the proper locations!) ++# ---------------------------------------------------------------------------------------------------------------------- ++# This program is free software; you can redistribute it and/or ++# modify it under the terms of the GNU General Public License ++# version 2 as published by the Free Software Foundation. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++# ---------------------------------------------------------------------------------------------------------------------- ++ ++TAB=$(printf "\t") ++EOL=' ++' ++ ++show_help() ++{ ++ echo "Bad or missing parameter(s)" ++ echo "Usage: $(basename $0) [ source_disk ] [ target_disk ] [ options ]" ++ echo "Options:" ++ echo "--force = Even proceed if target device does not appear empty" ++ echo "--noptupdate = Do NOT update the partition table on the target device (EXPERT!)" ++ echo "--nombrupdate = Do NOT update the MBR boot-loader on the target device (EXPERT!)" ++} ++ ++ ++get_partitions() ++{ ++ cat /proc/partitions |awk '{ print $NF }' |sed -e '1,2d' -e 's,^/dev/,,' ++} ++ ++ ++check_binary() ++{ ++ if ! which "$1" >/dev/null 2>&1; then ++ printf "\033[40m\033[1;31mERROR: Binary \"$1\" does not exist or is not executable!\033[0m\n" >&2 ++ printf "\033[40m\033[1;31m Please, make sure that it is (properly) installed!\033[0m\n" >&2 ++ exit 2 ++ fi ++} ++ ++ ++sanity_check() ++{ ++ if [ "$UID" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: Root check FAILED (you MUST be root to use this script)! Quitting...\n\033[0m" ++ exit 1 ++ fi ++ ++ check_binary mdadm ++ check_binary sfdisk ++ check_binary fdisk ++ check_binary dd ++ check_binary awk ++ check_binary grep ++ check_binary sed ++ check_binary cat ++ ++ if [ -z "$SOURCE" ] || [ -z "$TARGET" ]; then ++ echo "ERROR: Bad or missing argument(s)" ++ show_help; ++ exit 4 ++ fi ++ ++ if ! echo "$SOURCE" |grep -q '^/dev/'; then ++ printf "\033[40m\033[1;31mERROR: Source device $SOURCE does not start with /dev/! Quitting...\n\033[0m" ++ exit 5 ++ fi ++ ++ if ! echo "$TARGET" |grep -q '^/dev/'; then ++ printf "\033[40m\033[1;31mERROR: Target device $TARGET does not start with /dev/! Quitting...\n\033[0m" ++ exit 5 ++ fi ++ ++ if echo "$SOURCE" |grep -q 'md[0-9]'; then ++ printf "\033[40m\033[1;31mERROR: The source device specified is an md-device! Quitting...\n\033[0m" ++ echo "A physical drive (part of the md-array('s)) is required as source device (ie. /dev/hda)!" ++ exit 5 ++ fi ++ ++ # We also want variables without /dev/ : ++ SOURCE_NODEV="$(echo "$SOURCE" |sed 's,^/dev/,,')" ++ TARGET_NODEV="$(echo "$TARGET" |sed 's,^/dev/,,')" ++ ++ if ! get_partitions |grep -E -q -x "$SOURCE_NODEV""p?[0-9]+"; then ++ printf "\033[40m\033[1;31mERROR: Source device $SOURCE does not contain any partitions!? Quitting...\n\033[0m" ++ exit 7 ++ fi ++ ++ if get_partitions |grep -E -q -x "$TARGET_NODEV""p?[0-9]+" && [ "$FORCE" != "1" ]; then ++ printf "\033[40m\033[1;31mERROR: Target device $TARGET is NOT empty! Use --force to override. Quitting...\n\033[0m" ++ exit 8 ++ fi ++ ++ echo "--> Saving mdadm detail scan to /tmp/mdadm-detail-scan.txt..." ++ mdadm --detail --scan --verbose >/tmp/mdadm-detail-scan.txt ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: mdadm returned an error($retval) while determining detail information!\n\033[0m" ++ exit 10 ++ fi ++ ++ echo "--> Saving partition table of target device $TARGET to /tmp/partitions.$TARGET_NODEV..." ++ sfdisk -d "$TARGET" >"/tmp/partitions.$TARGET_NODEV" ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: sfdisk returned an error($retval) while reading the partition table!\n\033[0m" ++ exit 9 ++ fi ++ ++ echo "--> Saving partition table of source device $SOURCE to /tmp/partitions.$SOURCE_NODEV..." ++ sfdisk -d "$SOURCE" >"/tmp/partitions.$SOURCE_NODEV" ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: sfdisk returned an error($retval) while reading the partition table!\n\033[0m" ++ exit 9 ++ fi ++ ++ MD_DEV="" ++ IFS=$EOL ++ for MDSTAT_LINE in `cat /proc/mdstat`; do ++ if echo "$MDSTAT_LINE" |grep -q '^md'; then ++ MD_DEV_LINE="$MDSTAT_LINE" ++ MD_DEV="$(echo "$MDSTAT_LINE" |awk '{ print $1 }')" ++ ++ unset IFS ++ for part_nodev in `cat "/tmp/partitions.$TARGET_NODEV" |grep '^/dev/' |grep -v 'Id= 0' |awk '{ print $1 }' |sed 's,^/dev/,,'`; do ++ if echo "$MD_DEV_LINE" |grep -E -q "[[:blank:]]$part_nodev\["; then ++ printf "\033[40m\033[1;31mWARNING: Partition /dev/$part_nodev on target device is already in use by array /dev/$MD_DEV!\nPress enter to continue or CTRL-C to abort...\n\033[0m" ++ read ++ fi ++ done ++ fi ++ ++ if echo "$MDSTAT_LINE" |grep -E -q '[[:blank:]]blocks[[:blank:]]' && ! echo "$MDSTAT_LINE" |grep -q '_'; then ++ # This array is NOT degraded so now check whether we want to add devices to it: ++ unset IFS ++ #FIXME! ++ for part_nodev in `cat "/tmp/partitions.$SOURCE_NODEV" |grep '^/dev/' |grep -v 'Id= 0' |awk '{ print $1 }' |sed 's,^/dev/,,'`; do ++ if echo "$MD_DEV_LINE" |grep -E -q "[[:blank:]]$part_nodev\["; then ++ printf "\033[40m\033[1;31mWARNING: Array $MD_DEV is NOT degraded, target device $TARGET$(echo "$part_nodev" |sed "s,$SOURCE_NODEV,,") will become a hotspare!\nPress enter to continue or CTRL-C to abort...\n\033[0m" echo "WARNING: Array is not degraded: $LINE" ++ read ++ fi ++ done ++ fi ++ done ++} ++ ++ ++# Program entry point ++echo "MDadd for SoftRAID-MDADM v$MY_VERSION" ++echo "Written by Arno van Amersfoort" ++echo "--------------------------------" ++ ++# Set environment variables to default ++FORCE=0 ++NOPTUPDATE=0 ++NOMBRUPDATE=0 ++SOURCE="" ++TARGET="" ++ ++# Check arguments ++for arg in $*; do ++ ARGNAME="$(echo "$arg" |cut -d= -f1)" ++ ARGVAL="$(echo "$arg" |cut -d= -f2)" ++ ++ if ! echo "$ARGNAME" |grep -q '^-'; then ++ if [ -z "$SOURCE" ]; then ++ SOURCE="$ARGVAL" ++ else ++ if [ -z "$TARGET" ]; then ++ TARGET="$ARGVAL" ++ else ++ show_help; ++ exit 3 ++ fi ++ fi ++ else ++ case "$ARGNAME" in ++ --force|-force|-f) FORCE=1;; ++ --noptupdate|-noptupdate|--noptu|-noptu) NOPTUPDATE=1;; ++ --nombrupdate|-nombrupdate|--nombru|nombru) NOMBRUPDATE=1;; ++ --help) show_help; ++ exit 0;; ++ *) echo "ERROR: Bad argument: $ARGNAME"; ++ show_help; ++ exit 3;; ++ esac ++ fi ++done ++ ++# Make sure everything is sane: ++sanity_check; ++ ++# Disable all swaps on target disk ++echo "--> Disabling any swap partitions on target device $TARGET" ++IFS=$EOL ++for SWAP in `grep -E "^$TARGET""p?[0-9]+" /proc/swaps |awk '{ print $1 }'`; do ++ swapoff $SWAP >/dev/null 2>&1 ++done ++ ++# Update track0 on target disk ++if [ "$NOMBRUPDATE" != "1" ]; then ++ echo "--> Copying track0(containing MBR) from $SOURCE to $TARGET..." ++ dd if="$SOURCE" of="$TARGET" bs=65536 count=1 ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: dd returned an error($retval) while copying track0!\n\033[0m" ++ exit 9 ++ fi ++fi ++ ++if [ "$NOPTUPDATE" != "1" ]; then ++ echo "--> Restoring partition table from /tmp/partitions.$SOURCE_NODEV to $TARGET..." ++ cat "/tmp/partitions.$SOURCE_NODEV" |sfdisk --force "$TARGET" ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: sfdisk returned an error($retval) while writing the partition table!\n\033[0m" ++ exit 9 ++ fi ++else ++ echo "--> Restoring partition table from /tmp/partitions.$TARGET_NODEV to $TARGET..." ++ cat "/tmp/partitions.$TARGET_NODEV" |sfdisk --force "$TARGET" ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: sfdisk returned an error($retval) while writing the partition table!\n\033[0m" ++ exit 9 ++ fi ++fi ++ ++ ++# Copy/build all md devices that exist on the source drive: ++BOOT=0 ++NO_ADD=1 ++IFS=$EOL ++for LINE in `cat /tmp/mdadm-detail-scan.txt`; do ++ if echo "$LINE" |grep -E -q '^ARRAY[[:blank:]]'; then ++ MD_DEV=$(echo "$LINE" |awk '{ print $2 }') ++ fi ++ ++ if echo "$LINE" |grep -q "devices=.*$SOURCE"; then ++ NO_ADD=0 ++ PARTITION_NR="$(echo "$LINE" |sed -e "s:.*devices=.*$SOURCE::" -e "s:,.*::")" ++ ++ if [ -z "$PARTITION_NR" ]; then ++ printf "\033[40m\033[1;31mERROR: Unable to retrieve detail information for $SOURCE from $MD_DEV!\n\033[0m" ++ exit 11 ++ fi ++ ++ # Check whether we're a root or boot partition ++ if grep -E -q -e "^$MD_DEV[[:blank:]]*/boot[[:blank:]]" -e "$MD_DEV[[:blank:]]*/[[:blank:]]" /etc/fstab; then ++ BOOT=1 ++ fi ++ ++ echo "" ++ echo "--> Adding $TARGET$PARTITION_NR to RAID array $MD_DEV:" ++ printf "\033[40m\033[1;31m" ++ mdadm --add "$MD_DEV" "$TARGET""$PARTITION_NR" ++ retval=$? ++ if [ "$retval" != "0" ]; then ++ printf "\033[40m\033[1;31mERROR: mdadm returned an error($retval) while adding device!\n\033[0m" ++ exit 12 ++ fi ++ printf "\033[0m" ++ fi ++done ++ ++echo "" ++ ++# Create swapspace on partitions with ID=82 ++echo "--> Creating swapspace on target device (if any swap partitions exist):" ++IFS=$EOL ++for SWAP_DEVICE in `sfdisk -d "$TARGET" |grep -i 'Id=82' |awk '{ print $1 }'`; do ++ mkswap "$SWAP_DEVICE" ++ swapon "$SWAP_DEVICE" ++ ++ if ! grep -E -q "^$SWAP_DEVICE[[:blank:]]*none[[:blank:]]*swap[[:blank:]]" /etc/fstab; then ++ printf "\033[40m\033[1;31mWARNING: /etc/fstab does NOT contain a (valid) swap entry for $SWAP_DEVICE\n\033[0m" ++ fi ++done ++ ++#echo "--> Showing current mdadm detail-scan (you may need to update your mdadm.conf (manually):" ++#mdadm --detail --scan ++ ++echo "--> Showing current /proc/mdstat (you may need to update your mdadm.conf (manually):" ++cat /proc/mdstat ++echo "" ++ ++if [ "$NO_ADD" = "1" ]; then ++ printf "\033[40m\033[1;31mWARNING: No mdadm --add actions were performed, please investigate!\n\033[0m" ++fi ++ ++if [ "$BOOT" = "1" ]; then ++ printf "\033[40m\033[1;31mNOTE: Boot and/or root partition detected.\n You *MAY* need to reinstall your boot loader (ie. GRUB) on this device!\n\033[0m" ++fi ++ ++# TODO?: ++# sanity check nopt (check if target device has a partition table)? ++# detect if device has superblock (mdadm --examine /dev/sda1; echo $?)? ++# continue ask (show what will be done):? +-- +tg: (d28c1a7..) contrib/scripts/mdadd (depends on: mdadm-3.0.3) diff -Nru mdadm-2.6.7.1/debian/patches/debian/conffile-location.diff mdadm-3.1.4/debian/patches/debian/conffile-location.diff --- mdadm-2.6.7.1/debian/patches/debian/conffile-location.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/debian/conffile-location.diff 2010-09-03 12:06:50.000000000 +0300 @@ -0,0 +1,128 @@ +From: martin f. krafft +Subject: Set /etc/mdadm/mdadm.conf as primary config file location + +On Debian, the configuration file resides primarily in /etc/mdadm/mdadm.conf, +/etc/mdadm.conf is only used as a backup. + +This is a Debian-specific patch. + +Forwarded: not-needed +Reviewed-by: martin f. krafft + +--- + Makefile | 4 ++-- + ReadMe.c | 2 +- + mdadm.8.in | 14 ++++++-------- + mdadm.conf.5 | 2 +- + mdassemble.8 | 2 +- + 5 files changed, 11 insertions(+), 13 deletions(-) + +diff --git a/Makefile b/Makefile +index e2c65a5..0e85ccb 100644 +--- a/Makefile ++++ b/Makefile +@@ -60,8 +60,8 @@ else + endif + + SYSCONFDIR = /etc +-CONFFILE = $(SYSCONFDIR)/mdadm.conf +-CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf ++CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf ++CONFFILE2 = $(SYSCONFDIR)/mdadm.conf + MAILCMD =/usr/sbin/sendmail -t + CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" + # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +diff --git a/ReadMe.c b/ReadMe.c +index b97c55e..aa26032 100644 +--- a/ReadMe.c ++++ b/ReadMe.c +@@ -560,7 +560,7 @@ char Help_incr[] = + ; + + char Help_config[] = +-"The /etc/mdadm.conf config file:\n\n" ++"The /etc/mdadm/mdadm.conf config file:\n\n" + " The config file contains, apart from blank lines and comment lines that\n" + " start with a hash(#), four sorts of configuration lines: array lines, \n" + " device lines, mailaddr lines and program lines.\n" +diff --git a/mdadm.8.in b/mdadm.8.in +index d911cb3..d267b8c 100644 +--- a/mdadm.8.in ++++ b/mdadm.8.in +@@ -261,9 +261,9 @@ the exact meaning of this option in different contexts. + .TP + .BR \-c ", " \-\-config= + Specify the config file. Default is to use +-.BR /etc/mdadm.conf , +-or if that is missing then +-.BR /etc/mdadm/mdadm.conf . ++.BR /etc/mdadm/mdadm.conf , ++or if that is missing, then ++.BR /etc/mdadm.conf . + If the config file given is + .B "partitions" + then nothing will be read, but +@@ -1429,8 +1429,6 @@ The config file is only used if explicitly named with + or requested with (a possibly implicit) + .BR \-\-scan . + In the later case, +-.B /etc/mdadm.conf +-or + .B /etc/mdadm/mdadm.conf + is used. + +@@ -1771,7 +1769,7 @@ or + .B \-\-scan + will cause the output to be less detailed and the format to be + suitable for inclusion in +-.BR /etc/mdadm.conf . ++.BR /etc/mdadm/mdadm.conf . + The exit status of + .I mdadm + will normally be 0 unless +@@ -1848,7 +1846,7 @@ or + is given, then multiple devices that are components of the one array + are grouped together and reported in a single entry suitable + for inclusion in +-.BR /etc/mdadm.conf . ++.BR /etc/mdadm/mdadm.conf . + + Having + .B \-\-scan +@@ -2545,7 +2543,7 @@ uses this to find arrays when + is given in Misc mode, and to monitor array reconstruction + on Monitor mode. + +-.SS /etc/mdadm.conf ++.SS /etc/mdadm/mdadm.conf + + The config file lists which devices may be scanned to see if + they contain MD super block, and gives identifying information +diff --git a/mdadm.conf.5 b/mdadm.conf.5 +index e677ba9..648d26a 100644 +--- a/mdadm.conf.5 ++++ b/mdadm.conf.5 +@@ -8,7 +8,7 @@ + .SH NAME + mdadm.conf \- configuration for management of Software RAID with mdadm + .SH SYNOPSIS +-/etc/mdadm.conf ++/etc/mdadm/mdadm.conf + .SH DESCRIPTION + .PP + .I mdadm +diff --git a/mdassemble.8 b/mdassemble.8 +index 986432c..d06c201 100644 +--- a/mdassemble.8 ++++ b/mdassemble.8 +@@ -40,7 +40,7 @@ There are no options to + + .SH FILES + +-.SS /etc/mdadm.conf ++.SS /etc/mdadm/mdadm.conf + + The config file lists which devices may be scanned to see if + they contain MD super block, and gives identifying information +-- +tg: (972ee72..) debian/conffile-location (depends on: upstream) diff -Nru mdadm-2.6.7.1/debian/patches/debian/disable-incremental.diff mdadm-3.1.4/debian/patches/debian/disable-incremental.diff --- mdadm-2.6.7.1/debian/patches/debian/disable-incremental.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/debian/disable-incremental.diff 2010-09-03 12:06:50.000000000 +0300 @@ -0,0 +1,47 @@ +From: martin f. krafft +Subject: Disable udev incremental assembly + +Upstream enabled incremental assembly in the udev rules by default for 3.1.3, +but this is too early given the pending release of squeeze. Hence, this patch +simply reverts the change. + +Signed-off-by: martin f. krafft + +--- + udev-md-raid.rules | 13 ++++++------- + 1 files changed, 6 insertions(+), 7 deletions(-) + +diff --git a/udev-md-raid.rules b/udev-md-raid.rules +index f9607f3..c9a4f0e 100644 +--- a/udev-md-raid.rules ++++ b/udev-md-raid.rules +@@ -1,13 +1,13 @@ + # do not edit this file, it will be overwritten on update + + SUBSYSTEM!="block", GOTO="md_end" +- +-# handle potential components of arrays +-ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name" +-ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="add", RUN+="/sbin/mdadm --incremental $env{DEVNAME}" +- +-# handle md arrays + ACTION!="add|change", GOTO="md_end" ++ACTION=="change", GOTO="md_no_incr" ++ ++# import data from a raid member and activate it ++#ENV{ID_FS_TYPE}=="linux_raid_member", IMPORT{program}="/sbin/mdadm --examine --export $tempnode", RUN+="/sbin/mdadm --incremental $env{DEVNAME}" ++# import data from a raid set ++LABEL="md_no_incr" + KERNEL!="md*", GOTO="md_end" + + # partitions have no md/{array_state,metadata_version}, but should not +@@ -32,7 +32,6 @@ ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNA + + IMPORT{program}="/sbin/blkid -o udev -p $tempnode" + OPTIONS+="link_priority=100" +-OPTIONS+="watch" + ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" + ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + +-- +tg: (deb573e..) debian/disable-incremental (depends on: master) diff -Nru mdadm-2.6.7.1/debian/patches/debian-changes-3.1.4-1+8efb9d1ubuntu1 mdadm-3.1.4/debian/patches/debian-changes-3.1.4-1+8efb9d1ubuntu1 --- mdadm-2.6.7.1/debian/patches/debian-changes-3.1.4-1+8efb9d1ubuntu1 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/debian-changes-3.1.4-1+8efb9d1ubuntu1 2010-10-18 14:12:14.000000000 +0300 @@ -0,0 +1,127 @@ +Description: Upstream changes introduced in version 3.1.4-1+8efb9d1ubuntu1 + This patch has been created by dpkg-source during the package build. + Here's the last changelog entry, hopefully it gives details on why + those changes were made: + . + mdadm (3.1.4-1+8efb9d1ubuntu1) UNRELEASED; urgency=low + . + * Merge from debian unstable. (LP: #603582) + * Remaining changes + - Assemble.c, config.c: upgraded to the mdadm-3.1.4 version of these files + from Debian. + - debian/control: we need udev and util-linux in the right version. We + default to postfix as our mta + - debian/initramfs/hook: kept the Ubuntus version for handling the absence + of active raid arrays in /etc/mdadm/mdadm.conf + - debian/initramfs/script.local-top.DEBIAN, debian/mdadm-startall, + debian/mdadm.raid.DEBIAN: removed. udev does its job now instead. + - debian/mdadm-startall.sgml, debian/mdadm-startall.8: documentation of + unused startall script + - debian/mdadm.config, debian/mdadm.postinst - let udev do the handling + instead. Resolved merge conflict by keeping Ubuntu's version. + - debian/rules: kept debian's switch to using dh_lintian + - debian/mdadm.links, debian/mdadm.manpages: dropped owing to the fact + that these are not used in Ubuntu. Also dropped the build-dep on docbook + to man) + - debian/mdadm.postinst, debian/mdadm.config, initramfs/init-premount: + boot-degraded enablement; maintain udev starting of RAID devices; + init-premount hook script for the initramfs, to provide information at + boot + - debian/mkconf.in is the older mkconf. Kept the Ubuntus version. + - debian/rules: Kept Ubuntus version for installing apport hooks, not + installing un-used startall script and for adding a udev rule + corresponding to mdadm. + - debian/install-rc, check.d/_numbers, check.d/root_on_raid: Ubuntu partman + installer changes + - debian/presubj: Dropped this unused bug reporting file. Instead use + source_mdadm.py act as an apport hook for bug handling. + . + The person named in the Author field signed this changelog entry. +Author: Surbhi Palande +Bug-Ubuntu: https://bugs.launchpad.net/bugs/603582 + +--- +The information above should follow the Patch Tagging Guidelines, please +checkout http://dep.debian.net/deps/dep3/ to learn about the format. Here +are templates for supplementary fields that you might want to add: + +Origin: , +Bug: +Bug-Debian: http://bugs.debian.org/ +Forwarded: +Reviewed-By: +Last-Update: + +--- mdadm-3.1.4.orig/config.c ++++ mdadm-3.1.4/config.c +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + /* + * Read the config file +--- mdadm-3.1.4.orig/Assemble.c ++++ mdadm-3.1.4/Assemble.c +@@ -24,6 +24,7 @@ + + #include "mdadm.h" + #include ++#include + + static int name_matches(char *found, char *required, char *homehost) + { +--- mdadm-3.1.4.orig/ReadMe.c ++++ mdadm-3.1.4/ReadMe.c +@@ -366,7 +366,7 @@ char Help_build[] = + "\n" + " This usage is similar to --create. The difference is that it creates\n" + " a legacy array without a superblock. With these arrays there is no\n" +-" different between initially creating the array and subsequently\n" ++" difference between initially creating the array and subsequently\n" + " assembling the array, except that hopefully there is useful data\n" + " there in the second case.\n" + "\n" +--- /dev/null ++++ mdadm-3.1.4/check.d/root_on_raid +@@ -0,0 +1,35 @@ ++#!/bin/sh ++ ++. /lib/preseed/preseed.sh ++. /lib/partman/lib/base.sh ++ ++# Prompt for BOOT_DEGRADED=true|false if / or /boot is on a /dev/md* ++root_on_raid () { ++ prompt=$( ++ for i in /lib/partman/fstab.d/*; do ++ [ -x "$i" ] || continue ++ $i ++ done | ++ while read fs mp type options dump pass; do ++ if mdadm --detail "$fs" 2>/dev/null | grep -qsi " raid1$"; then ++ if [ "$mp" = "/" ] || [ "$mp" = "/boot" ]; then ++ echo "true" ++ fi ++ fi ++ done ++ ) ++ prompt="$(echo "$prompt" | head -n1)" ++ case $prompt in ++ true) ++ db_input critical mdadm/boot_degraded || true ++ db_go || true ++ db_get mdadm/boot_degraded ++ # write to preseed log so that /target knows about it ++ echo mdadm mdadm/boot_degraded boolean "$RET" >> "$logfile" ++ ++ ;; ++ esac ++ exit 0 ++} ++ ++root_on_raid +--- /dev/null ++++ mdadm-3.1.4/check.d/_numbers +@@ -0,0 +1 @@ ++07 root_on_raid diff -Nru mdadm-2.6.7.1/debian/patches/fixes/linker-flag-z-now.diff mdadm-3.1.4/debian/patches/fixes/linker-flag-z-now.diff --- mdadm-2.6.7.1/debian/patches/fixes/linker-flag-z-now.diff 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/fixes/linker-flag-z-now.diff 2010-09-03 12:06:51.000000000 +0300 @@ -0,0 +1,28 @@ +From: martin f. krafft +Subject: Fix compiler/linker flag in Makefile + +Cherry-pick 8efb9d1 from upstream to replace -z now with -Wl,-z,now + +Debian bug: #595290 + +Signed-off-by: martin f. krafft + +--- + Makefile | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/Makefile b/Makefile +index e2c65a5..0cc9a87 100644 +--- a/Makefile ++++ b/Makefile +@@ -167,7 +167,7 @@ mdmon.O2 : $(MON_SRCS) mdadm.h mdmon.h + + # use '-z now' to guarantee no dynamic linker interactions with the monitor thread + mdmon : $(MON_OBJS) +- $(CC) $(LDFLAGS) $(MON_LDFLAGS) -z now -o mdmon $(MON_OBJS) $(LDLIBS) ++ $(CC) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS) + msg.o: msg.c msg.h + + test_stripe : restripe.c mdadm.h +-- +tg: (ef9f23f..) fixes/linker-flag-z-now (depends on: master) diff -Nru mdadm-2.6.7.1/debian/patches/series mdadm-3.1.4/debian/patches/series --- mdadm-2.6.7.1/debian/patches/series 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/patches/series 2010-10-18 13:57:40.000000000 +0300 @@ -0,0 +1,9 @@ +contrib/scripts/mdadd.diff -p1 +contrib/docs/raid5-vs-raid10.diff -p1 +contrib/docs/superblock_formats.diff -p1 +contrib/docs/md.txt.diff -p1 +contrib/docs/jd-rebuilding-raid.diff -p1 +debian/conffile-location.diff -p1 +debian/disable-incremental.diff -p1 +fixes/linker-flag-z-now.diff -p1 +debian-changes-3.1.4-1+8efb9d1ubuntu1 diff -Nru mdadm-2.6.7.1/debian/po/cs.po mdadm-3.1.4/debian/po/cs.po --- mdadm-2.6.7.1/debian/po/cs.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/cs.po 2010-10-18 13:57:39.000000000 +0300 @@ -15,14 +15,14 @@ msgstr "" "Project-Id-Version: mdadm\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-17 21:10+0100\n" "Last-Translator: Miroslav Kure \n" "Language-Team: Czech \n" -"Language: cs\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: cs\n" #. Type: boolean #. Description diff -Nru mdadm-2.6.7.1/debian/po/de.po mdadm-3.1.4/debian/po/de.po --- mdadm-2.6.7.1/debian/po/de.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/de.po 2010-10-18 13:57:39.000000000 +0300 @@ -11,23 +11,23 @@ # msgid "" msgstr "" -"Project-Id-Version: mdadm\n" +"Project-Id-Version: mdadm 2.6.9-3\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" -"PO-Revision-Date: 2008-02-22 10:58+0100\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" +"PO-Revision-Date: 2009-06-24 17:35+0200\n" "Last-Translator: Mario Joussen \n" "Language-Team: German \n" -"Language: de\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=ISO-8859-1\n" "Content-Transfer-Encoding: 8bit\n" +"Language: de\n" #. Type: boolean #. Description #: ../mdadm.templates:1001 msgid "Should mdadm run monthly redundancy checks of the MD arrays?" msgstr "" -"Soll mdadm monatlich die Redundanzüberprüfung auf den RAID Verbünden " +"Soll mdadm monatlich die Redundanzüberprüfung auf den RAID-Verbünden " "ausführen?" #. Type: boolean @@ -42,12 +42,12 @@ "may result in write access to the media." msgstr "" "Falls Ihr Kernel es unterstützt (Versionen größer als 2.6.14) kann mdadm " -"regelmäßig die Redundanz Ihrer MD Verbünde (RAID) überprüfen. Dies kann " +"regelmäßig die Redundanz Ihrer MD-Verbünde (RAID) überprüfen. Dies kann " "abhängig von Ihrer Installation ein resourcenintensiver Vorgang sein, der " "aber helfen kann, seltene Fälle von Datenverlust zu vermeiden. Bitte " -"beachten Sie, daß diese Überprüfung nur lesend erfolgt, solange keine Fehler " -"gefunden werden. Falls Fehler gefunden werden, wird mdadm versuchen diese zu " -"beheben, was zu schreibendem Zugriff auf das Medium führen kann." +"beachten Sie, dass diese Überprüfung nur lesend erfolgt, solange keine " +"Fehler gefunden werden. Falls Fehler gefunden werden, wird mdadm versuchen, " +"diese zu beheben, was zu schreibendem Zugriff auf das Medium führen kann." #. Type: boolean #. Description @@ -63,7 +63,7 @@ #. Description #: ../mdadm.templates:2001 msgid "Do you want to start the MD monitoring daemon?" -msgstr "Möchten Sie den RAID Überwachungsdämon starten?" +msgstr "Möchten Sie den RAID-Überwachungsdämon starten?" #. Type: boolean #. Description @@ -72,8 +72,8 @@ "The MD (RAID) monitor daemon sends email notifications in response to " "important MD events (such as a disk failure)." msgstr "" -"Der MD (RAID) Überwachungsdämon verschickt Benachrichtigungen als Reaktion " -"auf wichtige RAID Ereignisse (wie zum Beispiel Festplattenfehler)." +"Der MD- (RAID-)Überwachungsdämon verschickt Benachrichtigungen als Reaktion " +"auf wichtige RAID-Ereignisse (wie zum Beispiel Festplattenfehler)." #. Type: boolean #. Description @@ -85,7 +85,7 @@ #. Description #: ../mdadm.templates:3001 msgid "Recipient for email notifications:" -msgstr "Empfänger der Emailbenachrichtungen:" +msgstr "Empfänger der E-Mail-Benachrichtungen:" #. Type: string #. Description @@ -94,8 +94,8 @@ "Please enter the email address of the user who should get the email " "notifications for important MD events." msgstr "" -"Geben Sie bitte die Emailadresse des Benutzers an, der die " -"Emailbenachrichtigung für wichtige MD Ereignisse erhalten soll." +"Geben Sie bitte die E-Mail-Adresse des Benutzers an, der die E-Mail-" +"Benachrichtigung für wichtigen MD-Ereignisse erhalten soll." #. Type: boolean #. Description @@ -128,15 +128,15 @@ msgstr "" #~ msgid "MD arrays needed for the root file system:" -#~ msgstr "Für das Wurzeldateisystem benötigte MD Verbünde:" +#~ msgstr "Für das Wurzeldateisystem benötigte MD folgende Verbünde:" #~ msgid "" #~ "Please enter 'all', 'none', or a space-separated list of devices such as " #~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." #~ msgstr "" -#~ "Bitte geben Sie \"all\", \"none\" oder eine leerzeichenseparierte " -#~ "Geräteliste wie zum Beispiel \"md0 md1\" oder \"md/1 md/d0\" ein (das " -#~ "führende \"/dev\" kann weggelassen werden)." +#~ "Bitte geben Sie »all«, »none« oder eine leerzeichenseparierte Geräteliste " +#~ "wie zum Beispiel »md0 md1« oder »md/1 md/d0« ein (das führende »/dev« kann " +#~ "weggelassen werden)." #~ msgid "for internal use - only the long description is needed." #~ msgstr "" @@ -149,8 +149,8 @@ #~ "logical volume (LVM), which is on MD, all constituent arrays need to be " #~ "started." #~ msgstr "" -#~ "Wenn das Wurzeldateisystem Ihres Systems auf einem MD Verbund (RAID) " -#~ "liegt, muß es frühzeitig während des Bootvorgangs gestartet werden. Wenn " +#~ "Wenn das Wurzeldateisystem Ihres Systems auf einem MD-Verbund (RAID) " +#~ "liegt, muss es frühzeitig während des Bootvorgangs gestartet werden. Wenn " #~ "sich Ihr Wurzeldateisystem auf einem logischen Laufwerk (LVM) befindet, " #~ "das sich wiederum auf einem MD Verbund befindet, müssen alle zugehörigen " #~ "Verbünde gestartet werden." @@ -161,21 +161,21 @@ #~ "point in the boot sequence, enter the arrays to start here. " #~ "Alternatively, enter 'all' to simply start all available arrays." #~ msgstr "" -#~ "Wenn Sie genau wissen welche Verbünde benötigt werden, um das " +#~ "Wenn Sie genau wissen, welche Verbünde benötigt werden, um das " #~ "Wurzeldateisystem zu starten, und Sie den Start der anderen Verbünde auf " #~ "einen späteren Zeitpunkt in der Bootreihenfolge verschieben wollen, geben " -#~ "Sie die zu startenden Verbünde hier ein. Alternativ geben Sie \"all\" " -#~ "ein, um alle verfügbaren Verbünde zu starten." +#~ "Sie die zu startenden Verbünde hier ein. Alternativ geben Sie »all« ein, " +#~ "um alle verfügbaren Verbünde zu starten." #~ msgid "" #~ "If you do not need or want to start any arrays for the root file system, " #~ "leave the answer blank (or enter 'none'). This may be the case if you are " #~ "using kernel autostart or do not need any arrays to boot." #~ msgstr "" -#~ "Falls Sie keine RAID Verbünde für das Wurzeldateisystem benötigen oder " -#~ "starten wollen, lassen Sie die Antwort leer (oder geben \"none\" ein). " -#~ "Dies könnte der Fall sein, wenn Sie entweder die Autostartfunktion des " -#~ "Kernels verwenden oder keine Verbünde zum Booten benötigen." +#~ "Falls Sie keine RAID-Verbünde für das Wurzeldateisystem benötigen oder " +#~ "starten wollen, lassen Sie die Antwort leer (oder geben »none« ein). Dies " +#~ "könnte der Fall sein, wenn Sie entweder die Autostartfunktion des Kernels " +#~ "verwenden oder keine Verbünde zum Booten benötigen." #~ msgid "An error occurred: device node does not exist" #~ msgstr "Ein Fehler ist aufgetreten: Geräteknoten existiert nicht" @@ -184,7 +184,7 @@ #~ msgstr "Ein Fehler ist aufgetreten: kein Blockgerät" #~ msgid "An error occurred: not an MD array" -#~ msgstr "Ein Fehler ist aufgetreten: kein RAID Verbund" +#~ msgstr "Ein Fehler ist aufgetreten: kein RAID-Verbund" #~ msgid "An error occurred: array not listed in mdadm.conf file" #~ msgstr "" @@ -201,7 +201,7 @@ #~ msgstr "" #~ "Der angegebene Verbund (${array}) ist in der Konfigurationsdatei " #~ "${config} nicht aufgeführt. Deshalb kann er während des Bootvorgangs " -#~ "nicht gestartet werden, es sei denn Sie korrigieren die " +#~ "nicht gestartet werden, es sei denn, Sie korrigieren die " #~ "Konfigurationsdatei und erzeugen die initiale Ramdisk neu." #~ msgid "" @@ -219,16 +219,16 @@ #~ "and enter 'none' when prompted which arrays to start from the initial " #~ "ramdisk." #~ msgstr "" -#~ "Diese Warnung ist nur von Bedeutung wenn Sie RAID Verbünde, die von der " +#~ "Diese Warnung ist nur von Bedeutung, wenn Sie RAID-Verbünde, die von der " #~ "initialen Ramdisk gestartet werden, benötigen, um booten zu können. Falls " -#~ "Sie die Autostartfunktion des Kernels verwenden oder kein RAID Verbund " +#~ "Sie die Autostartfunktion des Kernels verwenden oder kein RAID-Verbund " #~ "zum frühen Zeitpunkt des Ladens der initialen Ramdisk gestartet werden " -#~ "muß, können Sie einfach fortfahren. Alternativ wählen Sie nicht " -#~ "fortfahren und geben 'none' ein, wenn Sie gefragt werden, welche RAID " +#~ "muss, können Sie einfach fortfahren. Alternativ wählen Sie, nicht " +#~ "fortzufahren und geben »none« ein, wenn Sie gefragt werden, welche RAID-" #~ "Verbünde von der initialen Ramdisk gestartet werden sollen." #~ msgid "Do you want to start MD arrays automatically?" -#~ msgstr "Möchten Sie die RAID Verbünde automatisch starten?" +#~ msgstr "Möchten Sie die RAID-Verbünde automatisch starten?" #~ msgid "" #~ "Once the base system has booted, mdadm can start all MD arrays (RAIDs) " @@ -239,9 +239,9 @@ #~ "kernel)." #~ msgstr "" #~ "Sobald das Grundsystem hochgefahren ist, kann mdadm alle in /etc/mdadm/" -#~ "mdadm.conf angegebenen MD Verbünde (RAID) starten, die noch nicht " -#~ "gestartet wurden. Dies ist empfohlen, es sei denn die MD Unterstützung " -#~ "wurde in den Kernel einkompiliert und alle Partitionen, die zu MD " +#~ "mdadm.conf angegebenen MD-Verbünde (RAID) starten, die noch nicht " +#~ "gestartet wurden. Dies ist empfohlen, es sei denn, die MD-Unterstützung " +#~ "wurde in den Kernel einkompiliert und alle Partitionen, die zu MD-" #~ "Verbünden gehören, wurden mit dem Typ 0xfd markiert (weil diese und nur " #~ "diese automatisch vom Kernel gestartet werden)." @@ -257,9 +257,9 @@ #~ "earlier installations in different RAID arrays, you MUST zero each " #~ "superblock *before* activating the autostart feature." #~ msgstr "" -#~ "WARNUNG! Wenn Sie Festplatten verwenden, die bereits einen md Superblock " -#~ "von einer vorherigen Installation in einem anderen RAID Verbund besitzen, " -#~ "so MÜSSEN Sie diesen löschen, *bevor* Sie die Autostart Funktion " +#~ "WARNUNG! Wenn Sie Festplatten verwenden, die bereits einen md-Superblock " +#~ "von einer vorherigen Installation in einem anderen RAID-Verbund besitzen, " +#~ "so MÜSSEN Sie diesen löschen, *bevor* Sie die Autostart-Funktion " #~ "aktivieren." #~ msgid "" @@ -267,9 +267,9 @@ #~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" #~ "reconfigure mdadm` to reactivate the autostart feature." #~ msgstr "" -#~ "Dazu starten Sie die RAID Laufwerke nicht automatisch und löschen dann " -#~ "erst den Superblock (mdadm --zero-superblock /dev/mdX). Danach können " -#~ "Sie mit \"dpkg-reconfigure mdadm\" die Autostart Funktion aktivieren." +#~ "Dazu starten Sie die RAID-Laufwerke nicht automatisch und löschen dann " +#~ "erst den Superblock (mdadm --zero-superblock /dev/mdX). Danach können Sie " +#~ "mit »dpkg-reconfigure mdadm« die Autostart-Funktion aktivieren." #~ msgid "" #~ "You have the option to start all other arrays (those not needed for the " @@ -277,11 +277,11 @@ #~ "greater control over the arrays with the mdadm configuration file. " #~ "Starting all arrays at boot-time may be safer though." #~ msgstr "" -#~ "Sie haben die Option alle anderen Verbünde (diese die nicht für das " -#~ "Wurzeldateisystem benötigt werden) später während des Bootvorgangs " -#~ "zu starten. Damit haben Sie größere Kontrolle über die Verbünde " -#~ "mit Hilfe der mdadm Konfigurationsdatei. Es ist jedoch sicherer, alle " -#~ "Verbünde beim Booten zu starten." +#~ "Sie haben die Option, alle anderen Verbünde (diese die nicht für das " +#~ "Wurzeldateisystem benötigt werden) später während des Bootvorgangs zu " +#~ "starten. Damit haben Sie größere Kontrolle über die Verbünde mit Hilfe " +#~ "der mdadm-Konfigurationsdatei. Es ist jedoch sicherer, alle Verbünde beim " +#~ "Booten zu starten." #~ msgid "" #~ "If RAID devices are started automatically, all RAID devices are " @@ -290,18 +290,18 @@ #~ "compiled into your kernel, the automatic startup will be performed at " #~ "boot time by the kernel and therefore you should not choose this option." #~ msgstr "" -#~ "Wenn die RAID Laufwerke automatisch gestartet werden, werden alle RAID " +#~ "Wenn die RAID-Laufwerke automatisch gestartet werden, werden alle RAID-" #~ "Laufwerke beim Systemstart automatisch gefunden und gestartet. Diese " -#~ "Option sollte nur benutzt werden, falls der md Treiber als Modul " -#~ "kompiliert wurde. Falls er in den Kernel einkompiliert wurde, führt der " +#~ "Option sollte nur benutzt werden, falls der md-Treiber als Modul " +#~ "kompiliert wurde. Falls er in den Kernel einkompiliert wurde, führt der " #~ "Kernel den automatischen Start beim Booten durch und Sie sollten diese " -#~ "Option deshalb nicht auswählen." +#~ "Option deshalb nicht auswählen." #~ msgid "" #~ "When the RAID monitor daemon runs, email notifications are sent when a " #~ "disk belonging to a RAID array fails or changes its status for some " #~ "reason." #~ msgstr "" -#~ "Wird der RAID-Überwachungsdämon gestartet, so werden Email-" -#~ "Benachrichtigungen verschickt, falls ein zum RAID gehörendes Laufwerk " -#~ "ausfällt oder den Status ändert." +#~ "Wird der RAID-Überwachungsdaemon gestartet, so werden E-Mail-" +#~ "Benachrichtigungen verschickt, falls ein zum RAID gehörendes Laufwerk " +#~ "ausfällt oder den Status ändert." diff -Nru mdadm-2.6.7.1/debian/po/es.po mdadm-3.1.4/debian/po/es.po --- mdadm-2.6.7.1/debian/po/es.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/es.po 2010-10-18 13:57:39.000000000 +0300 @@ -9,7 +9,7 @@ # Fernando Cerezal # # -# Traductores, si no conoce el formato PO, merece la pena leer la +# Traductores, si no conoce el formato PO, merece la pena leer la # documentación de gettext, especialmente las secciones dedicadas a este # formato, por ejemplo ejecutando: # info -n '(gettext)PO Files' @@ -39,14 +39,14 @@ msgstr "" "Project-Id-Version: mdadm 2.5.6-6\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-04-25 17:47+0200\n" "Last-Translator: Javier Fernández-Sanguino \n" "Language-Team: Debian Spanish \n" -"Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=ISO-8859-15\n" "Content-Transfer-Encoding: 8bit\n" +"Language: \n" #. Type: boolean #. Description @@ -166,8 +166,7 @@ #~ msgstr "" #~ "Introduzca «all» (todos), «none» (ninguno) o una lista de dispositivos " #~ "separados por espacios como por ejemplo puede sólo introducir «md0 md1» o " -#~ "«md/1 md/d0» (no tiene que preceder los nombres de dispositivos con «/" -#~ "dev»)." +#~ "«md/1 md/d0» (no tiene que preceder los nombres de dispositivos con «/dev»)." #~ msgid "for internal use - only the long description is needed." #~ msgstr "para uso interno. Sólo se utiliza la descripción larga." @@ -236,8 +235,8 @@ #~ "Please refer to /usr/share/doc/mdadm/README.upgrading-2.5.3.gz if you " #~ "intend to continue." #~ msgstr "" -#~ "Si desea continuar debería leer «/usr/share/doc/mdadm/README." -#~ "upgrading-2.5.3.gz»." +#~ "Si desea continuar debería leer «/usr/share/doc/mdadm/README.upgrading-" +#~ "2.5.3.gz»." #~ msgid "" #~ "This warning is only relevant if you need arrays to be started from the " @@ -268,9 +267,9 @@ #~ "Mdadm puede iniciar todos los arrays MD (RAIDs) especificados en «/etc/" #~ "mdadm/mdadm.conf» una vez se haya arrancado el sistema base, siempre que " #~ "éstos no se hayan arrancado aún. Esto es probablemente lo que desea a no " -#~ "ser que haya compilado el soporte de dispositivos múltiples (MD, " -#~ "«multiple devices») en el núcleo y marcado todas las particiones que son " -#~ "parte de los arrays MD con el tipo «0xfd» (ya que éstas, y sólo éstas, se " +#~ "ser que haya compilado el soporte de dispositivos múltiples (MD, «multiple " +#~ "devices») en el núcleo y marcado todas las particiones que son parte de " +#~ "los arrays MD con el tipo «0xfd» (ya que éstas, y sólo éstas, se " #~ "arrancarán de forma automática por el núcleo)." #~ msgid "${msg}" diff -Nru mdadm-2.6.7.1/debian/po/eu.po mdadm-3.1.4/debian/po/eu.po --- mdadm-2.6.7.1/debian/po/eu.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/eu.po 2010-10-18 13:57:39.000000000 +0300 @@ -7,14 +7,14 @@ msgstr "" "Project-Id-Version: mdadm-debconf\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: Y2008-04-30 11:00+0100\n" "Last-Translator: Piarres Beobide \n" "Language-Team: Euskara \n" -"Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: \n" #. Type: boolean #. Description diff -Nru mdadm-2.6.7.1/debian/po/fi.po mdadm-3.1.4/debian/po/fi.po --- mdadm-2.6.7.1/debian/po/fi.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/fi.po 2010-10-18 13:57:39.000000000 +0300 @@ -2,14 +2,14 @@ msgstr "" "Project-Id-Version: mdadm\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-14 11:24+0200\n" "Last-Translator: Esko Arajärvi \n" "Language-Team: Finnish \n" -"Language: fi\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: fi\n" "X-Poedit-Language: Finnish\n" "X-Poedit-Country: FINLAND\n" diff -Nru mdadm-2.6.7.1/debian/po/fr.po mdadm-3.1.4/debian/po/fr.po --- mdadm-2.6.7.1/debian/po/fr.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/fr.po 2010-10-18 13:57:39.000000000 +0300 @@ -10,14 +10,14 @@ msgstr "" "Project-Id-Version: mdadm\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-15 20:42+0100\n" "Last-Translator: Florentin Duneau \n" "Language-Team: French \n" -"Language: fr\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: fr\n" "Plural-Forms: nplurals=2; plural=(n > 1);\n" "X-Generator: KBabel 1.11.4\n" @@ -131,9 +131,9 @@ #~ "Please enter 'all', 'none', or a space-separated list of devices such as " #~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." #~ msgstr "" -#~ "Veuillez indiquer « all », « none » ou une liste de périphériques, " -#~ "séparés par des espaces, par exemple, « md0 md1 » ou « md/1 md/d0 » (vous " -#~ "pouvez omettre « /dev/ »)." +#~ "Veuillez indiquer « all », « none » ou une liste de périphériques, séparés " +#~ "par des espaces, par exemple, « md0 md1 » ou « md/1 md/d0 » (vous pouvez " +#~ "omettre « /dev/ »)." #~ msgid "for internal use - only the long description is needed." #~ msgstr "" diff -Nru mdadm-2.6.7.1/debian/po/gl.po mdadm-3.1.4/debian/po/gl.po --- mdadm-2.6.7.1/debian/po/gl.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/gl.po 2010-10-18 13:57:39.000000000 +0300 @@ -6,14 +6,14 @@ msgstr "" "Project-Id-Version: mdadm\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-06 23:45+0000\n" "Last-Translator: Jacobo Tarrio \n" "Language-Team: Galician \n" -"Language: gl\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: gl\n" #. Type: boolean #. Description @@ -127,8 +127,8 @@ #~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." #~ msgstr "" #~ "Introduza \"all\" (todos), \"none\" (ningún) ou unha lista de " -#~ "dispositivos separados por espazos, tales coma \"md0 md1\" ou \"md/1 " -#~ "md/0\" (pódese omitir o \"/dev/\" do principio)." +#~ "dispositivos separados por espazos, tales coma \"md0 md1\" ou \"md/1 md/0" +#~ "\" (pódese omitir o \"/dev/\" do principio)." #~ msgid "for internal use - only the long description is needed." #~ msgstr "para uso interno - só se precisa da descrición longa." diff -Nru mdadm-2.6.7.1/debian/po/it.po mdadm-3.1.4/debian/po/it.po --- mdadm-2.6.7.1/debian/po/it.po 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/po/it.po 2010-10-18 13:57:39.000000000 +0300 @@ -0,0 +1,232 @@ +# Italian (it) translation of debconf templates for mdadm +# Copyright (C) 2008 Software in the Public Interest +# This file is distributed under the same license as the mdadm package. +# Luca Monducci , 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm italian debconf\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" +"PO-Revision-Date: 2008-11-19 11:02+0100\n" +"Last-Translator: Luca Monducci \n" +"Language-Team: Italian \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Language: it\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:1001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Far eseguire a mdadm i controlli mensili di ridondanza sugli array MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:1001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se il kernel lo supporta (tutte le versioni successive la 2.6.14), mdadm può " +"effettuare delle verifiche periodiche sulla ridondanza degli array MD " +"(RAID). Questo è un processo che potrebbe richiedere molte risorse, in base " +"alle impostazioni locali, ma può prevenire i rari casi di perdita di dati. " +"Notare che questa verifica è di sola-lettura tranne quando riscontra degli " +"errori; quando ci sono errori, mdadm prova a correggerli e potrebbe accedere " +"in scrittura al supporto." + +#. Type: boolean +#. Description +#: ../mdadm.templates:1001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Se attivo, la configurazione predefinita prevede che il controllo sia " +"eseguito la prima domenica di ogni mese alle 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Avviare il demone di monitoraggio MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Il demone di monitoraggio MD (RAID) invia delle notifiche via email quando " +"si verificano eventi importanti (come la rottura di un disco)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Enabling this option is recommended." +msgstr "Si raccomanda l'attivazione di questa funzione." + +#. Type: string +#. Description +#: ../mdadm.templates:3001 +msgid "Recipient for email notifications:" +msgstr "Destinatario delle email di notifica:" + +#. Type: string +#. Description +#: ../mdadm.templates:3001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Inserire l'indirizzo email dell'utente che deve ricevere le notifiche di " +"eventi importanti legati al MD." + +#. Type: boolean +#. Description +#: ../mdadm-udeb.templates:1001 +msgid "Do you want to boot your system if your RAID becomes degraded?" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm-udeb.templates:1001 +msgid "" +"If your root filesystem is on a RAID, and a disk is missing at boot, it can " +"either boot with the degraded array, or hold the system at a recovery shell." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm-udeb.templates:1001 +msgid "" +"Running a system with a degraded RAID could result in permanent data loss if " +"it suffers another hardware fault." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm-udeb.templates:1001 +msgid "" +"If you do not have access to the server console to use the recovery shell, " +"you might answer \"yes\" to enable the system to boot unattended." +msgstr "" + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Array MD necessari per il file system di root:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Inserire \"all\", \"none\" oppure un elenco dei device separati da uno " +#~ "spazio, per esempio \"md0 md1\" o \"md/1 md/d0\" (il \"/dev/\" iniziale " +#~ "può essere omesso)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "uso interno - è necessaria solo la descrizione lunga." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Se il file system di root è su un array MD (RAID), è necessario attivare " +#~ "tale array all'inizio della sequenza d'avvio. Se è su un volume logico " +#~ "(LVM), il quale è su un MD, è necessario attivare tutti gli array che " +#~ "costituiscono il volume." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Se si conoscono esattamente quali sono gli array da attivare per il file " +#~ "system di root e si vuole rimandare l'attivazione di tutti gli altri " +#~ "array a una fase successiva della sequenza d'avvio, inserire adesso gli " +#~ "array da attivare. In alternativa, inserire \"all\" per attivare tutti " +#~ "gli array disponibili." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Se non si ha bisogno o non si vuole attivare nessun array per il file " +#~ "system di root, lasciare la risposta in bianco (oppure inserire \"none" +#~ "\"). Questo potrebbe essere il caso se si utilizza l'attivazione " +#~ "automatica da kernel oppure se non si ha bisogno di alcun array per " +#~ "l'avvio." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Errore: il nodo del device non esiste" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Errore: non è un device a blocchi" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Errore: non è un array MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Errore: array non elencato nel file mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Avviare gli array non elencati in mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "L'array specificato (${array}) non è presente nel file di configurazione " +#~ "(${config}): quindi non può essere attivato durante l'avvio senza " +#~ "correggere il file di configurazione e ricreare il ramdisk iniziale." + +#~ msgid "" +#~ "Please refer to /usr/share/doc/mdadm/README.upgrading-2.5.3.gz if you " +#~ "intend to continue." +#~ msgstr "" +#~ "Fare riferimento a /usr/share/doc/mdadm/README.upgrading-2.5.3.gz se si " +#~ "intende proseguire." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Questo avviso è pertinente solo se è necessario attivare gli array dal " +#~ "ramdisk iniziale per permettere l'avvio. Con l'avvio automatico da kernel " +#~ "o se non è necessario attivare gli array così presto come al caricamento " +#~ "del ramdisk iniziale, si può proseguire. In alternativa, scegliere di non " +#~ "continuare e inserire \"none\" quando viene chiesto quali array attivare " +#~ "dal ramdisk iniziale." + +#~ msgid "Do you want to start MD arrays automatically?" +#~ msgstr "Avviare gli array MD automaticamente?" + +#~ msgid "" +#~ "Once the base system has booted, mdadm can start all MD arrays (RAIDs) " +#~ "specified in /etc/mdadm/mdadm.conf which have not yet been started. This " +#~ "is recommended unless multiple device (MD) support is compiled into the " +#~ "kernel and all partitions are marked as belonging to MD arrays, with type " +#~ "0xfd (as those and only those will be started automatically by the " +#~ "kernel)." +#~ msgstr "" +#~ "Una volta avviato il sistema di base, mdadm può attivare tutti gli array " +#~ "MD (RAID) specificati in /etc/mdadm/mdadm.conf non ancora attivi. Questa " +#~ "è la configurazione consigliata tranne quando il supporto per i device " +#~ "multidisco (MD) è compilato nel kernel e tutte le partizioni appartenenti " +#~ "agli array MD sono marcate con tipo 0xfd (come quelle che sono attivate " +#~ "automaticamente dal kernel)." diff -Nru mdadm-2.6.7.1/debian/po/ja.po mdadm-3.1.4/debian/po/ja.po --- mdadm-2.6.7.1/debian/po/ja.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/ja.po 2010-10-18 13:57:39.000000000 +0300 @@ -16,14 +16,14 @@ msgstr "" "Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-07 05:52+0900\n" "Last-Translator: Hideki Yamane (Debian-JP) \n" "Language-Team: Japanese \n" -"Language: ja\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: ja\n" #. Type: boolean #. Description diff -Nru mdadm-2.6.7.1/debian/po/nl.po mdadm-3.1.4/debian/po/nl.po --- mdadm-2.6.7.1/debian/po/nl.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/nl.po 2010-10-18 13:57:39.000000000 +0300 @@ -16,14 +16,14 @@ msgstr "" "Project-Id-Version: mdadm_2.6.3+200709292116+4450e59-4\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-19 14:04+0100\n" "Last-Translator: Frans Pop \n" "Language-Team: Dutch \n" -"Language: nl\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: nl\n" "X-Generator: KBabel 1.11.4\n" #. Type: boolean diff -Nru mdadm-2.6.7.1/debian/po/pt_BR.po mdadm-3.1.4/debian/po/pt_BR.po --- mdadm-2.6.7.1/debian/po/pt_BR.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/pt_BR.po 2010-10-18 13:57:39.000000000 +0300 @@ -15,14 +15,14 @@ msgstr "" "Project-Id-Version: mdadm\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2006-09-24 19:22-0300\n" "Last-Translator: Felipe Augusto van de Wiel (faw) \n" "Language-Team: l10n portuguese \n" -"Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: \n" "pt_BR utf-8\n" #. Type: boolean @@ -33,13 +33,6 @@ "O mdadm deve, mensalmente, executar checagens de redundância dos " "dispositivos MD?" -# | msgid "" -# | "If your kernel supports it (>> 2.6.14), mdadm can periodically check the " -# | "redundancy of your MD arrays (RAIDs). This may be a resource-intensive " -# | "process, depending on your setup, but it could help prevent rare cases of " -# | "data loss. Note that this is a read-only check unless errors are found; " -# | "if errors are found, mdadm will try to correct them, which may result in " -# | "write access to the media." #. Type: boolean #. Description #: ../mdadm.templates:1001 @@ -60,9 +53,6 @@ "encontrados, mdadm tentará corrigí-los, o que poderá resultar em acesso de " "escrita na mídia." -# | msgid "" -# | "The default, if turned on, is to run the checks on the first Sunday of " -# | "every month at 01:06 o'clock." #. Type: boolean #. Description #: ../mdadm.templates:1001 @@ -80,10 +70,6 @@ msgid "Do you want to start the MD monitoring daemon?" msgstr "Você deseja iniciar o \"daemon\" de monitoramento MD?" -# | msgid "" -# | "The MD (RAID) monitor daemon sends email notifications in response to " -# | "important MD events (such as a disk failure). You probably want to enable " -# | "it." #. Type: boolean #. Description #: ../mdadm.templates:2001 @@ -108,9 +94,6 @@ msgid "Recipient for email notifications:" msgstr "Destinatário para os e-mails de notificações:" -# | msgid "" -# | "Please enter the email address of the user who should get the email " -# | "notification for important MD events." #. Type: string #. Description #: ../mdadm.templates:3001 @@ -152,15 +135,10 @@ "you might answer \"yes\" to enable the system to boot unattended." msgstr "" -# | msgid "MD arrays needed for the root filesystem:" #, fuzzy #~ msgid "MD arrays needed for the root file system:" #~ msgstr "Dispositivos MD necessários para o sistema de arquivos raiz:" -# | msgid "" -# | "Please enter a space-separated list of devices, 'all', or 'none'. You may " -# | "omit the leading '/dev/' and just enter e.g. \"md0 md1\", or \"md/1 md/d0" -# | "\"." #, fuzzy #~ msgid "" #~ "Please enter 'all', 'none', or a space-separated list of devices such as " @@ -173,11 +151,6 @@ #~ msgid "for internal use - only the long description is needed." #~ msgstr "para uso interno - apenas a descrição longa é necessária." -# | msgid "" -# | "If your system has its root filesystem on an MD array (RAID), it needs to " -# | "be started early during the boot sequence. If your root filesystem is on " -# | "a logical volume (LVM), which is on MD, all constituent arrays need to be " -# | "started." #, fuzzy #~ msgid "" #~ "If the system's root file system is located on an MD array (RAID), it " @@ -191,11 +164,6 @@ #~ "(LVM), que está em um MD, todos os dispositivos que o constituem precisam " #~ "ser iniciados." -# | msgid "" -# | "If you know exactly which arrays are needed to bring up the root " -# | "filesystem, and you want to postpone starting all other arrays to a later " -# | "point in the boot sequence, enter the arrays to start here. " -# | "Alternatively, enter 'all' to simply start all available arrays." #, fuzzy #~ msgid "" #~ "If you know exactly which arrays are needed to bring up the root file " @@ -209,10 +177,6 @@ #~ "informe os dispositivos a serem iniciados aqui. Como alternativa, informe " #~ "'all' para simplesmente iniciar todos os dispositivos disponíveis." -# | msgid "" -# | "If you do not need or want to start any arrays for the root filesystem, " -# | "leave the answer blank (or enter 'none'). This may be the case if you are " -# | "using kernel autostart or do not need any arrays to boot." #, fuzzy #~ msgid "" #~ "If you do not need or want to start any arrays for the root file system, " @@ -236,16 +200,10 @@ #~ msgid "An error occurred: array not listed in mdadm.conf file" #~ msgstr "Um erro ocorreu: dispositivo não listado no arquivo mdadm.conf" -# | msgid "Proceed with starting arrays not listed in mdadm.conf?" #, fuzzy #~ msgid "Start arrays not listed in mdadm.conf?" #~ msgstr "Continuar com o início de dispositivos não listados no mdadm.conf?" -# | msgid "" -# | "The array you have specified (${array}) is not listed in the " -# | "configuration file ${config}. Therefore it cannot be started during boot, " -# | "unless you correct the configuration file and recreate the initial " -# | "ramdisk." #, fuzzy #~ msgid "" #~ "The specified array (${array}) is not listed in the configuration file " @@ -283,13 +241,6 @@ #~ msgid "Do you want to start MD arrays automatically?" #~ msgstr "Você deseja iniciar os dispositivos MD automaticamente?" -# | msgid "" -# | "Once the base system has come up, mdadm can start all MD arrays (RAIDs) " -# | "specified in /etc/mdadm/mdadm.conf, which have not yet been started. " -# | "Unless you have compiled multiple device (MD) support into the kernel and " -# | "marked all partitions part of MD arrays with type 0xfd (as those and only " -# | "those will be started automatically by the kernel), this is probably what " -# | "you want." #, fuzzy #~ msgid "" #~ "Once the base system has booted, mdadm can start all MD arrays (RAIDs) " diff -Nru mdadm-2.6.7.1/debian/po/pt.po mdadm-3.1.4/debian/po/pt.po --- mdadm-2.6.7.1/debian/po/pt.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/pt.po 2010-10-18 13:57:39.000000000 +0300 @@ -7,14 +7,14 @@ msgstr "" "Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-21 00:15+0000\n" "Last-Translator: Pedro Ribeiro \n" "Language-Team: Portuguese \n" -"Language: pt\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: pt\n" #. Type: boolean #. Description diff -Nru mdadm-2.6.7.1/debian/po/ru.po mdadm-3.1.4/debian/po/ru.po --- mdadm-2.6.7.1/debian/po/ru.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/ru.po 2010-10-18 13:57:39.000000000 +0300 @@ -15,17 +15,17 @@ msgstr "" "Project-Id-Version: 2.6.3+200709292116+4450e59-4\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-07 21:02+0300\n" "Last-Translator: Yuri Kozlov \n" "Language-Team: Russian \n" -"Language: ru\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: ru\n" "X-Generator: KBabel 1.11.4\n" -"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" -"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" +"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%" +"10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" #. Type: boolean #. Description diff -Nru mdadm-2.6.7.1/debian/po/sv.po mdadm-3.1.4/debian/po/sv.po --- mdadm-2.6.7.1/debian/po/sv.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/sv.po 2010-10-18 13:57:39.000000000 +0300 @@ -14,14 +14,14 @@ msgstr "" "Project-Id-Version: mdadm_2.6.7-3_sv\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-07-23 18:34+0200\n" "Last-Translator: Martin Ågren \n" "Language-Team: Swedish \n" -"Language: sv\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=ISO-8859-1\n" "Content-Transfer-Encoding: 8bit\n" +"Language: sv\n" "X-Generator: KBabel 1.11.4\n" "Plural-Forms: nplurals=2; plural=(n != 1);\n" diff -Nru mdadm-2.6.7.1/debian/po/templates.pot mdadm-3.1.4/debian/po/templates.pot --- mdadm-2.6.7.1/debian/po/templates.pot 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/templates.pot 2010-10-18 13:57:39.000000000 +0300 @@ -8,7 +8,7 @@ msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" diff -Nru mdadm-2.6.7.1/debian/po/vi.po mdadm-3.1.4/debian/po/vi.po --- mdadm-2.6.7.1/debian/po/vi.po 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/po/vi.po 2010-10-18 13:57:39.000000000 +0300 @@ -6,14 +6,14 @@ msgstr "" "Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" "Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" -"POT-Creation-Date: 2008-12-18 19:35-0600\n" +"POT-Creation-Date: 2010-10-18 13:57+0300\n" "PO-Revision-Date: 2008-02-23 17:40+1030\n" "Last-Translator: Clytie Siddall \n" "Language-Team: Vietnamese \n" -"Language: vi\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" +"Language: vi\n" "Plural-Forms: nplurals=1; plural=0;\n" "X-Generator: LocFactoryEditor 1.7b3\n" @@ -197,8 +197,8 @@ #~ "Please refer to /usr/share/doc/mdadm/README.upgrading-2.5.3.gz if you " #~ "intend to continue." #~ msgstr "" -#~ "Hãy tham chiếu đến tài liệu Äá»c Äi « /usr/share/doc/mdadm/README." -#~ "upgrading-2.5.3.gz » nếu bạn định tiếp tục." +#~ "Hãy tham chiếu đến tài liệu Äá»c Äi « /usr/share/doc/mdadm/README.upgrading-" +#~ "2.5.3.gz » nếu bạn định tiếp tục." #~ msgid "" #~ "This warning is only relevant if you need arrays to be started from the " diff -Nru mdadm-2.6.7.1/debian/README.checkarray mdadm-3.1.4/debian/README.checkarray --- mdadm-2.6.7.1/debian/README.checkarray 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/README.checkarray 2010-09-30 19:46:18.000000000 +0300 @@ -7,6 +7,18 @@ /etc/cron.d/mdadm, but then only running the script when the day of the month is less than or equal to 7. See #380425. +Cron will try to run the check at "idle I/O priority" (see ionice(1)), so that +the check does not overload the system too much. Note that this will only +work if all the component devices of the array employ the (default) "cfq" I/O +scheduler. See the kernel documentation[0] for information on how to verify +and modify the scheduler. checkarray does not verify this for you. + + 0. http://www.kernel.org/doc/Documentation/block/switching-sched.txt + +If you manually invoke checkarray, it runs with default I/O priority. Should +you need to run a check at a higher (or lower) I/O priority, then have a look +at the --idle, --slow, --fast, and --realtime options. + 'check' is a read-only operation, even though the kernel logs may suggest otherwise (e.g. /proc/mdstat and several kernel messages will mention "resync"). Please also see question 21 of the FAQ. @@ -18,4 +30,4 @@ You can cancel a running array check with the -x option to checkarray. - -- martin f. krafft Mon, 08 Jan 2007 02:07:19 +0100 + -- martin f. krafft Thu, 02 Sep 2010 10:27:29 +0200 diff -Nru mdadm-2.6.7.1/debian/README.recipes mdadm-3.1.4/debian/README.recipes --- mdadm-2.6.7.1/debian/README.recipes 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/README.recipes 2010-09-30 19:46:18.000000000 +0300 @@ -9,7 +9,7 @@ Enjoy. Submissions welcome. The latest version of this document is available here: - http://svn.debian.org/wsvn/pkg-mdadm/mdadm/trunk/debian/README.recipes?op=file&rev=0&sc=0 + http://git.debian.org/?p=pkg-mdadm/mdadm.git;a=blob;f=debian/README.recipes;hb=HEAD 0. create a new array ~~~~~~~~~~~~~~~~~~~~~ @@ -86,7 +86,7 @@ 10. convert existing filesystem to RAID 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # The idea is to create a degraded RAID 1 on the second partition, move - # data, then hot add the first. This seems safer to me than simply to + # data, then hot add the first. This seems safer to me than simply to # force-add a superblock to the existing filesystem. # # Assume /dev/sda1 holds the data (and let's assume it's mounted on diff -Nru mdadm-2.6.7.1/debian/README.source mdadm-3.1.4/debian/README.source --- mdadm-2.6.7.1/debian/README.source 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/README.source 2010-09-03 11:44:09.000000000 +0300 @@ -0,0 +1,18 @@ +Building mdadm for Debian +------------------------- + +The mdadm source package uses quilt to apply and remove its patches. Please +refer to /usr/share/doc/quilt/README.source for information about how to use +quilt for source packages. + +The quilt series is generated from the Git repository, using TopGit. +This process is documented in /usr/share/doc/topgit/HOWTO-tg2quilt.gz . + +The mdadm packages uses the following branch layout: + + fixes/* patches destined to go upstream + contrib/* contributed content + contrib/docs/* additional documentation + debian/* debian-specific changes + + -- martin f. krafft Tue, 27 Oct 2009 18:51:13 +0100 diff -Nru mdadm-2.6.7.1/debian/rules mdadm-3.1.4/debian/rules --- mdadm-2.6.7.1/debian/rules 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/rules 2010-10-18 13:29:41.000000000 +0300 @@ -40,18 +40,25 @@ $(MAKE) $(FLAGS) all .PHONY: mdadm +INTERPOLATED_FILES = debian/bugscript debian/mkconf + clean: dh_testdir dh_testroot + rm -f $(INTERPOLATED_FILES) rm -f build-stamp [ ! -f Makefile ] || $(MAKE) clean rm -f mdadm.udeb mdadm dh_clean debconf-updatepo +debian/%: VERSION=$(shell dpkg-parsechangelog | sed -ne 's,^Version: ,,p') +debian/%: debian/%.in + sed -re 's,%VERSION%,$(VERSION),g' < $< > $@ + install: DESTDIR=$(CURDIR)/debian/mdadm install: DESTDIR_UDEB=$(DESTDIR)-udeb -install: build +install: build $(INTERPOLATED_FILES) dh_testdir dh_clean -k dh_installdirs @@ -72,9 +79,8 @@ install -m0755 debian/checkarray $(DESTDIR)/usr/share/mdadm install -m0755 debian/bugscript $(DESTDIR)/usr/share/bug/mdadm/script install -m0755 debian/source_mdadm.py $(DESTDIR)/usr/share/apport/package-hooks/ - - install -m0644 debian/mdadm.lintian-overrides \ - $(DESTDIR)/usr/share/lintian/overrides/mdadm + install -m0755 mdadm.udeb $(DESTDIR_UDEB)/sbin/mdadm + install -D -m0644 udev-md-raid.rules $(DESTDIR_UDEB)/lib/udev/rules.d/64-md-raid.rules binary-indep: build install @@ -85,7 +91,7 @@ echo >> debian/mdadm/DEBIAN/templates po2debconf debian/mdadm-udeb.templates >> debian/mdadm/DEBIAN/templates dh_installdocs - dh_installexamples debian/newdisk + dh_installexamples contrib/mdadd.sh dh_installinit -- defaults 25 dh_installudev --priority=85 dh_installudev --priority=65 --name=mdadm-blkid @@ -95,7 +101,7 @@ dh_installlogcheck dh_link dh_strip - dh_compress + dh_compress -Xmdadd.sh dh_fixperms dh_installdeb dh_shlibdeps diff -Nru mdadm-2.6.7.1/debian/source/format mdadm-3.1.4/debian/source/format --- mdadm-2.6.7.1/debian/source/format 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/debian/source/format 2010-10-18 14:20:55.927166901 +0300 @@ -0,0 +1 @@ +3.0 (quilt) diff -Nru mdadm-2.6.7.1/debian/watch mdadm-3.1.4/debian/watch --- mdadm-2.6.7.1/debian/watch 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/debian/watch 2010-09-30 19:46:18.000000000 +0300 @@ -1,2 +1,2 @@ version=3 -http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-([.[:digit:]]+).tgz +http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-([.[:digit:]]+).tar.gz diff -Nru mdadm-2.6.7.1/Detail.c mdadm-3.1.4/Detail.c --- mdadm-2.6.7.1/Detail.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/Detail.c 2010-08-26 05:24:15.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,17 +19,13 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" #include "md_p.h" #include "md_u.h" +#include int Detail(char *dev, int brief, int export, int test, char *homehost) { @@ -38,7 +34,7 @@ * GET_ARRAY_INFO and GET_DISK_INFO ioctl calls */ - int fd = open(dev, O_RDONLY, 0); + int fd = open(dev, O_RDONLY); int vers; mdu_array_info_t array; mdu_disk_info_t *disks; @@ -52,10 +48,12 @@ int is_26 = get_linux_version() >= 2006000; int is_rebuilding = 0; int failed = 0; - struct supertype *st = NULL; + struct supertype *st; int max_disks = MD_SB_DISKS; /* just a default */ struct mdinfo info; struct mdinfo *sra; + char *member = NULL; + char *container = NULL; int rv = test ? 4 : 1; int avail_disks = 0; @@ -96,12 +94,28 @@ stb.st_rdev = 0; rv = 0; - if (st) max_disks = st->max_devs; + if (st) + max_disks = st->max_devs; + + if (sra && is_subarray(sra->text_version) && + strchr(sra->text_version+1, '/')) { + /* This is a subarray of some container. + * We want the name of the container, and the member + */ + char *s = strchr(sra->text_version+1, '/'); + int dn; + *s++ = '\0'; + member = s; + dn = devname2devnum(sra->text_version+1); + container = map_dev(dev2major(dn), dev2minor(dn), 1); + } /* try to load a superblock */ - for (d= 0; dsb) && - (disk.state & (1<=0 && st && - st->ss->load_super(st, fd2, NULL) == 0) { - st->ss->getinfo_super(st, &info); - if (info.array.ctime != array.ctime || - info.array.level != array.level) - st->ss->free_super(st); - } - if (fd2 >= 0) close(fd2); - } + + if (array.raid_disks > 0 && + (disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + + if (st->sb) + st->ss->free_super(st); + + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + st->ss->getinfo_super(st, &info); + + if (array.raid_disks != 0 && /* container */ + (info.array.ctime != array.ctime || + info.array.level != array.level)) { + st->ss->free_super(st); + continue; + } + /* some formats (imsm) have free-floating-spares + * with a uuid of uuid_match_any, they don't + * have very good info about the rest of the + * container, so keep searching when + * encountering such a device. Otherwise, stop + * after the first successful call to + * ->load_super. + */ + if (memcmp(uuid_match_any, + info.uuid, + sizeof(uuid_match_any)) == 0) { + st->ss->free_super(st); + continue; } + break; } /* Ok, we have some info to print... */ c = map_num(pers, array.level); if (export) { - if (c) - printf("MD_LEVEL=%s\n", c); - printf("MD_DEVICES=%d\n", array.raid_disks); - if (sra && sra->array.major_version < 0) - printf("MD_METADATA=%s\n", sra->text_version); - else - printf("MD_METADATA=%02d.%02d\n", - array.major_version, array.minor_version); - - if (st && st->sb) - st->ss->export_detail_super(st); + if (array.raid_disks) { + if (c) + printf("MD_LEVEL=%s\n", c); + printf("MD_DEVICES=%d\n", array.raid_disks); + } else { + printf("MD_LEVEL=container\n"); + printf("MD_DEVICES=%d\n", array.nr_disks); + } + if (container) { + printf("MD_CONTAINER=%s\n", container); + printf("MD_MEMBER=%s\n", member); + } else { + if (sra && sra->array.major_version < 0) + printf("MD_METADATA=%s\n", sra->text_version); + else + printf("MD_METADATA=%d.%d\n", + array.major_version, array.minor_version); + } + + if (st && st->sb) { + char nbuf[64]; + struct map_ent *mp, *map = NULL; + + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + mp = map_by_uuid(&map, info.uuid); + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) + printf("MD_DEVNAME=%s\n", mp->path+8); + + if (st->ss->export_detail_super) + st->ss->export_detail_super(st); + } else { + struct map_ent *mp, *map = NULL; + char nbuf[64]; + mp = map_by_devnum(&map, fd2devnum(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) + printf("MD_DEVNAME=%s\n", mp->path+8); + } goto out; } + disks = malloc(max_disks * sizeof(mdu_disk_info_t)); + for (d=0; d= 0 && disk.raid_disk < array.raid_disks) + disks[disk.raid_disk] = disk; + else if (next < max_disks) + disks[next++] = disk; + } + + avail = calloc(array.raid_disks, 1); + + for (d= 0; d < array.raid_disks; d++) { + mdu_disk_info_t disk = disks[d]; + + if ((disk.state & (1<array.major_version < 0) - printf(" metadata=%s", sra->text_version); - else - printf(" metadata=%02d.%02d", - array.major_version, array.minor_version); + mdu_bitmap_file_t bmf; + printf("ARRAY %s", dev); + if (brief > 1) { + if (array.raid_disks) + printf(" level=%s num-devices=%d", + c?c:"-unknown-", + array.raid_disks ); + else + printf(" level=container num-devices=%d", + array.nr_disks); + } + if (container) { + printf(" container=%s", container); + printf(" member=%s", member); + } else { + if (sra && sra->array.major_version < 0) + printf(" metadata=%s", sra->text_version); + else + printf(" metadata=%d.%d", + array.major_version, array.minor_version); + } + + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (vers >= 9001 && + ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && + bmf.pathname[0]) { + printf(" bitmap=%s", bmf.pathname); + } } else { mdu_bitmap_file_t bmf; unsigned long long larray_size; struct mdstat_ent *ms = mdstat_read(0, 0); struct mdstat_ent *e; int devnum = array.md_minor; - if (major(stb.st_rdev) != MD_MAJOR) + if (major(stb.st_rdev) == (unsigned)get_mdp_major()) devnum = -1 - devnum; for (e=ms; e; e=e->next) @@ -172,14 +301,19 @@ printf("%s:\n", dev); + if (container) + printf(" Container : %s, member %s\n", container, member); + else { if (sra && sra->array.major_version < 0) printf(" Version : %s\n", sra->text_version); else - printf(" Version : %02d.%02d\n", + printf(" Version : %d.%d\n", array.major_version, array.minor_version); + } atime = array.ctime; - printf(" Creation Time : %.24s\n", ctime(&atime)); + if (atime) + printf(" Creation Time : %.24s\n", ctime(&atime)); if (array.raid_disks == 0) c = "container"; printf(" Raid Level : %s\n", c?c:"-unknown-"); if (larray_size) @@ -190,17 +324,21 @@ unsigned long long dsize = get_component_size(fd); if (dsize > 0) printf(" Used Dev Size : %llu%s\n", - dsize, - human_size((long long)dsize<<10)); + dsize/2, + human_size((long long)dsize<<9)); else printf(" Used Dev Size : unknown\n"); } else printf(" Used Dev Size : %d%s\n", array.size, human_size((long long)array.size<<10)); } - printf(" Raid Devices : %d\n", array.raid_disks); + if (array.raid_disks) + printf(" Raid Devices : %d\n", array.raid_disks); printf(" Total Devices : %d\n", array.nr_disks); - printf("Preferred Minor : %d\n", array.md_minor); + if (!container && + ((sra == NULL && array.major_version == 0) || + (sra && sra->array.major_version == 0))) + printf("Preferred Minor : %d\n", array.md_minor); if (sra == NULL || sra->array.major_version >= 0) printf(" Persistence : Superblock is %spersistent\n", array.not_persistent?"not ":""); @@ -214,26 +352,45 @@ } else if (array.state & (1<percent < 0) ? "" : - (e->resync) ? ", resyncing": ", recovering", - larray_size ? "": ", Not Started"); - printf(" Active Devices : %d\n", array.active_disks); + if (atime) + printf(" Update Time : %.24s\n", ctime(&atime)); + if (array.raid_disks) { + char *st; + if (avail_disks == array.raid_disks) + st = ""; + else if (!enough(array.level, array.raid_disks, + array.layout, 1, avail, avail_disks)) + st = ", FAILED"; + else + st = ", degraded"; + + printf(" State : %s%s%s%s\n", + (array.state&(1<percent < 0) ? "" : + (e->resync) ? ", resyncing": ", recovering", + larray_size ? "": ", Not Started"); + } + if (array.raid_disks) + printf(" Active Devices : %d\n", array.active_disks); printf("Working Devices : %d\n", array.working_disks); - printf(" Failed Devices : %d\n", array.failed_disks); - printf(" Spare Devices : %d\n", array.spare_disks); + if (array.raid_disks) { + printf(" Failed Devices : %d\n", array.failed_disks); + printf(" Spare Devices : %d\n", array.spare_disks); + } printf("\n"); if (array.level == 5) { c = map_num(r5layout, array.layout); printf(" Layout : %s\n", c?c:"-unknown-"); } + if (array.level == 6) { + c = map_num(r6layout, array.layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } if (array.level == 10) { - printf(" Layout : near=%d, %s=%d\n", - array.layout&255, (array.layout&0x10000)?"offset":"far", - (array.layout>>8)&255); + printf(" Layout :"); + print_r10_layout(array.layout); + printf("\n"); } switch (array.level) { case 0: @@ -264,7 +421,7 @@ #if 0 This is pretty boring printf(" Reshape pos'n : %llu%s\n", (unsigned long long) info.reshape_progress<<9, - human_size(info.reshape_progress<<9)); + human_size((unsigned long long)info.reshape_progress<<9)); #endif if (info.delta_disks > 0) printf(" Delta Devices : %d, (%d->%d)\n", @@ -283,6 +440,11 @@ printf(" New Layout : %s\n", c?c:"-unknown-"); } + if (info.new_level == 6) { + char *c = map_num(r6layout, info.new_layout); + printf(" New Layout : %s\n", + c?c:"-unknown-"); + } if (info.new_level == 10) { printf(" New Layout : near=%d, %s=%d\n", info.new_layout&255, @@ -298,34 +460,47 @@ if (st && st->sb) st->ss->detail_super(st, homehost); - printf(" Number Major Minor RaidDevice State\n"); - } - disks = malloc(max_disks * sizeof(mdu_disk_info_t)); - for (d=0; darray.major_version == -1 + && sra->array.minor_version == -2 && sra->text_version[0] != '/') { + /* This looks like a container. Find any active arrays + * That claim to be a member. + */ + DIR *dir = opendir("/sys/block"); + struct dirent *de; + + printf(" Member Arrays :"); + + while (dir && (de = readdir(dir)) != NULL) { + char path[200]; + char vbuf[1024]; + int nlen = strlen(sra->sys_name); + int dn; + if (de->d_name[0] == '.') + continue; + sprintf(path, "/sys/block/%s/md/metadata_version", + de->d_name); + if (load_sys(path, vbuf) < 0) + continue; + if (strncmp(vbuf, "external:", 9) != 0 || + !is_subarray(sra->sys_name+9) || + strncmp(vbuf+10, sra->sys_name, nlen) != 0 || + vbuf[10+nlen] != '/') + continue; + dn = devname2devnum(de->d_name); + printf(" %s", map_dev(dev2major(dn), + dev2minor(dn), 1)); + } + if (dir) + closedir(dir); + printf("\n\n"); } - if (disk.major == 0 && disk.minor == 0) - continue; - if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks) - disks[disk.raid_disk] = disk; - else if (next < max_disks) - disks[next++] = disk; + + if (array.raid_disks) + printf(" Number Major Minor RaidDevice State\n"); + else + printf(" Number Major Minor RaidDevice\n"); } - avail = calloc(array.raid_disks, 1); for (d= 0; d < max_disks; d++) { char *dv; mdu_disk_info_t disk = disks[d]; @@ -342,6 +517,9 @@ else printf(" %5d %5d %5d %5d ", disk.number, disk.major, disk.minor, disk.raid_disk); + } + if (!brief && array.raid_disks) { + if (disk.state & (1<sb) st->ss->brief_detail_super(st); st->ss->free_super(st); @@ -405,7 +578,49 @@ 1, avail, avail_disks)) rv = 2; + free(disks); out: close(fd); return rv; } + +int Detail_Platform(struct superswitch *ss, int scan, int verbose) +{ + /* display platform capabilities for the given metadata format + * 'scan' in this context means iterate over all metadata types + */ + int i; + int err = 1; + + if (ss && ss->detail_platform) + err = ss->detail_platform(verbose, 0); + else if (ss) { + if (verbose) + fprintf(stderr, Name ": %s metadata is platform independent\n", + ss->name ? : "[no name]"); + } else if (!scan) { + if (verbose) + fprintf(stderr, Name ": specify a metadata type or --scan\n"); + } + + if (!scan) + return err; + + for (i = 0; superlist[i]; i++) { + struct superswitch *meta = superlist[i]; + + if (meta == ss) + continue; + if (verbose) + fprintf(stderr, Name ": checking metadata %s\n", + meta->name ? : "[no name]"); + if (!meta->detail_platform) { + if (verbose) + fprintf(stderr, Name ": %s metadata is platform independent\n", + meta->name ? : "[no name]"); + } else + err |= meta->detail_platform(verbose, 0); + } + + return err; +} diff -Nru mdadm-2.6.7.1/Examine.c mdadm-3.1.4/Examine.c --- mdadm-2.6.7.1/Examine.c 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/Examine.c 2010-08-31 10:18:39.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -68,7 +63,7 @@ } *arrays = NULL; for (; devlist ; devlist=devlist->next) { - struct supertype *st = forcest; + struct supertype *st; fd = dev_open(devlist->devname, O_RDONLY); if (fd < 0) { @@ -80,7 +75,9 @@ err = 1; } else { - if (!st) + if (forcest) + st = dup_super(forcest); + else st = guess_super(fd); if (st) err = st->ss->load_super(st, fd, @@ -119,16 +116,16 @@ ap->st = st; arrays = ap; st->ss->getinfo_super(st, &ap->info); - } else { + } else st->ss->getinfo_super(st, &ap->info); - st->ss->free_super(st); - } - if (!(ap->info.disk.state & MD_DISK_SYNC)) + if (!st->loaded_container && + !(ap->info.disk.state & (1<spares++; d = dl_strdup(devlist->devname); dl_add(ap->devs, d); } else if (export) { - st->ss->export_examine_super(st); + if (st->ss->export_examine_super) + st->ss->export_examine_super(st); } else { printf("%s:\n",devlist->devname); st->ss->examine_super(st, homehost); @@ -140,15 +137,23 @@ for (ap=arrays; ap; ap=ap->next) { char sep='='; char *d; - ap->st->ss->brief_examine_super(ap->st); - if (ap->spares) printf(" spares=%d", ap->spares); + int newline = 0; + + ap->st->ss->brief_examine_super(ap->st, brief > 1); + if (ap->spares) + newline += printf(" spares=%d", ap->spares); if (brief > 1) { - printf(" devices"); + newline += printf(" devices"); for (d=dl_next(ap->devs); d!= ap->devs; d=dl_next(d)) { printf("%c%s", sep, d); sep=','; } } + if (ap->st->ss->brief_examine_subarrays) { + if (newline) + printf("\n"); + ap->st->ss->brief_examine_subarrays(ap->st, brief > 1); + } ap->st->ss->free_super(ap->st); /* FIXME free ap */ if (ap->spares || brief > 1) diff -Nru mdadm-2.6.7.1/.gitignore mdadm-3.1.4/.gitignore --- mdadm-2.6.7.1/.gitignore 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/.gitignore 2010-03-22 08:08:42.000000000 +0200 @@ -3,3 +3,7 @@ /*-stamp /mdadm /mdadm.udeb +/mdmon +/swap_super +/test_stripe +/TAGS diff -Nru mdadm-2.6.7.1/Grow.c mdadm-3.1.4/Grow.c --- mdadm-2.6.7.1/Grow.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/Grow.c 2010-08-26 05:24:15.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,15 +19,11 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" #include "dlink.h" +#include #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) #error no endian defined @@ -35,6 +31,10 @@ #include "md_u.h" #include "md_p.h" +#ifndef offsetof +#define offsetof(t,f) ((size_t)&(((t*)0)->f)) +#endif + int Grow_Add_device(char *devname, int fd, char *newdev) { /* Add a device to an active array. @@ -69,7 +69,7 @@ return 1; } - nfd = open(newdev, O_RDWR|O_EXCL); + nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); if (nfd < 0) { fprintf(stderr, Name ": cannot open %s\n", newdev); return 1; @@ -288,6 +288,11 @@ return 1; } else if (strcmp(file, "internal") == 0) { int d; + if (st->ss->add_internal_bitmap == NULL) { + fprintf(stderr, Name ": Internal bitmaps not supported " + "with %s metadata\n", st->ss->name); + return 1; + } for (d=0; d< st->max_devs; d++) { mdu_disk_info_t disk; char *dv; @@ -381,24 +386,30 @@ /* * When reshaping an array we might need to backup some data. * This is written to all spares with a 'super_block' describing it. - * The superblock goes 1K form the end of the used space on the + * The superblock goes 4K from the end of the used space on the * device. * It if written after the backup is complete. * It has the following structure. */ -struct mdp_backup_super { - char magic[16]; /* md_backup_data-1 */ +static struct mdp_backup_super { + char magic[16]; /* md_backup_data-1 or -2 */ __u8 set_uuid[16]; __u64 mtime; /* start/sizes in 512byte sectors */ - __u64 devstart; + __u64 devstart; /* address on backup device/file of data */ __u64 arraystart; __u64 length; __u32 sb_csum; /* csum of preceeding bytes. */ -}; + __u32 pad1; + __u64 devstart2; /* offset in to data of second section */ + __u64 arraystart2; + __u64 length2; + __u32 sb_csum2; /* csum of preceeding bytes. */ + __u8 pad[512-68-32]; +} __attribute__((aligned(512))) bsb, bsb2; -int bsb_csum(char *buf, int len) +__u32 bsb_csum(char *buf, int len) { int i; int csum = 0; @@ -407,33 +418,107 @@ return __cpu_to_le32(csum); } +static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets); +static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets); +static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks, + int *fds, unsigned long long *offsets, + unsigned long long start, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets); + +int freeze_array(struct mdinfo *sra) +{ + /* Try to freeze resync on this array. + * Return -1 if the array is busy, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + char buf[20]; + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) + return 0; + if (strcmp(buf, "idle\n") != 0 && + strcmp(buf, "frozen\n") != 0) + return -1; + if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0) + return 0; + return 1; +} + +void unfreeze_array(struct mdinfo *sra, int frozen) +{ + /* If 'frozen' is 1, unfreeze the array */ + if (frozen > 0) + sysfs_set_str(sra, NULL, "sync_action", "idle"); +} + +void wait_reshape(struct mdinfo *sra) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_action"); + char action[20]; + + do { + fd_set rfds; + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + select(fd+1, NULL, NULL, &rfds, NULL); + + if (sysfs_fd_get_str(fd, action, 20) < 0) { + close(fd); + return; + } + } while (strncmp(action, "reshape", 7) == 0); +} + + int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, long long size, - int level, int layout, int chunksize, int raid_disks) + int level, char *layout_str, int chunksize, int raid_disks) { /* Make some changes in the shape of an array. * The kernel must support the change. - * Different reshapes have subtly different meaning for different - * levels, so we need to check the current state of the array - * and go from there. + * + * There are three different changes. Each can trigger + * a resync or recovery so we freeze that until we have + * requested everything (if kernel supports freezing - 2.6.30). + * The steps are: + * - change size (i.e. component_size) + * - change level + * - change layout/chunksize/ndisks + * + * The last can require a reshape. It is different on different + * levels so we need to check the level before actioning it. + * Some times the level change needs to be requested after the + * reshape (e.g. raid6->raid5, raid5->raid0) + * */ - struct mdu_array_info_s array; + struct mdu_array_info_s array, orig; char *c; - - struct mdp_backup_super bsb; + int rv = 0; struct supertype *st; - int nlevel, olevel; int nchunk, ochunk; int nlayout, olayout; int ndisks, odisks; - int ndata, odata; - unsigned long long nstripe, ostripe, last_block; + unsigned int ndata, odata; + int orig_level = UnSet; + char alt_layout[40]; int *fdlist; unsigned long long *offsets; - int d, i, spares; + int d, i; int nrdisks; int err; + int frozen; + unsigned long a,b, blocks, stripes; + unsigned long cache; + unsigned long long array_size; + int changed = 0; + int done; struct mdinfo *sra; struct mdinfo *sd; @@ -443,127 +528,332 @@ devname); return 1; } + + if (size >= 0 && + (chunksize || level!= UnSet || layout_str || raid_disks)) { + fprintf(stderr, Name ": cannot change component size at the same time " + "as other changes.\n" + " Change size first, then check data is intact before " + "making other changes.\n"); + return 1; + } + + if (raid_disks && raid_disks < array.raid_disks && array.level > 1 && + get_linux_version() < 2006032 && + !check_env("MDADM_FORCE_FEWER")) { + fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n" + " Please use a newer kernel\n"); + return 1; + } + sra = sysfs_read(fd, 0, GET_LEVEL); + if (sra) + frozen = freeze_array(sra); + else { + fprintf(stderr, Name ": failed to read sysfs parameters for %s\n", + devname); + return 1; + } + if (frozen < 0) { + fprintf(stderr, Name ": %s is performing resync/recovery and cannot" + " be reshaped\n", devname); + return 1; + } + + /* ========= set size =============== */ + if (size >= 0 && (size == 0 || size != array.size)) { + array.size = size; + if (array.size != size) { + /* got truncated to 32bit, write to + * component_size instead + */ + if (sra) + rv = sysfs_set_num(sra, NULL, + "component_size", size); + else + rv = -1; + } else + rv = ioctl(fd, SET_ARRAY_INFO, &array); + if (rv != 0) { + int err = errno; + fprintf(stderr, Name ": Cannot set device size for %s: %s\n", + devname, strerror(err)); + if (err == EBUSY && + (array.state & (1<4,5,6 1->5 4->5,6 5->1,6 + * Level changes that need a layout change first are: + * 6->5,4,0 : need a -6 layout, or parity-last + * 5->4,0 : need parity-last + */ + if ((array.level == 6 || array.level == 5) && + (level == 5 || level == 4 || level == 0)) { + /* Don't change level yet, but choose intermediate + * layout + */ + if (level == 5) { + if (layout_str == NULL) + switch (array.layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_ROTATING_N_RESTART: + layout_str = "left-asymmetric-6"; + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_ROTATING_N_CONTINUE: + layout_str = "left-symmetric-6"; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_ROTATING_ZERO_RESTART: + layout_str = "right-asymmetric-6"; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC_6: + layout_str = "right-symmetric-6"; + break; + case ALGORITHM_PARITY_0: + case ALGORITHM_PARITY_0_6: + layout_str = "parity-first-6"; + break; + case ALGORITHM_PARITY_N: + layout_str = "parity-last"; + break; + default: + fprintf(stderr, Name ": %s: cannot" + "convert layout to RAID5 equivalent\n", + devname); + rv = 1; + goto release; + } + else { + int l = map_name(r5layout, layout_str); + if (l == UnSet) { + fprintf(stderr, Name ": %s: layout '%s' not recognised\n", + devname, layout_str); + rv = 1; + goto release; + } + if (l != ALGORITHM_PARITY_N) { + /* need the -6 version */ + char *ls = map_num(r5layout, l); + strcat(strcpy(alt_layout, ls), + "-6"); + layout_str = alt_layout; + } + } + if (raid_disks) + /* The final raid6->raid5 conversion + * will reduce the number of disks, + * so now we need to aim higher + */ + raid_disks++; + } else + layout_str = "parity-last"; + } else { + c = map_num(pers, level); + if (c == NULL) { + rv = 1;/* not possible */ + goto release; + } + err = sysfs_set_str(sra, NULL, "level", c); + if (err) { + err = errno; + fprintf(stderr, Name ": %s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<= 16) { + fprintf(stderr, Name + ": %s has a non-standard layout. If you wish to preserve this\n" + " during the reshape, please specify --layout=preserve\n" + " If you want to change it, specify a layout or use --layout=normalise\n", + devname); + rv = 1; + goto release; + } + if (strcmp(layout_str, "normalise") == 0 || + strcmp(layout_str, "normalize") == 0) { + char *hyphen; + strcpy(alt_layout, map_num(r6layout, array.layout)); + hyphen = strrchr(alt_layout, '-'); + if (hyphen && strcmp(hyphen, "-6") == 0) { + *hyphen = 0; + layout_str = alt_layout; + } + } + + if (array.layout == map_name(r6layout, layout_str)) + layout_str = NULL; + if (layout_str && strcmp(layout_str, "preserve") == 0) + layout_str = NULL; + break; + } + if (layout_str == NULL + && (chunksize == 0 || chunksize*1024 == array.chunk_size) + && (raid_disks == 0 || raid_disks == array.raid_disks)) { + rv = 0; + if (level != UnSet && level != array.level) { + /* Looks like this level change doesn't need + * a reshape after all. + */ + c = map_num(pers, level); + if (c) { + rv = sysfs_set_str(sra, NULL, "level", c); + if (rv) { + int err = errno; + fprintf(stderr, Name ": %s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<= 0) { - fprintf(stderr, Name ": %s: Cannot change size of a 'faulty' array\n", - devname); - return 1; - } - if (level != UnSet && level != LEVEL_FAULTY) { - fprintf(stderr, Name ": %s: Cannot change RAID level of a 'faulty' array\n", - devname); - return 1; - } if (chunksize || raid_disks) { fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n", devname); - return 1; + rv = 1; + break; } - if (layout == UnSet) - return 0; /* nothing to do.... */ + if (layout_str == NULL) + break; /* nothing to do.... */ - array.layout = layout; + array.layout = parse_layout_faulty(layout_str); + if (array.layout < 0) { + fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n", + devname, layout_str); + rv = 1; + break; + } if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { fprintf(stderr, Name ": Cannot set layout for %s: %s\n", devname, strerror(errno)); - return 1; - } - if (!quiet) + rv = 1; + } else if (!quiet) printf("layout for %s set to %d\n", devname, array.layout); - return 0; + break; - case 1: /* raid_disks and size can each be changed. They are independant */ + case 1: /* only raid_disks can each be changed. */ - if (level != UnSet && level != 1) { - fprintf(stderr, Name ": %s: Cannot change RAID level of a RAID1 array.\n", - devname); - return 1; - } - if (chunksize || layout != UnSet) { - fprintf(stderr, Name ": %s: Cannot change chunk size of layout for a RAID1 array.\n", + if (chunksize || layout_str != NULL) { + fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n", devname); - return 1; + rv = 1; + break; } - - /* Each can trigger a resync/recovery which will block the - * other from happening. Later we could block - * resync for the duration via 'sync_action'... - */ if (raid_disks > 0) { array.raid_disks = raid_disks; if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n", devname, strerror(errno)); - return 1; + rv = 1; } } - if (size >= 0) { - array.size = size; - if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": Cannot set device size for %s: %s\n", - devname, strerror(errno)); - return 1; - } - } - return 0; + break; case 4: case 5: case 6: - st = super_by_fd(fd); - /* size can be changed independently. - * layout/chunksize/raid_disks/level can be changed + /* + * layout/chunksize/raid_disks can be changed * though the kernel may not support it all. - * If 'suspend_lo' is not present in devfs, then - * these cannot be changed. */ - if (size >= 0) { - /* Cannot change other details as well.. */ - if (layout != UnSet || - chunksize != 0 || - raid_disks != 0 || - level != UnSet) { - fprintf(stderr, Name ": %s: Cannot change shape as well as size of a %s array.\n", - devname, c); - return 1; - } - array.size = size; - if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n", - devname, strerror(errno)); - return 1; - } - return 0; - } - /* Ok, just change the shape. This can be awkward. - * There are three possibilities. - * 1/ The array will shrink. We don't support this - * possibility. Maybe one day... - * 2/ The array will not change size. This is easy enough - * to do, but not reliably. If the process is aborted - * the array *will* be corrupted. So maybe we can allow - * this but only if the user is really certain. e.g. - * --really-risk-everything - * 3/ The array will grow. This can be reliably achieved. + st = super_by_fd(fd); + + /* + * There are three possibilities. + * 1/ The array will shrink. + * We need to ensure the reshape will pause before reaching + * the 'critical section'. We also need to fork and wait for + * that to happen. When it does we + * suspend/backup/complete/unfreeze + * + * 2/ The array will not change size. + * This requires that we keep a backup of a sliding window + * so that we can restore data after a crash. So we need + * to fork and monitor progress. + * + * 3/ The array will grow. This is relatively easy. * However the kernel's restripe routines will cheerfully * overwrite some early data before it is safe. So we * need to make a backup of the early parts of the array * and be ready to restore it if rebuild aborts very early. * - * We backup data by writing it to all spares (there must be - * at least 1, so even raid6->raid5 requires a spare to be - * present). + * We backup data by writing it to one spare, or to a + * file which was given on command line. * + * [FOLLOWING IS OLD AND PARTLY WRONG] * So: we enumerate the devices in the array and * make sure we can open all of them. * Then we freeze the early part of the array and @@ -573,78 +863,131 @@ * and finally invalidate the copied data and unfreeze the * start of the array. * - * Before we can do this we need to decide: - * - will the array grow? Just calculate size - * - how much needs to be saved: count stripes. - * - where to save data... good question. - * + * In each case, we first make sure that storage is available + * for the required backup. + * Then we: + * - request the shape change. + * - for to handle backup etc. */ - nlevel = olevel = array.level; nchunk = ochunk = array.chunk_size; nlayout = olayout = array.layout; ndisks = odisks = array.raid_disks; - if (level != UnSet) nlevel = level; - if (chunksize) nchunk = chunksize; - if (layout != UnSet) nlayout = layout; + if (chunksize) { + nchunk = chunksize * 1024; + if (size % chunksize) { + fprintf(stderr, Name ": component size %lluK is not" + " a multiple of chunksize %dK\n", + size, chunksize); + break; + } + } + if (layout_str != NULL) + switch(array.level) { + case 4: /* ignore layout */ + break; + case 5: + nlayout = map_name(r5layout, layout_str); + if (nlayout == UnSet) { + fprintf(stderr, Name ": layout %s not understood for raid5.\n", + layout_str); + rv = 1; + goto release; + } + break; + + case 6: + nlayout = map_name(r6layout, layout_str); + if (nlayout == UnSet) { + fprintf(stderr, Name ": layout %s not understood for raid6.\n", + layout_str); + rv = 1; + goto release; + } + break; + } if (raid_disks) ndisks = raid_disks; odata = odisks-1; - if (olevel == 6) odata--; /* number of data disks */ ndata = ndisks-1; - if (nlevel == 6) ndata--; - - if (ndata < odata) { - fprintf(stderr, Name ": %s: Cannot reduce number of data disks (yet).\n", - devname); - return 1; - } - if (ndata == odata) { - fprintf(stderr, Name ": %s: Cannot reshape array without increasing size (yet).\n", - devname); - return 1; - } - /* Well, it is growing... so how much do we need to backup. - * Need to backup a full number of new-stripes, such that the - * last one does not over-write any place that it would be read - * from + if (array.level == 6) { + odata--; /* number of data disks */ + ndata--; + } + + if (odata == ndata && + get_linux_version() < 2006032) { + fprintf(stderr, Name ": in-place reshape is not safe before 2.6.32, sorry.\n"); + break; + } + + /* Check that we can hold all the data */ + get_dev_size(fd, NULL, &array_size); + if (ndata * (unsigned long long)size < (array_size/1024)) { + fprintf(stderr, Name ": this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, ndata * size); + rv = 1; + break; + } + + /* So how much do we need to backup. + * We need an amount of data which is both a whole number of + * old stripes and a whole number of new stripes. + * So LCM for (chunksize*datadisks). */ - nstripe = ostripe = 0; - while (nstripe >= ostripe) { - nstripe += nchunk/512; - last_block = nstripe * ndata; - ostripe = last_block / odata / (ochunk/512) * (ochunk/512); + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; + /* Find GCD */ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; } - printf("mdadm: Need to backup %lluK of critical section..\n", last_block/2); + /* LCM == product / GCD */ + blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; + sysfs_free(sra); sra = sysfs_read(fd, 0, GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| GET_CACHE); + if (!sra) { fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n", devname); - return 1; + rv = 1; + break; } - if (last_block >= sra->component_size/2) { + if (ndata == odata) { + /* Make 'blocks' bigger for better throughput, but + * not so big that we reject it below. + * Try for 16 megabytes + */ + while (blocks * 32 < sra->component_size && + blocks < 16*1024*2) + blocks *= 2; + } else + fprintf(stderr, Name ": Need to backup %luK of critical " + "section..\n", blocks/2); + + if (blocks >= sra->component_size/2) { fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n", devname); - return 1; - } - if (sra->array.spare_disks == 0 && backup_file == NULL) { - fprintf(stderr, Name ": %s: Cannot grow - need a spare or backup-file to backup critical section\n", - devname); - return 1; + rv = 1; + break; } - - nrdisks = array.nr_disks + sra->array.spare_disks; + nrdisks = array.raid_disks + sra->array.spare_disks; /* Now we need to open all these devices so we can read/write. */ fdlist = malloc((1+nrdisks) * sizeof(int)); offsets = malloc((1+nrdisks) * sizeof(offsets[0])); if (!fdlist || !offsets) { fprintf(stderr, Name ": malloc failed: grow aborted\n"); - return 1; + rv = 1; + break; } for (d=0; d <= nrdisks; d++) fdlist[d] = -1; @@ -657,204 +1000,675 @@ sd->disk.minor, 1); fdlist[sd->disk.raid_disk] = dev_open(dn, O_RDONLY); - offsets[sd->disk.raid_disk] = sd->data_offset; + offsets[sd->disk.raid_disk] = sd->data_offset*512; if (fdlist[sd->disk.raid_disk] < 0) { fprintf(stderr, Name ": %s: cannot open component %s\n", devname, dn?dn:"-unknown-"); - goto abort; + rv = 1; + goto release; } - } else { + } else if (backup_file == NULL) { /* spare */ char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); fdlist[d] = dev_open(dn, O_RDWR); - offsets[d] = sd->data_offset; + offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; if (fdlist[d]<0) { fprintf(stderr, Name ": %s: cannot open component %s\n", devname, dn?dn:"-unknown"); - goto abort; + rv = 1; + goto release; } d++; } } - for (i=0 ; iarray.spare_disks; - if (backup_file) { - fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, 0600); + if (backup_file == NULL) { + if (ndata <= odata) { + fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n", + devname); + rv = 1; + break; + } else if (sra->array.spare_disks == 0) { + fprintf(stderr, Name ": %s: Cannot grow - need a spare or " + "backup-file to backup critical section\n", + devname); + rv = 1; + break; + } + if (d == array.raid_disks) { + fprintf(stderr, Name ": %s: No spare device for backup\n", + devname); + rv = 1; + break; + } + } else { + /* need to check backup file is large enough */ + char buf[512]; + fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, + S_IRUSR | S_IWUSR); + offsets[d] = 8 * 512; if (fdlist[d] < 0) { fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", devname, backup_file, strerror(errno)); - goto abort; + rv = 1; + break; + } + memset(buf, 0, 512); + for (i=0; i < (signed)blocks + 1 ; i++) { + if (write(fdlist[d], buf, 512) != 512) { + fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + rv = 1; + break; + } + } + if (fsync(fdlist[d]) != 0) { + fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + rv = 1; + break; } - offsets[d] = 8; d++; - spares++; } - if (fdlist[array.raid_disks] < 0) { - fprintf(stderr, Name ": %s: failed to find a spare and no backup-file given - --grow aborted\n", - devname); - goto abort; + + /* lastly, check that the internal stripe cache is + * large enough, or it won't work. + */ + + cache = (nchunk < ochunk) ? ochunk : nchunk; + cache = cache * 4 / 4096; + if (cache < blocks / 8 / odisks + 16) + /* Make it big enough to hold 'blocks' */ + cache = blocks / 8 / odisks + 16; + if (sra->cache_size < cache) + sysfs_set_num(sra, NULL, "stripe_cache_size", + cache+1); + /* Right, everything seems fine. Let's kick things off. + * If only changing raid_disks, use ioctl, else use + * sysfs. + */ + if (ochunk == nchunk && olayout == nlayout) { + array.raid_disks = ndisks; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + int err = errno; + rv = 1; + fprintf(stderr, Name ": Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + if (ndisks < odisks && + get_linux_version() < 2006030) + fprintf(stderr, Name ": linux 2.6.30 or later required\n"); + if (err == EBUSY && + (array.state & (1<ss->load_super(st, fdlist[0], NULL)) { + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok >= 0) + break; + } + if (!sd) { fprintf(stderr, Name ": %s: Cannot find a superblock\n", devname); - goto abort; + rv = 1; + break; } - + memset(&bsb, 0, 512); memcpy(bsb.magic, "md_backup_data-1", 16); st->ss->uuid_from_super(st, (int*)&bsb.set_uuid); bsb.mtime = __cpu_to_le64(time(0)); - bsb.arraystart = 0; - bsb.length = __cpu_to_le64(last_block); - - /* Decide offset for the backup, llseek the spares, and write - * a leading superblock 4K earlier. + bsb.devstart2 = blocks; + stripes = blocks / (ochunk/512) / odata; + /* Now we just need to kick off the reshape and watch, while + * handling backups of the data... + * This is all done by a forked background process. */ - for (i=array.raid_disks; icomponent_size - last_block - 8; - if (lseek64(fdlist[i], (offsets[i]<<9) - 4096, 0) - != (offsets[i]<<9) - 4096) { - fprintf(stderr, Name ": could not seek...\n"); - goto abort; - } - memset(buf, 0, sizeof(buf)); - bsb.devstart = __cpu_to_le64(offsets[i]); - bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); - memcpy(buf, &bsb, sizeof(bsb)); - if (write(fdlist[i], buf, 4096) != 4096) { - fprintf(stderr, Name ": could not write leading superblock\n"); - goto abort; - } - } - array.level = nlevel; - array.raid_disks = ndisks; - array.chunk_size = nchunk; - array.layout = nlayout; - if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - if (errno == ENOSPC) { - /* stripe cache is not big enough. - * It needs to be 4 times chunksize_size, - * and we assume pagesize is 4K + switch(fork()) { + case 0: + close(fd); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); + + if (odata < ndata) + done = child_grow(fd, sra, stripes, + fdlist, offsets, + odisks, ochunk, array.level, olayout, odata, + d - odisks, fdlist+odisks, offsets+odisks); + else if (odata > ndata) + done = child_shrink(fd, sra, stripes, + fdlist, offsets, + odisks, ochunk, array.level, olayout, odata, + d - odisks, fdlist+odisks, offsets+odisks); + else + done = child_same_size(fd, sra, stripes, + fdlist, offsets, + 0, + odisks, ochunk, array.level, olayout, odata, + d - odisks, fdlist+odisks, offsets+odisks); + if (backup_file && done) + unlink(backup_file); + if (level != UnSet && level != array.level) { + /* We need to wait for the reshape to finish + * (which will have happened unless odata < ndata) + * and then set the level */ - if (sra->cache_size < 4 * (nchunk/4096)) { - sysfs_set_num(sra, NULL, - "stripe_cache_size", - 4 * (nchunk/4096) +1); - if (ioctl(fd, SET_ARRAY_INFO, - &array) == 0) - goto ok; - } - } - fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n", - devname, strerror(errno)); - goto abort; - } - ok: ; - /* suspend the relevant region */ - sysfs_set_num(sra, NULL, "suspend_hi", 0); /* just in case */ - if (sysfs_set_num(sra, NULL, "suspend_lo", 0) < 0 || - sysfs_set_num(sra, NULL, "suspend_hi", last_block) < 0) { - fprintf(stderr, Name ": %s: failed to suspend device.\n", - devname); - goto abort_resume; + c = map_num(pers, level); + if (c == NULL) + exit(0);/* not possible */ + + if (odata < ndata) + wait_reshape(sra); + err = sysfs_set_str(sra, NULL, "level", c); + if (err) + fprintf(stderr, Name ": %s: could not set level to %s\n", + devname, c); + } + exit(0); + case -1: + fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n", + strerror(errno)); + rv = 1; + break; + default: + /* The child will take care of unfreezing the array */ + frozen = 0; + break; } + break; + } - err = save_stripes(fdlist, offsets, - odisks, ochunk, olevel, olayout, - spares, fdlist+odisks, - 0ULL, last_block*512); + release: + if (rv && orig_level != UnSet && sra) { + c = map_num(pers, orig_level); + if (c && sysfs_set_str(sra, NULL, "level", c) == 0) + fprintf(stderr, Name ": aborting level change\n"); + } + if (sra) + unfreeze_array(sra, frozen); + return rv; +} - /* abort if there was an error */ - if (err < 0) { - fprintf(stderr, Name ": %s: failed to save critical region\n", - devname); - goto abort_resume; - } +/* + * We run a child process in the background which performs the following + * steps: + * - wait for resync to reach a certain point + * - suspend io to the following section + * - backup that section + * - allow resync to proceed further + * - resume io + * - discard the backup. + * + * When are combined in slightly different ways in the three cases. + * Grow: + * - suspend/backup/allow/wait/resume/discard + * Shrink: + * - allow/wait/suspend/backup/allow/wait/resume/discard + * same-size: + * - wait/resume/discard/suspend/backup/allow + * + * suspend/backup/allow always come together + * wait/resume/discard do too. + * For the same-size case we have two backups to improve flow. + * + */ - for (i=odisks; i= 4) + odata--; + if (level == 6) + odata--; + sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata); + /* Check that array hasn't become degraded, else we might backup the wrong data */ + sysfs_get_ll(sra, NULL, "degraded", &ll); + new_degraded = (int)ll; + if (new_degraded != *degraded) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + for (sd = sra->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.state = (1<disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = -1; + } + } } } + *degraded = new_degraded; + } + if (part) { + bsb.arraystart2 = __cpu_to_le64(offset * odata); + bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata); + } else { + bsb.arraystart = __cpu_to_le64(offset * odata); + bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata); + } + if (part) + bsb.magic[15] = '2'; + for (i = 0; i < dests; i++) + if (part) + lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0); + else + lseek64(destfd[i], destoffsets[i], 0); - /* start the reshape happening */ - if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { - fprintf(stderr, Name ": %s: failed to initiate reshape\n", - devname); - goto abort_resume; - } - /* wait for reshape to pass the critical region */ - while(1) { - unsigned long long comp; - if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0) { - sleep(5); + rv = save_stripes(sources, offsets, + disks, chunk, level, layout, + dests, destfd, + offset*512*odata, stripes * chunk * odata, + buf); + + if (rv) + return rv; + bsb.mtime = __cpu_to_le64(time(0)); + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + + rv = -1; + if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0) + != destoffsets[i] - 4096) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + if (destoffsets[i] > 4096) { + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) != + destoffsets[i]+stripes*chunk*odata) break; - } - if (comp >= nstripe) + if (write(destfd[i], &bsb, 512) != 512) break; - sleep(1); } + fsync(destfd[i]); + rv = 0; + } - /* invalidate superblocks */ - memset(&bsb, 0, sizeof(bsb)); - for (i=odisks; i 0 && + strncmp(action, "reshape", 7) != 0) + break; + } while (completed < offset + blocks); + close(fd); + + if (part) { + bsb.arraystart2 = __cpu_to_le64(0); + bsb.length2 = __cpu_to_le64(0); + } else { + bsb.arraystart = __cpu_to_le64(0); + bsb.length = __cpu_to_le64(0); + } + bsb.mtime = __cpu_to_le64(time(0)); + rv = 0; + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) != + destoffsets[i]-4096) + rv = -1; + if (rv == 0 && + write(destfd[i], &bsb, 512) != 512) + rv = -1; + fsync(destfd[i]); + } + return rv; +} - /* unsuspend. */ - sysfs_set_num(sra, NULL, "suspend_lo", last_block); +static void fail(char *msg) +{ + int rv; + rv = (write(2, msg, strlen(msg)) != (int)strlen(msg)); + rv |= (write(2, "\n", 1) != 1); + exit(rv ? 1 : 2); +} - for (i=0; i= 0) - close(fdlist[i]); - free(fdlist); - free(offsets); - if (backup_file) - unlink(backup_file); +static char *abuf, *bbuf; +static unsigned long long abuflen; +static void validate(int afd, int bfd, unsigned long long offset) +{ + /* check that the data in the backup against the array. + * This is only used for regression testing and should not + * be used while the array is active + */ + if (afd < 0) + return; + lseek64(bfd, offset - 4096, 0); + if (read(bfd, &bsb2, 512) != 512) + fail("cannot read bsb"); + if (bsb2.sb_csum != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum)-((char*)&bsb2))) + fail("first csum bad"); + if (memcmp(bsb2.magic, "md_backup_data", 14) != 0) + fail("magic is bad"); + if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 && + bsb2.sb_csum2 != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum2)-((char*)&bsb2))) + fail("second csum bad"); + + if (__le64_to_cpu(bsb2.devstart)*512 != offset) + fail("devstart is wrong"); + + if (bsb2.length) { + unsigned long long len = __le64_to_cpu(bsb2.length)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + if (posix_memalign((void**)&abuf, 4096, abuflen) || + posix_memalign((void**)&bbuf, 4096, abuflen)) { + abuflen = 0; + /* just stop validating on mem-alloc failure */ + return; + } + } - printf(Name ": ... critical section passed.\n"); - break; + lseek64(bfd, offset, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) { + //printf("len %llu\n", len); + fail("read first backup failed"); + } + lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read first from array failed"); + if (memcmp(bbuf, abuf, len) != 0) { + #if 0 + int i; + printf("offset=%llu len=%llu\n", + (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len); + for (i=0; i= 0) - close(fdlist[i]); - free(fdlist); - free(offsets); - if (backup_file) - unlink(backup_file); +static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets) +{ + char *buf; + int degraded = 0; + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + /* Don't start the 'reshape' */ + return 0; + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + grow_backup(sra, 0, stripes, + fds, offsets, disks, chunk, level, layout, + dests, destfd, destoffsets, + 0, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512), + dests, destfd, destoffsets, + 0); + sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data); + free(buf); + /* FIXME this should probably be numeric */ + sysfs_set_str(sra, NULL, "sync_max", "max"); return 1; +} + +static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets) +{ + char *buf; + unsigned long long start; + int rv; + int degraded = 0; + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + return 0; + start = sra->component_size - stripes * (chunk/512); + sysfs_set_num(sra, NULL, "sync_max", start); + sysfs_set_str(sra, NULL, "sync_action", "reshape"); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512), + dests, destfd, destoffsets, 0); + if (rv < 0) + return 0; + grow_backup(sra, 0, stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + 0, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + wait_backup(sra, start, stripes*(chunk/512), 0, + dests, destfd, destoffsets, 0); + sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data); + free(buf); + /* FIXME this should probably be numeric */ + sysfs_set_str(sra, NULL, "sync_max", "max"); + return 1; +} + +static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, + int *fds, unsigned long long *offsets, + unsigned long long start, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets) +{ + unsigned long long size; + unsigned long tailstripes = stripes; + int part; + char *buf; + unsigned long long speed; + int degraded = 0; + + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + return 0; + + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); + grow_backup(sra, start, stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + 0, °raded, buf); + grow_backup(sra, (start + stripes) * (chunk/512), stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + 1, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + part = 0; + start += stripes * 2; /* where to read next */ + size = sra->component_size / (chunk/512); + while (start < size) { + if (wait_backup(sra, (start-stripes*2)*(chunk/512), + stripes*(chunk/512), 0, + dests, destfd, destoffsets, + part) < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data); + if (start + stripes > size) + tailstripes = (size - start); + + grow_backup(sra, start*(chunk/512), tailstripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + part, °raded, buf); + start += stripes; + part = 1 - part; + validate(afd, destfd[0], destoffsets[0]); + } + if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0, + dests, destfd, destoffsets, + part) < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data); + wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0, + dests, destfd, destoffsets, + 1-part); + sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data); + sysfs_set_num(sra, NULL, "sync_speed_min", speed); + free(buf); + return 1; } /* @@ -862,28 +1676,35 @@ * write that data into the array and update the super blocks with * the new reshape_progress */ -int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, char *backup_file) +int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, + char *backup_file, int verbose) { int i, j; int old_disks; unsigned long long *offsets; - unsigned long long nstripe, ostripe, last_block; + unsigned long long nstripe, ostripe; int ndata, odata; - if (info->delta_disks < 0) - return 1; /* cannot handle a shrink */ - if (info->new_level != info->array.level || - info->new_layout != info->array.layout || - info->new_chunk != info->array.chunk_size) - return 1; /* Can only handle change in disks */ + if (info->new_level != info->array.level) + return 1; /* Cannot handle level changes (they are instantaneous) */ + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) ndata--; old_disks = info->array.raid_disks - info->delta_disks; + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; for (i=old_disks-(backup_file?1:0); iuuid, 16) != 0) + } + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 && + bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) { + if (verbose) + fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname); + continue; /* Bad second checksum */ + } + if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) { + if (verbose) + fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname); continue; /* Wrong uuid */ + } - if (info->array.utime > __le64_to_cpu(bsb.mtime) + 3600 || - info->array.utime < __le64_to_cpu(bsb.mtime)) - continue; /* time stamp is too bad */ - - if (__le64_to_cpu(bsb.arraystart) != 0) - continue; /* Can only handle backup from start of array */ - if (__le64_to_cpu(bsb.length) < - info->reshape_progress) - continue; /* No new data here */ + /* array utime and backup-mtime should be updated at much the same time, but it seems that + * sometimes they aren't... So allow considerable flexability in matching, and allow + * this test to be overridden by an environment variable. + */ + if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 || + info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) { + if (check_env("MDADM_GROW_ALLOW_OLD")) { + fprintf(stderr, Name ": accepting backup with timestamp %lu " + "for array with timestamp %lu\n", + (unsigned long)__le64_to_cpu(bsb.mtime), + (unsigned long)info->array.utime); + } else { + if (verbose) + fprintf(stderr, Name ": too-old timestamp on " + "backup-metadata on %s\n", devname); + continue; /* time stamp is too bad */ + } + } - if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) + if (bsb.magic[15] == '1') { + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < + info->reshape_progress) { + nonew: + if (verbose) + fprintf(stderr, Name ": backup-metadata found on %s but is not needed\n", devname); + continue; /* No new data here */ + } + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } else { + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) < + info->reshape_progress) + goto nonew; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } + if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { + second_fail: + if (verbose) + fprintf(stderr, Name ": Failed to verify secondary backup-metadata block on %s\n", + devname); continue; /* Cannot seek */ + } /* There should be a duplicate backup superblock 4k before here */ if (lseek64(fd, -4096, 1) < 0 || - read(fd, buf, 4096) != 4096 || - memcmp(buf, &bsb, sizeof(bsb)) != 0) - continue; /* Cannot find leading superblock */ + read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2)) + goto second_fail; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(&bsb2, &bsb, bsbsize) != 0) + goto second_fail; /* Cannot find leading superblock */ /* Now need the data offsets for all devices. */ offsets = malloc(sizeof(*offsets)*info->array.raid_disks); @@ -948,7 +1848,7 @@ continue; st->ss->getinfo_super(st, &dinfo); st->ss->free_super(st); - offsets[j] = dinfo.data_offset; + offsets[j] = dinfo.data_offset * 512; } printf(Name ": restoring critical section\n"); @@ -958,47 +1858,263 @@ info->new_level, info->new_layout, fd, __le64_to_cpu(bsb.devstart)*512, - 0, __le64_to_cpu(bsb.length)*512)) { + __le64_to_cpu(bsb.arraystart)*512, + __le64_to_cpu(bsb.length)*512)) { + /* didn't succeed, so giveup */ + if (verbose) + fprintf(stderr, Name ": Error restoring backup from %s\n", + devname); + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2)*512, + __le64_to_cpu(bsb.length2)*512)) { /* didn't succeed, so giveup */ + if (verbose) + fprintf(stderr, Name ": Error restoring second backup from %s\n", + devname); return 1; } + /* Ok, so the data is restored. Let's update those superblocks. */ + if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } for (j=0; jarray.raid_disks; j++) { if (fdlist[j] < 0) continue; if (st->ss->load_super(st, fdlist[j], NULL)) continue; st->ss->getinfo_super(st, &dinfo); - dinfo.reshape_progress = __le64_to_cpu(bsb.length); + dinfo.reshape_progress = info->reshape_progress; st->ss->update_super(st, &dinfo, "_reshape_progress", NULL,0, 0, NULL); st->ss->store_super(st, fdlist[j]); st->ss->free_super(st); } - - /* And we are done! */ return 0; } /* Didn't find any backup data, try to see if any * was needed. */ - nstripe = ostripe = 0; - odata = info->array.raid_disks - info->delta_disks - 1; - if (info->array.level == 6) odata--; /* number of data disks */ - ndata = info->array.raid_disks - 1; - if (info->new_level == 6) ndata--; - last_block = 0; - while (nstripe >= ostripe) { - nstripe += info->new_chunk / 512; - last_block = nstripe * ndata; - ostripe = last_block / odata / (info->array.chunk_size/512) * - (info->array.chunk_size/512); + if (info->delta_disks < 0) { + /* When shrinking, the critical section is at the end. + * So see if we are before the critical section. + */ + unsigned long long first_block; + nstripe = ostripe = 0; + first_block = 0; + while (ostripe >= nstripe) { + ostripe += info->array.chunk_size / 512; + first_block = ostripe * odata; + nstripe = first_block / ndata / (info->new_chunk/512) * + (info->new_chunk/512); + } + + if (info->reshape_progress >= first_block) + return 0; } + if (info->delta_disks > 0) { + /* See if we are beyond the critical section. */ + unsigned long long last_block; + nstripe = ostripe = 0; + last_block = 0; + while (nstripe >= ostripe) { + nstripe += info->new_chunk / 512; + last_block = nstripe * ndata; + ostripe = last_block / odata / (info->array.chunk_size/512) * + (info->array.chunk_size/512); + } - if (info->reshape_progress >= last_block) - return 0; + if (info->reshape_progress >= last_block) + return 0; + } /* needed to recover critical section! */ + if (verbose) + fprintf(stderr, Name ": Failed to find backup of critical section\n"); return 1; } + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file) +{ + /* Array is assembled and ready to be started, but + * monitoring is probably required. + * So: + * - start read-only + * - set upper bound for resync + * - initialise the 'suspend' boundaries + * - switch to read-write + * - fork and continue monitoring + */ + int err; + int backup_list[1]; + unsigned long long backup_offsets[1]; + int odisks, ndisks, ochunk, nchunk,odata,ndata; + unsigned long a,b,blocks,stripes; + int backup_fd; + int *fds; + unsigned long long *offsets; + int d; + struct mdinfo *sra, *sd; + int rv; + unsigned long cache; + int done = 0; + + err = sysfs_set_str(info, NULL, "array_state", "readonly"); + if (err) + return err; + + /* make sure reshape doesn't progress until we are ready */ + sysfs_set_str(info, NULL, "sync_max", "0"); + sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */ + + sra = sysfs_read(-1, devname2devnum(info->sys_name), + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_CACHE); + if (!sra) + return 1; + + /* ndisks is not growing, so raid_disks is old and +delta is new */ + odisks = info->array.raid_disks; + ndisks = odisks + info->delta_disks; + odata = odisks - 1; + ndata = ndisks - 1; + if (info->array.level == 6) { + odata--; + ndata--; + } + ochunk = info->array.chunk_size; + nchunk = info->new_chunk; + + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; + /* Find GCD */ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + /* LCM == product / GCD */ + blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; + + if (ndata == odata) + while (blocks * 32 < sra->component_size && + blocks < 16*1024*2) + blocks *= 2; + stripes = blocks / (info->array.chunk_size/512) / odata; + + /* check that the internal stripe cache is + * large enough, or it won't work. + */ + cache = (nchunk < ochunk) ? ochunk : nchunk; + cache = cache * 4 / 4096; + if (cache < blocks / 8 / odisks + 16) + /* Make it big enough to hold 'blocks' */ + cache = blocks / 8 / odisks + 16; + if (sra->cache_size < cache) + sysfs_set_num(sra, NULL, "stripe_cache_size", + cache+1); + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + memcpy(&bsb.set_uuid, info->uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR); + backup_list[0] = backup_fd; + backup_offsets[0] = 8 * 512; + fds = malloc(odisks * sizeof(fds[0])); + offsets = malloc(odisks * sizeof(offsets[0])); + for (d=0; ddevs; sd; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.major, + sd->disk.minor, 1); + fds[sd->disk.raid_disk] + = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fds[sd->disk.raid_disk] < 0) { + fprintf(stderr, Name ": %s: cannot open component %s\n", + info->sys_name, dn?dn:"-unknown-"); + rv = 1; + goto release; + } + free(dn); + } + } + + switch(fork()) { + case 0: + close(mdfd); + mlockall(MCL_FUTURE); + if (info->delta_disks < 0) + done = child_shrink(-1, info, stripes, + fds, offsets, + info->array.raid_disks, + info->array.chunk_size, + info->array.level, info->array.layout, + odata, + 1, backup_list, backup_offsets); + else if (info->delta_disks == 0) { + /* The 'start' is a per-device stripe number. + * reshape_progress is a per-array sector number. + * So divide by ndata * chunk_size + */ + unsigned long long start = info->reshape_progress / ndata; + start /= (info->array.chunk_size/512); + done = child_same_size(-1, info, stripes, + fds, offsets, + start, + info->array.raid_disks, + info->array.chunk_size, + info->array.level, info->array.layout, + odata, + 1, backup_list, backup_offsets); + } + if (backup_file && done) + unlink(backup_file); + /* FIXME should I intuit a level change */ + exit(0); + case -1: + fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n", + strerror(errno)); + return 1; + default: + break; + } +release: + return 0; +} + + diff -Nru mdadm-2.6.7.1/Incremental.c mdadm-3.1.4/Incremental.c --- mdadm-2.6.7.1/Incremental.c 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/Incremental.c 2010-08-31 10:18:39.000000000 +0300 @@ -2,7 +2,7 @@ * Incremental.c - support --incremental. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2006 Neil Brown + * Copyright (C) 2006-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -37,10 +37,11 @@ char *array_name); int Incremental(char *devname, int verbose, int runstop, - struct supertype *st, char *homehost, int autof) + struct supertype *st, char *homehost, int require_homehost, + int autof) { /* Add this device to an array, creating the array if necessary - * and starting the array if sensibe or - if runstop>0 - if possible. + * and starting the array if sensible or - if runstop>0 - if possible. * * This has several steps: * @@ -48,7 +49,8 @@ * 2/ Find metadata, reject if none appropriate (check * version/name from args) * 3/ Check if there is a match in mdadm.conf - * 3a/ if not, check for homehost match. If no match, reject. + * 3a/ if not, check for homehost match. If no match, assemble as + * a 'foreign' array. * 4/ Determine device number. * - If in mdadm.conf with std name, use that * - UUID in /var/run/mdadm.map use that @@ -56,6 +58,7 @@ * - Choose a free, high number. * - Use a partitioned device unless strong suggestion not to. * e.g. auto=md + * Don't choose partitioned for containers. * 5/ Find out if array already exists * 5a/ if it does not * - choose a name, from mdadm.conf or 'name' field in array. @@ -67,6 +70,7 @@ * - add the device * 6/ Make sure /var/run/mdadm.map contains this array. * 7/ Is there enough devices to possibly start the array? + * For a container, this means running Incremental_container. * 7a/ if not, finish with success. * 7b/ if yes, * - read all metadata and arrange devices like -A does @@ -74,23 +78,22 @@ * start the array (auto-readonly). */ struct stat stb; - struct mdinfo info, info2; + struct mdinfo info; struct mddev_ident_s *array_list, *match; char chosen_name[1024]; int rv; - int devnum; struct map_ent *mp, *map = NULL; int dfd, mdfd; char *avail; int active_disks; - + int trustworthy = FOREIGN; + char *name_to_use; + mdu_array_info_t ainf; struct createinfo *ci = conf_get_create_info(); - if (autof == 0) - autof = ci->autof; - /* 1/ Check if devices is permitted by mdadm.conf */ + /* 1/ Check if device is permitted by mdadm.conf */ if (!conf_test_dev(devname)) { if (verbose >= 0) @@ -140,9 +143,10 @@ close(dfd); return 1; } - st->ss->getinfo_super(st, &info); close (dfd); + memset(&info, 0, sizeof(info)); + st->ss->getinfo_super(st, &info); /* 3/ Check if there is a match in mdadm.conf */ array_list = conf_get_ident(NULL); @@ -151,7 +155,7 @@ if (array_list->uuid_set && same_uuid(array_list->uuid, info.uuid, st->ss->swapuuid) == 0) { - if (verbose >= 2) + if (verbose >= 2 && array_list->devname) fprintf(stderr, Name ": UUID differs from %s.\n", array_list->devname); @@ -159,7 +163,7 @@ } if (array_list->name[0] && strcasecmp(array_list->name, info.name) != 0) { - if (verbose >= 2) + if (verbose >= 2 && array_list->devname) fprintf(stderr, Name ": Name differs from %s.\n", array_list->devname); @@ -167,7 +171,7 @@ } if (array_list->devices && !match_oneof(array_list->devices, devname)) { - if (verbose >= 2) + if (verbose >= 2 && array_list->devname) fprintf(stderr, Name ": Not a listed device for %s.\n", array_list->devname); @@ -175,7 +179,7 @@ } if (array_list->super_minor != UnSet && array_list->super_minor != info.array.md_minor) { - if (verbose >= 2) + if (verbose >= 2 && array_list->devname) fprintf(stderr, Name ": Different super-minor to %s.\n", array_list->devname); @@ -185,7 +189,7 @@ !array_list->name[0] && !array_list->devices && array_list->super_minor == UnSet) { - if (verbose >= 2) + if (verbose >= 2 && array_list->devname) fprintf(stderr, Name ": %s doesn't have any identifying information.\n", array_list->devname); @@ -194,113 +198,142 @@ /* FIXME, should I check raid_disks and level too?? */ if (match) { - if (verbose >= 0) - fprintf(stderr, Name + if (verbose >= 0) { + if (match->devname && array_list->devname) + fprintf(stderr, Name ": we match both %s and %s - cannot decide which to use.\n", - match->devname, array_list->devname); + match->devname, array_list->devname); + else + fprintf(stderr, Name + ": multiple lines in mdadm.conf match\n"); + } return 2; } match = array_list; } - /* 3a/ if not, check for homehost match. If no match, reject. */ - if (!match) { - if (homehost == NULL || - st->ss->match_home(st, homehost) == 0) { - if (verbose >= 0) - fprintf(stderr, Name - ": not found in mdadm.conf and not identified by homehost.\n"); - return 2; - } + if (match && match->devname + && strcasecmp(match->devname, "") == 0) { + if (verbose >= 0) + fprintf(stderr, Name ": array containing %s is explicitly" + " ignored by mdadm.conf\n", + devname); + return 1; } - /* 4/ Determine device number. */ - /* - If in mdadm.conf with std name, use that */ - /* - UUID in /var/run/mdadm.map use that */ - /* - If name is suggestive, use that. unless in use with */ - /* different uuid. */ - /* - Choose a free, high number. */ - /* - Use a partitioned device unless strong suggestion not to. */ - /* e.g. auto=md */ - if (match && is_standard(match->devname, &devnum)) - /* We have devnum now */; - else if ((mp = map_by_uuid(&map, info.uuid)) != NULL) - devnum = mp->devnum; - else { - /* Have to guess a bit. */ - int use_partitions = 1; - char *np, *ep; - if ((autof&7) == 3 || (autof&7) == 5) - use_partitions = 0; - np = strchr(info.name, ':'); - if (np) - np++; - else - np = info.name; - devnum = strtoul(np, &ep, 10); - if (ep > np && *ep == 0) { - /* This is a number. Let check that it is unused. */ - if (mddev_busy(use_partitions ? (-1-devnum) : devnum)) - devnum = -1; - } else - devnum = -1; - - if (devnum < 0) { - /* Haven't found anything yet, choose something free */ - devnum = find_free_devnum(use_partitions); - if (devnum == NoMdDev) { - fprintf(stderr, Name - ": No spare md devices!!\n"); - return 2; - } - } else - devnum = use_partitions ? (-1-devnum) : devnum; + /* 3a/ if not, check for homehost match. If no match, continue + * but don't trust the 'name' in the array. Thus a 'random' minor + * number will be assigned, and the device name will be based + * on that. */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL_ANY; + else + trustworthy = FOREIGN; + + + if (!match && !conf_test_metadata(st->ss->name, + (trustworthy == LOCAL))) { + if (verbose >= 1) + fprintf(stderr, Name + ": %s has metadata type %s for which " + "auto-assembly is disabled\n", + devname, st->ss->name); + return 1; } - mdfd = open_mddev_devnum(match ? match->devname : NULL, - devnum, - info.name, - chosen_name, autof >> 3); + if (trustworthy == LOCAL_ANY) + trustworthy = LOCAL; + + /* There are three possible sources for 'autof': command line, + * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. + * ARRAY takes precedence, then command line, then + * CREATE. + */ + if (match && match->autof) + autof = match->autof; + if (autof == 0) + autof = ci->autof; + + if (st->ss->container_content && st->loaded_container) { + if ((runstop > 0 && info.container_enough >= 0) || + info.container_enough > 0) + /* pass */; + else { + if (verbose) + fprintf(stderr, Name ": not enough devices to start the container\n"); + return 0; + } + + /* This is a pre-built container array, so we do something + * rather different. + */ + return Incremental_container(st, devname, verbose, runstop, + autof, trustworthy); + } + + name_to_use = info.name; + if (name_to_use[0] == 0 && + info.array.level == LEVEL_CONTAINER && + trustworthy == LOCAL) { + name_to_use = info.text_version; + trustworthy = METADATA; + } + if (name_to_use[0] && trustworthy != LOCAL && + ! require_homehost && + conf_name_is_free(name_to_use)) + trustworthy = LOCAL; + + /* strip "hostname:" prefix from name if we have decided + * to treat it as LOCAL + */ + if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL) + name_to_use = strchr(name_to_use, ':')+1; + + /* 4/ Check if array exists. + */ + if (map_lock(&map)) + fprintf(stderr, Name ": failed to get exclusive lock on " + "mapfile\n"); + mp = map_by_uuid(&map, info.uuid); + if (mp) + mdfd = open_dev(mp->devnum); + else + mdfd = -1; + if (mdfd < 0) { - fprintf(stderr, Name ": failed to open %s: %s.\n", - chosen_name, strerror(errno)); - return 2; - } - /* 5/ Find out if array already exists */ - if (! mddev_busy(devnum)) { - /* 5a/ if it does not */ - /* - choose a name, from mdadm.conf or 'name' field in array. */ - /* - create the array */ - /* - add the device */ - mdu_array_info_t ainf; - mdu_disk_info_t disk; - char md[20]; struct mdinfo *sra; + struct mdinfo dinfo; - memset(&ainf, 0, sizeof(ainf)); - ainf.major_version = st->ss->major; - ainf.minor_version = st->minor_version; - if (ioctl(mdfd, SET_ARRAY_INFO, &ainf) != 0) { - fprintf(stderr, Name - ": SET_ARRAY_INFO failed for %s: %s\b", + /* Couldn't find an existing array, maybe make a new one */ + mdfd = create_mddev(match ? match->devname : NULL, + name_to_use, autof, trustworthy, chosen_name); + + if (mdfd < 0) + return 1; + + sysfs_init(&info, mdfd, 0); + + if (set_array_info(mdfd, st, &info) != 0) { + fprintf(stderr, Name ": failed to set array info for %s: %s\n", chosen_name, strerror(errno)); close(mdfd); return 2; } - sprintf(md, "%d.%d\n", st->ss->major, st->minor_version); - sra = sysfs_read(mdfd, devnum, GET_VERSION); - sysfs_set_str(sra, NULL, "metadata_version", md); - memset(&disk, 0, sizeof(disk)); - disk.major = major(stb.st_rdev); - disk.minor = minor(stb.st_rdev); - sysfs_free(sra); - if (ioctl(mdfd, ADD_NEW_DISK, &disk) != 0) { + + dinfo = info; + dinfo.disk.major = major(stb.st_rdev); + dinfo.disk.minor = minor(stb.st_rdev); + if (add_disk(mdfd, st, &info, &dinfo) != 0) { fprintf(stderr, Name ": failed to add %s to %s: %s.\n", devname, chosen_name, strerror(errno)); ioctl(mdfd, STOP_ARRAY, 0); close(mdfd); return 2; } - sra = sysfs_read(mdfd, devnum, GET_DEVS); + sra = sysfs_read(mdfd, fd2devnum(mdfd), GET_DEVS); if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { /* It really should be 'none' - must be old buggy * kernel, and mdadm -I may not be able to complete. @@ -314,6 +347,12 @@ sysfs_free(sra); return 2; } + info.array.working_disks = 1; + sysfs_free(sra); + /* 6/ Make sure /var/run/mdadm.map contains this array. */ + map_update(&map, fd2devnum(mdfd), + info.text_version, + info.uuid, chosen_name); } else { /* 5b/ if it does */ /* - check one drive in array to make sure metadata is a reasonably */ @@ -321,60 +360,88 @@ /* - add the device */ char dn[20]; int dfd2; - mdu_disk_info_t disk; int err; struct mdinfo *sra; struct supertype *st2; - sra = sysfs_read(mdfd, devnum, (GET_VERSION | GET_DEVS | - GET_STATE)); + struct mdinfo info2, *d; - if (sra->array.major_version != st->ss->major || - sra->array.minor_version != st->minor_version) { - if (verbose >= 0) + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, devnum2devname(mp->devnum)); + + /* It is generally not OK to add non-spare drives to a + * running array as they are probably missing because + * they failed. However if runstop is 1, then the + * array was possibly started early and our best be is + * to add this anyway. It would probably be good to + * allow explicit policy statement about this. + */ + if ((info.disk.state & (1<ss->external) { + char *devname = devnum2devname(fd2devnum(mdfd)); + + active = devname && is_container_active(devname); + free(devname); + } else if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) + active = 1; + if (active) { fprintf(stderr, Name - ": %s has different metadata to chosen array %s %d.%d %d.%d.\n", - devname, chosen_name, - sra->array.major_version, - sra->array.minor_version, - st->ss->major, st->minor_version); - close(mdfd); - return 1; - } - sprintf(dn, "%d:%d", sra->devs->disk.major, - sra->devs->disk.minor); - dfd2 = dev_open(dn, O_RDONLY); - st2 = dup_super(st); - if (st2->ss->load_super(st2, dfd2, NULL)) { - fprintf(stderr, Name - ": Strange error loading metadata for %s.\n", - chosen_name); - close(mdfd); - close(dfd2); - return 2; + ": not adding %s to active array (without --run) %s\n", + devname, chosen_name); + close(mdfd); + return 2; + } } - close(dfd2); - st2->ss->getinfo_super(st2, &info2); - st2->ss->free_super(st2); - if (info.array.level != info2.array.level || - memcmp(info.uuid, info2.uuid, 16) != 0 || - info.array.raid_disks != info2.array.raid_disks) { - fprintf(stderr, Name - ": unexpected difference between %s and %s.\n", - chosen_name, devname); - close(mdfd); + sra = sysfs_read(mdfd, fd2devnum(mdfd), (GET_DEVS | GET_STATE)); + if (!sra) return 2; + + if (sra->devs) { + sprintf(dn, "%d:%d", sra->devs->disk.major, + sra->devs->disk.minor); + dfd2 = dev_open(dn, O_RDONLY); + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2) != 0) { + fprintf(stderr, Name + ": metadata mismatch between %s and " + "chosen array %s\n", + devname, chosen_name); + close(mdfd); + close(dfd2); + return 2; + } + close(dfd2); + memset(&info2, 0, sizeof(info2)); + st2->ss->getinfo_super(st2, &info2); + st2->ss->free_super(st2); + if (info.array.level != info2.array.level || + memcmp(info.uuid, info2.uuid, 16) != 0 || + info.array.raid_disks != info2.array.raid_disks) { + fprintf(stderr, Name + ": unexpected difference between %s and %s.\n", + chosen_name, devname); + close(mdfd); + return 2; + } } - memset(&disk, 0, sizeof(disk)); - disk.major = major(stb.st_rdev); - disk.minor = minor(stb.st_rdev); - err = ioctl(mdfd, ADD_NEW_DISK, &disk); + info2.disk.major = major(stb.st_rdev); + info2.disk.minor = minor(stb.st_rdev); + /* add disk needs to know about containers */ + if (st->ss->external) + sra->array.level = LEVEL_CONTAINER; + err = add_disk(mdfd, st, sra, &info2); if (err < 0 && errno == EBUSY) { /* could be another device present with the same * disk.number. Find and reject any such */ find_reject(mdfd, st, sra, info.disk.number, info.events, verbose, chosen_name); - err = ioctl(mdfd, ADD_NEW_DISK, &disk); + err = add_disk(mdfd, st, sra, &info2); } if (err < 0) { fprintf(stderr, Name ": failed to add %s to %s: %s.\n", @@ -382,15 +449,33 @@ close(mdfd); return 2; } + info.array.working_disks = 0; + for (d = sra->devs; d; d=d->next) + info.array.working_disks ++; + } - /* 6/ Make sure /var/run/mdadm.map contains this array. */ - map_update(&map, devnum, - info.array.major_version, - info.array.minor_version, - info.uuid, chosen_name); /* 7/ Is there enough devices to possibly start the array? */ /* 7a/ if not, finish with success. */ + if (info.array.level == LEVEL_CONTAINER) { + /* Try to assemble within the container */ + map_unlock(&map); + sysfs_uevent(&info, "change"); + if (verbose >= 0) + fprintf(stderr, Name + ": container %s now has %d devices\n", + chosen_name, info.array.working_disks); + wait_for(chosen_name, mdfd); + close(mdfd); + rv = Incremental(chosen_name, verbose, runstop, + NULL, homehost, require_homehost, autof); + if (rv == 1) + /* Don't fail the whole -I if a subarray didn't + * have enough devices to start yet + */ + rv = 0; + return rv; + } avail = NULL; active_disks = count_active(st, mdfd, &avail, &info); if (enough(info.array.level, info.array.raid_disks, @@ -401,6 +486,7 @@ fprintf(stderr, Name ": %s attached to %s, not enough to start (%d).\n", devname, chosen_name, active_disks); + map_unlock(&map); close(mdfd); return 0; } @@ -411,18 +497,18 @@ /* are enough, */ /* + add any bitmap file */ /* + start the array (auto-readonly). */ -{ - mdu_array_info_t ainf; if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { if (verbose >= 0) fprintf(stderr, Name ": %s attached to %s which is already active.\n", devname, chosen_name); - close (mdfd); + close(mdfd); + map_unlock(&map); return 0; } -} + + map_unlock(&map); if (runstop > 0 || active_disks >= info.array.working_disks) { struct mdinfo *sra; /* Let's try to start it */ @@ -445,8 +531,9 @@ } close(bmfd); } - sra = sysfs_read(mdfd, devnum, 0); - if (sra == NULL || active_disks >= info.array.working_disks) + sra = sysfs_read(mdfd, fd2devnum(mdfd), 0); + if ((sra == NULL || active_disks >= info.array.working_disks) + && trustworthy != FOREIGN) rv = ioctl(mdfd, RUN_ARRAY, NULL); else rv = sysfs_set_str(sra, NULL, @@ -457,6 +544,7 @@ ": %s attached to %s, which has been started.\n", devname, chosen_name); rv = 0; + wait_for(chosen_name, mdfd); } else { fprintf(stderr, Name ": %s attached to %s, but failed to start: %s.\n", @@ -528,6 +616,9 @@ struct mdinfo *sra = sysfs_read(mdfd, -1, GET_DEVS | GET_STATE); char *avail = NULL; + if (!sra) + return 0; + for (d = sra->devs ; d ; d = d->next) { char dn[30]; int dfd; @@ -590,53 +681,6 @@ return cnt + cnt1; } -void RebuildMap(void) -{ - struct mdstat_ent *mdstat = mdstat_read(0, 0); - struct mdstat_ent *md; - struct map_ent *map = NULL; - int mdp = get_mdp_major(); - - for (md = mdstat ; md ; md = md->next) { - struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS); - struct mdinfo *sd; - - for (sd = sra->devs ; sd ; sd = sd->next) { - char dn[30]; - int dfd; - int ok; - struct supertype *st; - char *path; - struct mdinfo info; - - sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor); - dfd = dev_open(dn, O_RDONLY); - if (dfd < 0) - continue; - st = guess_super(dfd); - if ( st == NULL) - ok = -1; - else - ok = st->ss->load_super(st, dfd, NULL); - close(dfd); - if (ok != 0) - continue; - st->ss->getinfo_super(st, &info); - if (md->devnum > 0) - path = map_dev(MD_MAJOR, md->devnum, 0); - else - path = map_dev(mdp, (-1-md->devnum)<< 6, 0); - map_add(&map, md->devnum, st->ss->major, - st->minor_version, - info.uuid, path ? : "/unknown"); - st->ss->free_super(st); - break; - } - } - map_write(map); - map_free(map); -} - int IncrementalScan(int verbose) { /* look at every device listed in the 'map' file. @@ -654,12 +698,11 @@ devs = conf_get_ident(NULL); for (me = mapl ; me ; me = me->next) { - char path[1024]; mdu_array_info_t array; mdu_bitmap_file_t bmf; struct mdinfo *sra; - int mdfd = open_mddev_devnum(me->path, me->devnum, - NULL, path, 0); + int mdfd = open_dev(me->devnum); + if (mdfd < 0) continue; if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 || @@ -669,7 +712,8 @@ } /* Ok, we can try this one. Maybe it needs a bitmap */ for (mddev = devs ; mddev ; mddev = mddev->next) - if (strcmp(mddev->devname, me->path) == 0) + if (mddev->devname && me->path + && devname_matches(mddev->devname, me->path)) break; if (mddev && mddev->bitmap_file) { /* @@ -703,14 +747,182 @@ if (verbose >= 0) fprintf(stderr, Name ": started array %s\n", - me->path); + me->path ?: devnum2devname(me->devnum)); } else { fprintf(stderr, Name ": failed to start array %s: %s\n", - me->path, strerror(errno)); + me->path ?: devnum2devname(me->devnum), + strerror(errno)); rv = 1; } } } return rv; } + +static char *container2devname(char *devname) +{ + char *mdname = NULL; + + if (devname[0] == '/') { + int fd = open(devname, O_RDONLY); + if (fd >= 0) { + mdname = devnum2devname(fd2devnum(fd)); + close(fd); + } + } else { + int uuid[4]; + struct map_ent *mp, *map = NULL; + + if (!parse_uuid(devname, uuid)) + return mdname; + mp = map_by_uuid(&map, uuid); + if (mp) + mdname = devnum2devname(mp->devnum); + map_free(map); + } + + return mdname; +} + +int Incremental_container(struct supertype *st, char *devname, int verbose, + int runstop, int autof, int trustworthy) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list = st->ss->container_content(st); + struct mdinfo *ra; + struct map_ent *map = NULL; + + if (map_lock(&map)) + fprintf(stderr, Name ": failed to get exclusive lock on " + "mapfile\n"); + + for (ra = list ; ra ; ra = ra->next) { + int mdfd; + char chosen_name[1024]; + struct map_ent *mp; + struct mddev_ident_s *match = NULL; + + mp = map_by_uuid(&map, ra->uuid); + + if (mp) { + mdfd = open_dev(mp->devnum); + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, devnum2devname(mp->devnum)); + } else { + + /* Check in mdadm.conf for container == devname and + * member == ra->text_version after second slash. + */ + char *sub = strchr(ra->text_version+1, '/'); + struct mddev_ident_s *array_list; + if (sub) { + sub++; + array_list = conf_get_ident(NULL); + } else + array_list = NULL; + for(; array_list ; array_list = array_list->next) { + char *dn; + if (array_list->member == NULL || + array_list->container == NULL) + continue; + if (strcmp(array_list->member, sub) != 0) + continue; + if (array_list->uuid_set && + !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid)) + continue; + dn = container2devname(array_list->container); + if (dn == NULL) + continue; + if (strncmp(dn, ra->text_version+1, + strlen(dn)) != 0 || + ra->text_version[strlen(dn)+1] != '/') { + free(dn); + continue; + } + free(dn); + /* we have a match */ + match = array_list; + if (verbose>0) + fprintf(stderr, Name ": match found for member %s\n", + array_list->member); + break; + } + + if (match && match->devname && + strcasecmp(match->devname, "") == 0) { + if (verbose > 0) + fprintf(stderr, Name ": array %s/%s is " + "explicitly ignored by mdadm.conf\n", + match->container, match->member); + return 2; + } + if (match) + trustworthy = LOCAL; + + mdfd = create_mddev(match ? match->devname : NULL, + ra->name, + autof, + trustworthy, + chosen_name); + } + + if (mdfd < 0) { + fprintf(stderr, Name ": failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + assemble_container_content(st, mdfd, ra, runstop, + chosen_name, verbose); + } + map_unlock(&map); + return 0; +} + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * + * Note: the device name must be a kernel name like "sda", so + * that we can find it in /proc/mdstat + */ +int IncrementalRemove(char *devname, int verbose) +{ + int mdfd; + int rv; + struct mdstat_ent *ent; + struct mddev_dev_s devlist; + + if (strchr(devname, '/')) { + fprintf(stderr, Name ": incremental removal requires a " + "kernel device name, not a file: %s\n", devname); + return 1; + } + ent = mdstat_by_component(devname); + if (!ent) { + fprintf(stderr, Name ": %s does not appear to be a component " + "of any array\n", devname); + return 1; + } + mdfd = open_dev(ent->devnum); + if (mdfd < 0) { + fprintf(stderr, Name ": Cannot open array %s!!\n", ent->dev); + return 1; + } + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0); + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0); + close(mdfd); + return rv; +} diff -Nru mdadm-2.6.7.1/inventory mdadm-3.1.4/inventory --- mdadm-2.6.7.1/inventory 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/inventory 2010-08-31 10:21:13.000000000 +0300 @@ -1,28 +1,13 @@ -ANNOUNCE-2.0 -ANNOUNCE-2.1 -ANNOUNCE-2.2 -ANNOUNCE-2.3 -ANNOUNCE-2.3.1 -ANNOUNCE-2.4 -ANNOUNCE-2.4.1 -ANNOUNCE-2.4-pre1 -ANNOUNCE-2.5 -ANNOUNCE-2.5.1 -ANNOUNCE-2.5.2 -ANNOUNCE-2.5.3 -ANNOUNCE-2.5.4 -ANNOUNCE-2.5.5 -ANNOUNCE-2.5.6 -ANNOUNCE-2.6 -ANNOUNCE-2.6.1 -ANNOUNCE-2.6.2 -ANNOUNCE-2.6.3 -ANNOUNCE-2.6.4 -ANNOUNCE-2.6.5 -ANNOUNCE-2.6.6 -ANNOUNCE-2.6.7 -ANNOUNCE-2.6.7.1 +ANNOUNCE-3.0 +ANNOUNCE-3.0.1 +ANNOUNCE-3.0.2 +ANNOUNCE-3.0.3 +ANNOUNCE-3.1 +ANNOUNCE-3.1.1 +ANNOUNCE-3.1.2 +ANNOUNCE-3.1.3 +ANNOUNCE-3.1.4 Assemble.c bitmap.c bitmap.h @@ -30,6 +15,8 @@ ChangeLog config.c COPYING +crc32.c +crc32.h Create.c Detail.c dlink.c @@ -43,14 +30,17 @@ kernel-patch-2.6.18 kernel-patch-2.6.18.6 kernel-patch-2.6.19 +kernel-patch-2.6.25 +kernel-patch-2.6.27 Kill.c makedist Makefile Manage.c +managemon.c mapfile.c md.4 md5.h -mdadm.8 +mdadm.8.in mdadm.c mdadm.conf.5 mdadm.conf-example @@ -58,6 +48,9 @@ mdadm.spec mdassemble.8 mdassemble.c +mdmon.8 +mdmon.c +mdmon.h mdopen.c md_p.h mdstat.c @@ -65,17 +58,27 @@ misc/ misc/syslog-events mkinitramfs +monitor.c Monitor.c +msg.c +msg.h +platform-intel.c +platform-intel.h +probe_roms.c +probe_roms.h pwgr.c Query.c raid5extend.c ReadMe.c README.initramfs restripe.c +sg_io.c sha1.c sha1.h super0.c super1.c +super-ddf.c +super-intel.c swap_super.c sysfs.c test @@ -90,6 +93,8 @@ tests/00raid6 tests/01r1fail tests/01r5fail +tests/01r5integ +tests/01raid6integ tests/02lineargrow tests/02r1add tests/02r1grow @@ -121,9 +126,21 @@ tests/06wrmostly tests/07autoassemble tests/07autodetect +tests/07changelevelintr +tests/07changelevels +tests/07layouts +tests/07reshape5intr tests/07testreshape5 +tests/08imsm-overlap +tests/09imsm-assemble +tests/09imsm-create-fail-rebuild +tests/10ddf-create tests/check +tests/env-08imsm-overlap +tests/env-09imsm-assemble +tests/env-09imsm-create-fail-rebuild tests/testdev tests/ToTest TODO +udev-md-raid.rules util.c diff -Nru mdadm-2.6.7.1/kernel-patch-2.6.25 mdadm-3.1.4/kernel-patch-2.6.25 --- mdadm-2.6.7.1/kernel-patch-2.6.25 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/kernel-patch-2.6.25 2010-03-22 08:08:42.000000000 +0200 @@ -0,0 +1,199 @@ +Status: ok + +Support adding a spare to a live md array with external metadata. + +i.e. extend the 'md/dev-XXX/slot' attribute so that you can +tell a device to fill an vacant slot in an and md array. + + +Signed-off-by: Neil Brown + +### Diffstat output + ./drivers/md/md.c | 44 ++++++++++++++++++++++++++++++++++++++++---- + ./drivers/md/multipath.c | 7 ++++++- + ./drivers/md/raid1.c | 7 ++++++- + ./drivers/md/raid10.c | 10 ++++++++-- + ./drivers/md/raid5.c | 10 ++++++++-- + 5 files changed, 68 insertions(+), 10 deletions(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2008-06-05 09:19:56.000000000 +1000 ++++ ./drivers/md/md.c 2008-06-10 10:41:21.000000000 +1000 +@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char + slot = -1; + else if (e==buf || (*e && *e!= '\n')) + return -EINVAL; +- if (rdev->mddev->pers) { ++ if (rdev->mddev->pers && slot == -1) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. +@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ +- if (slot != -1) +- return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ +@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); ++ } else if (rdev->mddev->pers) { ++ mdk_rdev_t *rdev2; ++ struct list_head *tmp; ++ /* Activating a spare .. or possibly reactivating ++ * if we every get bitmaps working here. ++ */ ++ ++ if (rdev->raid_disk != -1) ++ return -EBUSY; ++ ++ if (rdev->mddev->pers->hot_add_disk == NULL) ++ return -EINVAL; ++ ++ rdev_for_each(rdev2, tmp, rdev->mddev) ++ if (rdev2->raid_disk == slot) ++ return -EEXIST; ++ ++ rdev->raid_disk = slot; ++ if (test_bit(In_sync, &rdev->flags)) ++ rdev->saved_raid_disk = slot; ++ else ++ rdev->saved_raid_disk = -1; ++ err = rdev->mddev->pers-> ++ hot_add_disk(rdev->mddev, rdev); ++ if (err != 1) { ++ rdev->raid_disk = -1; ++ if (err == 0) ++ return -EEXIST; ++ return err; ++ } ++ sprintf(nm, "rd%d", rdev->raid_disk); ++ if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) ++ printk(KERN_WARNING ++ "md: cannot register " ++ "%s for %s\n", ++ nm, mdname(rdev->mddev)); ++ ++ /* don't wakeup anyone, leave that to userspace. */ + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; +@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev, + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); +- if (err) ++ if (err < 0) + unbind_rdev_from_array(rdev); + } + if (err) + +diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c +--- .prev/drivers/md/multipath.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/multipath.c 2008-06-10 10:35:03.000000000 +1000 +@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m + int found = 0; + int path; + struct multipath_info *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; ++ ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; + + print_multipath_conf(conf); + +- for (path=0; pathraid_disks; path++) ++ for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + q = rdev->bdev->bd_disk->queue; + blk_queue_stack_limits(mddev->queue, q); + +diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c +--- .prev/drivers/md/raid10.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid10.c 2008-06-10 10:28:53.000000000 +1000 +@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde + int found = 0; + int mirror; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is +@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde + if (!enough(conf)) + return 0; + ++ if (rdev->raid_disk) ++ first = last = rdev->raid_disk; ++ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else +- mirror = 0; +- for ( ; mirror < mddev->raid_disks; mirror++) ++ mirror = first; ++ for ( ; mirror <= last ; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c +--- .prev/drivers/md/raid1.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid1.c 2008-06-10 10:41:00.000000000 +1000 +@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev + int found = 0; + int mirror = 0; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + +- for (mirror=0; mirror < mddev->raid_disks; mirror++) ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ ++ for (mirror = first; mirror <= last; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c +--- .prev/drivers/md/raid5.c 2008-05-30 14:49:35.000000000 +1000 ++++ ./drivers/md/raid5.c 2008-06-10 10:27:51.000000000 +1000 +@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev + int found = 0; + int disk; + struct disk_info *p; ++ int first = 0; ++ int last = conf->raid_disks - 1; + + if (mddev->degraded > conf->max_degraded) + /* no point adding a device */ + return 0; + ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + disk = rdev->saved_raid_disk; + else +- disk = 0; +- for ( ; disk < conf->raid_disks; disk++) ++ disk = first; ++ for ( ; disk <= last ; disk++) + if ((p=conf->disks + disk)->rdev == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->raid_disk = disk; diff -Nru mdadm-2.6.7.1/kernel-patch-2.6.27 mdadm-3.1.4/kernel-patch-2.6.27 --- mdadm-2.6.7.1/kernel-patch-2.6.27 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/kernel-patch-2.6.27 2010-03-22 08:08:42.000000000 +0200 @@ -0,0 +1,36 @@ +touch_mnt_namespace when the mount flags change + +From: Dan Williams + +Daemons that need to be launched while the rootfs is read-only can now +poll /proc/mounts to be notified when their O_RDWR requests may no +longer end in EROFS. + +Cc: Kay Sievers +Cc: Neil Brown +Signed-off-by: Dan Williams +--- + + fs/namespace.c | 7 ++++++- + 1 files changed, 6 insertions(+), 1 deletions(-) + + +diff --git a/fs/namespace.c b/fs/namespace.c +index 6e283c9..1bd5ba2 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1553,8 +1553,13 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, + if (!err) + nd->path.mnt->mnt_flags = mnt_flags; + up_write(&sb->s_umount); +- if (!err) ++ if (!err) { + security_sb_post_remount(nd->path.mnt, flags, data); ++ ++ spin_lock(&vfsmount_lock); ++ touch_mnt_namespace(nd->path.mnt->mnt_ns); ++ spin_unlock(&vfsmount_lock); ++ } + return err; + } + diff -Nru mdadm-2.6.7.1/Kill.c mdadm-3.1.4/Kill.c --- mdadm-2.6.7.1/Kill.c 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/Kill.c 2010-08-31 10:18:39.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: * * Added by Dale Stephenson * steph@snapserver.com @@ -34,39 +29,42 @@ #include "md_u.h" #include "md_p.h" -int Kill(char *dev, int force, int quiet) +int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl) { /* * Nothing fancy about Kill. It just zeroes out a superblock * Definitely not safe. + * Returns: + * 0 - a zero superblock was successfully written out + * 1 - failed to write the zero superblock + * 2 - failed to open the device or find a superblock. */ int fd, rv = 0; - struct supertype *st; - fd = open(dev, O_RDWR|O_EXCL); + if (force) + noexcl = 1; + fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL)); if (fd < 0) { if (!quiet) fprintf(stderr, Name ": Couldn't open %s for write - not zeroing\n", dev); - close(fd); - return 1; + return 2; } - st = guess_super(fd); + if (st == NULL) + st = guess_super(fd); if (st == NULL) { if (!quiet) fprintf(stderr, Name ": Unrecognised md component device - %s\n", dev); close(fd); - return 1; + return 2; } rv = st->ss->load_super(st, fd, dev); if (force && rv >= 2) rv = 0; /* ignore bad data in superblock */ if (rv== 0 || (force && rv >= 2)) { - mdu_array_info_t info; - info.major_version = -1; /* zero superblock */ st->ss->free_super(st); - st->ss->init_super(st, &info, 0, "", NULL, NULL); + st->ss->init_super(st, NULL, 0, "", NULL, NULL); if (st->ss->store_super(st, fd)) { if (!quiet) fprintf(stderr, Name ": Could not zero superblock on %s\n", @@ -81,3 +79,81 @@ close(fd); return rv; } + +int Kill_subarray(char *dev, char *subarray, int quiet) +{ + /* Delete a subarray out of a container, the subarry must be + * inactive. The subarray string must be a subarray index + * number. + * + * 0 = successfully deleted subarray from all container members + * 1 = failed to sync metadata to one or more devices + * 2 = failed to find the container, subarray, or other resource + * issue + */ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + if (snprintf(st->subarray, sizeof(st->subarray), "%s", subarray) >= + (int)sizeof(st->subarray)) { + if (!quiet) + fprintf(stderr, + Name ": Input overflow for subarray '%s' > %zu bytes\n", + subarray, sizeof(st->subarray) - 1); + return 2; + } + + fd = open_subarray(dev, st, quiet); + if (fd < 0) + return 2; + + if (!st->ss->kill_subarray) { + if (!quiet) + fprintf(stderr, + Name ": Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (is_subarray_active(subarray, st->devname)) { + if (!quiet) + fprintf(stderr, + Name ": Subarray-%s still active, aborting\n", + subarray); + goto free_super; + } + + if (mdmon_running(st->devnum)) + st->update_tail = &st->updates; + + /* ok we've found our victim, drop the axe */ + rv = st->ss->kill_subarray(st); + if (rv) { + if (!quiet) + fprintf(stderr, + Name ": Failed to delete subarray-%s from %s\n", + subarray, dev); + goto free_super; + } + + /* FIXME these routines do not report success/failure */ + if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (!quiet) + fprintf(stderr, + Name ": Deleted subarray-%s from %s, UUIDs may have changed\n", + subarray, dev); + + rv = 0; + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} diff -Nru mdadm-2.6.7.1/makedist mdadm-3.1.4/makedist --- mdadm-2.6.7.1/makedist 2007-02-22 06:04:00.000000000 +0200 +++ mdadm-3.1.4/makedist 2010-08-26 05:24:15.000000000 +0300 @@ -14,9 +14,14 @@ fi set `grep '^char Version' ReadMe.c ` version=`echo $7 | sed 's/v//'` -grep "^.TH MDADM 8 .. v$version" mdadm.8 > /dev/null 2>&1 || +grep "^.TH MDADM 8 .. v$version" mdadm.8.in > /dev/null 2>&1 || { - echo mdadm.8 does not mention verion $version. + echo mdadm.8.in does not mention version $version. + exit 1 + } +grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 || + { + echo mdmon.8 does not mention version $version. exit 1 } rpmv=`echo $version | tr - _` diff -Nru mdadm-2.6.7.1/Makefile mdadm-3.1.4/Makefile --- mdadm-2.6.7.1/Makefile 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/Makefile 2010-08-31 10:18:39.000000000 +0300 @@ -31,7 +31,10 @@ # e.g. make CXFLAGS=-O to optimise TCC = tcc UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) -DIET_GCC = diet gcc +#DIET_GCC = diet gcc +# sorry, but diet-libc doesn't know about posix_memalign, +# so we cannot use it any more. +DIET_GCC = gcc -DHAVE_STDINT_H KLIBC=/home/src/klibc/klibc-0.77 @@ -39,20 +42,46 @@ CC = $(CROSS_COMPILE)gcc CXFLAGS = -ggdb -CWFLAGS = -Wall -Werror -Wstrict-prototypes +CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter +ifdef WARN_UNUSED +CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O +endif ifdef DEBIAN -CPPFLAGS= -DDEBIAN +CPPFLAGS := -DDEBIAN +else +CPPFLAGS := +endif +ifdef DEFAULT_OLD_METADATA + CPPFLAG += -DDEFAULT_OLD_METADATA + DEFAULT_METADATA=0.90 else -CPPFLAGS= + DEFAULT_METADATA=1.2 endif SYSCONFDIR = /etc -CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf -CONFFILE2 = $(SYSCONFDIR)/mdadm.conf +CONFFILE = $(SYSCONFDIR)/mdadm.conf +CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf MAILCMD =/usr/sbin/sendmail -t CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" -CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) +# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +# pivotroot from early boot to late boot. +# /dev is an odd place to put this, but it is the only directory that +# meets the requirements. +MAP_DIR=/dev/.mdadm +MAP_FILE = map +MDMON_DIR = /dev/.mdadm +DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" +DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) + +# The glibc TLS ABI requires applications that call clone(2) to set up +# TLS data structures, use pthreads until mdmon implements this support +USE_PTHREADS = 1 +ifdef USE_PTHREADS +CFLAGS += -DUSE_PTHREADS +MON_LDFLAGS += -pthread +endif # If you want a static binary, you might uncomment these # LDFLAGS = -static @@ -69,31 +98,48 @@ OBJS = mdadm.o config.o mdstat.o ReadMe.o util.o Manage.o Assemble.o Build.o \ Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ Incremental.o \ - mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \ - mapfile.o + mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o \ + platform-intel.o probe_roms.o + SRCS = mdadm.c config.c mdstat.c ReadMe.c util.c Manage.c Assemble.c Build.c \ Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \ Incremental.c \ - mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \ - mapfile.c + mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \ + restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c \ + platform-intel.c probe_roms.c + +MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \ + Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \ + super-ddf.o sha1.o crc32.o msg.o bitmap.o \ + platform-intel.o probe_roms.o + +MON_SRCS = mdmon.c monitor.c managemon.c util.c mdstat.c sysfs.c config.c \ + Kill.c sg_io.c dlink.c ReadMe.c super0.c super1.c super-intel.c \ + super-ddf.c sha1.c crc32.c msg.c bitmap.c \ + platform-intel.c probe_roms.c STATICSRC = pwgr.c STATICOBJS = pwgr.o ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \ - super0.c super1.c sha1.c -ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c sysfs.c + super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \ + platform-intel.c probe_roms.c sysfs.c +ASSEMBLE_AUTO_SRCS := mdopen.c ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE ifdef MDASSEMBLE_AUTO ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS) ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO endif -all : mdadm mdadm.man md.man mdadm.conf.man +all : mdadm mdmon mdadm.man md.man mdadm.conf.man mdmon.man everything: all mdadm.static swap_super test_stripe \ mdassemble mdassemble.auto mdassemble.static mdassemble.man \ mdadm.Os mdadm.O2 +everything-test: all mdadm.static swap_super test_stripe \ + mdassemble.auto mdassemble.static mdassemble.man \ + mdadm.Os mdadm.O2 # mdadm.uclibc and mdassemble.uclibc don't work on x86-64 # mdadm.tcc doesn't work.. @@ -106,18 +152,23 @@ mdadm.tcc : $(SRCS) mdadm.h $(TCC) -o mdadm.tcc $(SRCS) -dadm.uclibc : $(SRCS) mdadm.h - $(UCLIBC_GCC) -DUCLIBC -DHAVE_STDINT_H -o mdadm.uclibc $(SRCS) $(STATICSRC) - mdadm.klibc : $(SRCS) mdadm.h rm -f $(OBJS) - gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) + $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) mdadm.Os : $(SRCS) mdadm.h - gcc -o mdadm.Os $(CFLAGS) -DHAVE_STDINT_H -Os $(SRCS) + $(CC) -o mdadm.Os $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) + +mdadm.O2 : $(SRCS) mdadm.h mdmon.O2 + $(CC) -o mdadm.O2 $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) -mdadm.O2 : $(SRCS) mdadm.h - gcc -o mdadm.O2 $(CFLAGS) -DHAVE_STDINT_H -O2 $(SRCS) +mdmon.O2 : $(MON_SRCS) mdadm.h mdmon.h + $(CC) -o mdmon.O2 $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) + +# use '-z now' to guarantee no dynamic linker interactions with the monitor thread +mdmon : $(MON_OBJS) + $(CC) $(LDFLAGS) $(MON_LDFLAGS) -z now -o mdmon $(MON_OBJS) $(LDLIBS) +msg.o: msg.c msg.h test_stripe : restripe.c mdadm.h $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c @@ -144,9 +195,15 @@ rm -f $(OBJS) $(KLIBC_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) +mdadm.8 : mdadm.8.in + sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' mdadm.8.in > mdadm.8 + mdadm.man : mdadm.8 nroff -man mdadm.8 > mdadm.man +mdmon.man : mdmon.8 + nroff -man mdmon.8 > mdmon.man + md.man : md.4 nroff -man md.4 > md.man @@ -156,13 +213,15 @@ mdassemble.man : mdassemble.8 nroff -man mdassemble.8 > mdassemble.man -$(OBJS) : mdadm.h bitmap.h +$(OBJS) : mdadm.h mdmon.h bitmap.h +$(MON_OBJS) : mdadm.h mdmon.h bitmap.h sha1.o : sha1.c sha1.h md5.h $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c -install : mdadm install-man +install : mdadm mdmon install-man install-udev $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm + $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon install-static : mdadm.static install-man $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm @@ -176,29 +235,41 @@ install-klibc : mdadm.klibc install-man $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm -install-man: mdadm.8 md.4 mdadm.conf.5 +install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8 $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8 + $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4 $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 +install-udev: udev-md-raid.rules + $(INSTALL) -D -m 644 udev-md-raid.rules $(DESTDIR)/lib/udev/rules.d/64-md-raid.rules + uninstall: - rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 md.4 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm + rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm -test: mdadm test_stripe swap_super +test: mdadm mdmon test_stripe swap_super @echo "Please run 'sh ./test' as root" clean : - rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ - mdadm.Os mdadm.O2 \ + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt .merge_file_* \ + mdadm.Os mdadm.O2 mdmon.O2 \ mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \ mdassemble.klibc swap_super \ - init.cpio.gz mdadm.uclibc.static test_stripe + init.cpio.gz mdadm.uclibc.static test_stripe mdmon \ + mdadm.8 dist : clean ./makedist -testdist : everything clean +testdist : everything-test clean ./makedist test TAGS : etags *.h *.c + +DISTRO_MAKEFILE := $(wildcard distropkg/Makefile) +ifdef DISTRO_MAKEFILE +include $(DISTRO_MAKEFILE) +endif + diff -Nru mdadm-2.6.7.1/Manage.c mdadm-3.1.4/Manage.c --- mdadm-2.6.7.1/Manage.c 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/Manage.c 2010-08-31 10:21:11.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,17 +19,13 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" #include "md_u.h" #include "md_p.h" +#include #define REGISTER_DEV _IO (MD_MAJOR, 1) #define START_MD _IO (MD_MAJOR, 2) @@ -45,11 +41,57 @@ * */ mdu_array_info_t array; +#ifndef MDASSEMBLE + struct mdinfo *mdi; +#endif if (md_get_version(fd) < 9000) { fprintf(stderr, Name ": need md driver version 0.90.0 or later\n"); return 1; } +#ifndef MDASSEMBLE + /* If this is an externally-manage array, we need to modify the + * metadata_version so that mdmon doesn't undo our change. + */ + mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.major_version == -1 && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { + char vers[64]; + strcpy(vers, "external:"); + strcat(vers, mdi->text_version); + if (readonly > 0) { + int rv; + /* We set readonly ourselves. */ + vers[9] = '-'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + close(fd); + rv = sysfs_set_str(mdi, NULL, "array_state", "readonly"); + + if (rv < 0) { + fprintf(stderr, Name ": failed to set readonly for %s: %s\n", + devname, strerror(errno)); + + vers[9] = mdi->text_version[0]; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + return 1; + } + } else { + char *cp; + /* We cannot set read/write - must signal mdmon */ + vers[9] = '/'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + cp = strchr(vers+10, '/'); + if (*cp) + *cp = 0; + ping_monitor(vers+10); + } + return 0; + } +#endif if (ioctl(fd, GET_ARRAY_INFO, &array)) { fprintf(stderr, Name ": %s does not appear to be active.\n", devname); @@ -74,17 +116,69 @@ #ifndef MDASSEMBLE +static void remove_devices(int devnum, char *path) +{ + /* + * Remove names at 'path' - possibly with + * partition suffixes - which link to the 'standard' + * name for devnum. These were probably created + * by mdadm when the array was assembled. + */ + char base[40]; + char *path2; + char link[1024]; + int n; + int part; + char *be; + char *pe; + + if (!path) + return; + + if (devnum >= 0) + sprintf(base, "/dev/md%d", devnum); + else + sprintf(base, "/dev/md_d%d", -1-devnum); + be = base + strlen(base); + + path2 = malloc(strlen(path)+20); + strcpy(path2, path); + pe = path2 + strlen(path2); + + for (part = 0; part < 16; part++) { + if (part) { + sprintf(be, "p%d", part); + + if (isdigit(pe[-1])) + sprintf(pe, "p%d", part); + else + sprintf(pe, "%d", part); + } + n = readlink(path2, link, sizeof(link)); + if (n && (int)strlen(base) == n && + strncmp(link, base, n) == 0) + unlink(path2); + } + free(path2); +} + + int Manage_runstop(char *devname, int fd, int runstop, int quiet) { /* Run or stop the array. array must already be configured * required >= 0.90.0 + * Only print failure messages if quiet == 0; + * quiet > 0 means really be quiet + * quiet < 0 means we will try again if it fails. */ mdu_param_t param; /* unused */ if (runstop == -1 && md_get_version(fd) < 9000) { if (ioctl(fd, STOP_MD, 0)) { - if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n", - devname, strerror(errno)); + if (quiet == 0) fprintf(stderr, + Name ": stopping device %s " + "failed: %s\n", + devname, strerror(errno)); return 1; } } @@ -111,24 +205,100 @@ } else if (runstop < 0){ struct map_ent *map = NULL; struct stat stb; - if (ioctl(fd, STOP_ARRAY, NULL)) { - if (quiet==0) - fprintf(stderr, Name ": fail to stop array %s: %s\n", + struct mdinfo *mdi; + int devnum; + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + devnum = fd2devnum(fd); + mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { + /* This is mdmon managed. */ + close(fd); + if (sysfs_set_str(mdi, NULL, + "array_state", "inactive") < 0) { + if (quiet == 0) + fprintf(stderr, Name + ": failed to stop array %s: %s\n", + devname, strerror(errno)); + return 1; + } + + /* Give monitor a chance to act */ + ping_monitor(mdi->text_version); + + fd = open(devname, O_RDONLY); + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + !is_subarray(mdi->text_version)) { + struct mdstat_ent *mds, *m; + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); + + /* now check that there are no existing arrays + * which are members of this array + */ + mds = mdstat_read(0, 0); + for (m=mds; m; m=m->next) + if (m->metadata_version && + strncmp(m->metadata_version, "external:", 9)==0 && + is_subarray(m->metadata_version+9) && + devname2devnum(m->metadata_version+10) == devnum) { + if (!quiet) + fprintf(stderr, Name + ": Cannot stop container %s: " + "member %s still active\n", + devname, m->dev); + free_mdstat(mds); + if (mdi) + sysfs_free(mdi); + return 1; + } + } + + if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) { + if (quiet == 0) { + fprintf(stderr, Name + ": failed to stop array %s: %s\n", devname, strerror(errno)); + if (errno == EBUSY) + fprintf(stderr, "Perhaps a running " + "process, mounted filesystem " + "or active volume group?\n"); + } + if (mdi) + sysfs_free(mdi); return 1; } + /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array + * was stopped, so We'll do it here just to be sure. Drop any + * partitions as well... + */ + if (fd >= 0) + ioctl(fd, BLKRRPART, 0); + if (mdi) + sysfs_uevent(mdi, "change"); + + + if (devnum != NoMdDev && + (stat("/dev/.udev", &stb) != 0 || + check_env("MDADM_NO_UDEV"))) { + struct map_ent *mp = map_by_devnum(&map, devnum); + remove_devices(devnum, mp ? mp->path : NULL); + } + + if (quiet <= 0) fprintf(stderr, Name ": stopped %s\n", devname); - if (fstat(fd, &stb) == 0) { - int devnum; - if (major(stb.st_rdev) == MD_MAJOR) - devnum = minor(stb.st_rdev); - else - devnum = -1-(minor(stb.st_rdev)>>6); - map_delete(&map, devnum); - map_write(map); - map_free(map); - } + map_lock(&map); + map_remove(&map, devnum); + map_unlock(&map); } return 0; } @@ -153,26 +323,8 @@ return 0; } -int Manage_reconfig(char *devname, int fd, int layout) -{ - mdu_array_info_t info; - if (ioctl(fd, GET_ARRAY_INFO, &info) != 0) { - fprintf(stderr, Name ": Cannot get array information for %s: %s\n", - devname, strerror(errno)); - return 1; - } - info.layout = layout; - printf("layout set to %d\n", info.layout); - if (ioctl(fd, SET_ARRAY_INFO, &info) != 0) { - fprintf(stderr, Name ": Cannot set layout for %s: %s\n", - devname, strerror(errno)); - return 1; - } - return 0; -} - int Manage_subdevs(char *devname, int fd, - mddev_dev_t devlist, int verbose) + mddev_dev_t devlist, int verbose, int test) { /* do something to each dev. * devmode can be @@ -185,17 +337,23 @@ * 'f' - set the device faulty SET_DISK_FAULTY * device can be 'detached' in which case any device that * is inaccessible will be marked faulty. + * For 'f' and 'r', the device can also be a kernel-internal + * name such as 'sdb'. */ + mddev_dev_t add_devlist = NULL; mdu_array_info_t array; mdu_disk_info_t disc; unsigned long long array_size; mddev_dev_t dv, next = NULL; struct stat stb; int j, jnext = 0; - int tfd; + int tfd = -1; struct supertype *st, *tst; int duuid[4]; int ouuid[4]; + int lfd = -1; + int sysfd = -1; + int count = 0; /* number of actions taken */ if (ioctl(fd, GET_ARRAY_INFO, &array)) { fprintf(stderr, Name ": cannot get array info for %s\n", @@ -218,10 +376,13 @@ return 1; } + stb.st_rdev = 0; for (dv = devlist, j=0 ; dv; dv = next, j = jnext) { unsigned long long ldsize; char dvname[20]; char *dnprintable = dv->devname; + char *add_dev = dv->devname; + int err; next = dv->next; jnext = 0; @@ -235,6 +396,7 @@ return 1; } for (; j < array.raid_disks + array.nr_disks ; j++) { + unsigned dev; disc.number = j; if (ioctl(fd, GET_DISK_INFO, &disc)) continue; @@ -242,9 +404,15 @@ continue; if ((disc.state & 1) == 0) /* faulty */ continue; - stb.st_rdev = makedev(disc.major, disc.minor); + dev = makedev(disc.major, disc.minor); + if (stb.st_rdev == dev) + /* already did that one */ + continue; + stb.st_rdev = dev; next = dv; - jnext = j+1; + /* same slot again next time - things might + * have reshuffled */ + jnext = j; sprintf(dvname,"%d:%d", disc.major, disc.minor); dnprintable = dvname; break; @@ -260,6 +428,7 @@ } for (; j < array.raid_disks + array.nr_disks; j++) { int sfd; + unsigned dev; disc.number = j; if (ioctl(fd, GET_DISK_INFO, &disc)) continue; @@ -276,21 +445,93 @@ continue; if (errno != ENXIO) continue; - stb.st_rdev = makedev(disc.major, disc.minor); + dev = makedev(disc.major, disc.minor); + if (stb.st_rdev == dev) + /* already did that one */ + continue; + stb.st_rdev = dev; next = dv; - jnext = j+1; + /* same slot again next time - things might + * have reshuffled */ + jnext = j; dnprintable = dvname; break; } if (jnext == 0) continue; + } else if (strcmp(dv->devname, "missing") == 0) { + if (dv->disposition != 'a' || dv->re_add == 0) { + fprintf(stderr, Name ": 'missing' only meaningful " + "with --re-add\n"); + return 1; + } + if (add_devlist == NULL) + add_devlist = conf_get_devs(); + if (add_devlist == NULL) { + fprintf(stderr, Name ": no devices to scan for missing members."); + continue; + } + add_dev = add_devlist->devname; + add_devlist = add_devlist->next; + if (add_devlist != NULL) + next = dv; + if (stat(add_dev, &stb) < 0) + continue; + } else if (strchr(dv->devname, '/') == NULL && + strchr(dv->devname, ':') == NULL && + strlen(dv->devname) < 50) { + /* Assume this is a kernel-internal name like 'sda1' */ + int found = 0; + char dname[55]; + if (dv->disposition != 'r' && dv->disposition != 'f') { + fprintf(stderr, Name ": %s only meaningful " + "with -r or -f, not -%c\n", + dv->devname, dv->disposition); + return 1; + } + + sprintf(dname, "dev-%s", dv->devname); + sysfd = sysfs_open(fd2devnum(fd), dname, "block/dev"); + if (sysfd >= 0) { + char dn[20]; + int mj,mn; + if (sysfs_fd_get_str(sysfd, dn, 20) > 0 && + sscanf(dn, "%d:%d", &mj,&mn) == 2) { + stb.st_rdev = makedev(mj,mn); + found = 1; + } + close(sysfd); + sysfd = -1; + } + if (!found) { + sysfd = sysfs_open(fd2devnum(fd), dname, "state"); + if (sysfd < 0) { + fprintf(stderr, Name ": %s does not appear " + "to be a component of %s\n", + dv->devname, devname); + return 1; + } + } } else { j = 0; - if (stat(dv->devname, &stb)) { - fprintf(stderr, Name ": cannot find %s: %s\n", - dv->devname, strerror(errno)); - return 1; + tfd = dev_open(dv->devname, O_RDONLY); + if (tfd < 0 && dv->disposition == 'r' && + lstat(dv->devname, &stb) == 0) + /* Be happy, the lstat worked, that is + * enough for --remove + */ + ; + else { + if (tfd < 0 || fstat(tfd, &stb) != 0) { + fprintf(stderr, Name ": cannot find %s: %s\n", + dv->devname, strerror(errno)); + if (tfd >= 0) + close(tfd); + return 1; + } + close(tfd); + tfd = -1; } if ((stb.st_mode & S_IFMT) != S_IFBLK) { fprintf(stderr, Name ": %s is not a " @@ -306,48 +547,68 @@ return 1; case 'a': /* add the device */ - + if (tst->subarray[0]) { + fprintf(stderr, Name ": Cannot add disks to a" + " \'member\' array, perform this" + " operation on the parent container\n"); + return 1; + } /* Make sure it isn't in use (in 2.6 or later) */ - tfd = open(dv->devname, O_RDONLY|O_EXCL); + tfd = dev_open(add_dev, O_RDONLY|O_EXCL|O_DIRECT); + if (tfd < 0 && add_dev != dv->devname) + continue; if (tfd < 0) { fprintf(stderr, Name ": Cannot open %s: %s\n", dv->devname, strerror(errno)); return 1; } - remove_partitions(tfd); st = dup_super(tst); if (array.not_persistent==0) st->ss->load_super(st, tfd, NULL); - if (!get_dev_size(tfd, dv->devname, &ldsize)) { + if (add_dev == dv->devname) { + if (!get_dev_size(tfd, dv->devname, &ldsize)) { + close(tfd); + return 1; + } + } else if (!get_dev_size(tfd, NULL, &ldsize)) { close(tfd); - return 1; + tfd = -1; + continue; } - close(tfd); - if (array.major_version == 0 && + if (!tst->ss->external && + array.major_version == 0 && md_get_version(fd)%100 < 2) { + close(tfd); + tfd = -1; if (ioctl(fd, HOT_ADD_DISK, (unsigned long)stb.st_rdev)==0) { if (verbose >= 0) fprintf(stderr, Name ": hot added %s\n", - dv->devname); + add_dev); continue; } fprintf(stderr, Name ": hot add failed for %s: %s\n", - dv->devname, strerror(errno)); + add_dev, strerror(errno)); return 1; } - if (array.not_persistent == 0) { + if (array.not_persistent == 0 || tst->ss->external) { /* need to find a sample superblock to copy, and - * a spare slot to use + * a spare slot to use. + * For 'external' array (well, container based), + * We can just load the metadata for the array. */ - for (j = 0; j < tst->max_devs; j++) { + if (tst->sb) + /* already loaded */; + else if (tst->ss->external) { + tst->ss->load_super(tst, fd, NULL); + } else for (j = 0; j < tst->max_devs; j++) { char *dev; int dfd; disc.number = j; @@ -369,7 +630,9 @@ close(dfd); break; } + /* FIXME this is a bad test to be using */ if (!tst->sb) { + close(tfd); fprintf(stderr, Name ": cannot find valid superblock in this array - HELP\n"); return 1; } @@ -377,6 +640,10 @@ /* Make sure device is large enough */ if (tst->ss->avail_size(tst, ldsize/512) < array_size) { + close(tfd); + tfd = -1; + if (add_dev != dv->devname) + continue; fprintf(stderr, Name ": %s not large enough to join array\n", dv->devname); return 1; @@ -408,16 +675,51 @@ disc.number = mdi.disk.number; disc.raid_disk = mdi.disk.raid_disk; disc.state = mdi.disk.state; - if (dv->writemostly) + if (dv->writemostly == 1) disc.state |= 1 << MD_DISK_WRITEMOSTLY; - if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { + if (dv->writemostly == 2) + disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + remove_partitions(tfd); + close(tfd); + tfd = -1; + /* don't even try if disk is marked as faulty */ + errno = 0; + if ((disc.state & 1) == 0 && + ioctl(fd, ADD_NEW_DISK, &disc) == 0) { if (verbose >= 0) - fprintf(stderr, Name ": re-added %s\n", dv->devname); + fprintf(stderr, Name ": re-added %s\n", add_dev); + count++; continue; } + if (errno == ENOMEM || errno == EROFS) { + fprintf(stderr, Name ": add new device failed for %s: %s\n", + add_dev, strerror(errno)); + if (add_dev != dv->devname) + continue; + return 1; + } /* fall back on normal-add */ } } + if (add_dev != dv->devname) { + if (verbose > 0) + fprintf(stderr, Name + ": --re-add for %s to %s is not possible\n", + add_dev, devname); + if (tfd >= 0) { + close(tfd); + tfd = -1; + } + continue; + } + if (dv->re_add) { + if (tfd >= 0) + close(tfd); + fprintf(stderr, Name + ": --re-add for %s to %s is not possible\n", + dv->devname, devname); + return 1; + } } else { /* non-persistent. Must ensure that new drive * is at least array.size big. @@ -425,9 +727,17 @@ if (ldsize/512 < array_size) { fprintf(stderr, Name ": %s not large enough to join array\n", dv->devname); + if (tfd >= 0) + close(tfd); return 1; } } + /* committed to really trying this device now*/ + if (tfd >= 0) { + remove_partitions(tfd); + close(tfd); + tfd = -1; + } /* in 2.6.17 and earlier, version-1 superblocks won't * use the number we write, but will choose a free number. * we must choose the same free number, which requires @@ -446,12 +756,21 @@ disc.minor = minor(stb.st_rdev); disc.number =j; disc.state = 0; - if (array.not_persistent==0) { - if (dv->writemostly) + if (array.not_persistent==0 || tst->ss->external) { + int dfd; + if (dv->writemostly == 1) disc.state |= 1 << MD_DISK_WRITEMOSTLY; - tst->ss->add_to_super(tst, &disc); - if (tst->ss->write_init_super(tst, &disc, - dv->devname)) + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname)) { + close(dfd); + return 1; + } + /* write_init_super will close 'dfd' */ + if (tst->ss->external) + /* mdmon will write the metadata */ + close(dfd); + else if (tst->ss->write_init_super(tst)) return 1; } else if (dv->re_add) { /* this had better be raid1. @@ -481,10 +800,57 @@ disc.state |= (1<writemostly) + if (dv->writemostly == 1) disc.state |= (1 << MD_DISK_WRITEMOSTLY); - if (ioctl(fd,ADD_NEW_DISK, &disc)) { + if (tst->ss->external) { + /* add a disk to an external metadata container + * only if mdmon is around to see it + */ + struct mdinfo new_mdi; + struct mdinfo *sra; + int container_fd; + int devnum = fd2devnum(fd); + + container_fd = open_dev_excl(devnum); + if (container_fd < 0) { + fprintf(stderr, Name ": add failed for %s:" + " could not get exclusive access to container\n", + dv->devname); + return 1; + } + + if (!mdmon_running(devnum)) { + fprintf(stderr, Name ": add failed for %s: mdmon not running\n", + dv->devname); + close(container_fd); + return 1; + } + + sra = sysfs_read(container_fd, -1, 0); + if (!sra) { + fprintf(stderr, Name ": add failed for %s: sysfs_read failed\n", + dv->devname); + close(container_fd); + return 1; + } + sra->array.level = LEVEL_CONTAINER; + /* Need to set data_offset and component_size */ + tst->ss->getinfo_super(tst, &new_mdi); + new_mdi.disk.major = disc.major; + new_mdi.disk.minor = disc.minor; + new_mdi.recovery_start = 0; + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + fprintf(stderr, Name ": add new device to external metadata" + " failed for %s\n", dv->devname); + close(container_fd); + return 1; + } + ping_monitor(devnum2devname(devnum)); + sysfs_free(sra); + close(container_fd); + } else if (ioctl(fd, ADD_NEW_DISK, &disc)) { fprintf(stderr, Name ": add new device failed for %s as %d: %s\n", dv->devname, j, strerror(errno)); return 1; @@ -495,33 +861,143 @@ case 'r': /* hot remove */ + if (tst->subarray[0]) { + fprintf(stderr, Name ": Cannot remove disks from a" + " \'member\' array, perform this" + " operation on the parent container\n"); + if (sysfd >= 0) + close(sysfd); + return 1; + } + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hold spare while we are checking, we + * get an O_EXCL open on the container + */ + int dnum = fd2devnum(fd); + lfd = open_dev_excl(dnum); + if (lfd < 0) { + fprintf(stderr, Name + ": Cannot get exclusive access " + " to container - odd\n"); + if (sysfd >= 0) + close(sysfd); + return 1; + } + /* in the detached case it is not possible to + * check if we are the unique holder, so just + * rely on the 'detached' checks + */ + if (strcmp(dv->devname, "detached") == 0 || + sysfd >= 0 || + sysfs_unique_holder(dnum, stb.st_rdev)) + /* pass */; + else { + fprintf(stderr, Name + ": %s is %s, cannot remove.\n", + dnprintable, + errno == EEXIST ? "still in use": + "not a member"); + close(lfd); + return 1; + } + } /* FIXME check that it is a current member */ - if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) { + if (sysfd >= 0) { + /* device has been removed and we don't know + * the major:minor number + */ + int n = write(sysfd, "remove", 6); + if (n != 6) + err = -1; + else + err = 0; + close(sysfd); + sysfd = -1; + } else { + err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * registered */ + struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == (int)major(stb.st_rdev) && + dv->disk.minor == (int)minor(stb.st_rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + if (sra) + sysfs_free(sra); + } + } + if (err) { fprintf(stderr, Name ": hot remove failed " "for %s: %s\n", dnprintable, strerror(errno)); + if (lfd >= 0) + close(lfd); return 1; } + if (tst->ss->external) { + /* + * Before dropping our exclusive open we make an + * attempt at preventing mdmon from seeing an + * 'add' event before reconciling this 'remove' + * event. + */ + char *name = devnum2devname(fd2devnum(fd)); + + if (!name) { + fprintf(stderr, Name ": unable to get container name\n"); + return 1; + } + + ping_manager(name); + free(name); + } + if (lfd >= 0) + close(lfd); + count++; if (verbose >= 0) - fprintf(stderr, Name ": hot removed %s\n", - dnprintable); + fprintf(stderr, Name ": hot removed %s from %s\n", + dnprintable, devname); break; case 'f': /* set faulty */ /* FIXME check current member */ - if (ioctl(fd, SET_DISK_FAULTY, (unsigned long) stb.st_rdev)) { + if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || + (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, + (unsigned long) stb.st_rdev))) { fprintf(stderr, Name ": set device faulty failed for %s: %s\n", dnprintable, strerror(errno)); + if (sysfd >= 0) + close(sysfd); return 1; } + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + count++; if (verbose >= 0) fprintf(stderr, Name ": set %s faulty in %s\n", dnprintable, devname); break; } } + if (test && count == 0) + return 2; return 0; - } int autodetect(void) @@ -536,4 +1012,57 @@ } return rv; } + +int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident, int quiet) +{ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + if (snprintf(st->subarray, sizeof(st->subarray), "%s", subarray) >= + (signed)sizeof(st->subarray)) { + if (!quiet) + fprintf(stderr, + Name ": Input overflow for subarray '%s' > %zu bytes\n", + subarray, sizeof(st->subarray) - 1); + return 2; + } + + fd = open_subarray(dev, st, quiet); + if (fd < 0) + return 2; + + if (!st->ss->update_subarray) { + if (!quiet) + fprintf(stderr, + Name ": Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (mdmon_running(st->devnum)) + st->update_tail = &st->updates; + + rv = st->ss->update_subarray(st, update, ident); + + if (rv) { + if (!quiet) + fprintf(stderr, Name ": Failed to update %s of subarray-%s in %s\n", + update, subarray, dev); + } else if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (rv == 0 && strcmp(update, "name") == 0 && !quiet) + fprintf(stderr, + Name ": Updated subarray-%s name from %s, UUIDs may have changed\n", + subarray, dev); + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} #endif diff -Nru mdadm-2.6.7.1/managemon.c mdadm-3.1.4/managemon.c --- mdadm-2.6.7.1/managemon.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/managemon.c 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,713 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * The management thread for monitoring active md arrays. + * This thread does things which might block such as memory + * allocation. + * In particular: + * + * - Find out about new arrays in this container. + * Allocate the data structures and open the files. + * + * For this we watch /proc/mdstat and find new arrays with + * metadata type that confirms sharing. e.g. "md4" + * When we find a new array we slip it into the list of + * arrays and signal 'monitor' by writing to a pipe. + * + * - Respond to reshape requests by allocating new data structures + * and opening new files. + * + * These come as a change to raid_disks. We allocate a new + * version of the data structures and slip it into the list. + * 'monitor' will notice and release the old version. + * Changes to level, chunksize, layout.. do not need re-allocation. + * Reductions in raid_disks don't really either, but we handle + * them the same way for consistency. + * + * - When a device is added to the container, we add it to the metadata + * as a spare. + * + * - Deal with degraded array + * We only do this when first noticing the array is degraded. + * This can be when we first see the array, when sync completes or + * when recovery completes. + * + * Check if number of failed devices suggests recovery is needed, and + * skip if not. + * Ask metadata to allocate a spare device + * Add device as not in_sync and give a role + * Update metadata. + * Open sysfs files and pass to monitor. + * Make sure that monitor Starts recovery.... + * + * - Pass on metadata updates from external programs such as + * mdadm creating a new array. + * + * This is most-messy. + * It might involve adding a new array or changing the status of + * a spare, or any reconfig that the kernel doesn't get involved in. + * + * The required updates are received via a named pipe. There will + * be one named pipe for each container. Each message contains a + * sync marker: 0x5a5aa5a5, A byte count, and the message. This is + * passed to the metadata handler which will interpret and process it. + * For 'DDF' messages are internal data blocks with the leading + * 'magic number' signifying what sort of data it is. + * + */ + +/* + * We select on /proc/mdstat and the named pipe. + * We create new arrays or updated version of arrays and slip + * them into the head of the list, then signal 'monitor' via a pipe write. + * 'monitor' will notice and place the old array on a return list. + * Metadata updates are placed on a queue just like they arrive + * from the named pipe. + * + * When new arrays are found based on correct metadata string, we + * need to identify them with an entry in the metadata. Maybe we require + * the metadata to be mdX/NN when NN is the index into an appropriate table. + * + */ + +/* + * List of tasks: + * - Watch for spares to be added to the container, and write updated + * metadata to them. + * - Watch for new arrays using this container, confirm they match metadata + * and if so, start monitoring them + * - Watch for spares being added to monitored arrays. This shouldn't + * happen, as we should do all the adding. Just remove them. + * - Watch for change in raid-disks, chunk-size, etc. Update metadata and + * start a reshape. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "mdadm.h" +#include "mdmon.h" +#include +#include +#include + +static void close_aa(struct active_array *aa) +{ + struct mdinfo *d; + + for (d = aa->info.devs; d; d = d->next) { + close(d->recovery_fd); + close(d->state_fd); + } + + close(aa->action_fd); + close(aa->info.state_fd); + close(aa->resync_start_fd); +} + +static void free_aa(struct active_array *aa) +{ + /* Note that this doesn't close fds if they are being used + * by a clone. ->container will be set for a clone + */ + dprintf("%s: devnum: %d\n", __func__, aa->devnum); + if (!aa->container) + close_aa(aa); + while (aa->info.devs) { + struct mdinfo *d = aa->info.devs; + aa->info.devs = d->next; + free(d); + } + free(aa); +} + +static struct active_array *duplicate_aa(struct active_array *aa) +{ + struct active_array *newa = malloc(sizeof(*newa)); + struct mdinfo **dp1, **dp2; + + *newa = *aa; + newa->next = NULL; + newa->replaces = NULL; + newa->info.next = NULL; + + dp2 = &newa->info.devs; + + for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) { + struct mdinfo *d; + if ((*dp1)->state_fd < 0) + continue; + + d = malloc(sizeof(*d)); + *d = **dp1; + *dp2 = d; + dp2 = & d->next; + } + *dp2 = NULL; + + return newa; +} + +static void wakeup_monitor(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mon_tid, SIGUSR1); +} + +static void remove_old(void) +{ + if (discard_this) { + discard_this->next = NULL; + free_aa(discard_this); + if (pending_discard == discard_this) + pending_discard = NULL; + discard_this = NULL; + wakeup_monitor(); + } +} + +static void replace_array(struct supertype *container, + struct active_array *old, + struct active_array *new) +{ + /* To replace an array, we add it to the top of the list + * marked with ->replaces to point to the original. + * 'monitor' will take the original out of the list + * and put it on 'discard_this'. We take it from there + * and discard it. + */ + remove_old(); + while (pending_discard) { + while (discard_this == NULL) + sleep(1); + remove_old(); + } + pending_discard = old; + new->replaces = old; + new->next = container->arrays; + container->arrays = new; + wakeup_monitor(); +} + +struct metadata_update *update_queue = NULL; +struct metadata_update *update_queue_handled = NULL; +struct metadata_update *update_queue_pending = NULL; + +static void free_updates(struct metadata_update **update) +{ + while (*update) { + struct metadata_update *this = *update; + + *update = this->next; + free(this->buf); + free(this->space); + free(this); + } +} + +void check_update_queue(struct supertype *container) +{ + free_updates(&update_queue_handled); + + if (update_queue == NULL && + update_queue_pending) { + update_queue = update_queue_pending; + update_queue_pending = NULL; + wakeup_monitor(); + } +} + +static void queue_metadata_update(struct metadata_update *mu) +{ + struct metadata_update **qp; + + qp = &update_queue_pending; + while (*qp) + qp = & ((*qp)->next); + *qp = mu; +} + +static void add_disk_to_container(struct supertype *st, struct mdinfo *sd) +{ + int dfd; + char nm[20]; + struct supertype *st2; + struct metadata_update *update = NULL; + struct mdinfo info; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + + dprintf("%s: add %d:%d to container\n", + __func__, sd->disk.major, sd->disk.minor); + + sd->next = st->devs; + st->devs = sd; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return; + + /* Check the metadata and see if it is already part of this + * array + */ + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd, NULL) == 0) { + st2->ss->getinfo_super(st, &info); + if (st->ss->compare_super(st, st2) == 0 && + info.disk.raid_disk >= 0) { + /* Looks like a good member of array. + * Just accept it. + * mdadm will incorporate any parts into + * active arrays. + */ + st2->ss->free_super(st2); + return; + } + } + st2->ss->free_super(st2); + + st->update_tail = &update; + st->ss->add_to_super(st, &dk, dfd, NULL); + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +static void manage_container(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* The only thing of interest here is if a new device + * has been added to the container. We add it to the + * array ignoring any metadata on it. + * FIXME should we look for compatible metadata and take hints + * about spare assignment.... probably not. + */ + if (mdstat->devcnt != container->devcnt) { + struct mdinfo **cdp, *cd, *di, *mdi; + int found; + + /* read /sys/block/NAME/md/dev-??/block/dev to find out + * what is there, and compare with container->info.devs + * To see what is removed and what is added. + * These need to be remove from, or added to, the array + */ + mdi = sysfs_read(-1, mdstat->devnum, GET_DEVS); + if (!mdi) { + /* invalidate the current count so we can try again */ + container->devcnt = -1; + return; + } + + /* check for removals */ + for (cdp = &container->devs; *cdp; ) { + found = 0; + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (*cdp)->disk.major && + di->disk.minor == (*cdp)->disk.minor) { + found = 1; + break; + } + if (!found) { + cd = *cdp; + *cdp = (*cdp)->next; + free(cd); + } else + cdp = &(*cdp)->next; + } + + /* check for additions */ + for (di = mdi->devs; di; di = di->next) { + for (cd = container->devs; cd; cd = cd->next) + if (di->disk.major == cd->disk.major && + di->disk.minor == cd->disk.minor) + break; + if (!cd) { + struct mdinfo *newd = malloc(sizeof(*newd)); + + if (!newd) { + container->devcnt = -1; + continue; + } + *newd = *di; + add_disk_to_container(container, newd); + } + } + sysfs_free(mdi); + container->devcnt = mdstat->devcnt; + } +} + +static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone, + struct active_array *aa) +{ + if (!disk || !clone) + return -1; + + *disk = *clone; + disk->recovery_fd = sysfs_open(aa->devnum, disk->sys_name, "recovery_start"); + disk->state_fd = sysfs_open(aa->devnum, disk->sys_name, "state"); + disk->prev_state = read_dev_state(disk->state_fd); + disk->curr_state = disk->prev_state; + disk->next = aa->info.devs; + aa->info.devs = disk; + + return 0; +} + +static void manage_member(struct mdstat_ent *mdstat, + struct active_array *a) +{ + /* Compare mdstat info with known state of member array. + * We do not need to look for device state changes here, that + * is dealt with by the monitor. + * + * We just look for changes which suggest that a reshape is + * being requested. + * Unfortunately decreases in raid_disks don't show up in + * mdstat until the reshape completes FIXME. + * + * Actually, we also want to handle degraded arrays here by + * trying to find and assign a spare. + * We do that whenever the monitor tells us too. + */ + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + a->info.array.chunk_size = mdstat->chunk_size; + // MORE + + if (a->check_degraded) { + struct metadata_update *updates = NULL; + struct mdinfo *newdev = NULL; + struct active_array *newa; + struct mdinfo *d; + + a->check_degraded = 0; + + /* The array may not be degraded, this is just a good time + * to check. + */ + newdev = a->container->ss->activate_spare(a, &updates); + if (!newdev) + return; + + newa = duplicate_aa(a); + if (!newa) + goto out; + /* Cool, we can add a device or several. */ + + /* Add device to array and set offset/size/slot. + * and open files for each newdev */ + for (d = newdev; d ; d = d->next) { + struct mdinfo *newd; + + newd = malloc(sizeof(*newd)); + if (!newd) + continue; + if (sysfs_add_disk(&newa->info, d, 0) < 0) { + free(newd); + continue; + } + disk_init_and_add(newd, d, newa); + } + queue_metadata_update(updates); + updates = NULL; + replace_array(a->container, a, newa); + sysfs_set_str(&a->info, NULL, "sync_action", "recover"); + out: + while (newdev) { + d = newdev->next; + free(newdev); + newdev = d; + } + free_updates(&updates); + } +} + +static int aa_ready(struct active_array *aa) +{ + struct mdinfo *d; + int level = aa->info.array.level; + + for (d = aa->info.devs; d; d = d->next) + if (d->state_fd < 0) + return 0; + + if (aa->info.state_fd < 0) + return 0; + + if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0)) + return 0; + + if (!aa->container) + return 0; + + return 1; +} + +static void manage_new(struct mdstat_ent *mdstat, + struct supertype *container, + struct active_array *victim) +{ + /* A new array has appeared in this container. + * Hopefully it is already recorded in the metadata. + * Check, then create the new array to report it to + * the monitor. + */ + + struct active_array *new; + struct mdinfo *mdi, *di; + char *inst; + int i; + int failed = 0; + + /* check if array is ready to be monitored */ + if (!mdstat->active) + return; + + mdi = sysfs_read(-1, mdstat->devnum, + GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| + GET_DEGRADED|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); + + new = malloc(sizeof(*new)); + + if (!new || !mdi) { + if (mdi) + sysfs_free(mdi); + if (new) + free(new); + return; + } + memset(new, 0, sizeof(*new)); + + new->devnum = mdstat->devnum; + strcpy(new->info.sys_name, devnum2devname(new->devnum)); + + new->prev_state = new->curr_state = new->next_state = inactive; + new->prev_action= new->curr_action= new->next_action= idle; + + new->container = container; + + inst = &mdstat->metadata_version[10+strlen(container->devname)+1]; + + new->info.array = mdi->array; + new->info.component_size = mdi->component_size; + + for (i = 0; i < new->info.array.raid_disks; i++) { + struct mdinfo *newd = malloc(sizeof(*newd)); + + for (di = mdi->devs; di; di = di->next) + if (i == di->disk.raid_disk) + break; + + if (disk_init_and_add(newd, di, new) != 0) { + if (newd) + free(newd); + + failed++; + if (failed > new->info.array.failed_disks) { + /* we cannot properly monitor without all working disks */ + new->container = NULL; + break; + } + } + } + + new->action_fd = sysfs_open(new->devnum, NULL, "sync_action"); + new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state"); + new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start"); + new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version"); + new->sync_completed_fd = sysfs_open(new->devnum, NULL, "sync_completed"); + dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst), + new->action_fd, new->info.state_fd); + + sysfs_free(mdi); + + /* if everything checks out tell the metadata handler we want to + * manage this instance + */ + if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) { + fprintf(stderr, "mdmon: failed to monitor %s\n", + mdstat->metadata_version); + new->container = NULL; + free_aa(new); + } else { + replace_array(container, victim, new); + if (failed) { + new->check_degraded = 1; + manage_member(mdstat, new); + } + } +} + +void manage(struct mdstat_ent *mdstat, struct supertype *container) +{ + /* We have just read mdstat and need to compare it with + * the known active arrays. + * Arrays with the wrong metadata are ignored. + */ + + for ( ; mdstat ; mdstat = mdstat->next) { + struct active_array *a; + if (mdstat->devnum == container->devnum) { + manage_container(mdstat, container); + continue; + } + if (!is_container_member(mdstat, container->devname)) + /* Not for this array */ + continue; + /* Looks like a member of this container */ + for (a = container->arrays; a; a = a->next) { + if (mdstat->devnum == a->devnum) { + if (a->container) + manage_member(mdstat, a); + break; + } + } + if (a == NULL || !a->container) + manage_new(mdstat, container, a); + } +} + +static void handle_message(struct supertype *container, struct metadata_update *msg) +{ + /* queue this metadata update through to the monitor */ + + struct metadata_update *mu; + + if (msg->len <= 0) + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + + if (msg->len == 0) { /* ping_monitor */ + int cnt; + + cnt = monitor_loop_cnt; + if (cnt & 1) + cnt += 2; /* wait until next pselect */ + else + cnt += 3; /* wait for 2 pselects */ + wakeup_monitor(); + + while (monitor_loop_cnt - cnt < 0) + usleep(10 * 1000); + } else if (msg->len == -1) { /* ping_manager */ + struct mdstat_ent *mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + free_mdstat(mdstat); + } else if (!sigterm) { + mu = malloc(sizeof(*mu)); + mu->len = msg->len; + mu->buf = msg->buf; + msg->buf = NULL; + mu->space = NULL; + mu->next = NULL; + if (container->ss->prepare_update) + container->ss->prepare_update(container, mu); + queue_metadata_update(mu); + } +} + +void read_sock(struct supertype *container) +{ + int fd; + struct metadata_update msg; + int terminate = 0; + long fl; + int tmo = 3; /* 3 second timeout before hanging up the socket */ + + fd = accept(container->sock, NULL, NULL); + if (fd < 0) + return; + + fl = fcntl(fd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(fd, F_SETFL, fl); + + do { + msg.buf = NULL; + + /* read and validate the message */ + if (receive_message(fd, &msg, tmo) == 0) { + handle_message(container, &msg); + if (ack(fd, tmo) < 0) + terminate = 1; + } else + terminate = 1; + + } while (!terminate); + + close(fd); +} + +int exit_now = 0; +int manager_ready = 0; +void do_manager(struct supertype *container) +{ + struct mdstat_ent *mdstat; + sigset_t set; + + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + sigdelset(&set, SIGTERM); + + do { + + if (exit_now) + exit(0); + + /* Can only 'manage' things if 'monitor' is not making + * structural changes to metadata, so need to check + * update_queue + */ + if (update_queue == NULL) { + mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + + read_sock(container); + + free_mdstat(mdstat); + } + remove_old(); + + check_update_queue(container); + + manager_ready = 1; + + if (sigterm) + wakeup_monitor(); + + if (update_queue == NULL) + mdstat_wait_fd(container->sock, &set); + else + /* If an update is happening, just wait for signal */ + pselect(0, NULL, NULL, NULL, NULL, &set); + } while(1); +} diff -Nru mdadm-2.6.7.1/mapfile.c mdadm-3.1.4/mapfile.c --- mdadm-2.6.7.1/mapfile.c 2008-10-13 04:22:02.000000000 +0300 +++ mdadm-3.1.4/mapfile.c 2010-08-26 05:24:15.000000000 +0300 @@ -1,8 +1,8 @@ /* - * mapfile - manage /var/run/mdadm.map. Part of: + * mapfile - keep track of uuid <-> array mapping. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2006 Neil Brown + * Copyright (C) 2006-2010 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -28,75 +28,148 @@ * Australia */ -/* /var/run/mdadm.map is used to track arrays being created in --incremental - * more. It particularly allows lookup from UUID to array device, but +/* The mapfile is used to track arrays being created in --incremental + * mode. It particularly allows lookup from UUID to array device, but * also allows the array device name to be easily found. * * The map file is line based with space separated fields. The fields are: - * Device id - mdX or mdpX where is a number. - * metadata - 0.90 1.0 1.1 1.2 + * Device id - mdX or mdpX where X is a number. + * metadata - 0.90 1.0 1.1 1.2 ddf ... * UUID - uuid of the array * path - path where device created: /dev/md/home * + * The best place for the mapfile wold be /var/run/mdadm/map. However + * it is needed during initramfs early-boot, and /var/run doesn't exist there + * and certainly doesn't persist through to normal boot. + * So we store it in /dev/.mdadm/map but allow this to be changed at + * compile time. via MAP_DIR and MAP_FILE + * */ +#include "mdadm.h" +#include +#include + +#ifndef MAP_DIR +#define MAP_DIR "/dev/.mdadm" +#define MAP_FILE "map" +#endif + +#define MAP_READ 0 +#define MAP_NEW 1 +#define MAP_LOCK 2 +#define MAP_DIRNAME 3 +#define mapnames(dir, base) { \ + +char *mapname[4] = { + MAP_DIR "/" MAP_FILE, + MAP_DIR "/" MAP_FILE ".new", + MAP_DIR "/" MAP_FILE ".lock", + MAP_DIR +}; +int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT|O_TRUNC }; +char *mapsmode[3] = { "r", "w", "w"}; -#include "mdadm.h" - +FILE *open_map(int modenum) +{ + int fd; + if ((mapmode[modenum] & O_CREAT)) + /* Attempt to create directory, don't worry about + * failure. + */ + (void)mkdir(mapname[MAP_DIRNAME], 0755); + fd = open(mapname[modenum], mapmode[modenum], 0600); + if (fd >= 0) + return fdopen(fd, mapsmode[modenum]); + return NULL; +} int map_write(struct map_ent *mel) { FILE *f; int err; - int subdir = 1; - f = fopen("/var/run/mdadm/map.new", "w"); - if (!f) { - f = fopen("/var/run/mdadm.map.new", "w"); - subdir = 1; - } + f = open_map(MAP_NEW); + if (!f) return 0; - while (mel) { + for (; mel; mel = mel->next) { + if (mel->bad) + continue; if (mel->devnum < 0) fprintf(f, "mdp%d ", -1-mel->devnum); else fprintf(f, "md%d ", mel->devnum); - fprintf(f, "%d.%d ", mel->major, mel->minor); + fprintf(f, "%s ", mel->metadata); fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0], mel->uuid[1], mel->uuid[2], mel->uuid[3]); - fprintf(f, "%s\n", mel->path); - mel = mel->next; + fprintf(f, "%s\n", mel->path?:""); } fflush(f); err = ferror(f); fclose(f); if (err) { - if (subdir) - unlink("/var/run/mdadm/map.new"); - else - unlink("/var/run/mdadm.map.new"); + unlink(mapname[1]); return 0; } - if (subdir) - return rename("/var/run/mdadm/map.new", - "/var/run/mdadm/map") == 0; - else - return rename("/var/run/mdadm.map.new", - "/var/run/mdadm.map") == 0; + return rename(mapname[1], + mapname[0]) == 0; +} + + +static FILE *lf = NULL; +int map_lock(struct map_ent **melp) +{ + while (lf == NULL) { + struct stat buf; + lf = open_map(MAP_LOCK); + if (lf == NULL) + return -1; + if (flock(fileno(lf), LOCK_EX) != 0) { + fclose(lf); + lf = NULL; + return -1; + } + if (fstat(fileno(lf), &buf) != 0 || + buf.st_nlink == 0) { + /* The owner of the lock unlinked it, + * so we have a lock on a stale file, + * try again + */ + fclose(lf); + lf = NULL; + } + } + if (*melp) + map_free(*melp); + map_read(melp); + return 0; +} + +void map_unlock(struct map_ent **melp) +{ + if (lf) { + /* must unlink before closing the file, + * as only the owner of the lock may + * unlink the file + */ + unlink(mapname[2]); + fclose(lf); + } + lf = NULL; } void map_add(struct map_ent **melp, - int devnum, int major, int minor, int uuid[4], char *path) + int devnum, char *metadata, int uuid[4], char *path) { struct map_ent *me = malloc(sizeof(*me)); me->devnum = devnum; - me->major = major; - me->minor = minor; + strcpy(me->metadata, metadata); memcpy(me->uuid, uuid, 16); - me->path = strdup(path); + me->path = path ? strdup(path) : NULL; me->next = *melp; + me->bad = 0; *melp = me; } @@ -105,24 +178,30 @@ FILE *f; char buf[8192]; char path[200]; - int devnum, major, minor, uuid[4]; + int devnum, uuid[4]; + char metadata[30]; char nam[4]; *melp = NULL; - f = fopen("/var/run/mdadm/map", "r"); - if (!f) - f = fopen("/var/run/mdadm.map", "r"); + f = open_map(MAP_READ); + if (!f) { + RebuildMap(); + f = open_map(MAP_READ); + } if (!f) return; while (fgets(buf, sizeof(buf), f)) { - if (sscanf(buf, " md%1[p]%d %d.%d %x:%x:%x:%x %200s", - nam, &devnum, &major, &minor, uuid, uuid+1, - uuid+2, uuid+3, path) == 9) { - if (nam[0] == 'p') + path[0] = 0; + if (sscanf(buf, " %3[mdp]%d %s %x:%x:%x:%x %200s", + nam, &devnum, metadata, uuid, uuid+1, + uuid+2, uuid+3, path) >= 7) { + if (strncmp(nam, "md", 2) != 0) + continue; + if (nam[2] == 'p') devnum = -1 - devnum; - map_add(melp, devnum, major, minor, uuid, path); + map_add(melp, devnum, metadata, uuid, path); } } fclose(f); @@ -138,7 +217,7 @@ } } -int map_update(struct map_ent **mpp, int devnum, int major, int minor, +int map_update(struct map_ent **mpp, int devnum, char *metadata, int *uuid, char *path) { struct map_ent *map, *mp; @@ -151,16 +230,16 @@ for (mp = map ; mp ; mp=mp->next) if (mp->devnum == devnum) { - mp->major = major; - mp->minor = minor; + strcpy(mp->metadata, metadata); memcpy(mp->uuid, uuid, 16); free(mp->path); - mp->path = strdup(path); + mp->path = path ? strdup(path) : NULL; break; } if (!mp) - map_add(&map, devnum, major, minor, uuid, path); - *mpp = NULL; + map_add(&map, devnum, metadata, uuid, path); + if (mpp) + *mpp = NULL; rv = map_write(map); map_free(map); return rv; @@ -183,15 +262,245 @@ } } +void map_remove(struct map_ent **mapp, int devnum) +{ + if (devnum == NoMdDev) + return; + + map_delete(mapp, devnum); + map_write(*mapp); + map_free(*mapp); +} + struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]) { struct map_ent *mp; if (!*map) map_read(map); - for (mp = *map ; mp ; mp = mp->next) - if (memcmp(uuid, mp->uuid, 16) == 0) - return mp; + for (mp = *map ; mp ; mp = mp->next) { + if (memcmp(uuid, mp->uuid, 16) != 0) + continue; + if (!mddev_busy(mp->devnum)) { + mp->bad = 1; + continue; + } + return mp; + } return NULL; +} +struct map_ent *map_by_devnum(struct map_ent **map, int devnum) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (mp->devnum != devnum) + continue; + if (!mddev_busy(mp->devnum)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_name(struct map_ent **map, char *name) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (!mp->path) + continue; + if (strncmp(mp->path, "/dev/md/", 8) != 0) + continue; + if (strcmp(mp->path+8, name) != 0) + continue; + if (!mddev_busy(mp->devnum)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +/* sets the proper subarray and container_dev according to the metadata + * version super_by_fd does this automatically, this routine is meant as + * a supplement for guess_super() + */ +static void set_member_info(struct supertype *st, struct mdstat_ent *ent) +{ + + st->subarray[0] = '\0'; + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + return; + + if (is_subarray(&ent->metadata_version[9])) { + char version[strlen(ent->metadata_version)+1]; + char *subarray; + char *name = &version[10]; + + strcpy(version, ent->metadata_version); + subarray = strrchr(version, '/'); + name = &version[10]; + + if (!subarray) + return; + *subarray++ = '\0'; + + st->container_dev = devname2devnum(name); + strncpy(st->subarray, subarray, sizeof(st->subarray)); + } +} + +void RebuildMap(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *md; + struct map_ent *map = NULL; + int mdp = get_mdp_major(); + int require_homehost; + char sys_hostname[256]; + char *homehost = conf_get_homehost(&require_homehost); + + if (homehost == NULL || strcmp(homehost, "")==0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + homehost = sys_hostname; + } + } + + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS); + struct mdinfo *sd; + + if (!sra) + continue; + + for (sd = sra->devs ; sd ; sd = sd->next) { + char namebuf[100]; + char dn[30]; + int dfd; + int ok; + struct supertype *st; + char *path; + struct mdinfo info; + + sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + st = guess_super(dfd); + if ( st == NULL) + ok = -1; + else { + set_member_info(st, md); + ok = st->ss->load_super(st, dfd, NULL); + } + close(dfd); + if (ok != 0) + continue; + st->ss->getinfo_super(st, &info); + if (md->devnum >= 0) + path = map_dev(MD_MAJOR, md->devnum, 0); + else + path = map_dev(mdp, (-1-md->devnum)<< 6, 0); + if (path == NULL || + strncmp(path, "/dev/md/", 8) != 0) { + /* We would really like a name that provides + * an MD_DEVNAME for udev. + * The name needs to be unique both in /dev/md/ + * and in this mapfile. + * It needs to match watch -I or -As would come + * up with. + * That means: + * Check if array is in mdadm.conf + * - if so use that. + * determine trustworthy from homehost etc + * find a unique name based on metadata name. + * + */ + struct mddev_ident_s *match = conf_match(&info, st); + struct stat stb; + if (match && match->devname && match->devname[0] == '/') { + path = match->devname; + if (path[0] != '/') { + strcpy(namebuf, "/dev/md/"); + strcat(namebuf, path); + path = namebuf; + } + } else { + int unum = 0; + char *sep = "_"; + const char *name; + int conflict = 1; + if ((homehost == NULL || + st->ss->match_home(st, homehost) != 1) && + st->ss->match_home(st, "any") != 1 && + (require_homehost + || ! conf_name_is_free(info.name))) + /* require a numeric suffix */ + unum = 0; + else + /* allow name to be used as-is if no conflict */ + unum = -1; + name = info.name; + if (!*name) { + name = st->ss->name; + if (!isdigit(name[strlen(name)-1]) && + unum == -1) { + unum = 0; + sep = ""; + } + } + if (strchr(name, ':')) + /* probably a uniquifying + * hostname prefix. Allow + * without a suffix + */ + unum = -1; + + while (conflict) { + if (unum >= 0) + sprintf(namebuf, "/dev/md/%s%s%d", + name, sep, unum); + else + sprintf(namebuf, "/dev/md/%s", + name); + unum++; + if (lstat(namebuf, &stb) != 0 && + (map == NULL || + !map_by_name(&map, namebuf+8))) + conflict = 0; + } + path = namebuf; + } + } + map_add(&map, md->devnum, + info.text_version, + info.uuid, path); + st->ss->free_super(st); + break; + } + sysfs_free(sra); + } + /* Only trigger a change if we wrote a new map file */ + if (map_write(map)) + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnum, + GET_VERSION); + if (sra) + sysfs_uevent(sra, "change"); + sysfs_free(sra); + } + map_free(map); + free_mdstat(mdstat); } diff -Nru mdadm-2.6.7.1/md.4 mdadm-3.1.4/md.4 --- mdadm-2.6.7.1/md.4 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/md.4 2010-08-26 05:24:15.000000000 +0300 @@ -11,6 +11,8 @@ .BI /dev/md n .br .BI /dev/md/ n +.br +.BR /dev/md/ name .SH DESCRIPTION The .B md @@ -37,15 +39,17 @@ MULTIPATH (a set of different interfaces to the same device), and FAULTY (a layer over a single device into which errors can be injected). -.SS MD SUPER BLOCK -Each device in an array may have a -.I superblock -which records information about the structure and state of the array. +.SS MD METADATA +Each device in an array may have some +.I metadata +stored in the device. This metadata is sometimes called a +.BR superblock . +The metadata records information about the structure and state of the array. This allows the array to be reliably re-assembled after a shutdown. From Linux kernel version 2.6.10, .B md -provides support for two different formats of this superblock, and +provides support for two different formats of metadata, and other formats can be added. Prior to this release, only one format is supported. @@ -66,11 +70,11 @@ and 12K from the end of the device, on a 4K boundary, though variations can be stored at the start of the device (version 1.1) or 4K from the start of the device (version 1.2). -This superblock format stores multibyte data in a +This metadata format stores multibyte data in a processor-independent format and supports up to hundreds of component devices (version 0.90 only supports 28). -The superblock contains, among other things: +The metadata contains, among other things: .TP LEVEL The manner in which the devices are arranged into the array @@ -80,6 +84,7 @@ a 128 bit Universally Unique Identifier that identifies the array that contains this device. +.PP When a version 0.90 array is being reshaped (e.g. adding extra devices to a RAID5), the version number is temporarily set to 0.91. This ensures that if the reshape process is stopped in the middle (e.g. by @@ -88,7 +93,7 @@ would cause data corruption) but will be left untouched until a kernel that can complete the reshape processes is used. -.SS ARRAYS WITHOUT SUPERBLOCKS +.SS ARRAYS WITHOUT METADATA While it is usually best to create arrays with superblocks so that they can be assembled reliably, there are some circumstances when an array without superblocks is preferred. These include: @@ -118,6 +123,40 @@ the array elsewhere. While not encouraged for general us, it does have special-purpose uses and is supported. +.SS ARRAYS WITH EXTERNAL METADATA + +From release 2.6.28, the +.I md +driver supports arrays with externally managed metadata. That is, +the metadata is not managed by the kernel by rather by a user-space +program which is external to the kernel. This allows support for a +variety of metadata formats without cluttering the kernel with lots of +details. +.PP +.I md +is able to communicate with the user-space program through various +sysfs attributes so that it can make appropriate changes to the +metadata \- for example to make a device as faulty. When necessary, +.I md +will wait for the program to acknowledge the event by writing to a +sysfs attribute. +The manual page for +.IR mdmon (8) +contains more detail about this interaction. + +.SS CONTAINERS +Many metadata formats use a single block of metadata to describe a +number of different arrays which all use the same set of devices. +In this case it is helpful for the kernel to know about the full set +of devices as a whole. This set is known to md as a +.IR container . +A container is an +.I md +array with externally managed metadata and with device offset and size +so that it just covers the metadata part of the devices. The +remainder of each device is available to be incorporated into various +arrays. + .SS LINEAR A linear array simply catenates the available space on each @@ -138,12 +177,12 @@ striped array. A RAID0 array is configured at creation with a .B "Chunk Size" -which must be a power of two, and at least 4 kibibytes. +which must be a power of two (prior to Linux 2.6.31), and at least 4 +kibibytes. The RAID0 driver assigns the first chunk of the array to the first device, the second chunk to the second device, and so on until all -drives have been assigned one chunk. This collection of chunks forms -a +drives have been assigned one chunk. This collection of chunks forms a .BR stripe . Further chunks are gathered into stripes in the same way, and are assigned to the remaining space in the drives. @@ -175,6 +214,11 @@ spindle. In theory, having an N-disk RAID1 will allow N sequential threads to read from all disks. +Individual devices in a RAID1 can be marked as "write-mostly". +This drives are excluded from the normal read balancing and will only +be read from when there is no other option. This can be useful for +devices connected over a slow link. + .SS RAID4 A RAID4 array is like a RAID0 array with an extra device for storing @@ -240,7 +284,7 @@ of any given block are on different drives. The 'far' arrangement can give sequential read performance equal to -that of a RAID0 array, but at the cost of degraded write performance. +that of a RAID0 array, but at the cost of reduced write performance. When 'offset' replicas are chosen, the multiple copies of a given chunk are laid out on consecutive drives and at consecutive offsets. @@ -274,7 +318,11 @@ devices, often fibre channel interfaces, that all refer the the same real device. If one of these interfaces fails (e.g. due to cable problems), the multipath driver will attempt to redirect requests to -another interface. +another interface. + +The MULTIPATH drive is not receiving any ongoing development and +should be considered a legacy driver. The device-mapper based +multipath drivers should be preferred for new installations. .SS FAULTY The FAULTY md module is provided for testing purposes. A faulty array @@ -365,6 +413,112 @@ .B speed_limit_max control files mentioned below. +.SS SCRUBBING AND MISMATCHES + +As storage devices can develop bad blocks at any time it is valuable +to regularly read all blocks on all devices in an array so as to catch +such bad blocks early. This process is called +.IR scrubbing . + +md arrays can be scrubbed by writing either +.I check +or +.I repair +to the file +.I md/sync_action +in the +.I sysfs +directory for the device. + +Requesting a scrub will cause +.I md +to read every block on every device in the array, and check that the +data is consistent. For RAID1 and RAID10, this means checking that the copies +are identical. For RAID4, RAID5, RAID6 this means checking that the +parity block is (or blocks are) correct. + +If a read error is detected during this process, the normal read-error +handling causes correct data to be found from other devices and to be +written back to the faulty device. In many case this will +effectively +.I fix +the bad block. + +If all blocks read successfully but are found to not be consistent, +then this is regarded as a +.IR mismatch . + +If +.I check +was used, then no action is taken to handle the mismatch, it is simply +recorded. +If +.I repair +was used, then a mismatch will be repaired in the same way that +.I resync +repairs arrays. For RAID5/RAID6 new parity blocks are written. For RAID1/RAID10, +all but one block are overwritten with the content of that one block. + +A count of mismatches is recorded in the +.I sysfs +file +.IR md/mismatch_cnt . +This is set to zero when a +scrub starts and is incremented whenever a sector is +found that is a mismatch. +.I md +normally works in units much larger than a single sector and when it +finds a mismatch, it does not determin exactly how many actual sectors were +affected but simply adds the number of sectors in the IO unit that was +used. So a value of 128 could simply mean that a single 64KB check +found an error (128 x 512bytes = 64KB). + +If an array is created by +.I mdadm +with +.I \-\-assume\-clean +then a subsequent check could be expected to find some mismatches. + +On a truly clean RAID5 or RAID6 array, any mismatches should indicate +a hardware problem at some level - software issues should never cause +such a mismatch. + +However on RAID1 and RAID10 it is possible for software issues to +cause a mismatch to be reported. This does not necessarily mean that +the data on the array is corrupted. It could simply be that the +system does not care what is stored on that part of the array - it is +unused space. + +The most likely cause for an unexpected mismatch on RAID1 or RAID10 +occurs if a swap partition or swap file is stored on the array. + +When the swap subsystem wants to write a page of memory out, it flags +the page as 'clean' in the memory manager and requests the swap device +to write it out. It is quite possible that the memory will be +changed while the write-out is happening. In that case the 'clean' +flag will be found to be clear when the write completes and so the +swap subsystem will simply forget that the swapout had been attempted, +and will possibly choose a different page to write out. + +If the swap device was on RAID1 (or RAID10), then the data is sent +from memory to a device twice (or more depending on the number of +devices in the array). Thus it is possible that the memory gets changed +between the times it is sent, so different data can be written to +the different devices in the array. This will be detected by +.I check +as a mismatch. However it does not reflect any corruption as the +block where this mismatch occurs is being treated by the swap system as +being empty, and the data will never be read from that block. + +It is conceivable for a similar situation to occur on non-swap files, +though it is less likely. + +Thus the +.I mismatch_cnt +value can not be interpreted very reliably on RAID1 or RAID10, +especially when the device is used for swap. + + .SS BITMAP WRITE-INTENT LOGGING From Linux 2.6.13, @@ -526,10 +680,22 @@ .B md/stripe_cache_size This is only available on RAID5 and RAID6. It records the size (in pages per device) of the stripe cache which is used for synchronising -all read and write operations to the array. The default is 128. +all write operations to the array and all read operations if the array +is degraded. The default is 256. Valid values are 17 to 32768. Increasing this number can increase performance in some situations, at -some cost in system memory. +some cost in system memory. Note, setting this value too high can +result in an "out of memory" condition for the system. + +memory_consumed = system_page_size * nr_disks * stripe_cache_size +.TP +.B md/preread_bypass_threshold +This is only available on RAID5 and RAID6. This variable sets the +number of times MD will service a full-stripe-write before servicing a +stripe that requires some "prereading". For fairness this defaults to +1. Valid values are 0 to stripe_cache_size. Setting this to 0 +maximizes sequential-write throughput at the cost of fairness to threads +doing small or random writes. .SS KERNEL PARAMETERS @@ -557,6 +723,8 @@ .TP .B md_mod.start_ro=1 +.TP +.B /sys/module/md_mod/parameters/start_ro This tells md to start all arrays in read-only mode. This is a soft read-only that will automatically switch to read-write on the first write request. However until that write request, nothing is written @@ -565,6 +733,8 @@ .TP .B md_mod.start_dirty_degraded=1 +.TP +.B /sys/module/md_mod/parameters/start_dirty_degraded As mentioned above, md will not normally start a RAID4, RAID5, or RAID6 that is both dirty and degraded as this situation can imply hidden data loss. This can be awkward if the root filesystem is @@ -614,13 +784,13 @@ speed for times when non-rebuild activity is current on an array. The speed is in Kibibytes per second, and is a per-device rate, not a per-array rate (which means that an array with more disks will shuffle -more data for a given speed). The default is 100. +more data for a given speed). The default is 1000. .TP .B /proc/sys/dev/raid/speed_limit_max A readable and writable file that reflects the current "goal" rebuild speed for times when no non-rebuild activity is current on an array. -The default is 100,000. +The default is 200,000. .SH SEE ALSO .BR mdadm (8), diff -Nru mdadm-2.6.7.1/md5.h mdadm-3.1.4/md5.h --- mdadm-2.6.7.1/md5.h 2008-06-02 02:12:21.000000000 +0300 +++ mdadm-3.1.4/md5.h 2010-03-22 08:08:42.000000000 +0200 @@ -27,7 +27,7 @@ #if HAVE_INTTYPES_H # include #endif -#if HAVE_STDINT_H || _LIBC +#if HAVE_STDINT_H || _LIBC || defined __UCLIBC__ # include #endif diff -Nru mdadm-2.6.7.1/mdadm.8 mdadm-3.1.4/mdadm.8 --- mdadm-2.6.7.1/mdadm.8 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/mdadm.8 1970-01-01 02:00:00.000000000 +0200 @@ -1,2142 +0,0 @@ -.\" -*- nroff -*- -.\" Copyright Neil Brown and others. -.\" This program is free software; you can redistribute it and/or modify -.\" it under the terms of the GNU General Public License as published by -.\" the Free Software Foundation; either version 2 of the License, or -.\" (at your option) any later version. -.\" See file COPYING in distribution for details. -.TH MDADM 8 "" v2.6.7.1 -.SH NAME -mdadm \- manage MD devices -.I aka -Linux Software RAID - -.SH SYNOPSIS - -.BI mdadm " [mode] [options] " - -.SH DESCRIPTION -RAID devices are virtual devices created from two or more -real block devices. This allows multiple devices (typically disk -drives or partitions thereof) to be combined into a single device to -hold (for example) a single filesystem. -Some RAID levels include redundancy and so can survive some degree of -device failure. - -Linux Software RAID devices are implemented through the md (Multiple -Devices) device driver. - -Currently, Linux supports -.B LINEAR -md devices, -.B RAID0 -(striping), -.B RAID1 -(mirroring), -.BR RAID4 , -.BR RAID5 , -.BR RAID6 , -.BR RAID10 , -.BR MULTIPATH , -and -.BR FAULTY . - -.B MULTIPATH -is not a Software RAID mechanism, but does involve -multiple devices: -each device is a path to one common physical storage device. - -.B FAULTY -is also not true RAID, and it only involves one device. It -provides a layer over a true device that can be used to inject faults. - -.\".B mdadm -.\"is a program that can be used to create, manage, and monitor -.\"MD devices. As -.\"such it provides a similar set of functionality to the -.\".B raidtools -.\"packages. -.\"The key differences between -.\".B mdadm -.\"and -.\".B raidtools -.\"are: -.\".IP \(bu 4 -.\".B mdadm -.\"is a single program and not a collection of programs. -.\".IP \(bu 4 -.\".B mdadm -.\"can perform (almost) all of its functions without having a -.\"configuration file and does not use one by default. Also -.\".B mdadm -.\"helps with management of the configuration -.\"file. -.\".IP \(bu 4 -.\".B mdadm -.\"can provide information about your arrays (through Query, Detail, and Examine) -.\"that -.\".B raidtools -.\"cannot. -.\".P -.\".I mdadm -.\"does not use -.\".IR /etc/raidtab , -.\"the -.\".B raidtools -.\"configuration file, at all. It has a different configuration file -.\"with a different format and a different purpose. - -.SH MODES -mdadm has several major modes of operation: -.TP -.B Assemble -Assemble the components of a previously created -array into an active array. Components can be explicitly given -or can be searched for. -.B mdadm -checks that the components -do form a bona fide array, and can, on request, fiddle superblock -information so as to assemble a faulty array. - -.TP -.B Build -Build an array that doesn't have per-device superblocks. For these -sorts of arrays, -.I mdadm -cannot differentiate between initial creation and subsequent assembly -of an array. It also cannot perform any checks that appropriate -components have been requested. Because of this, the -.B Build -mode should only be used together with a complete understanding of -what you are doing. - -.TP -.B Create -Create a new array with per-device superblocks. -.\"It can progress -.\"in several step create-add-add-run or it can all happen with one command. - -.TP -.B "Follow or Monitor" -Monitor one or more md devices and act on any state changes. This is -only meaningful for raid1, 4, 5, 6, 10 or multipath arrays, as -only these have interesting state. raid0 or linear never have -missing, spare, or failed drives, so there is nothing to monitor. - -.TP -.B "Grow" -Grow (or shrink) an array, or otherwise reshape it in some way. -Currently supported growth options including changing the active size -of component devices and changing the number of active devices in RAID -levels 1/4/5/6, as well as adding or removing a write-intent bitmap. - -.TP -.B "Incremental Assembly" -Add a single device to an appropriate array. If the addition of the -device makes the array runnable, the array will be started. -This provides a convenient interface to a -.I hot-plug -system. As each device is detected, -.I mdadm -has a chance to include it in some array as appropriate. - -.TP -.B Manage -This is for doing things to specific components of an array such as -adding new spares and removing faulty devices. - -.TP -.B Misc -This is an 'everything else' mode that supports operations on active -arrays, operations on component devices such as erasing old superblocks, and -information gathering operations. -.\"This mode allows operations on independent devices such as examine MD -.\"superblocks, erasing old superblocks and stopping active arrays. - -.TP -.B Auto-detect -This mode does not act on a specific device or array, but rather it -requests the Linux Kernel to activate any auto-detected arrays. -.SH OPTIONS - -.SH Options for selecting a mode are: - -.TP -.BR \-A ", " \-\-assemble -Assemble a pre-existing array. - -.TP -.BR \-B ", " \-\-build -Build a legacy array without superblocks. - -.TP -.BR \-C ", " \-\-create -Create a new array. - -.TP -.BR \-F ", " \-\-follow ", " \-\-monitor -Select -.B Monitor -mode. - -.TP -.BR \-G ", " \-\-grow -Change the size or shape of an active array. - -.TP -.BR \-I ", " \-\-incremental -Add a single device into an appropriate array, and possibly start the array. - -.TP -.B \-\-auto-detect -Request that the kernel starts any auto-detected arrays. This can only -work if -.I md -is compiled into the kernel \(em not if it is a module. -Arrays can be auto-detected by the kernel if all the components are in -primary MS-DOS partitions with partition type -.BR FD . -In-kernel autodetect is not recommended for new installations. Using -.I mdadm -to detect and assemble arrays \(em possibly in an -.I initrd -\(em is substantially more flexible and should be preferred. - -.P -If a device is given before any options, or if the first option is -.BR \-\-add , -.BR \-\-fail , -or -.BR \-\-remove , -then the MANAGE mode is assume. -Anything other than these will cause the -.B Misc -mode to be assumed. - -.SH Options that are not mode-specific are: - -.TP -.BR \-h ", " \-\-help -Display general help message or, after one of the above options, a -mode-specific help message. - -.TP -.B \-\-help\-options -Display more detailed help about command line parsing and some commonly -used options. - -.TP -.BR \-V ", " \-\-version -Print version information for mdadm. - -.TP -.BR \-v ", " \-\-verbose -Be more verbose about what is happening. This can be used twice to be -extra-verbose. -The extra verbosity currently only affects -.B \-\-detail \-\-scan -and -.BR "\-\-examine \-\-scan" . - -.TP -.BR \-q ", " \-\-quiet -Avoid printing purely informative messages. With this, -.B mdadm -will be silent unless there is something really important to report. - -.TP -.BR \-b ", " \-\-brief -Be less verbose. This is used with -.B \-\-detail -and -.BR \-\-examine . -Using -.B \-\-brief -with -.B \-\-verbose -gives an intermediate level of verbosity. - -.TP -.BR \-f ", " \-\-force -Be more forceful about certain operations. See the various modes for -the exact meaning of this option in different contexts. - -.TP -.BR \-c ", " \-\-config= -Specify the config file. Default is to use -.BR /etc/mdadm/mdadm.conf , -or if that is missing, then -.BR /etc/mdadm.conf . -If the config file given is -.B "partitions" -then nothing will be read, but -.I mdadm -will act as though the config file contained exactly -.B "DEVICE partitions" -and will read -.B /proc/partitions -to find a list of devices to scan. -If the word -.B "none" -is given for the config file, then -.I mdadm -will act as though the config file were empty. - -.TP -.BR \-s ", " \-\-scan -Scan config file or -.B /proc/mdstat -for missing information. -In general, this option gives -.B mdadm -permission to get any missing information (like component devices, -array devices, array identities, and alert destination) from the -configuration file (see previous option); -one exception is MISC mode when using -.B \-\-detail -or -.B \-\-stop, -in which case -.B \-\-scan -says to get a list of array devices from -.BR /proc/mdstat . - -.TP -.B \-e ", " \-\-metadata= -Declare the style of superblock (raid metadata) to be used. The -default is 0.90 for -.BR \-\-create , -and to guess for other operations. -The default can be overridden by setting the -.B metadata -value for the -.B CREATE -keyword in -.BR mdadm.conf . - -Options are: -.RS -.IP "0, 0.90, default" -Use the original 0.90 format superblock. This format limits arrays to -28 component devices and limits component devices of levels 1 and -greater to 2 terabytes. -.IP "1, 1.0, 1.1, 1.2" -Use the new version-1 format superblock. This has few restrictions. -The different sub-versions store the superblock at different locations -on the device, either at the end (for 1.0), at the start (for 1.1) or -4K from the start (for 1.2). -.RE - -.TP -.B \-\-homehost= -This will override any -.B HOMEHOST -setting in the config file and provides the identity of the host which -should be considered the home for any arrays. - -When creating an array, the -.B homehost -will be recorded in the superblock. For version-1 superblocks, it will -be prefixed to the array name. For version-0.90 superblocks, part of -the SHA1 hash of the hostname will be stored in the later half of the -UUID. - -When reporting information about an array, any array which is tagged -for the given homehost will be reported as such. - -When using Auto-Assemble, only arrays tagged for the given homehost -will be assembled. - -.SH For create, build, or grow: - -.TP -.BR \-n ", " \-\-raid\-devices= -Specify the number of active devices in the array. This, plus the -number of spare devices (see below) must equal the number of -.I component-devices -(including "\fBmissing\fP" devices) -that are listed on the command line for -.BR \-\-create . -Setting a value of 1 is probably -a mistake and so requires that -.B \-\-force -be specified first. A value of 1 will then be allowed for linear, -multipath, raid0 and raid1. It is never allowed for raid4 or raid5. -.br -This number can only be changed using -.B \-\-grow -for RAID1, RAID5 and RAID6 arrays, and only on kernels which provide -necessary support. - -.TP -.BR \-x ", " \-\-spare\-devices= -Specify the number of spare (eXtra) devices in the initial array. -Spares can also be added -and removed later. The number of component devices listed -on the command line must equal the number of raid devices plus the -number of spare devices. - - -.TP -.BR \-z ", " \-\-size= -Amount (in Kibibytes) of space to use from each drive in RAID level 1/4/5/6. -This must be a multiple of the chunk size, and must leave about 128Kb -of space at the end of the drive for the RAID superblock. -If this is not specified -(as it normally is not) the smallest drive (or partition) sets the -size, though if there is a variance among the drives of greater than 1%, a warning is -issued. - -This value can be set with -.B \-\-grow -for RAID level 1/4/5/6. If the array was created with a size smaller -than the currently active drives, the extra space can be accessed -using -.BR \-\-grow . -The size can be given as -.B max -which means to choose the largest size that fits on all current drives. - -.TP -.BR \-c ", " \-\-chunk= -Specify chunk size of kibibytes. The default is 64. - -.TP -.BR \-\-rounding= -Specify rounding factor for linear array (==chunk size) - -.TP -.BR \-l ", " \-\-level= -Set raid level. When used with -.BR \-\-create , -options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4, -raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty. Obviously some of these are synonymous. - -When used with -.BR \-\-build , -only linear, stripe, raid0, 0, raid1, multipath, mp, and faulty are valid. - -Not yet supported with -.BR \-\-grow . - -.TP -.BR \-p ", " \-\-layout= -This option configures the fine details of data layout for raid5, -and raid10 arrays, and controls the failure modes for -.IR faulty . - -The layout of the raid5 parity block can be one of -.BR left\-asymmetric , -.BR left\-symmetric , -.BR right\-asymmetric , -.BR right\-symmetric , -.BR la ", " ra ", " ls ", " rs . -The default is -.BR left\-symmetric . - -When setting the failure mode for level -.I faulty, -the options are: -.BR write\-transient ", " wt , -.BR read\-transient ", " rt , -.BR write\-persistent ", " wp , -.BR read\-persistent ", " rp , -.BR write\-all , -.BR read\-fixable ", " rf , -.BR clear ", " flush ", " none . - -Each failure mode can be followed by a number, which is used as a period -between fault generation. Without a number, the fault is generated -once on the first relevant request. With a number, the fault will be -generated after that many requests, and will continue to be generated -every time the period elapses. - -Multiple failure modes can be current simultaneously by using the -.B \-\-grow -option to set subsequent failure modes. - -"clear" or "none" will remove any pending or periodic failure modes, -and "flush" will clear any persistent faults. - -To set the parity with -.BR \-\-grow , -the level of the array ("faulty") -must be specified before the fault mode is specified. - -Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed -by a small number. The default is 'n2'. The supported options are: - -.I 'n' -signals 'near' copies. Multiple copies of one data block are at -similar offsets in different devices. - -.I 'o' -signals 'offset' copies. Rather than the chunks being duplicated -within a stripe, whole stripes are duplicated but are rotated by one -device so duplicate blocks are on different devices. Thus subsequent -copies of a block are in the next drive, and are one chunk further -down. - -.I 'f' -signals 'far' copies -(multiple copies have very different offsets). -See md(4) for more detail about 'near' and 'far'. - -The number is the number of copies of each datablock. 2 is normal, 3 -can be useful. This number can be at most equal to the number of -devices in the array. It does not need to divide evenly into that -number (e.g. it is perfectly legal to have an 'n2' layout for an array -with an odd number of devices). - -.TP -.BR \-\-parity= -same as -.B \-\-layout -(thus explaining the p of -.BR \-p ). - -.TP -.BR \-b ", " \-\-bitmap= -Specify a file to store a write-intent bitmap in. The file should not -exist unless -.B \-\-force -is also given. The same file should be provided -when assembling the array. If the word -.B "internal" -is given, then the bitmap is stored with the metadata on the array, -and so is replicated on all devices. If the word -.B "none" -is given with -.B \-\-grow -mode, then any bitmap that is present is removed. - -To help catch typing errors, the filename must contain at least one -slash ('/') if it is a real file (not 'internal' or 'none'). - -Note: external bitmaps are only known to work on ext2 and ext3. -Storing bitmap files on other filesystems may result in serious problems. - -.TP -.BR \-\-bitmap\-chunk= -Set the chunksize of the bitmap. Each bit corresponds to that many -Kilobytes of storage. -When using a file based bitmap, the default is to use the smallest -size that is at-least 4 and requires no more than 2^21 chunks. -When using an -.B internal -bitmap, the chunksize is automatically determined to make best use of -available space. - - -.TP -.BR \-W ", " \-\-write\-mostly -subsequent devices lists in a -.BR \-\-build , -.BR \-\-create , -or -.B \-\-add -command will be flagged as 'write-mostly'. This is valid for RAID1 -only and means that the 'md' driver will avoid reading from these -devices if at all possible. This can be useful if mirroring over a -slow link. - -.TP -.BR \-\-write\-behind= -Specify that write-behind mode should be enabled (valid for RAID1 -only). If an argument is specified, it will set the maximum number -of outstanding writes allowed. The default value is 256. -A write-intent bitmap is required in order to use write-behind -mode, and write-behind is only attempted on drives marked as -.IR write-mostly . - -.TP -.BR \-\-assume\-clean -Tell -.I mdadm -that the array pre-existed and is known to be clean. It can be useful -when trying to recover from a major failure as you can be sure that no -data will be affected unless you actually write to the array. It can -also be used when creating a RAID1 or RAID10 if you want to avoid the -initial resync, however this practice \(em while normally safe \(em is not -recommended. Use this only if you really know what you are doing. - -.TP -.BR \-\-backup\-file= -This is needed when -.B \-\-grow -is used to increase the number of -raid-devices in a RAID5 if there are no spare devices available. -See the section below on RAID_DEVICE CHANGES. The file should be -stored on a separate device, not on the raid array being reshaped. - -.TP -.BR \-N ", " \-\-name= -Set a -.B name -for the array. This is currently only effective when creating an -array with a version-1 superblock. The name is a simple textual -string that can be used to identify array components when assembling. - -.TP -.BR \-R ", " \-\-run -Insist that -.I mdadm -run the array, even if some of the components -appear to be active in another array or filesystem. Normally -.I mdadm -will ask for confirmation before including such components in an -array. This option causes that question to be suppressed. - -.TP -.BR \-f ", " \-\-force -Insist that -.I mdadm -accept the geometry and layout specified without question. Normally -.I mdadm -will not allow creation of an array with only one device, and will try -to create a raid5 array with one missing drive (as this makes the -initial resync work faster). With -.BR \-\-force , -.I mdadm -will not try to be so clever. - -.TP -.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part,p}{NN}" -Instruct mdadm to create the device file if needed, possibly allocating -an unused minor number. "md" causes a non-partitionable array -to be used. "mdp", "part" or "p" causes a partitionable array (2.6 and -later) to be used. "yes" requires the named md device to have -a 'standard' format, and the type and minor number will be determined -from this. See DEVICE NAMES below. - -The argument can also come immediately after -"\-a". e.g. "\-ap". - -If -.B \-\-auto -is not given on the command line or in the config file, then -the default will be -.BR \-\-auto=yes . - -If -.B \-\-scan -is also given, then any -.I auto= -entries in the config file will override the -.B \-\-auto -instruction given on the command line. - -For partitionable arrays, -.I mdadm -will create the device file for the whole array and for the first 4 -partitions. A different number of partitions can be specified at the -end of this option (e.g. -.BR \-\-auto=p7 ). -If the device name ends with a digit, the partition names add a 'p', -and a number, e.g. "/dev/home1p3". If there is no -trailing digit, then the partition names just have a number added, -e.g. "/dev/scratch3". - -If the md device name is in a 'standard' format as described in DEVICE -NAMES, then it will be created, if necessary, with the appropriate -number based on that name. If the device name is not in one of these -formats, then a unused minor number will be allocated. The minor -number will be considered unused if there is no active array for that -number, and there is no entry in /dev for that number and with a -non-standard name. - -.TP -.BR \-\-symlink = no -Normally when -.B \-\-auto -causes -.I mdadm -to create devices in -.B /dev/md/ -it will also create symlinks from -.B /dev/ -with names starting with -.B md -or -.BR md_ . -Use -.B \-\-symlink=no -to suppress this, or -.B \-\-symlink=yes -to enforce this even if it is suppressing -.IR mdadm.conf . - - -.SH For assemble: - -.TP -.BR \-u ", " \-\-uuid= -uuid of array to assemble. Devices which don't have this uuid are -excluded - -.TP -.BR \-m ", " \-\-super\-minor= -Minor number of device that array was created for. Devices which -don't have this minor number are excluded. If you create an array as -/dev/md1, then all superblocks will contain the minor number 1, even if -the array is later assembled as /dev/md2. - -Giving the literal word "dev" for -.B \-\-super\-minor -will cause -.I mdadm -to use the minor number of the md device that is being assembled. -e.g. when assembling -.BR /dev/md0 , -.B \-\-super\-minor=dev -will look for super blocks with a minor number of 0. - -.TP -.BR \-N ", " \-\-name= -Specify the name of the array to assemble. This must be the name -that was specified when creating the array. It must either match -the name stored in the superblock exactly, or it must match -with the current -.I homehost -prefixed to the start of the given name. - -.TP -.BR \-f ", " \-\-force -Assemble the array even if some superblocks appear out-of-date - -.TP -.BR \-R ", " \-\-run -Attempt to start the array even if fewer drives were given than were -present last time the array was active. Normally if not all the -expected drives are found and -.B \-\-scan -is not used, then the array will be assembled but not started. -With -.B \-\-run -an attempt will be made to start it anyway. - -.TP -.B \-\-no\-degraded -This is the reverse of -.B \-\-run -in that it inhibits the startup of array unless all expected drives -are present. This is only needed with -.B \-\-scan, -and can be used if the physical connections to devices are -not as reliable as you would like. - -.TP -.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}" -See this option under Create and Build options. - -.TP -.BR \-b ", " \-\-bitmap= -Specify the bitmap file that was given when the array was created. If -an array has an -.B internal -bitmap, there is no need to specify this when assembling the array. - -.TP -.BR \-\-backup\-file= -If -.B \-\-backup\-file -was used to grow the number of raid-devices in a RAID5, and the system -crashed during the critical section, then the same -.B \-\-backup\-file -must be presented to -.B \-\-assemble -to allow possibly corrupted data to be restored. - -.TP -.BR \-U ", " \-\-update= -Update the superblock on each device while assembling the array. The -argument given to this flag can be one of -.BR sparc2.2 , -.BR summaries , -.BR uuid , -.BR name , -.BR homehost , -.BR resync , -.BR byteorder , -.BR devicesize , -or -.BR super\-minor . - -The -.B sparc2.2 -option will adjust the superblock of an array what was created on a Sparc -machine running a patched 2.2 Linux kernel. This kernel got the -alignment of part of the superblock wrong. You can use the -.B "\-\-examine \-\-sparc2.2" -option to -.I mdadm -to see what effect this would have. - -The -.B super\-minor -option will update the -.B "preferred minor" -field on each superblock to match the minor number of the array being -assembled. -This can be useful if -.B \-\-examine -reports a different "Preferred Minor" to -.BR \-\-detail . -In some cases this update will be performed automatically -by the kernel driver. In particular the update happens automatically -at the first write to an array with redundancy (RAID level 1 or -greater) on a 2.6 (or later) kernel. - -The -.B uuid -option will change the uuid of the array. If a UUID is given with the -.B \-\-uuid -option that UUID will be used as a new UUID and will -.B NOT -be used to help identify the devices in the array. -If no -.B \-\-uuid -is given, a random UUID is chosen. - -The -.B name -option will change the -.I name -of the array as stored in the superblock. This is only supported for -version-1 superblocks. - -The -.B homehost -option will change the -.I homehost -as recorded in the superblock. For version-0 superblocks, this is the -same as updating the UUID. -For version-1 superblocks, this involves updating the name. - -The -.B resync -option will cause the array to be marked -.I dirty -meaning that any redundancy in the array (e.g. parity for raid5, -copies for raid1) may be incorrect. This will cause the raid system -to perform a "resync" pass to make sure that all redundant information -is correct. - -The -.B byteorder -option allows arrays to be moved between machines with different -byte-order. -When assembling such an array for the first time after a move, giving -.B "\-\-update=byteorder" -will cause -.I mdadm -to expect superblocks to have their byteorder reversed, and will -correct that order before assembling the array. This is only valid -with original (Version 0.90) superblocks. - -The -.B summaries -option will correct the summaries in the superblock. That is the -counts of total, working, active, failed, and spare devices. - -The -.B devicesize -will rarely be of use. It applies to version 1.1 and 1.2 metadata -only (where the metadata is at the start of the device) and is only -useful when the component device has changed size (typically become -larger). The version 1 metadata records the amount of the device that -can be used to store data, so if a device in a version 1.1 or 1.2 -array becomes larger, the metadata will still be visible, but the -extra space will not. In this case it might be useful to assemble the -array with -.BR \-\-update=devicesize . -This will cause -.I mdadm -to determine the maximum usable amount of space on each device and -update the relevant field in the metadata. - -.TP -.B \-\-auto\-update\-homehost -This flag is only meaningful with auto-assembly (see discussion below). -In that situation, if no suitable arrays are found for this homehost, -.I mdadm -will rescan for any arrays at all and will assemble them and update the -homehost to match the current host. - -.SH For Manage mode: - -.TP -.BR \-a ", " \-\-add -hot-add listed devices. - -.TP -.BR \-\-re\-add -re-add a device that was recently removed from an array. - -.TP -.BR \-r ", " \-\-remove -remove listed devices. They must not be active. i.e. they should -be failed or spare devices. As well as the name of a device file -(e.g. -.BR /dev/sda1 ) -the words -.B failed -and -.B detached -can be given to -.BR \-\-remove . -The first causes all failed device to be removed. The second causes -any device which is no longer connected to the system (i.e an 'open' -returns -.BR ENXIO ) -to be removed. This will only succeed for devices that are spares or -have already been marked as failed. - -.TP -.BR \-f ", " \-\-fail -mark listed devices as faulty. -As well as the name of a device file, the word -.B detached -can be given. This will cause any device that has been detached from -the system to be marked as failed. It can then be removed. - -.TP -.BR \-\-set\-faulty -same as -.BR \-\-fail . - -.P -Each of these options require that the first device listed is the array -to be acted upon, and the remainder are component devices to be added, -removed, or marked as faulty. Several different operations can be -specified for different devices, e.g. -.in +5 -mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1 -.in -5 -Each operation applies to all devices listed until the next -operation. - -If an array is using a write-intent bitmap, then devices which have -been removed can be re-added in a way that avoids a full -reconstruction but instead just updates the blocks that have changed -since the device was removed. For arrays with persistent metadata -(superblocks) this is done automatically. For arrays created with -.B \-\-build -mdadm needs to be told that this device we removed recently with -.BR \-\-re\-add . - -Devices can only be removed from an array if they are not in active -use, i.e. that must be spares or failed devices. To remove an active -device, it must first be marked as -.B faulty. - -.SH For Misc mode: - -.TP -.BR \-Q ", " \-\-query -Examine a device to see -(1) if it is an md device and (2) if it is a component of an md -array. -Information about what is discovered is presented. - -.TP -.BR \-D ", " \-\-detail -Print detail of one or more md devices. - -.TP -.BR \-Y ", " \-\-export -When used with -.B \-\-detail -or -.BR \-\-examine , -output will be formatted as -.B key=value -pairs for easy import into the environment. - -.TP -.BR \-E ", " \-\-examine -Print content of md superblock on device(s). -.TP -.B \-\-sparc2.2 -If an array was created on a 2.2 Linux kernel patched with RAID -support, the superblock will have been created incorrectly, or at -least incompatibly with 2.4 and later kernels. Using the -.B \-\-sparc2.2 -flag with -.B \-\-examine -will fix the superblock before displaying it. If this appears to do -the right thing, then the array can be successfully assembled using -.BR "\-\-assemble \-\-update=sparc2.2" . - -.TP -.BR \-X ", " \-\-examine\-bitmap -Report information about a bitmap file. -The argument is either an external bitmap file or an array component -in case of an internal bitmap. - -.TP -.BR \-R ", " \-\-run -start a partially built array. - -.TP -.BR \-S ", " \-\-stop -deactivate array, releasing all resources. - -.TP -.BR \-o ", " \-\-readonly -mark array as readonly. - -.TP -.BR \-w ", " \-\-readwrite -mark array as readwrite. - -.TP -.B \-\-zero\-superblock -If the device contains a valid md superblock, the block is -overwritten with zeros. With -.B \-\-force -the block where the superblock would be is overwritten even if it -doesn't appear to be valid. - -.TP -.BR \-t ", " \-\-test -When used with -.BR \-\-detail , -the exit status of -.I mdadm -is set to reflect the status of the device. - -.TP -.BR \-W ", " \-\-wait -For each md device given, wait for any resync, recovery, or reshape -activity to finish before returning. -.I mdadm -will return with success if it actually waited for every device -listed, otherwise it will return failure. - -.SH For Incremental Assembly mode: -.TP -.BR \-\-rebuild\-map ", " \-r -Rebuild the map file -.RB ( /var/run/mdadm/map ) -that -.I mdadm -uses to help track which arrays are currently being assembled. - -.TP -.BR \-\-run ", " \-R -Run any array assembled as soon as a minimal number of devices are -available, rather than waiting until all expected devices are present. - -.TP -.BR \-\-scan ", " \-s -Only meaningful with -.B \-R -this will scan the -.B map -file for arrays that are being incrementally assembled and will try to -start any that are not already started. If any such array is listed -in -.B mdadm.conf -as requiring an external bitmap, that bitmap will be attached first. - -.SH For Monitor mode: -.TP -.BR \-m ", " \-\-mail -Give a mail address to send alerts to. - -.TP -.BR \-p ", " \-\-program ", " \-\-alert -Give a program to be run whenever an event is detected. - -.TP -.BR \-y ", " \-\-syslog -Cause all events to be reported through 'syslog'. The messages have -facility of 'daemon' and varying priorities. - -.TP -.BR \-d ", " \-\-delay -Give a delay in seconds. -.B mdadm -polls the md arrays and then waits this many seconds before polling -again. The default is 60 seconds. - -.TP -.BR \-f ", " \-\-daemonise -Tell -.B mdadm -to run as a background daemon if it decides to monitor anything. This -causes it to fork and run in the child, and to disconnect form the -terminal. The process id of the child is written to stdout. -This is useful with -.B \-\-scan -which will only continue monitoring if a mail address or alert program -is found in the config file. - -.TP -.BR \-i ", " \-\-pid\-file -When -.B mdadm -is running in daemon mode, write the pid of the daemon process to -the specified file, instead of printing it on standard output. - -.TP -.BR \-1 ", " \-\-oneshot -Check arrays only once. This will generate -.B NewArray -events and more significantly -.B DegradedArray -and -.B SparesMissing -events. Running -.in +5 -.B " mdadm \-\-monitor \-\-scan \-1" -.in -5 -from a cron script will ensure regular notification of any degraded arrays. - -.TP -.BR \-t ", " \-\-test -Generate a -.B TestMessage -alert for every array found at startup. This alert gets mailed and -passed to the alert program. This can be used for testing that alert -message do get through successfully. - -.SH ASSEMBLE MODE - -.HP 12 -Usage: -.B mdadm \-\-assemble -.I md-device options-and-component-devices... -.HP 12 -Usage: -.B mdadm \-\-assemble \-\-scan -.I md-devices-and-options... -.HP 12 -Usage: -.B mdadm \-\-assemble \-\-scan -.I options... - -.PP -This usage assembles one or more raid arrays from pre-existing components. -For each array, mdadm needs to know the md device, the identity of the -array, and a number of component-devices. These can be found in a number of ways. - -In the first usage example (without the -.BR \-\-scan ) -the first device given is the md device. -In the second usage example, all devices listed are treated as md -devices and assembly is attempted. -In the third (where no devices are listed) all md devices that are -listed in the configuration file are assembled. - -If precisely one device is listed, but -.B \-\-scan -is not given, then -.I mdadm -acts as though -.B \-\-scan -was given and identity information is extracted from the configuration file. - -The identity can be given with the -.B \-\-uuid -option, with the -.B \-\-super\-minor -option, will be taken from the md-device record in the config file, or -will be taken from the super block of the first component-device -listed on the command line. - -Devices can be given on the -.B \-\-assemble -command line or in the config file. Only devices which have an md -superblock which contains the right identity will be considered for -any array. - -The config file is only used if explicitly named with -.B \-\-config -or requested with (a possibly implicit) -.BR \-\-scan . -In the later case, -.B /etc/mdadm/mdadm.conf -is used. - -If -.B \-\-scan -is not given, then the config file will only be used to find the -identity of md arrays. - -Normally the array will be started after it is assembled. However if -.B \-\-scan -is not given and insufficient drives were listed to start a complete -(non-degraded) array, then the array is not started (to guard against -usage errors). To insist that the array be started in this case (as -may work for RAID1, 4, 5, 6, or 10), give the -.B \-\-run -flag. - -If the md device does not exist, then it will be created providing the -intent is clear. i.e. the name must be in a standard form, or the -.B \-\-auto -option must be given to clarify how and whether the device should be -created. -This can be useful for handling partitioned devices (which don't have -a stable device number \(em it can change after a reboot) and when using -"udev" to manage your -.B /dev -tree (udev cannot handle md devices because of the unusual device -initialisation conventions). - -If the option to "auto" is "mdp" or "part" or (on the command line -only) "p", then mdadm will create a partitionable array, using the -first free one that is not in use and does not already have an entry -in /dev (apart from numeric /dev/md* entries). - -If the option to "auto" is "yes" or "md" or (on the command line) -nothing, then mdadm will create a traditional, non-partitionable md -array. - -It is expected that the "auto" functionality will be used to create -device entries with meaningful names such as "/dev/md/home" or -"/dev/md/root", rather than names based on the numerical array number. - -When using option "auto" to create a partitionable array, the device -files for the first 4 partitions are also created. If a different -number is required it can be simply appended to the auto option. -e.g. "auto=part8". Partition names are created by appending a digit -string to the device name, with an intervening "p" if the device name -ends with a digit. - -The -.B \-\-auto -option is also available in Build and Create modes. As those modes do -not use a config file, the "auto=" config option does not apply to -these modes. - -.SS Auto Assembly -When -.B \-\-assemble -is used with -.B \-\-scan -and no devices are listed, -.I mdadm -will first attempt to assemble all the arrays listed in the config -file. - -If a -.B homehost -has been specified (either in the config file or on the command line), -.I mdadm -will look further for possible arrays and will try to assemble -anything that it finds which is tagged as belonging to the given -homehost. This is the only situation where -.I mdadm -will assemble arrays without being given specific device name or -identity information for the array. - -If -.I mdadm -finds a consistent set of devices that look like they should comprise -an array, and if the superblock is tagged as belonging to the given -home host, it will automatically choose a device name and try to -assemble the array. If the array uses version-0.90 metadata, then the -.B minor -number as recorded in the superblock is used to create a name in -.B /dev/md/ -so for example -.BR /dev/md/3 . -If the array uses version-1 metadata, then the -.B name -from the superblock is used to similarly create a name in -.BR /dev/md -(the name will have any 'host' prefix stripped first). - -If -.I mdadm -cannot find any array for the given host at all, and if -.B \-\-auto\-update\-homehost -is given, then -.I mdadm -will search again for any array (not just an array created for this -host) and will assemble each assuming -.BR \-\-update=homehost . -This will change the host tag in the superblock so that on the next run, -these arrays will be found without the second pass. The intention of -this feature is to support transitioning a set of md arrays to using -homehost tagging. - -The reason for requiring arrays to be tagged with the homehost for -auto assembly is to guard against problems that can arise when moving -devices from one host to another. - -.SH BUILD MODE - -.HP 12 -Usage: -.B mdadm \-\-build -.I md-device -.BI \-\-chunk= X -.BI \-\-level= Y -.BI \-\-raid\-devices= Z -.I devices - -.PP -This usage is similar to -.BR \-\-create . -The difference is that it creates an array without a superblock. With -these arrays there is no difference between initially creating the array and -subsequently assembling the array, except that hopefully there is useful -data there in the second case. - -The level may raid0, linear, multipath, or faulty, or one of their -synonyms. All devices must be listed and the array will be started -once complete. - -.SH CREATE MODE - -.HP 12 -Usage: -.B mdadm \-\-create -.I md-device -.BI \-\-chunk= X -.BI \-\-level= Y -.br -.BI \-\-raid\-devices= Z -.I devices - -.PP -This usage will initialise a new md array, associate some devices with -it, and activate the array. - -If the -.B \-\-auto -option is given (as described in more detail in the section on -Assemble mode), then the md device will be created with a suitable -device number if necessary. - -As devices are added, they are checked to see if they contain raid -superblocks or filesystems. They are also checked to see if the variance in -device size exceeds 1%. - -If any discrepancy is found, the array will not automatically be run, though -the presence of a -.B \-\-run -can override this caution. - -To create a "degraded" array in which some devices are missing, simply -give the word "\fBmissing\fP" -in place of a device name. This will cause -.B mdadm -to leave the corresponding slot in the array empty. -For a RAID4 or RAID5 array at most one slot can be -"\fBmissing\fP"; for a RAID6 array at most two slots. -For a RAID1 array, only one real device needs to be given. All of the -others can be -"\fBmissing\fP". - -When creating a RAID5 array, -.B mdadm -will automatically create a degraded array with an extra spare drive. -This is because building the spare into a degraded array is in general faster than resyncing -the parity on a non-degraded, but not clean, array. This feature can -be overridden with the -.B \-\-force -option. - -When creating an array with version-1 metadata a name for the host is -required. -If this is not given with the -.B \-\-name -option, -.I mdadm -will chose a name based on the last component of the name of the -device being created. So if -.B /dev/md3 -is being created, then the name -.B 3 -will be chosen. -If -.B /dev/md/home -is being created, then the name -.B home -will be used. - -A new array will normally get a randomly assigned 128bit UUID which is -very likely to be unique. If you have a specific need, you can choose -a UUID for the array by giving the -.B \-\-uuid= -option. Be warned that creating two arrays with the same UUID is a -recipe for disaster. Also, using -.B \-\-uuid= -when creating a v0.90 array will silently override any -.B \-\-homehost= -setting. -.\"If the -.\".B \-\-size -.\"option is given, it is not necessary to list any component-devices in this command. -.\"They can be added later, before a -.\".B \-\-run. -.\"If no -.\".B \-\-size -.\"is given, the apparent size of the smallest drive given is used. - -The General Management options that are valid with -.B \-\-create -are: -.TP -.B \-\-run -insist on running the array even if some devices look like they might -be in use. - -.TP -.B \-\-readonly -start the array readonly \(em not supported yet. - - -.SH MANAGE MODE -.HP 12 -Usage: -.B mdadm -.I device -.I options... devices... -.PP - -This usage will allow individual devices in an array to be failed, -removed or added. It is possible to perform multiple operations with -on command. For example: -.br -.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1" -.br -will firstly mark -.B /dev/hda1 -as faulty in -.B /dev/md0 -and will then remove it from the array and finally add it back -in as a spare. However only one md array can be affected by a single -command. - -.SH MISC MODE -.HP 12 -Usage: -.B mdadm -.I options ... -.I devices ... -.PP - -MISC mode includes a number of distinct operations that -operate on distinct devices. The operations are: -.TP -.B \-\-query -The device is examined to see if it is -(1) an active md array, or -(2) a component of an md array. -The information discovered is reported. - -.TP -.B \-\-detail -The device should be an active md device. -.B mdadm -will display a detailed description of the array. -.B \-\-brief -or -.B \-\-scan -will cause the output to be less detailed and the format to be -suitable for inclusion in -.BR /etc/mdadm/mdadm.conf . -The exit status of -.I mdadm -will normally be 0 unless -.I mdadm -failed to get useful information about the device(s); however, if the -.B \-\-test -option is given, then the exit status will be: -.RS -.TP -0 -The array is functioning normally. -.TP -1 -The array has at least one failed device. -.TP -2 -The array has multiple failed devices such that it is unusable. -.TP -4 -There was an error while trying to get information about the device. -.RE - -.TP -.B \-\-examine -The device should be a component of an md array. -.B mdadm -will read the md superblock of the device and display the contents. -If -.B \-\-brief -or -.B \-\-scan -is given, then multiple devices that are components of the one array -are grouped together and reported in a single entry suitable -for inclusion in -.BR /etc/mdadm/mdadm.conf . - -Having -.B \-\-scan -without listing any devices will cause all devices listed in the -config file to be examined. - -.TP -.B \-\-stop -The devices should be active md arrays which will be deactivated, as -long as they are not currently in use. - -.TP -.B \-\-run -This will fully activate a partially assembled md array. - -.TP -.B \-\-readonly -This will mark an active array as read-only, providing that it is -not currently being used. - -.TP -.B \-\-readwrite -This will change a -.B readonly -array back to being read/write. - -.TP -.B \-\-scan -For all operations except -.BR \-\-examine , -.B \-\-scan -will cause the operation to be applied to all arrays listed in -.BR /proc/mdstat . -For -.BR \-\-examine, -.B \-\-scan -causes all devices listed in the config file to be examined. - - -.SH MONITOR MODE - -.HP 12 -Usage: -.B mdadm \-\-monitor -.I options... devices... - -.PP -This usage causes -.B mdadm -to periodically poll a number of md arrays and to report on any events -noticed. -.B mdadm -will never exit once it decides that there are arrays to be checked, -so it should normally be run in the background. - -As well as reporting events, -.B mdadm -may move a spare drive from one array to another if they are in the -same -.B spare-group -and if the destination array has a failed drive but no spares. - -If any devices are listed on the command line, -.B mdadm -will only monitor those devices. Otherwise all arrays listed in the -configuration file will be monitored. Further, if -.B \-\-scan -is given, then any other md devices that appear in -.B /proc/mdstat -will also be monitored. - -The result of monitoring the arrays is the generation of events. -These events are passed to a separate program (if specified) and may -be mailed to a given E-mail address. - -When passing events to a program, the program is run once for each event, -and is given 2 or 3 command-line arguments: the first is the -name of the event (see below), the second is the name of the -md device which is affected, and the third is the name of a related -device if relevant (such as a component device that has failed). - -If -.B \-\-scan -is given, then a program or an E-mail address must be specified on the -command line or in the config file. If neither are available, then -.B mdadm -will not monitor anything. -Without -.B \-\-scan, -.B mdadm -will continue monitoring as long as something was found to monitor. If -no program or email is given, then each event is reported to -.BR stdout . - -The different events are: - -.RS 4 -.TP -.B DeviceDisappeared -An md array which previously was configured appears to no longer be -configured. (syslog priority: Critical) - -If -.I mdadm -was told to monitor an array which is RAID0 or Linear, then it will -report -.B DeviceDisappeared -with the extra information -.BR Wrong-Level . -This is because RAID0 and Linear do not support the device-failed, -hot-spare and resync operations which are monitored. - -.TP -.B RebuildStarted -An md array started reconstruction. (syslog priority: Warning) - -.TP -.BI Rebuild NN -Where -.I NN -is 20, 40, 60, or 80, this indicates that rebuild has passed that many -percentage of the total. (syslog priority: Warning) - -.TP -.B RebuildFinished -An md array that was rebuilding, isn't any more, either because it -finished normally or was aborted. (syslog priority: Warning) - -.TP -.B Fail -An active component device of an array has been marked as -faulty. (syslog priority: Critical) - -.TP -.B FailSpare -A spare component device which was being rebuilt to replace a faulty -device has failed. (syslog priority: Critical) - -.TP -.B SpareActive -A spare component device which was being rebuilt to replace a faulty -device has been successfully rebuilt and has been made active. -(syslog priority: Info) - -.TP -.B NewArray -A new md array has been detected in the -.B /proc/mdstat -file. (syslog priority: Info) - -.TP -.B DegradedArray -A newly noticed array appears to be degraded. This message is not -generated when -.I mdadm -notices a drive failure which causes degradation, but only when -.I mdadm -notices that an array is degraded when it first sees the array. -(syslog priority: Critical) - -.TP -.B MoveSpare -A spare drive has been moved from one array in a -.B spare-group -to another to allow a failed drive to be replaced. -(syslog priority: Info) - -.TP -.B SparesMissing -If -.I mdadm -has been told, via the config file, that an array should have a certain -number of spare devices, and -.I mdadm -detects that it has fewer than this number when it first sees the -array, it will report a -.B SparesMissing -message. -(syslog priority: Warning) - -.TP -.B TestMessage -An array was found at startup, and the -.B \-\-test -flag was given. -(syslog priority: Info) -.RE - -Only -.B Fail, -.B FailSpare, -.B DegradedArray, -.B SparesMissing -and -.B TestMessage -cause Email to be sent. All events cause the program to be run. -The program is run with two or three arguments: the event -name, the array device and possibly a second device. - -Each event has an associated array device (e.g. -.BR /dev/md1 ) -and possibly a second device. For -.BR Fail , -.BR FailSpare , -and -.B SpareActive -the second device is the relevant component device. -For -.B MoveSpare -the second device is the array that the spare was moved from. - -For -.B mdadm -to move spares from one array to another, the different arrays need to -be labeled with the same -.B spare-group -in the configuration file. The -.B spare-group -name can be any string; it is only necessary that different spare -groups use different names. - -When -.B mdadm -detects that an array in a spare group has fewer active -devices than necessary for the complete array, and has no spare -devices, it will look for another array in the same spare group that -has a full complement of working drive and a spare. It will then -attempt to remove the spare from the second drive and add it to the -first. -If the removal succeeds but the adding fails, then it is added back to -the original array. - -.SH GROW MODE -The GROW mode is used for changing the size or shape of an active -array. -For this to work, the kernel must support the necessary change. -Various types of growth are being added during 2.6 development, -including restructuring a raid5 array to have more active devices. - -Currently the only support available is to -.IP \(bu 4 -change the "size" attribute -for RAID1, RAID5 and RAID6. -.IP \(bu 4 -increase the "raid-disks" attribute of RAID1, RAID5, and RAID6. -.IP \(bu 4 -add a write-intent bitmap to any array which supports these bitmaps, or -remove a write-intent bitmap from such an array. -.PP - -.SS SIZE CHANGES -Normally when an array is built the "size" it taken from the smallest -of the drives. If all the small drives in an arrays are, one at a -time, removed and replaced with larger drives, then you could have an -array of large drives with only a small amount used. In this -situation, changing the "size" with "GROW" mode will allow the extra -space to start being used. If the size is increased in this way, a -"resync" process will start to make sure the new parts of the array -are synchronised. - -Note that when an array changes size, any filesystem that may be -stored in the array will not automatically grow to use the space. The -filesystem will need to be explicitly told to use the extra space. - -.SS RAID-DEVICES CHANGES - -A RAID1 array can work with any number of devices from 1 upwards -(though 1 is not very useful). There may be times which you want to -increase or decrease the number of active devices. Note that this is -different to hot-add or hot-remove which changes the number of -inactive devices. - -When reducing the number of devices in a RAID1 array, the slots which -are to be removed from the array must already be vacant. That is, the -devices which were in those slots must be failed and removed. - -When the number of devices is increased, any hot spares that are -present will be activated immediately. - -Increasing the number of active devices in a RAID5 is much more -effort. Every block in the array will need to be read and written -back to a new location. From 2.6.17, the Linux Kernel is able to do -this safely, including restart and interrupted "reshape". - -When relocating the first few stripes on a raid5, it is not possible -to keep the data on disk completely consistent and crash-proof. To -provide the required safety, mdadm disables writes to the array while -this "critical section" is reshaped, and takes a backup of the data -that is in that section. This backup is normally stored in any spare -devices that the array has, however it can also be stored in a -separate file specified with the -.B \-\-backup\-file -option. If this option is used, and the system does crash during the -critical period, the same file must be passed to -.B \-\-assemble -to restore the backup and reassemble the array. - -.SS BITMAP CHANGES - -A write-intent bitmap can be added to, or removed from, an active -array. Either internal bitmaps, or bitmaps stored in a separate file, -can be added. Note that if you add a bitmap stored in a file which is -in a filesystem that is on the raid array being affected, the system -will deadlock. The bitmap must be on a separate filesystem. - -.SH INCREMENTAL MODE - -.HP 12 -Usage: -.B mdadm \-\-incremental -.RB [ \-\-run ] -.RB [ \-\-quiet ] -.I component-device -.HP 12 -Usage: -.B mdadm \-\-incremental \-\-rebuild -.HP 12 -Usage: -.B mdadm \-\-incremental \-\-run \-\-scan - - -.PP -This mode is designed to be used in conjunction with a device -discovery system. As devices are found in a system, they can be -passed to -.B "mdadm \-\-incremental" -to be conditionally added to an appropriate array. - -.I mdadm -performs a number of tests to determine if the device is part of an -array, and which array it should be part of. If an appropriate array -is found, or can be created, -.I mdadm -adds the device to the array and conditionally starts the array. - -Note that -.I mdadm -will only add devices to an array which were previously working -(active or spare) parts of that array. It does not currently support -automatic inclusion of a new drive as a spare in some array. - -.B "mdadm \-\-incremental" -requires a bug-fix in all kernels through 2.6.19. -Hopefully, this will be fixed in 2.6.20; alternately, apply the patch -which is included with the mdadm source distribution. If -.I mdadm -detects that this bug is present, it will abort any attempt to use -.BR \-\-incremental . - -The tests that -.I mdadm -makes are as follow: -.IP + -Is the device permitted by -.BR mdadm.conf ? -That is, is it listed in a -.B DEVICES -line in that file. If -.B DEVICES -is absent then the default it to allow any device. Similar if -.B DEVICES -contains the special word -.B partitions -then any device is allowed. Otherwise the device name given to -.I mdadm -must match one of the names or patterns in a -.B DEVICES -line. - -.IP + -Does the device have a valid md superblock. If a specific metadata -version is request with -.B \-\-metadata -or -.B \-e -then only that style of metadata is accepted, otherwise -.I mdadm -finds any known version of metadata. If no -.I md -metadata is found, the device is rejected. - -.IP + -Does the metadata match an expected array? -The metadata can match in two ways. Either there is an array listed -in -.B mdadm.conf -which identifies the array (either by UUID, by name, by device list, -or by minor-number), or the array was created with a -.B homehost -specified and that -.B homehost -matches the one in -.B mdadm.conf -or on the command line. -If -.I mdadm -is not able to positively identify the array as belonging to the -current host, the device will be rejected. - -.IP + -.I mdadm -keeps a list of arrays that it has partially assembled in -.B /var/run/mdadm/map -(or -.B /var/run/mdadm.map -if the directory doesn't exist). If no array exists which matches -the metadata on the new device, -.I mdadm -must choose a device name and unit number. It does this based on any -name given in -.B mdadm.conf -or any name information stored in the metadata. If this name -suggests a unit number, that number will be used, otherwise a free -unit number will be chosen. Normally -.I mdadm -will prefer to create a partitionable array, however if the -.B CREATE -line in -.B mdadm.conf -suggests that a non-partitionable array is preferred, that will be -honoured. - -.IP + -Once an appropriate array is found or created and the device is added, -.I mdadm -must decide if the array is ready to be started. It will -normally compare the number of available (non-spare) devices to the -number of devices that the metadata suggests need to be active. If -there are at least that many, the array will be started. This means -that if any devices are missing the array will not be restarted. - -As an alternative, -.B \-\-run -may be passed to -.B mdadm -in which case the array will be run as soon as there are enough -devices present for the data to be accessible. For a raid1, that -means one device will start the array. For a clean raid5, the array -will be started as soon as all but one drive is present. - -Note that neither of these approaches is really ideal. If it can -be known that all device discovery has completed, then -.br -.B " mdadm \-IRs" -.br -can be run which will try to start all arrays that are being -incrementally assembled. They are started in "read-auto" mode in -which they are read-only until the first write request. This means -that no metadata updates are made and no attempt at resync or recovery -happens. Further devices that are found before the first write can -still be added safely. - -.SH EXAMPLES - -.B " mdadm \-\-query /dev/name-of-device" -.br -This will find out if a given device is a raid array, or is part of -one, and will provide brief information about the device. - -.B " mdadm \-\-assemble \-\-scan" -.br -This will assemble and start all arrays listed in the standard config -file. This command will typically go in a system startup file. - -.B " mdadm \-\-stop \-\-scan" -.br -This will shut down all arrays that can be shut down (i.e. are not -currently in use). This will typically go in a system shutdown script. - -.B " mdadm \-\-follow \-\-scan \-\-delay=120" -.br -If (and only if) there is an Email address or program given in the -standard config file, then -monitor the status of all arrays listed in that file by -polling them ever 2 minutes. - -.B " mdadm \-\-create /dev/md0 \-\-level=1 \-\-raid\-devices=2 /dev/hd[ac]1" -.br -Create /dev/md0 as a RAID1 array consisting of /dev/hda1 and /dev/hdc1. - -.br -.B " echo 'DEVICE /dev/hd*[0\-9] /dev/sd*[0\-9]' > mdadm.conf" -.br -.B " mdadm \-\-detail \-\-scan >> mdadm.conf" -.br -This will create a prototype config file that describes currently -active arrays that are known to be made from partitions of IDE or SCSI drives. -This file should be reviewed before being used as it may -contain unwanted detail. - -.B " echo 'DEVICE /dev/hd[a\-z] /dev/sd*[a\-z]' > mdadm.conf" -.br -.B " mdadm \-\-examine \-\-scan \-\-config=mdadm.conf >> mdadm.conf" -.br -This will find arrays which could be assembled from existing IDE and -SCSI whole drives (not partitions), and store the information in the -format of a config file. -This file is very likely to contain unwanted detail, particularly -the -.B devices= -entries. It should be reviewed and edited before being used as an -actual config file. - -.B " mdadm \-\-examine \-\-brief \-\-scan \-\-config=partitions" -.br -.B " mdadm \-Ebsc partitions" -.br -Create a list of devices by reading -.BR /proc/partitions , -scan these for RAID superblocks, and printout a brief listing of all -that were found. - -.B " mdadm \-Ac partitions \-m 0 /dev/md0" -.br -Scan all partitions and devices listed in -.BR /proc/partitions -and assemble -.B /dev/md0 -out of all such devices with a RAID superblock with a minor number of 0. - -.B " mdadm \-\-monitor \-\-scan \-\-daemonise > /var/run/mdadm" -.br -If config file contains a mail address or alert program, run mdadm in -the background in monitor mode monitoring all md devices. Also write -pid of mdadm daemon to -.BR /var/run/mdadm . - -.B " mdadm \-Iq /dev/somedevice" -.br -Try to incorporate newly discovered device into some array as -appropriate. - -.B " mdadm \-\-incremental \-\-rebuild \-\-run \-\-scan" -.br -Rebuild the array map from any current arrays, and then start any that -can be started. - -.B " mdadm /dev/md4 --fail detached --remove detached" -.br -Any devices which are components of /dev/md4 will be marked as faulty -and then remove from the array. - -.B " mdadm \-\-create \-\-help" -.br -Provide help about the Create mode. - -.B " mdadm \-\-config \-\-help" -.br -Provide help about the format of the config file. - -.B " mdadm \-\-help" -.br -Provide general help. - - -.SH FILES - -.SS /proc/mdstat - -If you're using the -.B /proc -filesystem, -.B /proc/mdstat -lists all active md devices with information about them. -.B mdadm -uses this to find arrays when -.B \-\-scan -is given in Misc mode, and to monitor array reconstruction -on Monitor mode. - - -.SS /etc/mdadm/mdadm.conf - -The config file lists which devices may be scanned to see if -they contain MD super block, and gives identifying information -(e.g. UUID) about known MD arrays. See -.BR mdadm.conf (5) -for more details. - -.SS /var/run/mdadm/map -When -.B \-\-incremental -mode is used, this file gets a list of arrays currently being created. -If -.B /var/run/mdadm -does not exist as a directory, then -.B /var/run/mdadm.map -is used instead. - -.SH DEVICE NAMES - -While entries in the /dev directory can have any format you like, -.I mdadm -has an understanding of 'standard' formats which it uses to guide its -behaviour when creating device files via the -.B \-\-auto -option. - -The standard names for non-partitioned arrays (the only sort of md -array available in 2.4 and earlier) are either of -.IP -/dev/mdNN -.br -/dev/md/NN -.PP -where NN is a number. -The standard names for partitionable arrays (as available from 2.6 -onwards) are either of -.IP -/dev/md/dNN -.br -/dev/md_dNN -.PP -Partition numbers should be indicated by added "pMM" to these, thus "/dev/md/d1p2". - -.SH NOTE -.B mdadm -was previously known as -.BR mdctl . -.P -.B mdadm -is completely separate from the -.B raidtools -package, and does not use the -.I /etc/raidtab -configuration file at all. - -.SH SEE ALSO -For further information on mdadm usage, MD and the various levels of -RAID, see: -.IP -.UR http://linux-raid.osdl.org/ -http://linux\-raid.osdl.org/ -.PP -(based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO) -.\".PP -.\"for new releases of the RAID driver check out: -.\" -.\".IP -.\".UR ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches -.\"ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches -.\".UE -.\".PP -.\"or -.\".IP -.\".UR http://www.cse.unsw.edu.au/~neilb/patches/linux-stable/ -.\"http://www.cse.unsw.edu.au/~neilb/patches/linux-stable/ -.\".UE -.PP -The latest version of -.I mdadm -should always be available from -.IP -.UR http://www.kernel.org/pub/linux/utils/raid/mdadm/ -http://www.kernel.org/pub/linux/utils/raid/mdadm/ -.PP -.IR mdadm.conf (5), -.IR md (4). -.PP -.IR raidtab (5), -.IR raid0run (8), -.IR raidstop (8), -.IR mkraid (8). diff -Nru mdadm-2.6.7.1/mdadm.8.in mdadm-3.1.4/mdadm.8.in --- mdadm-2.6.7.1/mdadm.8.in 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/mdadm.8.in 2010-08-31 10:21:13.000000000 +0300 @@ -0,0 +1,2673 @@ +.\" -*- nroff -*- +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM 8 "" v3.1.4 +.SH NAME +mdadm \- manage MD devices +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI mdadm " [mode] [options] " + +.SH DESCRIPTION +RAID devices are virtual devices created from two or more +real block devices. This allows multiple devices (typically disk +drives or partitions thereof) to be combined into a single device to +hold (for example) a single filesystem. +Some RAID levels include redundancy and so can survive some degree of +device failure. + +Linux Software RAID devices are implemented through the md (Multiple +Devices) device driver. + +Currently, Linux supports +.B LINEAR +md devices, +.B RAID0 +(striping), +.B RAID1 +(mirroring), +.BR RAID4 , +.BR RAID5 , +.BR RAID6 , +.BR RAID10 , +.BR MULTIPATH , +.BR FAULTY , +and +.BR CONTAINER . + +.B MULTIPATH +is not a Software RAID mechanism, but does involve +multiple devices: +each device is a path to one common physical storage device. +New installations should not use md/multipath as it is not well +supported and has no ongoing development. Use the Device Mapper based +multipath-tools instead. + +.B FAULTY +is also not true RAID, and it only involves one device. It +provides a layer over a true device that can be used to inject faults. + +.B CONTAINER +is different again. A +.B CONTAINER +is a collection of devices that are +managed as a set. This is similar to the set of devices connected to +a hardware RAID controller. The set of devices may contain a number +of different RAID arrays each utilising some (or all) of the blocks from a +number of the devices in the set. For example, two devices in a 5-device set +might form a RAID1 using the whole devices. The remaining three might +have a RAID5 over the first half of each device, and a RAID0 over the +second half. + +With a +.BR CONTAINER , +there is one set of metadata that describes all of +the arrays in the container. So when +.I mdadm +creates a +.B CONTAINER +device, the device just represents the metadata. Other normal arrays (RAID1 +etc) can be created inside the container. + +.SH MODES +mdadm has several major modes of operation: +.TP +.B Assemble +Assemble the components of a previously created +array into an active array. Components can be explicitly given +or can be searched for. +.I mdadm +checks that the components +do form a bona fide array, and can, on request, fiddle superblock +information so as to assemble a faulty array. + +.TP +.B Build +Build an array that doesn't have per-device metadata (superblocks). For these +sorts of arrays, +.I mdadm +cannot differentiate between initial creation and subsequent assembly +of an array. It also cannot perform any checks that appropriate +components have been requested. Because of this, the +.B Build +mode should only be used together with a complete understanding of +what you are doing. + +.TP +.B Create +Create a new array with per-device metadata (superblocks). +Appropriate metadata is written to each device, and then the array +comprising those devices is activated. A 'resync' process is started +to make sure that the array is consistent (e.g. both sides of a mirror +contain the same data) but the content of the device is left otherwise +untouched. +The array can be used as soon as it has been created. There is no +need to wait for the initial resync to finish. + +.TP +.B "Follow or Monitor" +Monitor one or more md devices and act on any state changes. This is +only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as +only these have interesting state. RAID0 or Linear never have +missing, spare, or failed drives, so there is nothing to monitor. + +.TP +.B "Grow" +Grow (or shrink) an array, or otherwise reshape it in some way. +Currently supported growth options including changing the active size +of component devices and changing the number of active devices in RAID +levels 1/4/5/6, changing the RAID level between 1, 5, and 6, changing +the chunk size and layout for RAID5 and RAID5, as well as adding or +removing a write-intent bitmap. + +.TP +.B "Incremental Assembly" +Add a single device to an appropriate array. If the addition of the +device makes the array runnable, the array will be started. +This provides a convenient interface to a +.I hot-plug +system. As each device is detected, +.I mdadm +has a chance to include it in some array as appropriate. +Optionally, when the +.I \-\-fail +flag is passed in we will remove the device from any active array +instead of adding it. + +If a +.B CONTAINER +is passed to +.I mdadm +in this mode, then any arrays within that container will be assembled +and started. + +.TP +.B Manage +This is for doing things to specific components of an array such as +adding new spares and removing faulty devices. + +.TP +.B Misc +This is an 'everything else' mode that supports operations on active +arrays, operations on component devices such as erasing old superblocks, and +information gathering operations. +.\"This mode allows operations on independent devices such as examine MD +.\"superblocks, erasing old superblocks and stopping active arrays. + +.TP +.B Auto-detect +This mode does not act on a specific device or array, but rather it +requests the Linux Kernel to activate any auto-detected arrays. +.SH OPTIONS + +.SH Options for selecting a mode are: + +.TP +.BR \-A ", " \-\-assemble +Assemble a pre-existing array. + +.TP +.BR \-B ", " \-\-build +Build a legacy array without superblocks. + +.TP +.BR \-C ", " \-\-create +Create a new array. + +.TP +.BR \-F ", " \-\-follow ", " \-\-monitor +Select +.B Monitor +mode. + +.TP +.BR \-G ", " \-\-grow +Change the size or shape of an active array. + +.TP +.BR \-I ", " \-\-incremental +Add/remove a single device to/from an appropriate array, and possibly start the array. + +.TP +.B \-\-auto-detect +Request that the kernel starts any auto-detected arrays. This can only +work if +.I md +is compiled into the kernel \(em not if it is a module. +Arrays can be auto-detected by the kernel if all the components are in +primary MS-DOS partitions with partition type +.BR FD , +and all use v0.90 metadata. +In-kernel autodetect is not recommended for new installations. Using +.I mdadm +to detect and assemble arrays \(em possibly in an +.I initrd +\(em is substantially more flexible and should be preferred. + +.P +If a device is given before any options, or if the first option is +.BR \-\-add , +.BR \-\-fail , +or +.BR \-\-remove , +then the MANAGE mode is assumed. +Anything other than these will cause the +.B Misc +mode to be assumed. + +.SH Options that are not mode-specific are: + +.TP +.BR \-h ", " \-\-help +Display general help message or, after one of the above options, a +mode-specific help message. + +.TP +.B \-\-help\-options +Display more detailed help about command line parsing and some commonly +used options. + +.TP +.BR \-V ", " \-\-version +Print version information for mdadm. + +.TP +.BR \-v ", " \-\-verbose +Be more verbose about what is happening. This can be used twice to be +extra-verbose. +The extra verbosity currently only affects +.B \-\-detail \-\-scan +and +.BR "\-\-examine \-\-scan" . + +.TP +.BR \-q ", " \-\-quiet +Avoid printing purely informative messages. With this, +.I mdadm +will be silent unless there is something really important to report. + +.TP +.BR \-f ", " \-\-force +Be more forceful about certain operations. See the various modes for +the exact meaning of this option in different contexts. + +.TP +.BR \-c ", " \-\-config= +Specify the config file. Default is to use +.BR /etc/mdadm.conf , +or if that is missing then +.BR /etc/mdadm/mdadm.conf . +If the config file given is +.B "partitions" +then nothing will be read, but +.I mdadm +will act as though the config file contained exactly +.B "DEVICE partitions containers" +and will read +.B /proc/partitions +to find a list of devices to scan, and +.B /proc/mdstat +to find a list of containers to examine. +If the word +.B "none" +is given for the config file, then +.I mdadm +will act as though the config file were empty. + +.TP +.BR \-s ", " \-\-scan +Scan config file or +.B /proc/mdstat +for missing information. +In general, this option gives +.I mdadm +permission to get any missing information (like component devices, +array devices, array identities, and alert destination) from the +configuration file (see previous option); +one exception is MISC mode when using +.B \-\-detail +or +.B \-\-stop, +in which case +.B \-\-scan +says to get a list of array devices from +.BR /proc/mdstat . + +.TP +.BR \-e ", " \-\-metadata= +Declare the style of RAID metadata (superblock) to be used. The +default is {DEFAULT_METADATA} for +.BR \-\-create , +and to guess for other operations. +The default can be overridden by setting the +.B metadata +value for the +.B CREATE +keyword in +.BR mdadm.conf . + +Options are: +.RS +.ie '{DEFAULT_METADATA}'0.90' +.IP "0, 0.90, default" +.el +.IP "0, 0.90" +.. +Use the original 0.90 format superblock. This format limits arrays to +28 component devices and limits component devices of levels 1 and +greater to 2 terabytes. +.ie '{DEFAULT_METADATA}'0.90' +.IP "1, 1.0, 1.1, 1.2" +.el +.IP "1, 1.0, 1.1, 1.2 default" +.. +Use the new version-1 format superblock. This has few restrictions. +The different sub-versions store the superblock at different locations +on the device, either at the end (for 1.0), at the start (for 1.1) or +4K from the start (for 1.2). "1" is equivalent to "1.0". +'if '{DEFAULT_METADATA}'1.2' "default" is equivalent to "1.2". +.IP ddf +Use the "Industry Standard" DDF (Disk Data Format) format defined by +SNIA. +When creating a DDF array a +.B CONTAINER +will be created, and normal arrays can be created in that container. +.IP imsm +Use the Intel(R) Matrix Storage Manager metadata format. This creates a +.B CONTAINER +which is managed in a similar manner to DDF, and is supported by an +option-rom on some platforms: +.IP +.B http://www.intel.com/design/chipsets/matrixstorage_sb.htm +.PP +.RE + +.TP +.B \-\-homehost= +This will override any +.B HOMEHOST +setting in the config file and provides the identity of the host which +should be considered the home for any arrays. + +When creating an array, the +.B homehost +will be recorded in the metadata. For version-1 superblocks, it will +be prefixed to the array name. For version-0.90 superblocks, part of +the SHA1 hash of the hostname will be stored in the later half of the +UUID. + +When reporting information about an array, any array which is tagged +for the given homehost will be reported as such. + +When using Auto-Assemble, only arrays tagged for the given homehost +will be allowed to use 'local' names (i.e. not ending in '_' followed +by a digit string). See below under +.BR "Auto Assembly" . + +.SH For create, build, or grow: + +.TP +.BR \-n ", " \-\-raid\-devices= +Specify the number of active devices in the array. This, plus the +number of spare devices (see below) must equal the number of +.I component-devices +(including "\fBmissing\fP" devices) +that are listed on the command line for +.BR \-\-create . +Setting a value of 1 is probably +a mistake and so requires that +.B \-\-force +be specified first. A value of 1 will then be allowed for linear, +multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6. +.br +This number can only be changed using +.B \-\-grow +for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide +the necessary support. + +.TP +.BR \-x ", " \-\-spare\-devices= +Specify the number of spare (eXtra) devices in the initial array. +Spares can also be added +and removed later. The number of component devices listed +on the command line must equal the number of RAID devices plus the +number of spare devices. + +.TP +.BR \-z ", " \-\-size= +Amount (in Kibibytes) of space to use from each drive in RAID levels 1/4/5/6. +This must be a multiple of the chunk size, and must leave about 128Kb +of space at the end of the drive for the RAID superblock. +If this is not specified +(as it normally is not) the smallest drive (or partition) sets the +size, though if there is a variance among the drives of greater than 1%, a warning is +issued. + +This value can be set with +.B \-\-grow +for RAID level 1/4/5/6. If the array was created with a size smaller +than the currently active drives, the extra space can be accessed +using +.BR \-\-grow . +The size can be given as +.B max +which means to choose the largest size that fits on all current drives. + +This value can not be used with +.B CONTAINER +metadata such as DDF and IMSM. + +.TP +.BR \-Z ", " \-\-array-size= +This is only meaningful with +.B \-\-grow +and its effect is not persistent: when the array is stopped an +restarted the default array size will be restored. + +Setting the array-size causes the array to appear smaller to programs +that access the data. This is particularly needed before reshaping an +array so that it will be smaller. As the reshape is not reversible, +but setting the size with +.B \-\-array-size +is, it is required that the array size is reduced as appropriate +before the number of devices in the array is reduced. + +.TP +.BR \-c ", " \-\-chunk= +Specify chunk size of kibibytes. The default when creating an +array is 512KB. To ensure compatibility with earlier versions, the +default when Building and array with no persistent metadata is 64KB. +This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10. + +.TP +.BR \-\-rounding= +Specify rounding factor for a Linear array. The size of each +component will be rounded down to a multiple of this size. +This is a synonym for +.B \-\-chunk +but highlights the different meaning for Linear as compared to other +RAID levels. The default is 64K if a kernel earlier than 2.6.16 is in +use, and is 0K (i.e. no rounding) in later kernels. + +.TP +.BR \-l ", " \-\-level= +Set RAID level. When used with +.BR \-\-create , +options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4, +raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container. +Obviously some of these are synonymous. + +When a +.B CONTAINER +metadata type is requested, only the +.B container +level is permitted, and it does not need to be explicitly given. + +When used with +.BR \-\-build , +only linear, stripe, raid0, 0, raid1, multipath, mp, and faulty are valid. + +Can be used with +.B \-\-grow +to change the RAID level in some cases. See LEVEL CHANGES below. + +.TP +.BR \-p ", " \-\-layout= +This option configures the fine details of data layout for RAID5, RAID6, +and RAID10 arrays, and controls the failure modes for +.IR faulty . + +The layout of the RAID5 parity block can be one of +.BR left\-asymmetric , +.BR left\-symmetric , +.BR right\-asymmetric , +.BR right\-symmetric , +.BR la ", " ra ", " ls ", " rs . +The default is +.BR left\-symmetric . + +It is also possibly to cause RAID5 to use a RAID4-like layout by +choosing +.BR parity\-first , +or +.BR parity\-last . + +Finally for RAID5 there are DDF\-compatible layouts, +.BR ddf\-zero\-restart , +.BR ddf\-N\-restart , +and +.BR ddf\-N\-continue . + +These same layouts are available for RAID6. There are also 4 layouts +that will provide an intermediate stage for converting between RAID5 +and RAID6. These provide a layout which is identical to the +corresponding RAID5 layout on the first N\-1 devices, and has the 'Q' +syndrome (the second 'parity' block used by RAID6) on the last device. +These layouts are: +.BR left\-symmetric\-6 , +.BR right\-symmetric\-6 , +.BR left\-asymmetric\-6 , +.BR right\-asymmetric\-6 , +and +.BR parity\-first\-6 . + +When setting the failure mode for level +.I faulty, +the options are: +.BR write\-transient ", " wt , +.BR read\-transient ", " rt , +.BR write\-persistent ", " wp , +.BR read\-persistent ", " rp , +.BR write\-all , +.BR read\-fixable ", " rf , +.BR clear ", " flush ", " none . + +Each failure mode can be followed by a number, which is used as a period +between fault generation. Without a number, the fault is generated +once on the first relevant request. With a number, the fault will be +generated after that many requests, and will continue to be generated +every time the period elapses. + +Multiple failure modes can be current simultaneously by using the +.B \-\-grow +option to set subsequent failure modes. + +"clear" or "none" will remove any pending or periodic failure modes, +and "flush" will clear any persistent faults. + +Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed +by a small number. The default is 'n2'. The supported options are: + +.I 'n' +signals 'near' copies. Multiple copies of one data block are at +similar offsets in different devices. + +.I 'o' +signals 'offset' copies. Rather than the chunks being duplicated +within a stripe, whole stripes are duplicated but are rotated by one +device so duplicate blocks are on different devices. Thus subsequent +copies of a block are in the next drive, and are one chunk further +down. + +.I 'f' +signals 'far' copies +(multiple copies have very different offsets). +See md(4) for more detail about 'near', 'offset', and 'far'. + +The number is the number of copies of each datablock. 2 is normal, 3 +can be useful. This number can be at most equal to the number of +devices in the array. It does not need to divide evenly into that +number (e.g. it is perfectly legal to have an 'n2' layout for an array +with an odd number of devices). + +When an array is converted between RAID5 and RAID6 an intermediate +RAID6 layout is used in which the second parity block (Q) is always on +the last device. To convert a RAID5 to RAID6 and leave it in this new +layout (which does not require re-striping) use +.BR \-\-layout=preserve . +This will try to avoid any restriping. + +The converse of this is +.B \-\-layout=normalise +which will change a non-standard RAID6 layout into a more standard +arrangement. + +.TP +.BR \-\-parity= +same as +.B \-\-layout +(thus explaining the p of +.BR \-p ). + +.TP +.BR \-b ", " \-\-bitmap= +Specify a file to store a write-intent bitmap in. The file should not +exist unless +.B \-\-force +is also given. The same file should be provided +when assembling the array. If the word +.B "internal" +is given, then the bitmap is stored with the metadata on the array, +and so is replicated on all devices. If the word +.B "none" +is given with +.B \-\-grow +mode, then any bitmap that is present is removed. + +To help catch typing errors, the filename must contain at least one +slash ('/') if it is a real file (not 'internal' or 'none'). + +Note: external bitmaps are only known to work on ext2 and ext3. +Storing bitmap files on other filesystems may result in serious problems. + +.TP +.BR \-\-bitmap\-chunk= +Set the chunksize of the bitmap. Each bit corresponds to that many +Kilobytes of storage. +When using a file based bitmap, the default is to use the smallest +size that is at-least 4 and requires no more than 2^21 chunks. +When using an +.B internal +bitmap, the chunksize defaults to 64Meg, or larger if necessary to +fit the bitmap into the available space. + +.TP +.BR \-W ", " \-\-write\-mostly +subsequent devices listed in a +.BR \-\-build , +.BR \-\-create , +or +.B \-\-add +command will be flagged as 'write-mostly'. This is valid for RAID1 +only and means that the 'md' driver will avoid reading from these +devices if at all possible. This can be useful if mirroring over a +slow link. + +.TP +.BR \-\-write\-behind= +Specify that write-behind mode should be enabled (valid for RAID1 +only). If an argument is specified, it will set the maximum number +of outstanding writes allowed. The default value is 256. +A write-intent bitmap is required in order to use write-behind +mode, and write-behind is only attempted on drives marked as +.IR write-mostly . + +.TP +.BR \-\-assume\-clean +Tell +.I mdadm +that the array pre-existed and is known to be clean. It can be useful +when trying to recover from a major failure as you can be sure that no +data will be affected unless you actually write to the array. It can +also be used when creating a RAID1 or RAID10 if you want to avoid the +initial resync, however this practice \(em while normally safe \(em is not +recommended. Use this only if you really know what you are doing. +.IP +When the devices that will be part of a new array were filled +with zeros before creation the operator knows the array is +actually clean. If that is the case, such as after running +badblocks, this argument can be used to tell mdadm the +facts the operator knows. + +.TP +.BR \-\-backup\-file= +This is needed when +.B \-\-grow +is used to increase the number of +raid-devices in a RAID5 if there are no spare devices available. +See the GROW MODE section below on RAID\-DEVICES CHANGES. The file +should be stored on a separate device, not on the RAID array being +reshaped. + +.TP +.BR \-\-array-size= ", " \-Z +Set the size of the array which is seen by users of the device such as +filesystems. This can be less that the real size, but never greater. +The size set this way does not persist across restarts of the array. + +This is most useful when reducing the number of devices in a RAID5 or +RAID6. Such arrays require the array-size to be reduced before a +reshape can be performed that reduces the real size. + +A value of +.B max +restores the apparent size of the array to be whatever the real +amount of available space is. + +.TP +.BR \-N ", " \-\-name= +Set a +.B name +for the array. This is currently only effective when creating an +array with a version-1 superblock, or an array in a DDF container. +The name is a simple textual string that can be used to identify array +components when assembling. If name is needed but not specified, it +is taken from the basename of the device that is being created. +e.g. when creating +.I /dev/md/home +the +.B name +will default to +.IR home . + +.TP +.BR \-R ", " \-\-run +Insist that +.I mdadm +run the array, even if some of the components +appear to be active in another array or filesystem. Normally +.I mdadm +will ask for confirmation before including such components in an +array. This option causes that question to be suppressed. + +.TP +.BR \-f ", " \-\-force +Insist that +.I mdadm +accept the geometry and layout specified without question. Normally +.I mdadm +will not allow creation of an array with only one device, and will try +to create a RAID5 array with one missing drive (as this makes the +initial resync work faster). With +.BR \-\-force , +.I mdadm +will not try to be so clever. + +.TP +.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}" +Instruct mdadm how to create the device file if needed, possibly allocating +an unused minor number. "md" causes a non-partitionable array +to be used (though since Linux 2.6.28, these array devices are in fact +partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and +later) to be used. "yes" requires the named md device to have +a 'standard' format, and the type and minor number will be determined +from this. With mdadm 3.0, device creation is normally left up to +.I udev +so this option is unlikely to be needed. +See DEVICE NAMES below. + +The argument can also come immediately after +"\-a". e.g. "\-ap". + +If +.B \-\-auto +is not given on the command line or in the config file, then +the default will be +.BR \-\-auto=yes . + +If +.B \-\-scan +is also given, then any +.I auto= +entries in the config file will override the +.B \-\-auto +instruction given on the command line. + +For partitionable arrays, +.I mdadm +will create the device file for the whole array and for the first 4 +partitions. A different number of partitions can be specified at the +end of this option (e.g. +.BR \-\-auto=p7 ). +If the device name ends with a digit, the partition names add a 'p', +and a number, e.g. +.IR /dev/md/home1p3 . +If there is no trailing digit, then the partition names just have a +number added, e.g. +.IR /dev/md/scratch3 . + +If the md device name is in a 'standard' format as described in DEVICE +NAMES, then it will be created, if necessary, with the appropriate +device number based on that name. If the device name is not in one of these +formats, then a unused device number will be allocated. The device +number will be considered unused if there is no active array for that +number, and there is no entry in /dev for that number and with a +non-standard name. Names that are not in 'standard' format are only +allowed in "/dev/md/". + +.ig XX +.\".TP +.\".BR \-\-symlink = no +.\"Normally when +.\".B \-\-auto +.\"causes +.\".I mdadm +.\"to create devices in +.\".B /dev/md/ +.\"it will also create symlinks from +.\".B /dev/ +.\"with names starting with +.\".B md +.\"or +.\".BR md_ . +.\"Use +.\".B \-\-symlink=no +.\"to suppress this, or +.\".B \-\-symlink=yes +.\"to enforce this even if it is suppressing +.\".IR mdadm.conf . +.\" +.XX + +.SH For assemble: + +.TP +.BR \-u ", " \-\-uuid= +uuid of array to assemble. Devices which don't have this uuid are +excluded + +.TP +.BR \-m ", " \-\-super\-minor= +Minor number of device that array was created for. Devices which +don't have this minor number are excluded. If you create an array as +/dev/md1, then all superblocks will contain the minor number 1, even if +the array is later assembled as /dev/md2. + +Giving the literal word "dev" for +.B \-\-super\-minor +will cause +.I mdadm +to use the minor number of the md device that is being assembled. +e.g. when assembling +.BR /dev/md0 , +.B \-\-super\-minor=dev +will look for super blocks with a minor number of 0. + +.B \-\-super\-minor +is only relevant for v0.90 metadata, and should not normally be used. +Using +.B \-\-uuid +is much safer. + +.TP +.BR \-N ", " \-\-name= +Specify the name of the array to assemble. This must be the name +that was specified when creating the array. It must either match +the name stored in the superblock exactly, or it must match +with the current +.I homehost +prefixed to the start of the given name. + +.TP +.BR \-f ", " \-\-force +Assemble the array even if the metadata on some devices appears to be +out-of-date. If +.I mdadm +cannot find enough working devices to start the array, but can find +some devices that are recorded as having failed, then it will mark +those devices as working so that the array can be started. +An array which requires +.B \-\-force +to be started may contain data corruption. Use it carefully. + +.TP +.BR \-R ", " \-\-run +Attempt to start the array even if fewer drives were given than were +present last time the array was active. Normally if not all the +expected drives are found and +.B \-\-scan +is not used, then the array will be assembled but not started. +With +.B \-\-run +an attempt will be made to start it anyway. + +.TP +.B \-\-no\-degraded +This is the reverse of +.B \-\-run +in that it inhibits the startup of array unless all expected drives +are present. This is only needed with +.B \-\-scan, +and can be used if the physical connections to devices are +not as reliable as you would like. + +.TP +.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}" +See this option under Create and Build options. + +.TP +.BR \-b ", " \-\-bitmap= +Specify the bitmap file that was given when the array was created. If +an array has an +.B internal +bitmap, there is no need to specify this when assembling the array. + +.TP +.BR \-\-backup\-file= +If +.B \-\-backup\-file +was used to grow the number of raid-devices in a RAID5, and the system +crashed during the critical section, then the same +.B \-\-backup\-file +must be presented to +.B \-\-assemble +to allow possibly corrupted data to be restored. + +.TP +.BR \-U ", " \-\-update= +Update the superblock on each device while assembling the array. The +argument given to this flag can be one of +.BR sparc2.2 , +.BR summaries , +.BR uuid , +.BR name , +.BR homehost , +.BR resync , +.BR byteorder , +.BR devicesize , +or +.BR super\-minor . + +The +.B sparc2.2 +option will adjust the superblock of an array what was created on a Sparc +machine running a patched 2.2 Linux kernel. This kernel got the +alignment of part of the superblock wrong. You can use the +.B "\-\-examine \-\-sparc2.2" +option to +.I mdadm +to see what effect this would have. + +The +.B super\-minor +option will update the +.B "preferred minor" +field on each superblock to match the minor number of the array being +assembled. +This can be useful if +.B \-\-examine +reports a different "Preferred Minor" to +.BR \-\-detail . +In some cases this update will be performed automatically +by the kernel driver. In particular the update happens automatically +at the first write to an array with redundancy (RAID level 1 or +greater) on a 2.6 (or later) kernel. + +The +.B uuid +option will change the uuid of the array. If a UUID is given with the +.B \-\-uuid +option that UUID will be used as a new UUID and will +.B NOT +be used to help identify the devices in the array. +If no +.B \-\-uuid +is given, a random UUID is chosen. + +The +.B name +option will change the +.I name +of the array as stored in the superblock. This is only supported for +version-1 superblocks. + +The +.B homehost +option will change the +.I homehost +as recorded in the superblock. For version-0 superblocks, this is the +same as updating the UUID. +For version-1 superblocks, this involves updating the name. + +The +.B resync +option will cause the array to be marked +.I dirty +meaning that any redundancy in the array (e.g. parity for RAID5, +copies for RAID1) may be incorrect. This will cause the RAID system +to perform a "resync" pass to make sure that all redundant information +is correct. + +The +.B byteorder +option allows arrays to be moved between machines with different +byte-order. +When assembling such an array for the first time after a move, giving +.B "\-\-update=byteorder" +will cause +.I mdadm +to expect superblocks to have their byteorder reversed, and will +correct that order before assembling the array. This is only valid +with original (Version 0.90) superblocks. + +The +.B summaries +option will correct the summaries in the superblock. That is the +counts of total, working, active, failed, and spare devices. + +The +.B devicesize +will rarely be of use. It applies to version 1.1 and 1.2 metadata +only (where the metadata is at the start of the device) and is only +useful when the component device has changed size (typically become +larger). The version 1 metadata records the amount of the device that +can be used to store data, so if a device in a version 1.1 or 1.2 +array becomes larger, the metadata will still be visible, but the +extra space will not. In this case it might be useful to assemble the +array with +.BR \-\-update=devicesize . +This will cause +.I mdadm +to determine the maximum usable amount of space on each device and +update the relevant field in the metadata. + +.ig +.TP +.B \-\-auto\-update\-homehost +This flag is only meaningful with auto-assembly (see discussion below). +In that situation, if no suitable arrays are found for this homehost, +.I mdadm +will rescan for any arrays at all and will assemble them and update the +homehost to match the current host. +.. + +.SH For Manage mode: + +.TP +.BR \-t ", " \-\-test +Unless a more serious error occurred, +.I mdadm +will exit with a status of 2 if no changes were made to the array and +0 if at least one change was made. +This can be useful when an indirect specifier such as +.BR missing , +.B detached +or +.B faulty +is used in requesting an operation on the array. +.B \-\-test +will report failure if these specifiers didn't find any match. + +.TP +.BR \-a ", " \-\-add +hot-add listed devices. +If a device appears to have recently been part of the array +(possibly it failed or was removed) the device is re-added as describe +in the next point. +If that fails or the device was never part of the array, the device is +added as a hot-spare. +If the array is degraded, it will immediately start to rebuild data +onto that spare. + +Note that this and the following options are only meaningful on array +with redundancy. They don't apply to RAID0 or Linear. + +.TP +.BR \-\-re\-add +re\-add a device that was previous removed from an array. +If the metadata on the device reports that it is a member of the +array, and the slot that it used is still vacant, then the device will +be added back to the array in the same position. This will normally +cause the data for that device to be recovered. However based on the +event count on the device, the recovery may only require sections that +are flagged a write-intent bitmap to be recovered or may not require +any recovery at all. + +When used on an array that has no metadata (i.e. it was built with +.BR \-\-build) +it will be assumed that bitmap-based recovery is enough to make the +device fully consistent with the array. + +If the device name given is +.B missing +then mdadm will try to find any device that looks like it should be +part of the array but isn't and will try to re\-add all such devices. + +.TP +.BR \-r ", " \-\-remove +remove listed devices. They must not be active. i.e. they should +be failed or spare devices. As well as the name of a device file +(e.g. +.BR /dev/sda1 ) +the words +.B failed +and +.B detached +can be given to +.BR \-\-remove . +The first causes all failed device to be removed. The second causes +any device which is no longer connected to the system (i.e an 'open' +returns +.BR ENXIO ) +to be removed. This will only succeed for devices that are spares or +have already been marked as failed. + +.TP +.BR \-f ", " \-\-fail +mark listed devices as faulty. +As well as the name of a device file, the word +.B detached +can be given. This will cause any device that has been detached from +the system to be marked as failed. It can then be removed. + +.TP +.BR \-\-set\-faulty +same as +.BR \-\-fail . + +.TP +.BR \-\-write\-mostly +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag set. This is only valid for RAID1 and means that the 'md' driver +will avoid reading from these devices if possible. +.TP +.BR \-\-readwrite +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag cleared. + +.P +Each of these options requires that the first device listed is the array +to be acted upon, and the remainder are component devices to be added, +removed, marked as faulty, etc. Several different operations can be +specified for different devices, e.g. +.in +5 +mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1 +.in -5 +Each operation applies to all devices listed until the next +operation. + +If an array is using a write-intent bitmap, then devices which have +been removed can be re\-added in a way that avoids a full +reconstruction but instead just updates the blocks that have changed +since the device was removed. For arrays with persistent metadata +(superblocks) this is done automatically. For arrays created with +.B \-\-build +mdadm needs to be told that this device we removed recently with +.BR \-\-re\-add . + +Devices can only be removed from an array if they are not in active +use, i.e. that must be spares or failed devices. To remove an active +device, it must first be marked as +.B faulty. + +.SH For Misc mode: + +.TP +.BR \-Q ", " \-\-query +Examine a device to see +(1) if it is an md device and (2) if it is a component of an md +array. +Information about what is discovered is presented. + +.TP +.BR \-D ", " \-\-detail +Print details of one or more md devices. + +.TP +.BR \-\-detail\-platform +Print details of the platform's RAID capabilities (firmware / hardware +topology) for a given metadata format. + +.TP +.BR \-Y ", " \-\-export +When used with +.B \-\-detail +or +.BR \-\-examine , +output will be formatted as +.B key=value +pairs for easy import into the environment. + +.TP +.BR \-E ", " \-\-examine +Print contents of the metadata stored on the named device(s). +Note the contrast between +.B \-\-examine +and +.BR \-\-detail . +.B \-\-examine +applies to devices which are components of an array, while +.B \-\-detail +applies to a whole array which is currently active. +.TP +.B \-\-sparc2.2 +If an array was created on a SPARC machine with a 2.2 Linux kernel +patched with RAID support, the superblock will have been created +incorrectly, or at least incompatibly with 2.4 and later kernels. +Using the +.B \-\-sparc2.2 +flag with +.B \-\-examine +will fix the superblock before displaying it. If this appears to do +the right thing, then the array can be successfully assembled using +.BR "\-\-assemble \-\-update=sparc2.2" . + +.TP +.BR \-X ", " \-\-examine\-bitmap +Report information about a bitmap file. +The argument is either an external bitmap file or an array component +in case of an internal bitmap. Note that running this on an array +device (e.g. +.BR /dev/md0 ) +does not report the bitmap for that array. + +.TP +.BR \-R ", " \-\-run +start a partially assembled array. If +.B \-\-assemble +did not find enough devices to fully start the array, it might leaving +it partially assembled. If you wish, you can then use +.B \-\-run +to start the array in degraded mode. + +.TP +.BR \-S ", " \-\-stop +deactivate array, releasing all resources. + +.TP +.BR \-o ", " \-\-readonly +mark array as readonly. + +.TP +.BR \-w ", " \-\-readwrite +mark array as readwrite. + +.TP +.B \-\-zero\-superblock +If the device contains a valid md superblock, the block is +overwritten with zeros. With +.B \-\-force +the block where the superblock would be is overwritten even if it +doesn't appear to be valid. + +.TP +.B \-\-kill\-subarray= +If the device is a container and the argument to \-\-kill\-subarray +specifies an inactive subarray in the container, then the subarray is +deleted. Deleting all subarrays will leave an 'empty-container' or +spare superblock on the drives. See \-\-zero\-superblock for completely +removing a superblock. Note that some formats depend on the subarray +index for generating a UUID, this command will fail if it would change +the UUID of an active subarray. + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. See below in +.B MISC MODE +for details. + +.TP +.BR \-t ", " \-\-test +When used with +.BR \-\-detail , +the exit status of +.I mdadm +is set to reflect the status of the device. See below in +.B MISC MODE +for details. + +.TP +.BR \-W ", " \-\-wait +For each md device given, wait for any resync, recovery, or reshape +activity to finish before returning. +.I mdadm +will return with success if it actually waited for every device +listed, otherwise it will return failure. + +.TP +.BR \-\-wait\-clean +For each md device given, or each device in /proc/mdstat if +.B \-\-scan +is given, arrange for the array to be marked clean as soon as possible. +.I mdadm +will return with success if the array uses external metadata and we +successfully waited. For native arrays this returns immediately as the +kernel handles dirty-clean transitions at shutdown. No action is taken +if safe-mode handling is disabled. + +.SH For Incremental Assembly mode: +.TP +.BR \-\-rebuild\-map ", " \-r +Rebuild the map file +.RB ( /var/run/mdadm/map ) +that +.I mdadm +uses to help track which arrays are currently being assembled. + +.TP +.BR \-\-run ", " \-R +Run any array assembled as soon as a minimal number of devices are +available, rather than waiting until all expected devices are present. + +.TP +.BR \-\-scan ", " \-s +Only meaningful with +.B \-R +this will scan the +.B map +file for arrays that are being incrementally assembled and will try to +start any that are not already started. If any such array is listed +in +.B mdadm.conf +as requiring an external bitmap, that bitmap will be attached first. + +.TP +.BR \-\-fail ", " \-f +This allows the hot-plug system to remove devices that have fully disappeared +from the kernel. It will first fail and then remove the device from any +array it belongs to. +The device name given should be a kernel device name such as "sda", +not a name in +.IR /dev . + +.SH For Monitor mode: +.TP +.BR \-m ", " \-\-mail +Give a mail address to send alerts to. + +.TP +.BR \-p ", " \-\-program ", " \-\-alert +Give a program to be run whenever an event is detected. + +.TP +.BR \-y ", " \-\-syslog +Cause all events to be reported through 'syslog'. The messages have +facility of 'daemon' and varying priorities. + +.TP +.BR \-d ", " \-\-delay +Give a delay in seconds. +.I mdadm +polls the md arrays and then waits this many seconds before polling +again. The default is 60 seconds. Since 2.6.16, there is no need to +reduce this as the kernel alerts +.I mdadm +immediately when there is any change. + +.TP +.BR \-r ", " \-\-increment +Give a percentage increment. +.I mdadm +will generate RebuildNN events with the given percentage increment. + +.TP +.BR \-f ", " \-\-daemonise +Tell +.I mdadm +to run as a background daemon if it decides to monitor anything. This +causes it to fork and run in the child, and to disconnect from the +terminal. The process id of the child is written to stdout. +This is useful with +.B \-\-scan +which will only continue monitoring if a mail address or alert program +is found in the config file. + +.TP +.BR \-i ", " \-\-pid\-file +When +.I mdadm +is running in daemon mode, write the pid of the daemon process to +the specified file, instead of printing it on standard output. + +.TP +.BR \-1 ", " \-\-oneshot +Check arrays only once. This will generate +.B NewArray +events and more significantly +.B DegradedArray +and +.B SparesMissing +events. Running +.in +5 +.B " mdadm \-\-monitor \-\-scan \-1" +.in -5 +from a cron script will ensure regular notification of any degraded arrays. + +.TP +.BR \-t ", " \-\-test +Generate a +.B TestMessage +alert for every array found at startup. This alert gets mailed and +passed to the alert program. This can be used for testing that alert +message do get through successfully. + +.SH ASSEMBLE MODE + +.HP 12 +Usage: +.B mdadm \-\-assemble +.I md-device options-and-component-devices... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I md-devices-and-options... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I options... + +.PP +This usage assembles one or more RAID arrays from pre-existing components. +For each array, mdadm needs to know the md device, the identity of the +array, and a number of component-devices. These can be found in a number of ways. + +In the first usage example (without the +.BR \-\-scan ) +the first device given is the md device. +In the second usage example, all devices listed are treated as md +devices and assembly is attempted. +In the third (where no devices are listed) all md devices that are +listed in the configuration file are assembled. If not arrays are +described by the configuration file, then any arrays that +can be found on unused devices will be assembled. + +If precisely one device is listed, but +.B \-\-scan +is not given, then +.I mdadm +acts as though +.B \-\-scan +was given and identity information is extracted from the configuration file. + +The identity can be given with the +.B \-\-uuid +option, the +.B \-\-name +option, or the +.B \-\-super\-minor +option, will be taken from the md-device record in the config file, or +will be taken from the super block of the first component-device +listed on the command line. + +Devices can be given on the +.B \-\-assemble +command line or in the config file. Only devices which have an md +superblock which contains the right identity will be considered for +any array. + +The config file is only used if explicitly named with +.B \-\-config +or requested with (a possibly implicit) +.BR \-\-scan . +In the later case, +.B /etc/mdadm.conf +or +.B /etc/mdadm/mdadm.conf +is used. + +If +.B \-\-scan +is not given, then the config file will only be used to find the +identity of md arrays. + +Normally the array will be started after it is assembled. However if +.B \-\-scan +is not given and not all expected drives were listed, then the array +is not started (to guard against usage errors). To insist that the +array be started in this case (as may work for RAID1, 4, 5, 6, or 10), +give the +.B \-\-run +flag. + +If +.I udev +is active, +.I mdadm +does not create any entries in +.B /dev +but leaves that to +.IR udev . +It does record information in +.B /var/run/mdadm/map +which will allow +.I udev +to choose the correct name. + +If +.I mdadm +detects that udev is not configured, it will create the devices in +.B /dev +itself. + +In Linux kernels prior to version 2.6.28 there were two distinctly +different types of md devices that could be created: one that could be +partitioned using standard partitioning tools and one that could not. +Since 2.6.28 that distinction is no longer relevant as both type of +devices can be partitioned. +.I mdadm +will normally create the type that originally could not be partitioned +as it has a well defined major number (9). + +Prior to 2.6.28, it is important that mdadm chooses the correct type +of array device to use. This can be controlled with the +.B \-\-auto +option. In particular, a value of "mdp" or "part" or "p" tells mdadm +to use a partitionable device rather than the default. + +In the no-udev case, the value given to +.B \-\-auto +can be suffixed by a number. This tells +.I mdadm +to create that number of partition devices rather than the default of 4. + +The value given to +.B \-\-auto +can also be given in the configuration file as a word starting +.B auto= +on the ARRAY line for the relevant array. + +.SS Auto Assembly +When +.B \-\-assemble +is used with +.B \-\-scan +and no devices are listed, +.I mdadm +will first attempt to assemble all the arrays listed in the config +file. + +In no array at listed in the config (other than those marked +.BR ) +it will look through the available devices for possible arrays and +will try to assemble anything that it finds. Arrays which are tagged +as belonging to the given homehost will be assembled and started +normally. Arrays which do not obviously belong to this host are given +names that are expected not to conflict with anything local, and are +started "read-auto" so that nothing is written to any device until the +array is written to. i.e. automatic resync etc is delayed. + +If +.I mdadm +finds a consistent set of devices that look like they should comprise +an array, and if the superblock is tagged as belonging to the given +home host, it will automatically choose a device name and try to +assemble the array. If the array uses version-0.90 metadata, then the +.B minor +number as recorded in the superblock is used to create a name in +.B /dev/md/ +so for example +.BR /dev/md/3 . +If the array uses version-1 metadata, then the +.B name +from the superblock is used to similarly create a name in +.B /dev/md/ +(the name will have any 'host' prefix stripped first). + +This behaviour can be modified by the +.I AUTO +line in the +.I mdadm.conf +configuration file. This line can indicate that specific metadata +type should, or should not, be automatically assembled. If an array +is found which is not listed in +.I mdadm.conf +and has a metadata format that is denied by the +.I AUTO +line, then it will not be assembled. +The +.I AUTO +line can also request that all arrays identified as being for this +homehost should be assembled regardless of their metadata type. +See +.IR mdadm.conf (5) +for further details. + +.ig +If +.I mdadm +cannot find any array for the given host at all, and if +.B \-\-auto\-update\-homehost +is given, then +.I mdadm +will search again for any array (not just an array created for this +host) and will assemble each assuming +.BR \-\-update=homehost . +This will change the host tag in the superblock so that on the next run, +these arrays will be found without the second pass. The intention of +this feature is to support transitioning a set of md arrays to using +homehost tagging. + +The reason for requiring arrays to be tagged with the homehost for +auto assembly is to guard against problems that can arise when moving +devices from one host to another. +.. + +.SH BUILD MODE + +.HP 12 +Usage: +.B mdadm \-\-build +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage is similar to +.BR \-\-create . +The difference is that it creates an array without a superblock. With +these arrays there is no difference between initially creating the array and +subsequently assembling the array, except that hopefully there is useful +data there in the second case. + +The level may raid0, linear, raid1, raid10, multipath, or faulty, or +one of their synonyms. All devices must be listed and the array will +be started once complete. It will often be appropriate to use +.B \-\-assume\-clean +with levels raid1 or raid10. + +.SH CREATE MODE + +.HP 12 +Usage: +.B mdadm \-\-create +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.br +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage will initialise a new md array, associate some devices with +it, and activate the array. + +The named device will normally not exist when +.I "mdadm \-\-create" +is run, but will be created by +.I udev +once the array becomes active. + +As devices are added, they are checked to see if they contain RAID +superblocks or filesystems. They are also checked to see if the variance in +device size exceeds 1%. + +If any discrepancy is found, the array will not automatically be run, though +the presence of a +.B \-\-run +can override this caution. + +To create a "degraded" array in which some devices are missing, simply +give the word "\fBmissing\fP" +in place of a device name. This will cause +.I mdadm +to leave the corresponding slot in the array empty. +For a RAID4 or RAID5 array at most one slot can be +"\fBmissing\fP"; for a RAID6 array at most two slots. +For a RAID1 array, only one real device needs to be given. All of the +others can be +"\fBmissing\fP". + +When creating a RAID5 array, +.I mdadm +will automatically create a degraded array with an extra spare drive. +This is because building the spare into a degraded array is in general +faster than resyncing the parity on a non-degraded, but not clean, +array. This feature can be overridden with the +.B \-\-force +option. + +When creating an array with version-1 metadata a name for the array is +required. +If this is not given with the +.B \-\-name +option, +.I mdadm +will choose a name based on the last component of the name of the +device being created. So if +.B /dev/md3 +is being created, then the name +.B 3 +will be chosen. +If +.B /dev/md/home +is being created, then the name +.B home +will be used. + +When creating a partition based array, using +.I mdadm +with version-1.x metadata, the partition type should be set to +.B 0xDA +(non fs-data). This type selection allows for greater precision since +using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)], +might create problems in the event of array recovery through a live cdrom. + +A new array will normally get a randomly assigned 128bit UUID which is +very likely to be unique. If you have a specific need, you can choose +a UUID for the array by giving the +.B \-\-uuid= +option. Be warned that creating two arrays with the same UUID is a +recipe for disaster. Also, using +.B \-\-uuid= +when creating a v0.90 array will silently override any +.B \-\-homehost= +setting. +.\"If the +.\".B \-\-size +.\"option is given, it is not necessary to list any component-devices in this command. +.\"They can be added later, before a +.\".B \-\-run. +.\"If no +.\".B \-\-size +.\"is given, the apparent size of the smallest drive given is used. + +When creating an array within a +.B CONTAINER +.I mdadm +can be given either the list of devices to use, or simply the name of +the container. The former case gives control over which devices in +the container will be used for the array. The latter case allows +.I mdadm +to automatically choose which devices to use based on how much spare +space is available. + +The General Management options that are valid with +.B \-\-create +are: +.TP +.B \-\-run +insist on running the array even if some devices look like they might +be in use. + +.TP +.B \-\-readonly +start the array readonly \(em not supported yet. + +.SH MANAGE MODE +.HP 12 +Usage: +.B mdadm +.I device +.I options... devices... +.PP + +This usage will allow individual devices in an array to be failed, +removed or added. It is possible to perform multiple operations with +on command. For example: +.br +.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1" +.br +will firstly mark +.B /dev/hda1 +as faulty in +.B /dev/md0 +and will then remove it from the array and finally add it back +in as a spare. However only one md array can be affected by a single +command. + +When a device is added to an active array, mdadm checks to see if it +has metadata on it which suggests that it was recently a member of the +array. If it does, it tries to "re\-add" the device. If there have +been no changes since the device was removed, or if the array has a +write-intent bitmap which has recorded whatever changes there were, +then the device will immediately become a full member of the array and +those differences recorded in the bitmap will be resolved. + +.SH MISC MODE +.HP 12 +Usage: +.B mdadm +.I options ... +.I devices ... +.PP + +MISC mode includes a number of distinct operations that +operate on distinct devices. The operations are: +.TP +.B \-\-query +The device is examined to see if it is +(1) an active md array, or +(2) a component of an md array. +The information discovered is reported. + +.TP +.B \-\-detail +The device should be an active md device. +.B mdadm +will display a detailed description of the array. +.B \-\-brief +or +.B \-\-scan +will cause the output to be less detailed and the format to be +suitable for inclusion in +.BR /etc/mdadm.conf . +The exit status of +.I mdadm +will normally be 0 unless +.I mdadm +failed to get useful information about the device(s); however, if the +.B \-\-test +option is given, then the exit status will be: +.RS +.TP +0 +The array is functioning normally. +.TP +1 +The array has at least one failed device. +.TP +2 +The array has multiple failed devices such that it is unusable. +.TP +4 +There was an error while trying to get information about the device. +.RE + +.TP +.B \-\-detail\-platform +Print detail of the platform's RAID capabilities (firmware / hardware +topology). If the metadata is specified with +.B \-e +or +.B \-\-metadata= +then the return status will be: +.RS +.TP +0 +metadata successfully enumerated its platform components on this system +.TP +1 +metadata is platform independent +.TP +2 +metadata failed to find its platform components on this system +.RE + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. Similar to updating an array in +"assemble" mode, the field to update is selected by +.B \-U +or +.B \-\-update= +option. Currently only +.B name +is supported. + +The +.B name +option updates the subarray name in the metadata, it may not affect the +device node name or the device node symlink until the subarray is +re\-assembled. If updating +.B name +would change the UUID of an active subarray this operation is blocked, +and the command will end in an error. + +.TP +.B \-\-examine +The device should be a component of an md array. +.I mdadm +will read the md superblock of the device and display the contents. +If +.B \-\-brief +or +.B \-\-scan +is given, then multiple devices that are components of the one array +are grouped together and reported in a single entry suitable +for inclusion in +.BR /etc/mdadm.conf . + +Having +.B \-\-scan +without listing any devices will cause all devices listed in the +config file to be examined. + +.TP +.B \-\-stop +The devices should be active md arrays which will be deactivated, as +long as they are not currently in use. + +.TP +.B \-\-run +This will fully activate a partially assembled md array. + +.TP +.B \-\-readonly +This will mark an active array as read-only, providing that it is +not currently being used. + +.TP +.B \-\-readwrite +This will change a +.B readonly +array back to being read/write. + +.TP +.B \-\-scan +For all operations except +.BR \-\-examine , +.B \-\-scan +will cause the operation to be applied to all arrays listed in +.BR /proc/mdstat . +For +.BR \-\-examine, +.B \-\-scan +causes all devices listed in the config file to be examined. + +.TP +.BR \-b ", " \-\-brief +Be less verbose. This is used with +.B \-\-detail +and +.BR \-\-examine . +Using +.B \-\-brief +with +.B \-\-verbose +gives an intermediate level of verbosity. + +.SH MONITOR MODE + +.HP 12 +Usage: +.B mdadm \-\-monitor +.I options... devices... + +.PP +This usage causes +.I mdadm +to periodically poll a number of md arrays and to report on any events +noticed. +.I mdadm +will never exit once it decides that there are arrays to be checked, +so it should normally be run in the background. + +As well as reporting events, +.I mdadm +may move a spare drive from one array to another if they are in the +same +.B spare-group +and if the destination array has a failed drive but no spares. + +If any devices are listed on the command line, +.I mdadm +will only monitor those devices. Otherwise all arrays listed in the +configuration file will be monitored. Further, if +.B \-\-scan +is given, then any other md devices that appear in +.B /proc/mdstat +will also be monitored. + +The result of monitoring the arrays is the generation of events. +These events are passed to a separate program (if specified) and may +be mailed to a given E-mail address. + +When passing events to a program, the program is run once for each event, +and is given 2 or 3 command-line arguments: the first is the +name of the event (see below), the second is the name of the +md device which is affected, and the third is the name of a related +device if relevant (such as a component device that has failed). + +If +.B \-\-scan +is given, then a program or an E-mail address must be specified on the +command line or in the config file. If neither are available, then +.I mdadm +will not monitor anything. +Without +.B \-\-scan, +.I mdadm +will continue monitoring as long as something was found to monitor. If +no program or email is given, then each event is reported to +.BR stdout . + +The different events are: + +.RS 4 +.TP +.B DeviceDisappeared +An md array which previously was configured appears to no longer be +configured. (syslog priority: Critical) + +If +.I mdadm +was told to monitor an array which is RAID0 or Linear, then it will +report +.B DeviceDisappeared +with the extra information +.BR Wrong-Level . +This is because RAID0 and Linear do not support the device-failed, +hot-spare and resync operations which are monitored. + +.TP +.B RebuildStarted +An md array started reconstruction. (syslog priority: Warning) + +.TP +.BI Rebuild NN +Where +.I NN +is a two-digit number (ie. 05, 48). This indicates that rebuild +has passed that many percent of the total. The events are generated +with fixed increment since 0. Increment size may be specified with +a commandline option (default is 20). (syslog priority: Warning) + +.TP +.B RebuildFinished +An md array that was rebuilding, isn't any more, either because it +finished normally or was aborted. (syslog priority: Warning) + +.TP +.B Fail +An active component device of an array has been marked as +faulty. (syslog priority: Critical) + +.TP +.B FailSpare +A spare component device which was being rebuilt to replace a faulty +device has failed. (syslog priority: Critical) + +.TP +.B SpareActive +A spare component device which was being rebuilt to replace a faulty +device has been successfully rebuilt and has been made active. +(syslog priority: Info) + +.TP +.B NewArray +A new md array has been detected in the +.B /proc/mdstat +file. (syslog priority: Info) + +.TP +.B DegradedArray +A newly noticed array appears to be degraded. This message is not +generated when +.I mdadm +notices a drive failure which causes degradation, but only when +.I mdadm +notices that an array is degraded when it first sees the array. +(syslog priority: Critical) + +.TP +.B MoveSpare +A spare drive has been moved from one array in a +.B spare-group +to another to allow a failed drive to be replaced. +(syslog priority: Info) + +.TP +.B SparesMissing +If +.I mdadm +has been told, via the config file, that an array should have a certain +number of spare devices, and +.I mdadm +detects that it has fewer than this number when it first sees the +array, it will report a +.B SparesMissing +message. +(syslog priority: Warning) + +.TP +.B TestMessage +An array was found at startup, and the +.B \-\-test +flag was given. +(syslog priority: Info) +.RE + +Only +.B Fail, +.B FailSpare, +.B DegradedArray, +.B SparesMissing +and +.B TestMessage +cause Email to be sent. All events cause the program to be run. +The program is run with two or three arguments: the event +name, the array device and possibly a second device. + +Each event has an associated array device (e.g. +.BR /dev/md1 ) +and possibly a second device. For +.BR Fail , +.BR FailSpare , +and +.B SpareActive +the second device is the relevant component device. +For +.B MoveSpare +the second device is the array that the spare was moved from. + +For +.I mdadm +to move spares from one array to another, the different arrays need to +be labeled with the same +.B spare-group +in the configuration file. The +.B spare-group +name can be any string; it is only necessary that different spare +groups use different names. + +When +.I mdadm +detects that an array in a spare group has fewer active +devices than necessary for the complete array, and has no spare +devices, it will look for another array in the same spare group that +has a full complement of working drive and a spare. It will then +attempt to remove the spare from the second drive and add it to the +first. +If the removal succeeds but the adding fails, then it is added back to +the original array. + +.SH GROW MODE +The GROW mode is used for changing the size or shape of an active +array. +For this to work, the kernel must support the necessary change. +Various types of growth are being added during 2.6 development, +including restructuring a RAID5 array to have more active devices. + +Currently the only support available is to +.IP \(bu 4 +change the "size" attribute +for RAID1, RAID5 and RAID6. +.IP \(bu 4 +increase or decrease the "raid\-devices" attribute of RAID1, RAID5, +and RAID6. +.IP \bu 4 +change the chunk-size and layout of RAID5 and RAID6. +.IP \bu 4 +convert between RAID1 and RAID5, and between RAID5 and RAID6. +.IP \(bu 4 +add a write-intent bitmap to any array which supports these bitmaps, or +remove a write-intent bitmap from such an array. +.PP + +GROW mode is not currently supported for +.B CONTAINERS +or arrays inside containers. + +.SS SIZE CHANGES +Normally when an array is built the "size" it taken from the smallest +of the drives. If all the small drives in an arrays are, one at a +time, removed and replaced with larger drives, then you could have an +array of large drives with only a small amount used. In this +situation, changing the "size" with "GROW" mode will allow the extra +space to start being used. If the size is increased in this way, a +"resync" process will start to make sure the new parts of the array +are synchronised. + +Note that when an array changes size, any filesystem that may be +stored in the array will not automatically grow to use the space. The +filesystem will need to be explicitly told to use the extra space. + +Also the size of an array cannot be changed while it has an active +bitmap. If an array has a bitmap, it must be removed before the size +can be changed. Once the change it complete a new bitmap can be created. + +.SS RAID\-DEVICES CHANGES + +A RAID1 array can work with any number of devices from 1 upwards +(though 1 is not very useful). There may be times which you want to +increase or decrease the number of active devices. Note that this is +different to hot-add or hot-remove which changes the number of +inactive devices. + +When reducing the number of devices in a RAID1 array, the slots which +are to be removed from the array must already be vacant. That is, the +devices which were in those slots must be failed and removed. + +When the number of devices is increased, any hot spares that are +present will be activated immediately. + +Changing the number of active devices in a RAID5 or RAID6 is much more +effort. Every block in the array will need to be read and written +back to a new location. From 2.6.17, the Linux Kernel is able to +increase the number of devices in a RAID5 safely, including restarting +an interrupted "reshape". From 2.6.31, the Linux Kernel is able to +increase or decrease the number of devices in a RAID5 or RAID6. + +When decreasing the number of devices, the size of the array will also +decrease. If there was data in the array, it could get destroyed and +this is not reversible. To help prevent accidents, +.I mdadm +requires that the size of the array be decreased first with +.BR "mdadm --grow --array-size" . +This is a reversible change which simply makes the end of the array +inaccessible. The integrity of any data can then be checked before +the non-reversible reduction in the number of devices is request. + +When relocating the first few stripes on a RAID5, it is not possible +to keep the data on disk completely consistent and crash-proof. To +provide the required safety, mdadm disables writes to the array while +this "critical section" is reshaped, and takes a backup of the data +that is in that section. This backup is normally stored in any spare +devices that the array has, however it can also be stored in a +separate file specified with the +.B \-\-backup\-file +option. If this option is used, and the system does crash during the +critical period, the same file must be passed to +.B \-\-assemble +to restore the backup and reassemble the array. + +.SS LEVEL CHANGES + +Changing the RAID level of any array happens instantaneously. However +in the RAID to RAID6 case this requires a non-standard layout of the +RAID6 data, and in the RAID6 to RAID5 case that non-standard layout is +required before the change can be accomplish. So while the level +change is instant, the accompanying layout change can take quite a +long time. + +.SS CHUNK-SIZE AND LAYOUT CHANGES + +Changing the chunk-size of layout without also changing the number of +devices as the same time will involve re-writing all blocks in-place. +To ensure against data loss in the case of a crash, a +.B --backup-file +must be provided for these changes. Small sections of the array will +be copied to the backup file while they are being rearranged. + +If the reshape is interrupted for any reason, this backup file must be +make available to +.B "mdadm --assemble" +so the array can be reassembled. Consequently the file cannot be +stored on the device being reshaped. + + +.SS BITMAP CHANGES + +A write-intent bitmap can be added to, or removed from, an active +array. Either internal bitmaps, or bitmaps stored in a separate file, +can be added. Note that if you add a bitmap stored in a file which is +in a filesystem that is on the RAID array being affected, the system +will deadlock. The bitmap must be on a separate filesystem. + +.SH INCREMENTAL MODE + +.HP 12 +Usage: +.B mdadm \-\-incremental +.RB [ \-\-run ] +.RB [ \-\-quiet ] +.I component-device +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-fail +.I component-device +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-rebuild\-map +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-run \-\-scan + +.PP +This mode is designed to be used in conjunction with a device +discovery system. As devices are found in a system, they can be +passed to +.B "mdadm \-\-incremental" +to be conditionally added to an appropriate array. + +Conversely, it can also be used with the +.B \-\-fail +flag to do just the opposite and find whatever array a particular device +is part of and remove the device from that array. + +If the device passed is a +.B CONTAINER +device created by a previous call to +.IR mdadm , +then rather than trying to add that device to an array, all the arrays +described by the metadata of the container will be started. + +.I mdadm +performs a number of tests to determine if the device is part of an +array, and which array it should be part of. If an appropriate array +is found, or can be created, +.I mdadm +adds the device to the array and conditionally starts the array. + +Note that +.I mdadm +will only add devices to an array which were previously working +(active or spare) parts of that array. It does not currently support +automatic inclusion of a new drive as a spare in some array. + +The tests that +.I mdadm +makes are as follow: +.IP + +Is the device permitted by +.BR mdadm.conf ? +That is, is it listed in a +.B DEVICES +line in that file. If +.B DEVICES +is absent then the default it to allow any device. Similar if +.B DEVICES +contains the special word +.B partitions +then any device is allowed. Otherwise the device name given to +.I mdadm +must match one of the names or patterns in a +.B DEVICES +line. + +.IP + +Does the device have a valid md superblock. If a specific metadata +version is request with +.B \-\-metadata +or +.B \-e +then only that style of metadata is accepted, otherwise +.I mdadm +finds any known version of metadata. If no +.I md +metadata is found, the device is rejected. + +.ig +.IP + +Does the metadata match an expected array? +The metadata can match in two ways. Either there is an array listed +in +.B mdadm.conf +which identifies the array (either by UUID, by name, by device list, +or by minor-number), or the array was created with a +.B homehost +specified and that +.B homehost +matches the one in +.B mdadm.conf +or on the command line. +If +.I mdadm +is not able to positively identify the array as belonging to the +current host, the device will be rejected. +.. + +.I mdadm +keeps a list of arrays that it has partially assembled in +.B /var/run/mdadm/map +(or +.B /var/run/mdadm.map +if the directory doesn't exist. Or maybe even +.BR /dev/.mdadm.map ). +If no array exists which matches +the metadata on the new device, +.I mdadm +must choose a device name and unit number. It does this based on any +name given in +.B mdadm.conf +or any name information stored in the metadata. If this name +suggests a unit number, that number will be used, otherwise a free +unit number will be chosen. Normally +.I mdadm +will prefer to create a partitionable array, however if the +.B CREATE +line in +.B mdadm.conf +suggests that a non-partitionable array is preferred, that will be +honoured. + +If the array is not found in the config file and its metadata does not +identify it as belonging to the "homehost", then +.I mdadm +will choose a name for the array which is certain not to conflict with +any array which does belong to this host. It does this be adding an +underscore and a small number to the name preferred by the metadata. + +Once an appropriate array is found or created and the device is added, +.I mdadm +must decide if the array is ready to be started. It will +normally compare the number of available (non-spare) devices to the +number of devices that the metadata suggests need to be active. If +there are at least that many, the array will be started. This means +that if any devices are missing the array will not be restarted. + +As an alternative, +.B \-\-run +may be passed to +.I mdadm +in which case the array will be run as soon as there are enough +devices present for the data to be accessible. For a RAID1, that +means one device will start the array. For a clean RAID5, the array +will be started as soon as all but one drive is present. + +Note that neither of these approaches is really ideal. If it can +be known that all device discovery has completed, then +.br +.B " mdadm \-IRs" +.br +can be run which will try to start all arrays that are being +incrementally assembled. They are started in "read-auto" mode in +which they are read-only until the first write request. This means +that no metadata updates are made and no attempt at resync or recovery +happens. Further devices that are found before the first write can +still be added safely. + +.SH ENVIRONMENT +This section describes environment variables that affect how mdadm +operates. + +.TP +.B MDADM_NO_MDMON +Setting this value to 1 will prevent mdadm from automatically launching +mdmon. This variable is intended primarily for debugging mdadm/mdmon. + +.TP +.B MDADM_NO_UDEV +Normally, +.I mdadm +does not create any device nodes in /dev, but leaves that task to +.IR udev . +If +.I udev +appears not to be configured, or if this environment variable is set +to '1', the +.I mdadm +will create and devices that are needed. + +.SH EXAMPLES + +.B " mdadm \-\-query /dev/name-of-device" +.br +This will find out if a given device is a RAID array, or is part of +one, and will provide brief information about the device. + +.B " mdadm \-\-assemble \-\-scan" +.br +This will assemble and start all arrays listed in the standard config +file. This command will typically go in a system startup file. + +.B " mdadm \-\-stop \-\-scan" +.br +This will shut down all arrays that can be shut down (i.e. are not +currently in use). This will typically go in a system shutdown script. + +.B " mdadm \-\-follow \-\-scan \-\-delay=120" +.br +If (and only if) there is an Email address or program given in the +standard config file, then +monitor the status of all arrays listed in that file by +polling them ever 2 minutes. + +.B " mdadm \-\-create /dev/md0 \-\-level=1 \-\-raid\-devices=2 /dev/hd[ac]1" +.br +Create /dev/md0 as a RAID1 array consisting of /dev/hda1 and /dev/hdc1. + +.br +.B " echo 'DEVICE /dev/hd*[0\-9] /dev/sd*[0\-9]' > mdadm.conf" +.br +.B " mdadm \-\-detail \-\-scan >> mdadm.conf" +.br +This will create a prototype config file that describes currently +active arrays that are known to be made from partitions of IDE or SCSI drives. +This file should be reviewed before being used as it may +contain unwanted detail. + +.B " echo 'DEVICE /dev/hd[a\-z] /dev/sd*[a\-z]' > mdadm.conf" +.br +.B " mdadm \-\-examine \-\-scan \-\-config=mdadm.conf >> mdadm.conf" +.br +This will find arrays which could be assembled from existing IDE and +SCSI whole drives (not partitions), and store the information in the +format of a config file. +This file is very likely to contain unwanted detail, particularly +the +.B devices= +entries. It should be reviewed and edited before being used as an +actual config file. + +.B " mdadm \-\-examine \-\-brief \-\-scan \-\-config=partitions" +.br +.B " mdadm \-Ebsc partitions" +.br +Create a list of devices by reading +.BR /proc/partitions , +scan these for RAID superblocks, and printout a brief listing of all +that were found. + +.B " mdadm \-Ac partitions \-m 0 /dev/md0" +.br +Scan all partitions and devices listed in +.BR /proc/partitions +and assemble +.B /dev/md0 +out of all such devices with a RAID superblock with a minor number of 0. + +.B " mdadm \-\-monitor \-\-scan \-\-daemonise > /var/run/mdadm" +.br +If config file contains a mail address or alert program, run mdadm in +the background in monitor mode monitoring all md devices. Also write +pid of mdadm daemon to +.BR /var/run/mdadm . + +.B " mdadm \-Iq /dev/somedevice" +.br +Try to incorporate newly discovered device into some array as +appropriate. + +.B " mdadm \-\-incremental \-\-rebuild\-map \-\-run \-\-scan" +.br +Rebuild the array map from any current arrays, and then start any that +can be started. + +.B " mdadm /dev/md4 --fail detached --remove detached" +.br +Any devices which are components of /dev/md4 will be marked as faulty +and then remove from the array. + +.B " mdadm --grow /dev/md4 --level=6 --backup-file=/root/backup-md4 +.br +The array +.B /dev/md4 +which is currently a RAID5 array will be converted to RAID6. There +should normally already be a spare drive attached to the array as a +RAID6 needs one more drive than a matching RAID5. + +.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]" +.br +Create a DDF array over 6 devices. + +.B " mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf" +.br +Create a RAID5 array over any 3 devices in the given DDF set. Use +only 30 gigabytes of each device. + +.B " mdadm -A /dev/md/ddf1 /dev/sd[a-f]" +.br +Assemble a pre-exist ddf array. + +.B " mdadm -I /dev/md/ddf1" +.br +Assemble all arrays contained in the ddf array, assigning names as +appropriate. + +.B " mdadm \-\-create \-\-help" +.br +Provide help about the Create mode. + +.B " mdadm \-\-config \-\-help" +.br +Provide help about the format of the config file. + +.B " mdadm \-\-help" +.br +Provide general help. + +.SH FILES + +.SS /proc/mdstat + +If you're using the +.B /proc +filesystem, +.B /proc/mdstat +lists all active md devices with information about them. +.I mdadm +uses this to find arrays when +.B \-\-scan +is given in Misc mode, and to monitor array reconstruction +on Monitor mode. + +.SS /etc/mdadm.conf + +The config file lists which devices may be scanned to see if +they contain MD super block, and gives identifying information +(e.g. UUID) about known MD arrays. See +.BR mdadm.conf (5) +for more details. + +.SS /var/run/mdadm/map +When +.B \-\-incremental +mode is used, this file gets a list of arrays currently being created. +If +.B /var/run/mdadm +does not exist as a directory, then +.B /var/run/mdadm.map +is used instead. If +.B /var/run +is not available (as may be the case during early boot), +.B /dev/.mdadm.map +is used on the basis that +.B /dev +is usually available very early in boot. + +.SH DEVICE NAMES + +.I mdadm +understand two sorts of names for array devices. + +The first is the so-called 'standard' format name, which matches the +names used by the kernel and which appear in +.IR /proc/mdstat . + +The second sort can be freely chosen, but must reside in +.IR /dev/md/ . +When giving a device name to +.I mdadm +to create or assemble an array, either full path name such as +.I /dev/md0 +or +.I /dev/md/home +can be given, or just the suffix of the second sort of name, such as +.I home +can be given. + +When +.I mdadm +chooses device names during auto-assembly or incremental assembly, it +will sometimes add a small sequence number to the end of the name to +avoid conflicted between multiple arrays that have the same name. If +.I mdadm +can reasonably determine that the array really is meant for this host, +either by a hostname in the metadata, or by the presence of the array +in /etc/mdadm.conf, then it will leave off the suffix if possible. +Also if the homehost is specified as +.B +.I mdadm +will only use a suffix if a different array of the same name already +exists or is listed in the config file. + +The standard names for non-partitioned arrays (the only sort of md +array available in 2.4 and earlier) are of the form +.IP +/dev/mdNN +.PP +where NN is a number. +The standard names for partitionable arrays (as available from 2.6 +onwards) are of the form +.IP +/dev/md_dNN +.PP +Partition numbers should be indicated by added "pMM" to these, thus "/dev/md/d1p2". +.PP +From kernel version, 2.6.28 the "non-partitioned array" can actually +be partitioned. So the "md_dNN" names are no longer needed, and +partitions such as "/dev/mdNNpXX" are possible. + +.SH NOTE +.I mdadm +was previously known as +.IR mdctl . +.P +.I mdadm +is completely separate from the +.I raidtools +package, and does not use the +.I /etc/raidtab +configuration file at all. + +.SH SEE ALSO +For further information on mdadm usage, MD and the various levels of +RAID, see: +.IP +.B http://linux\-raid.osdl.org/ +.PP +(based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO) +.\".PP +.\"for new releases of the RAID driver check out: +.\" +.\".IP +.\".UR ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches +.\"ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches +.\".UE +.\".PP +.\"or +.\".IP +.\".UR http://www.cse.unsw.edu.au/~neilb/patches/linux-stable/ +.\"http://www.cse.unsw.edu.au/~neilb/patches/linux-stable/ +.\".UE +.PP +The latest version of +.I mdadm +should always be available from +.IP +.B http://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). +.PP +.IR raidtab (5), +.IR raid0run (8), +.IR raidstop (8), +.IR mkraid (8). diff -Nru mdadm-2.6.7.1/mdadm.c mdadm-3.1.4/mdadm.c --- mdadm-2.6.7.1/mdadm.c 2008-10-15 08:04:09.000000000 +0300 +++ mdadm-3.1.4/mdadm.c 2010-08-31 09:14:47.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: * * Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004, * Paul Clements, SteelEye Technology, Inc. @@ -46,8 +41,10 @@ int chunk = 0; long long size = -1; + long long array_size = -1; int level = UnSet; int layout = UnSet; + char *layout_str = NULL; int raiddisks = 0; int max_disks = MD_SB_DISKS; /* just a default */ int sparedisks = 0; @@ -91,8 +88,10 @@ char *homehost = NULL; char sys_hostname[256]; + int require_homehost = 1; char *mailaddr = NULL; char *program = NULL; + int increments = 20; int delay = 0; int daemonise = 0; char *pidfile = NULL; @@ -104,8 +103,8 @@ int dosyslog = 0; int rebuild_map = 0; int auto_update_home = 0; + char *subarray = NULL; - int copies; int print_help = 0; FILE *outf; @@ -124,13 +123,15 @@ ident.bitmap_fd = -1; ident.bitmap_file = NULL; ident.name[0] = 0; + ident.container = NULL; + ident.member = NULL; while ((option_index = -1) , (opt=getopt_long(argc, argv, shortopt, long_options, &option_index)) != -1) { int newmode = mode; - /* firstly, some mode-independant options */ + /* firstly, some mode-independent options */ switch(opt) { case 'h': if (option_index > 0 && @@ -151,20 +152,20 @@ continue; case 'b': - if (mode == ASSEMBLE || mode == BUILD || mode == CREATE || mode == GROW) + if (mode == ASSEMBLE || mode == BUILD || mode == CREATE || mode == GROW || + mode == INCREMENTAL || mode == MANAGE) break; /* b means bitmap */ brief = 1; - if (optarg) { - fprintf(stderr, Name ": -b cannot have any extra immediately after it, sorry.\n"); - exit(2); - } continue; case 'Y': export++; continue; case HomeHost: - homehost = optarg; + if (strcasecmp(optarg, "") == 0) + require_homehost = 0; + else + homehost = optarg; continue; case ':': @@ -214,6 +215,17 @@ case 'o': case 'w': case 'W': + case Waitclean: + case DetailPlatform: + case KillSubarray: + case UpdateSubarray: + if (opt == KillSubarray || opt == UpdateSubarray) { + if (subarray) { + fprintf(stderr, Name ": subarray can only be specified once\n"); + exit(2); + } + subarray = optarg; + } case 'K': if (!mode) newmode = MISC; break; } if (mode && newmode == mode) { @@ -253,6 +265,7 @@ dv->writemostly = writemostly; dv->re_add = re_add; dv->used = 0; + dv->content = NULL; dv->next = NULL; *devlistend = dv; devlistend = &dv->next; @@ -261,7 +274,8 @@ continue; } /* No mode yet, and this is the second device ... */ - fprintf(stderr, Name ": An option must be given to set the mode before a second device is listed\n"); + fprintf(stderr, Name ": An option must be given to set the mode before a second device\n" + " (%s) is listed\n", optarg); exit(2); } if (option_index >= 0) @@ -305,6 +319,8 @@ dv->disposition = devmode; dv->writemostly = writemostly; dv->re_add = re_add; + dv->used = 0; + dv->content = NULL; dv->next = NULL; *devlistend = dv; devlistend = &dv->next; @@ -317,6 +333,7 @@ * could depend on the mode */ #define O(a,b) ((a<<8)|b) switch (O(mode,opt)) { + case O(GROW,'c'): case O(CREATE,'c'): case O(BUILD,'c'): /* chunk or rounding */ if (chunk) { @@ -332,9 +349,11 @@ } continue; +#if 0 case O(ASSEMBLE,AutoHomeHost): auto_update_home = 1; continue; +#endif case O(INCREMENTAL, 'e'): case O(CREATE,'e'): case O(ASSEMBLE,'e'): @@ -360,8 +379,15 @@ writemostly = 1; continue; + case O(MANAGE,'w'): + /* clear write-mostly for following devices */ + writemostly = 2; + continue; + + case O(GROW,'z'): - case O(CREATE,'z'): /* size */ + case O(CREATE,'z'): + case O(BUILD,'z'): /* size */ if (size >= 0) { fprintf(stderr, Name ": size may only be specified once. " "Second value is %s.\n", optarg); @@ -370,16 +396,36 @@ if (strcmp(optarg, "max")==0) size = 0; else { - size = strtoll(optarg, &c, 10); - if (!optarg[0] || *c || size < 4) { + size = parse_size(optarg); + if (size < 8) { fprintf(stderr, Name ": invalid size: %s\n", optarg); exit(2); } + /* convert sectors to K */ + size /= 2; } continue; - case O(GROW,'l'): /* hack - needed to understand layout */ + case O(GROW,'Z'): /* array size */ + if (array_size >= 0) { + fprintf(stderr, Name ": array-size may only be specified once. " + "Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max") == 0) + array_size = 0; + else { + array_size = parse_size(optarg); + if (array_size <= 0) { + fprintf(stderr, Name ": invalid array size: %s\n", + optarg); + exit(2); + } + } + continue; + + case O(GROW,'l'): case O(CREATE,'l'): case O(BUILD,'l'): /* set raid level*/ if (level != UnSet) { @@ -393,7 +439,10 @@ optarg); exit(2); } - if (level != 0 && level != -1 && level != 1 && level != -4 && level != -5 && mode == BUILD) { + if (level != 0 && level != LEVEL_LINEAR && level != 1 && + level != LEVEL_MULTIPATH && level != LEVEL_FAULTY && + level != 10 && + mode == BUILD) { fprintf(stderr, Name ": Raid level %s not permitted with --build.\n", optarg); exit(2); @@ -406,9 +455,18 @@ ident.level = level; continue; + case O(GROW, 'p'): /* new layout */ + if (layout_str) { + fprintf(stderr,Name ": layout may only be sent once. " + "Second value was %s\n", optarg); + exit(2); + } + layout_str = optarg; + /* 'Grow' will parse the value */ + continue; + case O(CREATE,'p'): /* raid5 layout */ case O(BUILD,'p'): /* faulty layout */ - case O(GROW, 'p'): /* faulty reconfig */ if (layout != UnSet) { fprintf(stderr,Name ": layout may only be sent once. " "Second value was %s\n", optarg); @@ -424,7 +482,6 @@ exit(2); case 5: - case 6: layout = map_name(r5layout, optarg); if (layout==UnSet) { fprintf(stderr, Name ": layout %s not understood for raid5.\n", @@ -432,40 +489,33 @@ exit(2); } break; + case 6: + layout = map_name(r6layout, optarg); + if (layout==UnSet) { + fprintf(stderr, Name ": layout %s not understood for raid6.\n", + optarg); + exit(2); + } + break; case 10: - /* 'f', 'o' or 'n' followed by a number <= raid_disks */ - if ((optarg[0] != 'n' && optarg[0] != 'f' && optarg[0] != 'o') || - (copies = strtoul(optarg+1, &cp, 10)) < 1 || - copies > 200 || - *cp) { + layout = parse_layout_10(optarg); + if (layout < 0) { fprintf(stderr, Name ": layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg); exit(2); } - if (optarg[0] == 'n') - layout = 256 + copies; - else if (optarg[0] == 'o') - layout = 0x10000 + (copies<<8) + 1; - else - layout = 1 + (copies<<8); break; - case -5: /* Faulty - * modeNNN - */ - - { - int ln = strcspn(optarg, "0123456789"); - char *m = strdup(optarg); - int mode; - m[ln] = 0; - mode = map_name(faultylayout, m); - if (mode == UnSet) { + case LEVEL_FAULTY: + /* Faulty + * modeNNN + */ + layout = parse_layout_faulty(optarg); + if (layout == -1) { fprintf(stderr, Name ": layout %s not understood for faulty.\n", optarg); exit(2); } - layout = mode | (atoi(optarg+ln)<< ModeShift); - } + break; } continue; @@ -549,11 +599,16 @@ case O(CREATE,'N'): case O(ASSEMBLE,'N'): + case O(MISC,'N'): if (ident.name[0]) { fprintf(stderr, Name ": name cannot be set twice. " "Second value %s.\n", optarg); exit(2); } + if (mode == MISC && !subarray) { + fprintf(stderr, Name ": -N/--name only valid with --update-subarray in misc mode\n"); + exit(2); + } if (strlen(optarg) > 32) { fprintf(stderr, Name ": name '%s' is too long, 32 chars max.\n", optarg); @@ -580,11 +635,16 @@ continue; case O(ASSEMBLE,'U'): /* update the superblock */ + case O(MISC,'U'): if (update) { fprintf(stderr, Name ": Can only update one aspect of superblock, both %s and %s given.\n", update, optarg); exit(2); } + if (mode == MISC && !subarray) { + fprintf(stderr, Name ": Only subarrays can be updated in misc mode\n"); + exit(2); + } update = optarg; if (strcmp(update, "sparc2.2")==0) continue; @@ -631,12 +691,15 @@ " 'summaries', 'homehost', 'byteorder', 'devicesize'.\n"); exit(outf == stdout ? 0 : 2); + case O(INCREMENTAL,NoDegraded): + fprintf(stderr, Name ": --no-degraded is deprecated in Incremental mode\n"); case O(ASSEMBLE,NoDegraded): /* --no-degraded */ runstop = -1; /* --stop isn't allowed for --assemble, * so we overload slightly */ continue; case O(ASSEMBLE,'c'): /* config file */ + case O(INCREMENTAL, 'c'): case O(MISC, 'c'): case O(MONITOR,'c'): if (configfile) { @@ -671,6 +734,14 @@ program = optarg; continue; + case O(MONITOR,'r'): /* rebuild increments */ + increments = atoi(optarg); + if (increments>99 || increments<1) { + fprintf(stderr, Name ": please specify positive integer between 1 and 99 as rebuild increments.\n"); + exit(2); + } + continue; + case O(MONITOR,'d'): /* delay in seconds */ case O(GROW, 'd'): case O(BUILD,'d'): /* delay for bitmap updates */ @@ -724,6 +795,9 @@ devmode = 'r'; continue; case O(MANAGE,'f'): /* set faulty */ + case O(INCREMENTAL,'f'): /* r for incremental is taken, use f + * even though we will both fail and + * remove the device */ devmode = 'f'; continue; case O(INCREMENTAL,'R'): @@ -744,20 +818,8 @@ } runstop = -1; continue; - - case O(MANAGE,'o'): - if (readonly < 0) { - fprintf(stderr, Name ": Cannot have both readonly and readwrite\n"); - exit(2); - } - readonly = 1; - continue; - case O(MANAGE,'w'): - if (readonly > 0) { - fprintf(stderr, Name ": Cannot have both readwrite and readonly.\n"); - exit(2); - } - readonly = -1; + case O(MANAGE,'t'): + test = 1; continue; case O(MISC,'Q'): @@ -770,10 +832,23 @@ case O(MISC,'o'): case O(MISC,'w'): case O(MISC,'W'): + case O(MISC, Waitclean): + case O(MISC, DetailPlatform): + case O(MISC, KillSubarray): + case O(MISC, UpdateSubarray): if (devmode && devmode != opt && (devmode == 'E' || (opt == 'E' && devmode != 'Q'))) { - fprintf(stderr, Name ": --examine/-E cannot be given with -%c\n", - devmode =='E'?opt:devmode); + fprintf(stderr, Name ": --examine/-E cannot be given with "); + if (devmode == 'E') { + if (option_index >= 0) + fprintf(stderr, "--%s\n", + long_options[option_index].name); + else + fprintf(stderr, "-%c\n", opt); + } else if (isalpha(devmode)) + fprintf(stderr, "-%c\n", devmode); + else + fprintf(stderr, "previous option\n"); exit(2); } devmode = opt; @@ -835,7 +910,8 @@ continue; } /* probable typo */ - fprintf(stderr, Name ": bitmap file must contain a '/', or be 'internal', or 'none'\n"); + fprintf(stderr, Name ": bitmap file must contain a '/', or be 'internal', or 'none'\n" + " not '%s'\n", optarg); exit(2); case O(GROW,BitmapChunk): @@ -946,16 +1022,36 @@ fprintf(stderr, Name ": --super-minor=dev is incompatible with --auto\n"); exit(2); } - if (mode == MANAGE || mode == GROW) - autof=1; /* Don't create */ - mdfd = open_mddev(devlist->devname, autof); - if (mdfd < 0) + if (mode == MANAGE || mode == GROW) { + mdfd = open_mddev(devlist->devname, 1); + if (mdfd < 0) + exit(1); + } else + /* non-existent device is OK */ + mdfd = open_mddev(devlist->devname, 0); + if (mdfd == -2) { + fprintf(stderr, Name ": device %s exists but is not an " + "md array.\n", devlist->devname); exit(1); + } if ((int)ident.super_minor == -2) { struct stat stb; + if (mdfd < 0) { + fprintf(stderr, Name ": --super-minor=dev given, and " + "listed device %s doesn't exist.\n", + devlist->devname); + exit(1); + } fstat(mdfd, &stb); ident.super_minor = minor(stb.st_rdev); } + if (mdfd >= 0 && mode != MANAGE && mode != GROW) { + /* We don't really want this open yet, we just might + * have wanted to check some things + */ + close(mdfd); + mdfd = -1; + } } if (raiddisks) { @@ -980,14 +1076,22 @@ } if (homehost == NULL) - homehost = conf_get_homehost(); - if (homehost && strcmp(homehost, "")==0) { + homehost = conf_get_homehost(&require_homehost); + if (homehost == NULL || strcmp(homehost, "")==0) { if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { sys_hostname[sizeof(sys_hostname)-1] = 0; homehost = sys_hostname; } } + if ((mode != MISC || devmode != 'E') && + geteuid() != 0) { + fprintf(stderr, Name ": must be super-user to perform this action\n"); + exit(1); + } + + ident.autof = autof; + rv = 0; switch(mode) { case MANAGE: @@ -996,7 +1100,7 @@ rv = Manage_ro(devlist->devname, mdfd, readonly); if (!rv && devs_found>1) rv = Manage_subdevs(devlist->devname, mdfd, - devlist->next, verbose-quiet); + devlist->next, verbose-quiet, test); if (!rv && readonly < 0) rv = Manage_ro(devlist->devname, mdfd, readonly); if (!rv && runstop) @@ -1011,22 +1115,23 @@ fprintf(stderr, Name ": %s not identified in config file.\n", devlist->devname); rv |= 1; - } else { - mdfd = open_mddev(devlist->devname, - array_ident->autof ? array_ident->autof : autof); - if (mdfd < 0) - rv |= 1; - else { - rv |= Assemble(ss, devlist->devname, mdfd, array_ident, - NULL, backup_file, - readonly, runstop, update, homehost, verbose-quiet, force); + if (mdfd >= 0) close(mdfd); - } + } else { + if (array_ident->autof == 0) + array_ident->autof = autof; + rv |= Assemble(ss, devlist->devname, array_ident, + NULL, backup_file, + readonly, runstop, update, + homehost, require_homehost, + verbose-quiet, force); } } else if (!scan) - rv = Assemble(ss, devlist->devname, mdfd, &ident, + rv = Assemble(ss, devlist->devname, &ident, devlist->next, backup_file, - readonly, runstop, update, homehost, verbose-quiet, force); + readonly, runstop, update, + homehost, require_homehost, + verbose-quiet, force); else if (devs_found>0) { if (update && devs_found > 1) { fprintf(stderr, Name ": can only update a single array at a time\n"); @@ -1044,21 +1149,19 @@ rv |= 1; continue; } - mdfd = open_mddev(dv->devname, - array_ident->autof ?array_ident->autof : autof); - if (mdfd < 0) { - rv |= 1; - continue; - } - rv |= Assemble(ss, dv->devname, mdfd, array_ident, + if (array_ident->autof == 0) + array_ident->autof = autof; + rv |= Assemble(ss, dv->devname, array_ident, NULL, backup_file, - readonly, runstop, update, homehost, verbose-quiet, force); - close(mdfd); + readonly, runstop, update, + homehost, require_homehost, + verbose-quiet, force); } } else { - mddev_ident_t array_list = conf_get_ident(NULL); + mddev_ident_t a, array_list = conf_get_ident(NULL); mddev_dev_t devlist = conf_get_devs(); int cnt = 0; + int failures, successes; if (devlist == NULL) { fprintf(stderr, Name ": No devices listed in conf file were found.\n"); exit(1); @@ -1071,29 +1174,41 @@ fprintf(stderr, Name ": --backup_file not meaningful with a --scan assembly.\n"); exit(1); } - for (; array_list; array_list = array_list->next) { - mdu_array_info_t array; - mdfd = open_mddev(array_list->devname, - array_list->autof ? array_list->autof : autof); - if (mdfd < 0) { - rv |= 1; - continue; - } - if (ioctl(mdfd, GET_ARRAY_INFO, &array)>=0) - /* already assembled, skip */ + for (a = array_list; a ; a = a->next) { + a->assembled = 0; + if (a->autof == 0) + a->autof = autof; + } + do { + failures = 0; + successes = 0; + rv = 0; + for (a = array_list; a ; a = a->next) { + int r; + if (a->assembled) + continue; + if (a->devname && + strcasecmp(a->devname, "") == 0) + continue; + + r = Assemble(ss, a->devname, + a, + NULL, NULL, + readonly, runstop, NULL, + homehost, require_homehost, + verbose-quiet, force); + if (r == 0) { + a->assembled = 1; + successes++; + } else + failures++; + rv |= r; cnt++; - else { - rv |= Assemble(ss, array_list->devname, mdfd, - array_list, - NULL, NULL, - readonly, runstop, NULL, homehost, verbose-quiet, force); - if (rv == 0) cnt++; } - close(mdfd); - } - if (homehost) { + } while (failures && successes); + if (homehost && cnt == 0) { /* Maybe we can auto-assemble something. - * Repeatedly call Assemble in auto-assmble mode + * Repeatedly call Assemble in auto-assemble mode * until it fails */ int rv2; @@ -1103,10 +1218,12 @@ mddev_dev_t devlist = conf_get_devs(); acnt = 0; do { - rv2 = Assemble(ss, NULL, -1, + rv2 = Assemble(ss, NULL, &ident, devlist, NULL, - readonly, runstop, NULL, homehost, verbose-quiet, force); + readonly, runstop, NULL, + homehost, require_homehost, + verbose-quiet, force); if (rv2==0) { cnt++; acnt++; @@ -1119,15 +1236,18 @@ } while (rv2!=2); /* Incase there are stacked devices, we need to go around again */ } while (acnt); +#if 0 if (cnt == 0 && auto_update_home && homehost) { /* Nothing found, maybe we need to bootstrap homehost info */ do { acnt = 0; do { - rv2 = Assemble(ss, NULL, -1, + rv2 = Assemble(ss, NULL, &ident, NULL, NULL, - readonly, runstop, "homehost", homehost, verbose-quiet, force); + readonly, runstop, "homehost", + homehost, require_homehost, + verbose-quiet, force); if (rv2==0) { cnt++; acnt++; @@ -1136,6 +1256,7 @@ /* Incase there are stacked devices, we need to go around again */ } while (acnt); } +#endif if (cnt == 0 && rv == 0) { fprintf(stderr, Name ": No arrays found in config file or automatically\n"); rv = 1; @@ -1155,7 +1276,7 @@ break; } if (raiddisks == 0) { - fprintf(stderr, Name ": no raid-disks specified.\n"); + fprintf(stderr, Name ": no raid-devices specified.\n"); rv = 1; break; } @@ -1167,9 +1288,10 @@ break; } } - rv = Build(devlist->devname, mdfd, chunk, level, layout, + rv = Build(devlist->devname, chunk, level, layout, raiddisks, devlist->next, assume_clean, - bitmap_file, bitmap_chunk, write_behind, delay, verbose-quiet); + bitmap_file, bitmap_chunk, write_behind, + delay, verbose-quiet, autof, size); break; case CREATE: if (delay == 0) delay = DEFAULT_BITMAP_DELAY; @@ -1179,16 +1301,16 @@ break; } if (raiddisks == 0) { - fprintf(stderr, Name ": no raid-disks specified.\n"); + fprintf(stderr, Name ": no raid-devices specified.\n"); rv = 1; break; } - rv = Create(ss, devlist->devname, mdfd, chunk, level, layout, size<0 ? 0 : size, + rv = Create(ss, devlist->devname, chunk, level, layout, size<0 ? 0 : size, raiddisks, sparedisks, ident.name, homehost, ident.uuid_set ? ident.uuid : NULL, devs_found-1, devlist->next, runstop, verbose-quiet, force, assume_clean, - bitmap_file, bitmap_chunk, write_behind, delay); + bitmap_file, bitmap_chunk, write_behind, delay, autof); break; case MISC: if (devmode == 'E') { @@ -1207,24 +1329,51 @@ rv = Examine(devlist, scan?(verbose>1?0:verbose+1):brief, export, scan, SparcAdjust, ss, homehost); + } else if (devmode == DetailPlatform) { + rv = Detail_Platform(ss ? ss->ss : NULL, ss ? scan : 1, verbose); } else { if (devlist == NULL) { - if (devmode=='D' && scan) { - /* apply --detail to all devices in /proc/mdstat */ + if ((devmode=='D' || devmode == Waitclean) && scan) { + /* apply --detail or --wait-clean to + * all devices in /proc/mdstat + */ struct mdstat_ent *ms = mdstat_read(0, 1); struct mdstat_ent *e; + struct map_ent *map = NULL; + int members; + int v = verbose>1?0:verbose+1; + + for (members = 0; members <= 1; members++) { for (e=ms ; e ; e=e->next) { - char *name = get_md_name(e->devnum); + char *name; + struct map_ent *me; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; + me = map_by_devnum(&map, e->devnum); + if (me && me->path + && strcmp(me->path, "/unknown") != 0) + name = me->path; + else + name = get_md_name(e->devnum); if (!name) { fprintf(stderr, Name ": cannot find device file for %s\n", e->dev); continue; } - rv |= Detail(name, verbose>1?0:verbose+1, - export, test, homehost); + if (devmode == 'D') + rv |= Detail(name, v, + export, test, + homehost); + else + rv |= WaitClean(name, -1, v); put_md_name(name); } + } + free_mdstat(ms); } else if (devmode == 'S' && scan) { /* apply --stop to all devices in /proc/mdstat */ /* Due to possible stacking of devices, repeat until @@ -1257,6 +1406,7 @@ put_md_name(name); } + free_mdstat(ms); } while (!last && err); if (err) rv |= 1; } else { @@ -1272,13 +1422,37 @@ export, test, homehost); continue; case 'K': /* Zero superblock */ - rv |= Kill(dv->devname, force, quiet); continue; + if (ss) + rv |= Kill(dv->devname, ss, force, quiet,0); + else { + int q = quiet; + do { + rv |= Kill(dv->devname, NULL, force, q, 0); + q = 1; + } while (rv == 0); + rv &= ~2; + } + continue; case 'Q': rv |= Query(dv->devname); continue; case 'X': rv |= ExamineBitmap(dv->devname, brief, ss); continue; case 'W': rv |= Wait(dv->devname); continue; + case Waitclean: + rv |= WaitClean(dv->devname, -1, verbose-quiet); continue; + case KillSubarray: + rv |= Kill_subarray(dv->devname, subarray, quiet); + continue; + case UpdateSubarray: + if (update == NULL) { + fprintf(stderr, + Name ": -U/--update must be specified with --update-subarray\n"); + rv |= 1; + continue; + } + rv |= Update_subarray(dv->devname, subarray, update, &ident, quiet); + continue; } mdfd = open_mddev(dv->devname, 1); if (mdfd>=0) { @@ -1309,17 +1483,55 @@ rv = 1; break; } + if (delay == 0) { + if (get_linux_version() > 20616) + /* mdstat responds to poll */ + delay = 1000; + else + delay = 60; + } rv= Monitor(devlist, mailaddr, program, delay?delay:60, daemonise, scan, oneshot, - dosyslog, test, pidfile); + dosyslog, test, pidfile, increments); break; case GROW: + if (array_size >= 0) { + /* alway impose array size first, independent of + * anything else + * Do not allow level or raid_disks changes at the + * same time as that can be irreversibly destructive. + */ + struct mdinfo sra; + int err; + if (raiddisks || level != UnSet) { + fprintf(stderr, Name ": cannot change array size in same operation " + "as changing raiddisks or level.\n" + " Change size first, then check that data is still intact.\n"); + rv = 1; + break; + } + sysfs_init(&sra, mdfd, 0); + if (array_size == 0) + err = sysfs_set_str(&sra, NULL, "array_size", "default"); + else + err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2); + if (err < 0) { + if (errno == E2BIG) + fprintf(stderr, Name ": --array-size setting" + " is too large.\n"); + else + fprintf(stderr, Name ": current kernel does" + " not support setting --array-size\n"); + rv = 1; + break; + } + } if (devs_found > 1) { /* must be '-a'. */ - if (size >= 0 || raiddisks) { - fprintf(stderr, Name ": --size, --raiddisks, and --add are exclusing in --grow mode\n"); + if (size >= 0 || raiddisks || chunk || layout_str != NULL || bitmap_file) { + fprintf(stderr, Name ": --add cannot be used with other geometry changes in --grow mode\n"); rv = 1; break; } @@ -1328,20 +1540,21 @@ if (rv) break; } - } else if ((size >= 0) + (raiddisks != 0) + (layout != UnSet) + (bitmap_file != NULL)> 1) { - fprintf(stderr, Name ": can change at most one of size, raiddisks, bitmap, and layout\n"); - rv = 1; - break; - } else if (layout != UnSet) - rv = Manage_reconfig(devlist->devname, mdfd, layout); - else if (size >= 0 || raiddisks) - rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file, - size, level, layout, chunk, raiddisks); - else if (bitmap_file) { - if (delay == 0) delay = DEFAULT_BITMAP_DELAY; + } else if (bitmap_file) { + if (size >= 0 || raiddisks || chunk || layout_str != NULL) { + fprintf(stderr, Name ": --bitmap changes cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + if (delay == 0) + delay = DEFAULT_BITMAP_DELAY; rv = Grow_addbitmap(devlist->devname, mdfd, bitmap_file, bitmap_chunk, delay, write_behind, force); - } else + } else if (size >= 0 || raiddisks != 0 || layout_str != NULL + || chunk != 0 || level != UnSet) { + rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file, + size, level, layout_str, chunk, raiddisks); + } else if (array_size < 0) fprintf(stderr, Name ": no changes to --grow\n"); break; case INCREMENTAL: @@ -1354,6 +1567,11 @@ ": --incremental --scan meaningless without --run.\n"); break; } + if (devmode == 'f') { + fprintf(stderr, Name + ": --incremental --scan --fail not supported.\n"); + break; + } rv = IncrementalScan(verbose); } if (!devlist) { @@ -1370,8 +1588,12 @@ rv = 1; break; } + if (devmode == 'f') { + rv = IncrementalRemove(devlist->devname, verbose-quiet); + break; + } rv = Incremental(devlist->devname, verbose-quiet, runstop, - ss, homehost, autof); + ss, homehost, require_homehost, autof); break; case AUTODETECT: autodetect(); diff -Nru mdadm-2.6.7.1/mdadm.conf.5 mdadm-3.1.4/mdadm.conf.5 --- mdadm-2.6.7.1/mdadm.conf.5 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/mdadm.conf.5 2010-08-26 05:24:15.000000000 +0300 @@ -8,10 +8,10 @@ .SH NAME mdadm.conf \- configuration for management of Software RAID with mdadm .SH SYNOPSIS -/etc/mdadm/mdadm.conf +/etc/mdadm.conf .SH DESCRIPTION .PP -.B mdadm +.I mdadm is a tool for creating, managing, and monitoring RAID devices using the .B md driver in Linux. @@ -40,7 +40,7 @@ line lists the devices (whole devices or partitions) that might contain a component of an MD array. When looking for the components of an array, -.B mdadm +.I mdadm will scan these devices (or any devices listed on the command line). The @@ -53,9 +53,20 @@ Alternatively, a .B device -line can contain the word +line can contain either of both of the words +.B containers +and .BR partitions . -This will cause +The word +.B containers +will cause +.I mdadm +to look for assembled CONTAINER arrays and included them as a source +for assembling further arrays. + +The word +.I partitions +will cause .I mdadm to read .I /proc/partitions @@ -67,7 +78,7 @@ .I /dev to find the name that matches the numbers. -If no DEVICE line is present, then "DEVICE partitions" is assumed. +If no DEVICE line is present, then "DEVICE partitions containers" is assumed. For example: .IP @@ -75,22 +86,35 @@ .br DEV /dev/sd* .br -DEVICE /dev/discs/disc*/disc +DEVICE /dev/disk/by-path/pci* .br DEVICE partitions .TP .B ARRAY The ARRAY lines identify actual arrays. The second word on the line -should be the name of the device where the array is normally +may be the name of the device where the array is normally assembled, such as -.BR /dev/md1 . +.B /dev/md1 +or +.BR /dev/md/backup . +If the name does not start with a slash +.RB (' / '), +it is treated as being in +.BR /dev/md/ . +Alternately the word +.B +(complete with angle brackets) can be given in which case any array +which matches the rest of the line will never be automatically assembled. +If no device name is given, +.I mdadm +will use various heuristics to determine an appropriate name. + Subsequent words identify the array, or identify the array as a member of a group. If multiple identities are given, then a component device must match ALL identities to be considered a match. Each identity word has a tag, and equals sign, and some value. The tags are: - .RS 4 .TP .B uuid= @@ -135,6 +159,7 @@ .TP .B spares= The value is a number of spare devices to expect the array to have. +The sole use of this keyword and value is as follows: .B mdadm \-\-monitor will report an array if it is found to have fewer than this number of spares when @@ -150,17 +175,22 @@ .B spare\-group name are considered to be part of the same group. The significance of a group of arrays is that -.B mdadm +.I mdadm will, when monitoring the arrays, move a spare drive from one array in a group to another array in that group if the first array had a failed or missing drive but no spare. .TP .B auto= -This option declares to -.B mdadm -that it should try to create the device file of the array if it -doesn't already exist, or exists but with the wrong device number. +This option is rarely needed with mdadm-3.0, particularly if use with +the Linux kernel v2.6.28 or later. +It tells +.I mdadm +whether to use partitionable array or non-partitionable arrays and, +in the absence of +.IR udev , +how many partition devices to create. From 2.6.28 all md array +devices are partitionable, hence this option is not needed. The value of this option can be "yes" or "md" to indicate that a traditional, non-partitionable md array should be created, or "mdp", @@ -189,6 +219,18 @@ recognised for comparability with the output of .BR "mdadm \-Es" . +.TP +.B container= +Specify that this array is a member array of some container. The +value given can be either a path name in /dev, or a UUID of the +container array. + +.TP +.B member= +Specify that this array is a member array of some container. Each +type of container has some way to enumerate member arrays, often a +simple sequence number. The value identifies which member of a +container the array is. It will usually accompany a "container=" word. .RE .TP @@ -197,7 +239,7 @@ .B mailaddr line gives an E-mail address that alerts should be sent to when -.B mdadm +.I mdadm is running in .B \-\-monitor mode (and was given the @@ -294,26 +336,94 @@ The .B homehost line gives a default value for the -.B --homehost= -option to mdadm. There should be exactly one other word on the line. -It should either exactly +.B \-\-homehost= +option to mdadm. There should normally be only one other word on the line. +It should either be a host name, or one of the special words .B -or a host name. +and +.BR . If .B is given, then the .BR gethostname ( 2 ) systemcall is used to get the host name. + +If +.B +is given, then a flag is set so that when arrays are being +auto-assembled the checking of the recorded +.I homehost +is disabled. +If +.B +is given it is also possible to give an explicit name which will be +used when creating arrays. This is the only case when there can be +more that one other word on the +.B HOMEHOST +line. + When arrays are created, this host name will be stored in the -metadata. When arrays are assembled using auto-assembly, only arrays -with this host name stored in the metadata will be considered. +metadata. When arrays are assembled using auto-assembly, arrays which +do not record the correct homehost name in their metadata will be +assembled using a "foreign" name. A "foreign" name alway ends with a +digit string preceded by an underscore to differentiate it +from any possible local name. e.g. +.B /dev/md/1_1 +or +.BR /dev/md/home_0 . +.TP +.B AUTO +A list of names of metadata format can be given, each preceded by a +plus or minus sign. Also the word +.I homehost +is allowed as is +.I all +preceded by plus or minus sign. +.I all +is usually last. + +When +.I mdadm +is auto-assembling an array, either via +.I \-\-assemble +or +.I \-\-incremental +and it finds metadata of a given type, it checks that metadata type +against those listed in this line. The first match wins, where +.I all +matches anything. +If a match is found that was preceded by a plus sign, the auto +assembly is allowed. If the match was preceded by a minus sign, the +auto assembly is disallowed. If no match is found, the auto assembly +is allowed. + +If the metadata indicates that the array was created for +.I this +host, and the word +.I homehost +appears before any other match, then the array is treated as a valid +candidate for auto-assembly. + +This can be used to disable all auto-assembly (so that only arrays +explicitly listed in mdadm.conf or on the command line are assembled), +or to disable assembly of certain metadata types which might be +handled by other software. It can also be used to disable assembly of +all foreign arrays - normally such arrays are assembled but given a +non-deterministic name in +.BR /dev/md/ . + +The known metadata types are +.BR 0.90 , +.BR 1.x , +.BR ddf , +.BR imsm . .SH EXAMPLE DEVICE /dev/sd[bcdjkl]1 .br DEVICE /dev/hda1 /dev/hdb1 -# /dev/md0 is known by its UID. +# /dev/md0 is known by its UUID. .br ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 .br @@ -354,6 +464,8 @@ CREATE group=system mode=0640 auto=part\-8 .br HOMEHOST +.br +AUTO +1.x homehost -all .SH SEE ALSO .BR mdadm (8), diff -Nru mdadm-2.6.7.1/mdadm.conf-example mdadm-3.1.4/mdadm.conf-example --- mdadm-2.6.7.1/mdadm.conf-example 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/mdadm.conf-example 2010-08-26 05:24:15.000000000 +0300 @@ -27,6 +27,14 @@ #DEVICE /dev/discs/*/* # # +# The AUTO line can control which arrays get assembled by auto-assembly, +# meaing either "mdadm -As" when there are no 'ARRAY' lines in this file, +# or "mdadm --incremental" when the array found is not listed in this file. +# By default, all arrays that are found are assembled. +# If you want to ignore all DDF arrays (maybe they are managed by dmraid), +# and only assemble 1.x arrays if which are marked for 'this' homehost, +# but assemble all others, then use +#AUTO -ddf homehost -1.x +all # # ARRAY lines specify an array to assemble and a method of identification. # Arrays can currently be identified by using a UUID, superblock minor number, diff -Nru mdadm-2.6.7.1/mdadm.h mdadm-3.1.4/mdadm.h --- mdadm-2.6.7.1/mdadm.h 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/mdadm.h 2010-08-31 10:18:39.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #define _GNU_SOURCE @@ -73,9 +68,35 @@ #define DEFAULT_BITMAP_DELAY 5 #define DEFAULT_MAX_WRITE_BEHIND 256 +/* MAP_DIR should be somewhere that persists across the pivotroot + * from early boot to late boot. + * Currently /dev seems to be the only option on most distros. + */ +#ifndef MAP_DIR +#define MAP_DIR "/dev/.mdadm" +#endif /* MAP_DIR */ +/* MAP_FILE is what we name the map file we put in MAP_DIR, in case you + * want something other than the default of "map" + */ +#ifndef MAP_FILE +#define MAP_FILE "map" +#endif /* MAP_FILE */ +/* MDMON_DIR is where pid and socket files used for communicating + * with mdmon normally live. It *should* be /var/run, but when + * mdmon is needed at early boot then it needs to write there prior + * to /var/run being mounted read/write, and it also then needs to + * persist beyond when /var/run is mounter read-only. So, to be + * safe, the default is somewhere that is read/write early in the + * boot process and stays up as long as possible during shutdown. + */ +#ifndef MDMON_DIR +#define MDMON_DIR "/dev/.mdadm/" +#endif /* MDMON_DIR */ + #include "md_u.h" #include "md_p.h" #include "bitmap.h" +#include "msg.h" #include /* Redhat don't like to #include , and @@ -106,6 +127,13 @@ #define __le16_to_cpu(_x) (_x) #define __le32_to_cpu(_x) (_x) #define __le64_to_cpu(_x) (_x) + +#define __cpu_to_be16(_x) bswap_16(_x) +#define __cpu_to_be32(_x) bswap_32(_x) +#define __cpu_to_be64(_x) bswap_64(_x) +#define __be16_to_cpu(_x) bswap_16(_x) +#define __be32_to_cpu(_x) bswap_32(_x) +#define __be64_to_cpu(_x) bswap_64(_x) #elif BYTE_ORDER == BIG_ENDIAN #define __cpu_to_le16(_x) bswap_16(_x) #define __cpu_to_le32(_x) bswap_32(_x) @@ -113,12 +141,35 @@ #define __le16_to_cpu(_x) bswap_16(_x) #define __le32_to_cpu(_x) bswap_32(_x) #define __le64_to_cpu(_x) bswap_64(_x) + +#define __cpu_to_be16(_x) (_x) +#define __cpu_to_be32(_x) (_x) +#define __cpu_to_be64(_x) (_x) +#define __be16_to_cpu(_x) (_x) +#define __be32_to_cpu(_x) (_x) +#define __be64_to_cpu(_x) (_x) #else # error "unknown endianness." #endif #endif /* __KLIBC__ */ +/* + * min()/max()/clamp() macros that also do + * strict type-checking.. See the + * "unnecessary" pointer comparison. + */ +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) /* general information that might be extracted from a superblock */ struct mdinfo { @@ -128,18 +179,53 @@ int uuid[4]; char name[33]; unsigned long long data_offset; - unsigned long long component_size; + unsigned long long component_size; /* same as array.size, except in + * sectors and up to 64bits. + */ + unsigned long long custom_array_size; /* size for non-default sized + * arrays (in sectors) + */ int reshape_active; unsigned long long reshape_progress; + union { + unsigned long long resync_start; /* per-array resync position */ + unsigned long long recovery_start; /* per-device rebuild position */ + #define MaxSector (~0ULL) /* resync/recovery complete position */ + }; + unsigned long safe_mode_delay; /* ms delay to mark clean */ int new_level, delta_disks, new_layout, new_chunk; int errors; - int cache_size; /* size of raid456 stripe cache*/ + unsigned long cache_size; /* size of raid456 stripe cache*/ int mismatch_cnt; char text_version[50]; - + void *update_private; /* for passing metadata-format + * specific update data + * between successive calls to + * update_super() + */ + + int container_member; /* for assembling external-metatdata arrays + * This is to be used internally by metadata + * handler only */ + int container_enough; /* flag external handlers can set to + * indicate that subarrays have not enough (-1), + * enough to start (0), or all expected disks (1) */ char sys_name[20]; struct mdinfo *devs; struct mdinfo *next; + + /* Device info for mdmon: */ + int recovery_fd; + int state_fd; + #define DS_FAULTY 1 + #define DS_INSYNC 2 + #define DS_WRITE_MOSTLY 4 + #define DS_SPARE 8 + #define DS_BLOCKED 16 + #define DS_REMOVE 1024 + #define DS_UNBLOCK 2048 + int prev_state, curr_state, next_state; + }; struct createinfo { @@ -176,6 +262,7 @@ /* for option that don't have short equivilents, we assign arbitrary * small numbers. '1' means an undecorated option, so we start at '2'. + * (note we must stop before we get to 65 i.e. 'A') */ enum special_options { AssumeClean = 2, @@ -184,11 +271,15 @@ ReAdd, NoDegraded, Sparc22, - BackupFile, + BackupFile, /* 8 */ HomeHost, AutoHomeHost, Symlinks, AutoDetect, + Waitclean, + DetailPlatform, + KillSubarray, + UpdateSubarray, /* 16 */ }; /* structures read from config file */ @@ -209,21 +300,32 @@ int uuid[4]; char name[33]; - unsigned int super_minor; + int super_minor; char *devices; /* comma separated list of device * names with wild cards */ int level; - unsigned int raid_disks; - unsigned int spare_disks; + int raid_disks; + int spare_disks; struct supertype *st; int autof; /* 1 for normal, 2 for partitioned */ char *spare_group; char *bitmap_file; int bitmap_fd; + char *container; /* /dev/whatever name of container, or + * uuid of container. You would expect + * this to be the 'devname' or UUID + * of some other entry. + */ + char *member; /* subarray within a container */ + struct mddev_ident_s *next; + union { + /* fields needed by different users of this structure */ + int assembled; /* set when assembly succeeds */ + }; } *mddev_ident_t; /* List of device names - wildcards expanded */ @@ -232,9 +334,11 @@ char disposition; /* 'a' for add, 'r' for remove, 'f' for fail. * Not set for names read from .config */ - char writemostly; + char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */ char re_add; char used; /* set when used */ + struct mdinfo *content; /* If devname is a container, this might list + * the remaining member arrays. */ struct mddev_dev_s *next; } *mddev_dev_t; @@ -252,63 +356,101 @@ char *pattern; /* U or up, _ for down */ int percent; /* -1 if no resync */ int resync; /* 1 if resync, 0 if recovery */ + int devcnt; + int raid_disks; + int chunk_size; + char * metadata_version; + struct dev_member { + char *name; + struct dev_member *next; + } *members; struct mdstat_ent *next; }; extern struct mdstat_ent *mdstat_read(int hold, int start); extern void free_mdstat(struct mdstat_ent *ms); extern void mdstat_wait(int seconds); +extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); extern int mddev_busy(int devnum); +extern struct mdstat_ent *mdstat_by_component(char *name); struct map_ent { struct map_ent *next; int devnum; - int major,minor; + char metadata[20]; int uuid[4]; + int bad; char *path; }; -extern int map_update(struct map_ent **mpp, int devnum, int major, int minor, +extern int map_update(struct map_ent **mpp, int devnum, char *metadata, int uuid[4], char *path); +extern void map_remove(struct map_ent **map, int devnum); extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]); +extern struct map_ent *map_by_devnum(struct map_ent **map, int devnum); +extern struct map_ent *map_by_name(struct map_ent **map, char *name); extern void map_read(struct map_ent **melp); extern int map_write(struct map_ent *mel); extern void map_delete(struct map_ent **mapp, int devnum); extern void map_free(struct map_ent *map); extern void map_add(struct map_ent **melp, - int devnum, int major, int minor, int uuid[4], char *path); + int devnum, char *metadata, int uuid[4], char *path); +extern int map_lock(struct map_ent **melp); +extern void map_unlock(struct map_ent **melp); /* various details can be requested */ -#define GET_LEVEL 1 -#define GET_LAYOUT 2 -#define GET_COMPONENT 4 -#define GET_CHUNK 8 -#define GET_CACHE 16 -#define GET_MISMATCH 32 -#define GET_VERSION 64 - -#define GET_DEVS 1024 /* gets role, major, minor */ -#define GET_OFFSET 2048 -#define GET_SIZE 4096 -#define GET_STATE 8192 -#define GET_ERROR 16384 +enum sysfs_read_flags { + GET_LEVEL = (1 << 0), + GET_LAYOUT = (1 << 1), + GET_COMPONENT = (1 << 2), + GET_CHUNK = (1 << 3), + GET_CACHE = (1 << 4), + GET_MISMATCH = (1 << 5), + GET_VERSION = (1 << 6), + GET_DISKS = (1 << 7), + GET_DEGRADED = (1 << 8), + GET_SAFEMODE = (1 << 9), + GET_DEVS = (1 << 10), /* gets role, major, minor */ + GET_OFFSET = (1 << 11), + GET_SIZE = (1 << 12), + GET_STATE = (1 << 13), + GET_ERROR = (1 << 14), +}; /* If fd >= 0, get the array it is open on, * else use devnum. >=0 -> major9. <0..... */ +extern int sysfs_open(int devnum, char *devname, char *attr); +extern void sysfs_init(struct mdinfo *mdi, int fd, int devnum); extern void sysfs_free(struct mdinfo *sra); extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options); +extern int sysfs_attr_match(const char *attr, const char *str); +extern int sysfs_match_word(const char *word, char **list); extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val); extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long val); +extern int sysfs_uevent(struct mdinfo *sra, char *event); +extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_fd_get_ll(int fd, unsigned long long *val); extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long *val); +extern int sysfs_fd_get_str(int fd, char *val, int size); +extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size); +extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); +extern int sysfs_set_array(struct mdinfo *info, int vers); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume); +extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); +extern int sysfs_unique_holder(int devnum, long rdev); +extern int load_sys(char *path, char *buf); extern int save_stripes(int *source, unsigned long long *offsets, int raid_disks, int chunk_size, int level, int layout, int nwrites, int *dest, - unsigned long long start, unsigned long long length); + unsigned long long start, unsigned long long length, + char *buf); extern int restore_stripes(int *dest, unsigned long long *offsets, int raid_disks, int chunk_size, int level, int layout, int source, unsigned long long read_offset, @@ -322,32 +464,133 @@ extern char *map_num(mapping_t *map, int num); extern int map_name(mapping_t *map, char *name); -extern mapping_t r5layout[], pers[], modes[], faultylayout[]; +extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[]; extern char *map_dev(int major, int minor, int create); +struct active_array; +struct metadata_update; +/* A superswitch provides entry point the a metadata handler. + * + * The super_switch primarily operates on some "metadata" that + * is accessed via the 'supertype'. + * This metadata has one of three possible sources. + * 1/ It is read from a single device. In this case it may not completely + * describe the array or arrays as some information might be on other + * devices. + * 2/ It is read from all devices in a container. In this case all + * information is present. + * 3/ It is created by ->init_super / ->add_to_super. In this case it will + * be complete once enough ->add_to_super calls have completed. + * + * When creating an array inside a container, the metadata will be + * formed by a combination of 2 and 3. The metadata or the array is read, + * then new information is added. + * + * The metadata must sometimes have a concept of a 'current' array + * and a 'current' device. + * The 'current' array is set by init_super to be the newly created array, + * or is set by super_by_fd when it finds it is looking at an array inside + * a container. + * + * The 'current' device is either the device that the metadata was read from + * in case 1, or the last device added by add_to_super in case 3. + * Case 2 does not identify a 'current' device. + */ extern struct superswitch { + + /* Used to report details of metadata read from a component + * device. ->load_super has been called. + */ void (*examine_super)(struct supertype *st, char *homehost); - void (*brief_examine_super)(struct supertype *st); + void (*brief_examine_super)(struct supertype *st, int verbose); + void (*brief_examine_subarrays)(struct supertype *st, int verbose); void (*export_examine_super)(struct supertype *st); + + /* Used to report details of an active array. + * ->load_super was possibly given a 'component' string. + */ void (*detail_super)(struct supertype *st, char *homehost); void (*brief_detail_super)(struct supertype *st); void (*export_detail_super)(struct supertype *st); + + /* Optional: platform hardware / firmware details */ + int (*detail_platform)(int verbose, int enumerate_only); + + /* Used: + * to get uuid to storing in bitmap metadata + * and 'reshape' backup-data metadata + * To see if a device is being re-added to an array it was part of. + */ void (*uuid_from_super)(struct supertype *st, int uuid[4]); + + /* Extract generic details from metadata. This could be details about + * the container, or about an individual array within the container. + * The determination is made either by: + * load_super being given a 'component' string. + * validate_geometry determining what to create. + * The info includes both array information and device information. + * The particular device should be: + * The last device added by add_to_super + * The device the metadata was loaded from by load_super + */ void (*getinfo_super)(struct supertype *st, struct mdinfo *info); + + /* Check if the given metadata is flagged as belonging to "this" + * host. 0 for 'no', 1 for 'yes', -1 for "Don't record homehost" + */ int (*match_home)(struct supertype *st, char *homehost); + + /* Make one of several generic modifications to metadata + * prior to assembly (or other times). + * sparc2.2 - first bug in early 0.90 metadata + * super-minor - change name of 0.90 metadata + * summaries - 'correct' any redundant data + * resync - mark array as dirty to trigger a resync. + * uuid - set new uuid - only 0.90 or 1.x + * name - change the name of the array (where supported) + * homehost - change which host this array is tied to. + * devicesize - If metadata is at start of device, change recorded + * device size to match actual device size + * byteorder - swap bytes for 0.90 metadata + * + * force-one - mark that device as uptodate, not old or failed. + * force-array - mark array as clean if it would not otherwise + * assemble + * assemble - not sure how this is different from force-one... + * linear-grow-new - add a new device to a linear array, but don't + * change the size: so superblock still matches + * linear-grow-update - now change the size of the array. + */ int (*update_super)(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, int uuid_set, char *homehost); + + /* Create new metadata for new array as described. This could + * be a new container, or an array in a pre-existing container. + * Also used to zero metadata prior to writing it to invalidate old + * metadata. + */ int (*init_super)(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, char *homehost, int *uuid); - void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo); + + /* update the metadata to include new device, either at create or + * when hot-adding a spare. + */ + int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname); + + /* Write metadata to one device when fixing problems or adding + * a new device. + */ int (*store_super)(struct supertype *st, int fd); - int (*write_init_super)(struct supertype *st, mdu_disk_info_t *dinfo, - char *devname); + + /* Write all metadata for this array. + */ + int (*write_init_super)(struct supertype *st); int (*compare_super)(struct supertype *st, struct supertype *tst); int (*load_super)(struct supertype *st, int fd, char *devname); struct supertype * (*match_metadata_desc)(char *arg); @@ -358,15 +601,129 @@ void (*locate_bitmap)(struct supertype *st, int fd); int (*write_bitmap)(struct supertype *st, int fd); void (*free_super)(struct supertype *st); - int major; + + /* validate_geometry is called with an st returned by + * match_metadata_desc. + * It should check that the geometry described in compatible with + * the metadata type. It will be called repeatedly as devices + * added to validate changing size and new devices. If there are + * inter-device dependencies, it should record sufficient details + * so these can be validated. + * Both 'size' and '*freesize' are in sectors. chunk is bytes. + */ + int (*validate_geometry)(struct supertype *st, int level, int layout, + int raiddisks, + int chunk, unsigned long long size, + char *subdev, unsigned long long *freesize, + int verbose); + + struct mdinfo *(*container_content)(struct supertype *st); + /* Allow a metadata handler to override mdadm's default layouts */ + int (*default_layout)(int level); /* optional */ + /* query the supertype for default chunk size */ + int (*default_chunk)(struct supertype *st); /* optional */ + /* Permit subarray's to be deleted from inactive containers */ + int (*kill_subarray)(struct supertype *st); /* optional */ + /* Permit subarray's to be modified */ + int (*update_subarray)(struct supertype *st, char *update, mddev_ident_t ident); /* optional */ + +/* for mdmon */ + int (*open_new)(struct supertype *c, struct active_array *a, + char *inst); + + /* Tell the metadata handler the current state of the array. + * This covers whether it is known to be consistent (no pending writes) + * and how far along a resync is known to have progressed + * (in a->resync_start). + * resync status is really irrelevant if the array is not consistent, + * but some metadata (DDF!) have a place to record the distinction. + * If 'consistent' is '2', then the array can mark it dirty if a + * resync/recovery/whatever is required, or leave it clean if not. + * Return value is 0 dirty (not consistent) and 1 if clean. + * it is only really important if consistent is passed in as '2'. + */ + int (*set_array_state)(struct active_array *a, int consistent); + + /* When the state of a device might have changed, we call set_disk to + * tell the metadata what the current state is. + * Typically this happens on spare->in_sync and (spare|in_sync)->faulty + * transitions. + * set_disk might be called when the state of the particular disk has + * not in fact changed. + */ + void (*set_disk)(struct active_array *a, int n, int state); + void (*sync_metadata)(struct supertype *st); + void (*process_update)(struct supertype *st, + struct metadata_update *update); + void (*prepare_update)(struct supertype *st, + struct metadata_update *update); + + /* activate_spare will check if the array is degraded and, if it + * is, try to find some spare space in the container. + * On success, it add appropriate updates (For process_update) to + * to the 'updates' list and returns a list of 'mdinfo' identifying + * the device, or devices as there might be multiple missing + * devices and multiple spares available. + */ + struct mdinfo *(*activate_spare)(struct active_array *a, + struct metadata_update **updates); + int swapuuid; /* true if uuid is bigending rather than hostendian */ -} super0, super1, *superlist[]; + int external; + const char *name; /* canonical metadata name */ +} super0, super1, super_ddf, *superlist[]; + +extern struct superswitch super_imsm; + +struct metadata_update { + int len; + char *buf; + void *space; /* allocated space that monitor will use */ + struct metadata_update *next; +}; +/* A supertype holds a particular collection of metadata. + * It identifies the metadata type by the superswitch, and the particular + * sub-version of that metadata type. + * metadata read in or created is stored in 'sb' and 'info'. + * There are also fields used by mdmon to track containers. + * + * A supertype may refer to: + * Just an array, possibly in a container + * A container, not identifying any particular array + * Info read from just one device, not yet fully describing the array/container. + * + * + * A supertype is created by: + * super_by_fd + * guess_super + * dup_super + */ struct supertype { struct superswitch *ss; int minor_version; int max_devs; + int container_dev; /* devnum of container */ + char subarray[32]; /* name of array inside container */ void *sb; + void *info; + int loaded_container; /* Set if load_super found a container, + * not just one device */ + + struct metadata_update *updates; + struct metadata_update **update_tail; + + /* extra stuff used by mdmon */ + struct active_array *arrays; + int sock; /* listen to external programs */ + int devnum; + char *devname; /* e.g. md0. This appears in metadata_verison: + * external:/md0/12 + */ + int devcnt; + + struct mdinfo *devs; + }; extern struct supertype *super_by_fd(int fd); @@ -375,6 +732,7 @@ extern int get_dev_size(int fd, char *dname, unsigned long long *sizep); extern void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk); +void wait_for(char *dev, int fd); #if __GNUC__ < 3 struct stat64; @@ -383,8 +741,11 @@ #define HAVE_NFTW we assume #define HAVE_FTW -#ifdef UCLIBC +#ifdef __UCLIBC__ # include +# ifndef __UCLIBC_HAS_LFS__ +# define lseek64 lseek +# endif # ifndef __UCLIBC_HAS_FTW__ # undef HAVE_FTW # undef HAVE_NFTW @@ -417,56 +778,64 @@ extern int Manage_ro(char *devname, int fd, int readonly); extern int Manage_runstop(char *devname, int fd, int runstop, int quiet); extern int Manage_resize(char *devname, int fd, long long size, int raid_disks); -extern int Manage_reconfig(char *devname, int fd, int layout); extern int Manage_subdevs(char *devname, int fd, - mddev_dev_t devlist, int verbose); + mddev_dev_t devlist, int verbose, int test); extern int autodetect(void); extern int Grow_Add_device(char *devname, int fd, char *newdev); extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force); extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, long long size, - int level, int layout, int chunksize, int raid_disks); + int level, char *layout_str, int chunksize, int raid_disks); extern int Grow_restart(struct supertype *st, struct mdinfo *info, - int *fdlist, int cnt, char *backup_file); + int *fdlist, int cnt, char *backup_file, int verbose); +extern int Grow_continue(int mdfd, struct supertype *st, + struct mdinfo *info, char *backup_file); - -extern int Assemble(struct supertype *st, char *mddev, int mdfd, +extern int Assemble(struct supertype *st, char *mddev, mddev_ident_t ident, mddev_dev_t devlist, char *backup_file, int readonly, int runstop, - char *update, char *homehost, + char *update, char *homehost, int require_homehost, int verbose, int force); -extern int Build(char *mddev, int mdfd, int chunk, int level, int layout, - int raiddisks, - mddev_dev_t devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int verbose); +extern int Build(char *mddev, int chunk, int level, int layout, + int raiddisks, mddev_dev_t devlist, int assume_clean, + char *bitmap_file, int bitmap_chunk, int write_behind, + int delay, int verbose, int autof, unsigned long long size); -extern int Create(struct supertype *st, char *mddev, int mdfd, +extern int Create(struct supertype *st, char *mddev, int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks, char *name, char *homehost, int *uuid, int subdevs, mddev_dev_t devlist, int runstop, int verbose, int force, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, int delay); + char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof); extern int Detail(char *dev, int brief, int export, int test, char *homehost); +extern int Detail_Platform(struct superswitch *ss, int scan, int verbose); extern int Query(char *dev); extern int Examine(mddev_dev_t devlist, int brief, int export, int scan, int SparcAdjust, struct supertype *forcest, char *homehost); extern int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char *pidfile); + int dosyslog, int test, char *pidfile, int increments); -extern int Kill(char *dev, int force, int quiet); +extern int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl); +extern int Kill_subarray(char *dev, char *subarray, int quiet); +extern int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident, int quiet); extern int Wait(char *dev); +extern int WaitClean(char *dev, int sock, int verbose); extern int Incremental(char *devname, int verbose, int runstop, - struct supertype *st, char *homehost, int autof); + struct supertype *st, char *homehost, int require_homehost, + int autof); +extern int Incremental_container(struct supertype *st, char *devname, + int verbose, int runstop, int autof, + int trustworthy); extern void RebuildMap(void); extern int IncrementalScan(int verbose); - +extern int IncrementalRemove(char *devname, int verbose); extern int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, unsigned long write_behind, @@ -478,42 +847,71 @@ extern int md_get_version(int fd); extern int get_linux_version(void); +extern long long parse_size(char *size); extern int parse_uuid(char *str, int uuid[4]); +extern int parse_layout_10(char *layout); +extern int parse_layout_faulty(char *layout); extern int check_ext2(int fd, char *name); extern int check_reiser(int fd, char *name); extern int check_raid(int fd, char *name); +extern int check_partitions(int fd, char *dname, unsigned long long freesize); extern int get_mdp_major(void); extern int dev_open(char *dev, int flags); +extern int open_dev(int devnum); +extern int open_dev_excl(int devnum); extern int is_standard(char *dev, int *nump); +extern int same_dev(char *one, char *two); extern int parse_auto(char *str, char *msg, int config); extern mddev_ident_t conf_get_ident(char *dev); extern mddev_dev_t conf_get_devs(void); extern int conf_test_dev(char *devname); +extern int conf_test_metadata(const char *version, int is_homehost); extern struct createinfo *conf_get_create_info(void); extern void set_conffile(char *file); extern char *conf_get_mailaddr(void); extern char *conf_get_mailfrom(void); extern char *conf_get_program(void); -extern char *conf_get_homehost(void); +extern char *conf_get_homehost(int *require_homehostp); extern char *conf_line(FILE *file); extern char *conf_word(FILE *file, int allow_key); +extern int conf_name_is_free(char *name); +extern int devname_matches(char *name, char *match); +extern struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st); + extern void free_line(char *line); extern int match_oneof(char *devices, char *devname); extern void uuid_from_super(int uuid[4], mdp_super_t *super); +extern const int uuid_match_any[4]; extern int same_uuid(int a[4], int b[4], int swapuuid); extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); +extern char *fname_from_uuid(struct supertype *st, + struct mdinfo *info, char *buf, char sep); extern unsigned long calc_csum(void *super, int bytes); extern int enough(int level, int raid_disks, int layout, int clean, char *avail, int avail_disks); extern int ask(char *mesg); extern unsigned long long get_component_size(int fd); extern void remove_partitions(int fd); - +extern int test_partition(int fd); +extern unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize); +extern int flush_metadata_updates(struct supertype *st); +extern void append_metadata_update(struct supertype *st, void *buf, int len); +extern int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, int runstop, + char *chosen_name, int verbose); + +extern int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info); +unsigned long long min_recovery_start(struct mdinfo *array); extern char *human_size(long long bytes); -char *human_size_brief(long long bytes); +extern char *human_size_brief(long long bytes); +extern void print_r10_layout(int layout); #define NoMdDev (1<<23) extern int find_free_devnum(int use_partitions); @@ -523,15 +921,91 @@ extern char DefaultConfFile[]; -extern int open_mddev(char *dev, int autof); -extern int open_mddev_devnum(char *devname, int devnum, char *name, - char *chosen_name, int parts); - +extern int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen); +/* values for 'trustworthy' */ +#define LOCAL 1 +#define LOCAL_ANY 10 +#define FOREIGN 2 +#define METADATA 3 +extern int open_mddev(char *dev, int report_errors); +extern int open_container(int fd); +extern int is_container_member(struct mdstat_ent *ent, char *devname); +extern int is_subarray_active(char *subarray, char *devname); +int is_container_active(char *devname); +extern int open_subarray(char *dev, struct supertype *st, int quiet); +extern struct superswitch *version_to_superswitch(char *vers); + +extern int mdmon_running(int devnum); +extern int mdmon_pid(int devnum); +extern int check_env(char *name); +extern __u32 random32(void); +extern int start_mdmon(int devnum); + +extern char *devnum2devname(int num); +extern int devname2devnum(char *name); +extern int stat2devnum(struct stat *st); +extern int fd2devnum(int fd); + +static inline int dev2major(int d) +{ + if (d >= 0) + return MD_MAJOR; + else + return get_mdp_major(); +} + +static inline int dev2minor(int d) +{ + if (d >= 0) + return d; + return (-1-d) << MdpMinorShift; +} + +static inline int ROUND_UP(int a, int base) +{ + return ((a+base-1)/base)*base; +} + +static inline int is_subarray(char *vers) +{ + /* The version string for a 'subarray' (an array in a container) + * is + * /containername/componentname for normal read-write arrays + * -containername/componentname for read-only arrays. + * containername is e.g. md0, md_d1 + * componentname is dependant on the metadata. e.g. '1' 'S1' ... + */ + return (*vers == '/' || *vers == '-'); +} + +#ifdef DEBUG +#define dprintf(fmt, arg...) \ + fprintf(stderr, fmt, ##arg) +#else +#define dprintf(fmt, arg...) \ + ({ if (0) fprintf(stderr, fmt, ##arg); 0; }) +#endif +#include +#include +static inline int xasprintf(char **strp, const char *fmt, ...) { + va_list ap; + int ret; + va_start(ap, fmt); + ret = vasprintf(strp, fmt, ap); + va_end(ap); + assert(ret >= 0); + return ret; +} #define LEVEL_MULTIPATH (-4) #define LEVEL_LINEAR (-1) #define LEVEL_FAULTY (-5) +/* kernel module doesn't know about these */ +#define LEVEL_CONTAINER (-100) +#define LEVEL_UNSUPPORTED (-200) + /* faulty stuff */ @@ -562,8 +1036,51 @@ #define makedev(M,m) (((M)<<8) | (m)) #endif -/* for raid5 */ +/* for raid4/5/6 */ #define ALGORITHM_LEFT_ASYMMETRIC 0 #define ALGORITHM_RIGHT_ASYMMETRIC 1 #define ALGORITHM_LEFT_SYMMETRIC 2 #define ALGORITHM_RIGHT_SYMMETRIC 3 + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +/* Define PATH_MAX in case we don't use glibc or standard library does + * not have PATH_MAX defined. Assume max path length is 4K characters. + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + diff -Nru mdadm-2.6.7.1/mdadm.spec mdadm-3.1.4/mdadm.spec --- mdadm-2.6.7.1/mdadm.spec 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/mdadm.spec 2010-08-31 10:21:13.000000000 +0300 @@ -1,6 +1,6 @@ Summary: mdadm is used for controlling Linux md devices (aka RAID arrays) Name: mdadm -Version: 2.6.7.1 +Version: 3.1.4 Release: 1 Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tgz URL: http://neil.brown.name/blog/mdadm diff -Nru mdadm-2.6.7.1/mdassemble.8 mdadm-3.1.4/mdassemble.8 --- mdadm-2.6.7.1/mdassemble.8 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/mdassemble.8 2010-08-31 10:21:13.000000000 +0300 @@ -1,5 +1,5 @@ .\" -*- nroff -*- -.TH MDASSEMBLE 8 "" v2.6.7.1 +.TH MDASSEMBLE 8 "" v3.1.4 .SH NAME mdassemble \- assemble MD devices .I aka @@ -40,7 +40,7 @@ .SH FILES -.SS /etc/mdadm/mdadm.conf +.SS /etc/mdadm.conf The config file lists which devices may be scanned to see if they contain MD super block, and gives identifying information diff -Nru mdadm-2.6.7.1/mdassemble.c mdadm-3.1.4/mdassemble.c --- mdadm-2.6.7.1/mdassemble.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/mdassemble.c 2010-08-26 05:24:15.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdassemble - assemble Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * Copyright (C) 2003 Luca Berra * * @@ -20,12 +20,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -33,7 +28,7 @@ /* from readme.c */ mapping_t pers[] = { - { "linear", -1}, + { "linear", LEVEL_LINEAR}, { "raid0", 0}, { "0", 0}, { "stripe", 0}, @@ -44,8 +39,8 @@ { "4", 4}, { "raid5", 5}, { "5", 5}, - { "multipath", -4}, - { "mp", -4}, + { "multipath", LEVEL_MULTIPATH}, + { "mp", LEVEL_MULTIPATH}, { "raid6", 6}, { "6", 6}, { "raid10", 10}, @@ -55,9 +50,9 @@ #ifndef MDASSEMBLE_AUTO /* from mdopen.c */ -int open_mddev(char *dev, int autof/*unused */) +int open_mddev(char *dev, int report_errors/*unused*/) { - int mdfd = open(dev, O_RDWR, 0); + int mdfd = open(dev, O_RDWR); if (mdfd < 0) fprintf(stderr, Name ": error opening %s: %s\n", dev, strerror(errno)); @@ -69,7 +64,21 @@ } return mdfd; } +int create_mddev(char *dev, char *name, int autof/*unused*/, int trustworthy, + char *chosen) +{ + return open_mddev(dev, 0); +} #endif +int map_update(struct map_ent **mpp, int devnum, char *metadata, + int *uuid, char *path) +{ + return 0; +} +struct map_ent *map_by_name(struct map_ent **mpp, char *name) +{ + return NULL; +} int rv; int mdfd = -1; @@ -86,19 +95,19 @@ } else for (; array_list; array_list = array_list->next) { mdu_array_info_t array; - mdfd = open_mddev(array_list->devname, array_list->autof); - if (mdfd < 0) { - rv |= 1; + if (strcasecmp(array_list->devname, "") == 0) continue; - } - if (ioctl(mdfd, GET_ARRAY_INFO, &array) < 0) { - rv |= Assemble(array_list->st, array_list->devname, mdfd, - array_list, NULL, NULL, - readonly, runstop, NULL, NULL, verbose, force); - } else { + mdfd = open_mddev(array_list->devname, 0); + if (mdfd >= 0 && ioctl(mdfd, GET_ARRAY_INFO, &array) == 0) { rv |= Manage_ro(array_list->devname, mdfd, -1); /* make it readwrite */ + continue; } - close(mdfd); + if (mdfd >= 0) + close(mdfd); + rv |= Assemble(array_list->st, array_list->devname, + array_list, NULL, NULL, + readonly, runstop, NULL, NULL, 0, + verbose, force); } return rv; } diff -Nru mdadm-2.6.7.1/mdmon.8 mdadm-3.1.4/mdmon.8 --- mdadm-2.6.7.1/mdmon.8 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/mdmon.8 2010-08-31 10:21:13.000000000 +0300 @@ -0,0 +1,248 @@ +.\" See file COPYING in distribution for details. +.TH MDMON 8 "" v3.1.4 +.SH NAME +mdmon \- monitor MD external metadata arrays + +.SH SYNOPSIS + +.BI mdmon " [--all] [--takeover] CONTAINER" + +.SH OVERVIEW +The 2.6.27 kernel brings the ability to support external metadata arrays. +External metadata implies that user space handles all updates to the metadata. +The kernel's responsibility is to notify user space when a "metadata event" +occurs, like disk failures and clean-to-dirty transitions. The kernel, in +important cases, waits for user space to take action on these notifications. + +.SH DESCRIPTION +.SS Metadata updates: +To service metadata update requests a daemon, +.IR mdmon , +is introduced. +.I Mdmon +is tasked with polling the sysfs namespace looking for changes in +.BR array_state , +.BR sync_action , +and per disk +.BR state +attributes. When a change is detected it calls a per metadata type +handler to make modifications to the metadata. The following actions +are taken: +.RS +.TP +.B array_state \- inactive +Clear the dirty bit for the volume and let the array be stopped +.TP +.B array_state \- write pending +Set the dirty bit for the array and then set +.B array_state +to +.BR active . +Writes +are blocked until userspace writes +.BR active. +.TP +.B array_state \- active-idle +The safe mode timer has expired so set array state to clean to block writes to the array +.TP +.B array_state \- clean +Clear the dirty bit for the volume +.TP +.B array_state \- read-only +This is the initial state that all arrays start at. +.I mdmon +takes one of the three actions: +.RS +.TP +1/ +Transition the array to read-auto keeping the dirty bit clear if the metadata +handler determines that the array does not need resyncing or other modification +.TP +2/ +Transition the array to active if the metadata handler determines a resync or +some other manipulation is necessary +.TP +3/ +Leave the array read\-only if the volume is marked to not be monitored; for +example, the metadata version has been set to "external:\-dev/md127" instead of +"external:/dev/md127" +.RE +.TP +.B sync_action \- resync\-to\-idle +Notify the metadata handler that a resync may have completed. If a resync +process is idled before it completes this event allows the metadata handler to +checkpoint resync. +.TP +.B sync_action \- recover\-to\-idle +A spare may have completed rebuilding so tell the metadata handler about the +state of each disk. This is the metadata handler's opportunity to clear +any "out-of-sync" bits and clear the volume's degraded status. If a recovery +process is idled before it completes this event allows the metadata handler to +checkpoint recovery. +.TP +.B /state \- faulty +A disk failure kicks off a series of events. First, notify the metadata +handler that a disk has failed, and then notify the kernel that it can unblock +writes that were dependent on this disk. After unblocking the kernel this disk +is set to be removed+ from the member array. Finally the disk is marked failed +in all other member arrays in the container. +.IP ++ Note This behavior differs slightly from native MD arrays where +removal is reserved for a +.B mdadm --remove +event. In the external metadata case the container holds the final +reference on a block device and a +.B mdadm --remove +call is still required. +.RE + +.SS Containers: +.P +External metadata formats, like DDF, differ from the native MD metadata +formats in that they define a set of disks and a series of sub-arrays +within those disks. MD metadata in comparison defines a 1:1 +relationship between a set of block devices and a raid array. For +example to create 2 arrays at different raid levels on a single +set of disks, MD metadata requires the disks be partitioned and then +each array can created be created with a subset of those partitions. The +supported external formats perform this disk carving internally. +.P +Container devices simply hold references to all member disks and allow +tools like +.I mdmon +to determine which active arrays belong to which +container. Some array management commands like disk removal and disk +add are now only valid at the container level. Attempts to perform +these actions on member arrays are blocked with error messages like: +.IP +"mdadm: Cannot remove disks from a \'member\' array, perform this +operation on the parent container" +.P +Containers are identified in /proc/mdstat with a metadata version string +"external:". Member devices are identified by +"external://", or "external:-/" if the array is to remain readonly. + +.SH OPTIONS +.TP +CONTAINER +The +.B container +device to monitor. It can be a full path like /dev/md/container, or a +simple md device name like md127. +.TP +.B \-\-takeover +This instructs +.I mdmon +to replace any active +.I mdmon +which is currently monitoring the array. This is primarily used late +in the boot process to replace any +.I mdmon +which was started from an +.B initramfs +before the root filesystem was mounted. This avoids holding a +reference on that +.B initramfs +indefinitely and ensures that the +.I pid +and +.I sock +files used to communicate with +.I mdmon +are in a standard place. +.TP +.B \-\-all +This tells mdmon to find any active containers and start monitoring +each of them if appropriate. This is normally used with +.B \-\-takeover +late in the boot sequence. +A separate +.I mdmon +process is started for each container as the +.B \-\-all +argument is over-written with the name of the container. To allow for +containers with names longer than 5 characters, this argument can be +arbitrarily extended, e.g. to +.BR \-\-all-active-arrays . + +.PP +Note that +.I mdmon +is automatically started by +.I mdadm +when needed and so does not need to be considered when working with +RAID arrays. The only times it is run other that by +.I mdadm +is when the boot scripts need to restart it after mounting the new +root filesystem. + +.SH START UP AND SHUTDOWN + +As +.I mdmon +needs to be running whenever any filesystem on the monitored device is +mounted there are special considerations when the root filesystem is +mounted from an +.I mdmon +monitored device. +Note that in general +.I mdmon +is needed even if the filesystem is mounted read-only as some +filesystems can still write to the device in those circumstances, for +example to replay a journal after an unclean shutdown. + +When the array is assembled by the +.B initramfs +code, mdadm will automatically start +.I mdmon +as required. This means that +.I mdmon +must be installed on the +.B initramfs +and there must be a writable filesystem (typically tmpfs) in which +.B mdmon +can create a +.B .pid +and +.B .sock +file. The particular filesystem to use is given to mdmon at compile +time and defaults to +.BR /dev/.mdadm . + +This filesystem must persist through to shutdown time. + +After the final root filesystem has be instantiated (usually with +.BR pivot_root ) +.I mdmon +should be run with +.I "\-\-all \-\-takeover" +so that the +.I mdmon +running from the +.B initramfs +can be replaced with one running in the main root, and so the +memory used by the initramfs can be released. + +At shutdown time, +.I mdmon +should not be killed along with other processes. Also as it holds a +file (socket actually) open in +.B /dev +(by default) it will not be possible to unmount +.B /dev +if it is a separate filesystem. + +.SH EXAMPLES + +.B " mdmon \-\-all-active-arrays \-\-takeover" +.br +Any +.I mdmon +which is currently running is killed and a new instance is started. +This should be run during in the boot sequence if an initramfs was +used, so that any mdmon running from the initramfs will not hold +the initramfs active. +.SH SEE ALSO +.IR mdadm (8), +.IR md (4). diff -Nru mdadm-2.6.7.1/mdmon.c mdadm-3.1.4/mdmon.c --- mdadm-2.6.7.1/mdmon.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/mdmon.c 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,520 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * md array manager. + * When md arrays have user-space managed metadata, this is the program + * that does the managing. + * + * Given one argument: the name of the array (e.g. /dev/md0) that is + * the container. + * We fork off a helper that runs high priority and mlocked. It responds to + * device failures and other events that might stop writeout, or that are + * trivial to deal with. + * The main thread then watches for new arrays being created in the container + * and starts monitoring them too ... along with a few other tasks. + * + * The main thread communicates with the priority thread by writing over + * a pipe. + * Separate programs can communicate with the main thread via Unix-domain + * socket. + * The two threads share address space and open file table. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef USE_PTHREADS +#include +#else +#include +#endif + +#include "mdadm.h" +#include "mdmon.h" + +struct active_array *discard_this; +struct active_array *pending_discard; + +int mon_tid, mgr_tid; + +int sigterm; + +#ifdef USE_PTHREADS +static void *run_child(void *v) +{ + struct supertype *c = v; + + mon_tid = syscall(SYS_gettid); + do_monitor(c); + return 0; +} + +static int clone_monitor(struct supertype *container) +{ + pthread_attr_t attr; + pthread_t thread; + int rc; + + mon_tid = -1; + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 4096); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rc = pthread_create(&thread, &attr, run_child, container); + if (rc) + return rc; + while (mon_tid == -1) + usleep(10); + pthread_attr_destroy(&attr); + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#else /* USE_PTHREADS */ +static int run_child(void *v) +{ + struct supertype *c = v; + + do_monitor(c); + return 0; +} + +#ifdef __ia64__ +int __clone2(int (*fn)(void *), + void *child_stack_base, size_t stack_size, + int flags, void *arg, ... + /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ ); +#endif +static int clone_monitor(struct supertype *container) +{ + static char stack[4096]; + +#ifdef __ia64__ + mon_tid = __clone2(run_child, stack, sizeof(stack), + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#else + mon_tid = clone(run_child, stack+4096-64, + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#endif + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#endif /* USE_PTHREADS */ + +static int make_pidfile(char *devname) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + if (mkdir(MDMON_DIR, 0755) < 0 && + errno != EEXIST) + return -errno; + sprintf(path, "%s/%s.pid", MDMON_DIR, devname); + + fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + return -errno; + sprintf(pid, "%d\n", getpid()); + n = write(fd, pid, strlen(pid)); + close(fd); + if (n < 0) + return -errno; + return 0; +} + +static void try_kill_monitor(pid_t pid, char *devname, int sock) +{ + char buf[100]; + int fd; + int n; + long fl; + + /* first rule of survival... don't off yourself */ + if (pid == getpid()) + return; + + /* kill this process if it is mdmon */ + sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid); + fd = open(buf, O_RDONLY); + if (fd < 0) + return; + + n = read(fd, buf, sizeof(buf)-1); + buf[sizeof(buf)-1] = 0; + close(fd); + + if (n < 0 || !strstr(buf, "mdmon")) + return; + + kill(pid, SIGTERM); + + /* Wait for monitor to exit by reading from the socket, after + * clearing the non-blocking flag */ + fl = fcntl(sock, F_GETFL, 0); + fl &= ~O_NONBLOCK; + fcntl(sock, F_SETFL, fl); + n = read(sock, buf, 100); + /* Ignore result, it is just the wait that + * matters + */ +} + +void remove_pidfile(char *devname) +{ + char buf[100]; + + sprintf(buf, "%s/%s.pid", MDMON_DIR, devname); + unlink(buf); + sprintf(buf, "%s/%s.sock", MDMON_DIR, devname); + unlink(buf); +} + +static int make_control_sock(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + if (sigterm) + return -1; + + sprintf(path, "%s/%s.sock", MDMON_DIR, devname); + unlink(path); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (bind(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + listen(sfd, 10); + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + return sfd; +} + +static void term(int sig) +{ + sigterm = 1; +} + +static void wake_me(int sig) +{ + +} + +/* if we are debugging and starting mdmon by hand then don't fork */ +static int do_fork(void) +{ + #ifdef DEBUG + if (check_env("MDADM_NO_MDMON")) + return 0; + #endif + + return 1; +} + +void usage(void) +{ + fprintf(stderr, "Usage: mdmon [--all] [--takeover] CONTAINER\n"); + exit(2); +} + +static int mdmon(char *devname, int devnum, int must_fork, int takeover); + +int main(int argc, char *argv[]) +{ + char *container_name = NULL; + int devnum; + char *devname; + int status = 0; + int arg; + int all = 0; + int takeover = 0; + + for (arg = 1; arg < argc; arg++) { + if (strncmp(argv[arg], "--all",5) == 0 || + strcmp(argv[arg], "/proc/mdstat") == 0) { + container_name = argv[arg]; + all = 1; + } else if (strcmp(argv[arg], "--takeover") == 0) + takeover = 1; + else if (container_name == NULL) + container_name = argv[arg]; + else + usage(); + } + if (container_name == NULL) + usage(); + + if (all) { + struct mdstat_ent *mdstat, *e; + int container_len = strlen(container_name); + + /* launch an mdmon instance for each container found */ + mdstat = mdstat_read(0, 0); + for (e = mdstat; e; e = e->next) { + if (strncmp(e->metadata_version, "external:", 9) == 0 && + !is_subarray(&e->metadata_version[9])) { + devname = devnum2devname(e->devnum); + /* update cmdline so this mdmon instance can be + * distinguished from others in a call to ps(1) + */ + if (strlen(devname) <= (unsigned)container_len) { + memset(container_name, 0, container_len); + sprintf(container_name, "%s", devname); + } + status |= mdmon(devname, e->devnum, 1, + takeover); + } + } + free_mdstat(mdstat); + + return status; + } else if (strncmp(container_name, "md", 2) == 0) { + devnum = devname2devnum(container_name); + devname = devnum2devname(devnum); + if (strcmp(container_name, devname) != 0) + devname = NULL; + } else { + struct stat st; + + devnum = NoMdDev; + if (stat(container_name, &st) == 0) + devnum = stat2devnum(&st); + if (devnum == NoMdDev) + devname = NULL; + else + devname = devnum2devname(devnum); + } + + if (!devname) { + fprintf(stderr, "mdmon: %s is not a valid md device name\n", + container_name); + exit(1); + } + return mdmon(devname, devnum, do_fork(), takeover); +} + +static int mdmon(char *devname, int devnum, int must_fork, int takeover) +{ + int mdfd; + struct mdinfo *mdi, *di; + struct supertype *container; + sigset_t set; + struct sigaction act; + int pfd[2]; + int status; + int ignore; + pid_t victim = -1; + int victim_sock = -1; + + dprintf("starting mdmon for %s\n", devname); + + mdfd = open_dev(devnum); + if (mdfd < 0) { + fprintf(stderr, "mdmon: %s: %s\n", devname, + strerror(errno)); + return 1; + } + if (md_get_version(mdfd) < 0) { + fprintf(stderr, "mdmon: %s: Not an md device\n", + devname); + return 1; + } + + /* Fork, and have the child tell us when they are ready */ + if (must_fork) { + if (pipe(pfd) != 0) { + fprintf(stderr, "mdmon: failed to create pipe\n"); + return 1; + } + switch(fork()) { + case -1: + fprintf(stderr, "mdmon: failed to fork: %s\n", + strerror(errno)); + return 1; + case 0: /* child */ + close(pfd[0]); + break; + default: /* parent */ + close(pfd[1]); + if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) { + wait(&status); + status = WEXITSTATUS(status); + } + return status; + } + } else + pfd[0] = pfd[1] = -1; + + container = calloc(1, sizeof(*container)); + container->devnum = devnum; + container->devname = devname; + container->arrays = NULL; + container->subarray[0] = 0; + container->sock = -1; + + if (!container->devname) { + fprintf(stderr, "mdmon: failed to allocate container name string\n"); + exit(3); + } + + mdi = sysfs_read(mdfd, container->devnum, GET_VERSION|GET_LEVEL|GET_DEVS); + + if (!mdi) { + fprintf(stderr, "mdmon: failed to load sysfs info for %s\n", + container->devname); + exit(3); + } + if (mdi->array.level != UnSet) { + fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n", + devname); + exit(3); + } + if (mdi->array.major_version != -1 || + mdi->array.minor_version != -2) { + fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n", + devname); + exit(3); + } + + container->ss = version_to_superswitch(mdi->text_version); + if (container->ss == NULL) { + fprintf(stderr, "mdmon: %s uses unsupported metadata: %s\n", + devname, mdi->text_version); + exit(3); + } + + container->devs = NULL; + for (di = mdi->devs; di; di = di->next) { + struct mdinfo *cd = malloc(sizeof(*cd)); + *cd = *di; + cd->next = container->devs; + container->devs = cd; + } + sysfs_free(mdi); + + /* SIGUSR is sent between parent and child. So both block it + * and enable it only with pselect. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGTERM); + sigprocmask(SIG_BLOCK, &set, NULL); + act.sa_handler = wake_me; + act.sa_flags = 0; + sigaction(SIGUSR1, &act, NULL); + act.sa_handler = term; + sigaction(SIGTERM, &act, NULL); + act.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &act, NULL); + + victim = mdmon_pid(container->devnum); + if (victim >= 0) + victim_sock = connect_monitor(container->devname); + + ignore = chdir("/"); + if (!takeover && victim > 0 && victim_sock >= 0) { + if (fping_monitor(victim_sock) == 0) { + fprintf(stderr, "mdmon: %s already managed\n", + container->devname); + exit(3); + } + close(victim_sock); + } + if (container->ss->load_super(container, mdfd, devname)) { + fprintf(stderr, "mdmon: Cannot load metadata for %s\n", + devname); + exit(3); + } + close(mdfd); + + /* Ok, this is close enough. We can say goodbye to our parent now. + */ + if (victim > 0) + remove_pidfile(devname); + if (make_pidfile(devname) < 0) { + exit(3); + } + container->sock = make_control_sock(devname); + + status = 0; + if (write(pfd[1], &status, sizeof(status)) < 0) + fprintf(stderr, "mdmon: failed to notify our parent: %d\n", + getppid()); + close(pfd[1]); + + mlockall(MCL_CURRENT | MCL_FUTURE); + + if (clone_monitor(container) < 0) { + fprintf(stderr, "mdmon: failed to start monitor process: %s\n", + strerror(errno)); + exit(2); + } + + if (victim > 0) { + try_kill_monitor(victim, container->devname, victim_sock); + close(victim_sock); + } + + setsid(); + close(0); + open("/dev/null", O_RDWR); + close(1); + ignore = dup(0); +#ifndef DEBUG + close(2); + ignore = dup(0); +#endif + + do_manager(container); + + exit(0); +} diff -Nru mdadm-2.6.7.1/mdmon.h mdadm-3.1.4/mdmon.h --- mdadm-2.6.7.1/mdmon.h 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/mdmon.h 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,97 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +enum array_state { clear, inactive, suspended, readonly, read_auto, + clean, active, write_pending, active_idle, bad_word}; + +enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; + + +struct active_array { + struct mdinfo info; + struct supertype *container; + struct active_array *next, *replaces; + + int action_fd; + int resync_start_fd; + int metadata_fd; /* for monitoring rw/ro status */ + int sync_completed_fd; /* for checkpoint notification events */ + unsigned long long last_checkpoint; /* sync_completed fires for many + * reasons this field makes sure the + * kernel has made progress before + * moving the checkpoint. It is + * cleared by the metadata handler + * when it determines recovery is + * terminated. + */ + + enum array_state prev_state, curr_state, next_state; + enum sync_action prev_action, curr_action, next_action; + + int check_degraded; /* flag set by mon, read by manage */ + + int devnum; +}; + +/* + * Metadata updates are handled by the monitor thread, + * as it has exclusive access to the metadata. + * When the manager want to updates metadata, either + * for it's own reason (e.g. committing a spare) or + * on behalf of mdadm, it creates a metadata_update + * structure and queues it to the monitor. + * Updates are created and processed by code under the + * superswitch. All common code sees them as opaque + * blobs. + */ +extern struct metadata_update *update_queue, *update_queue_handled; + +#define MD_MAJOR 9 + +extern struct active_array *container; +extern struct active_array *discard_this; +extern struct active_array *pending_discard; +extern struct md_generic_cmd *active_cmd; + + +void remove_pidfile(char *devname); +void do_monitor(struct supertype *container); +void do_manager(struct supertype *container); +extern int sigterm; + +int read_dev_state(int fd); +int is_container_member(struct mdstat_ent *mdstat, char *container); + +struct mdstat_ent *mdstat_read(int hold, int start); + +extern int exit_now, manager_ready; +extern int mon_tid, mgr_tid; +extern int monitor_loop_cnt; + +/* helper routine to determine resync completion since MaxSector is a + * moving target + */ +static inline int is_resync_complete(struct mdinfo *array) +{ + if (array->resync_start >= array->component_size) + return 1; + return 0; +} + diff -Nru mdadm-2.6.7.1/mdopen.c mdadm-3.1.4/mdopen.c --- mdadm-2.6.7.1/mdopen.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/mdopen.c 2010-08-05 09:51:58.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -32,45 +27,44 @@ #include -void make_dev_symlink(char *dev) -{ - char *new = strdup(dev); - - if (!new) return; - /* /dev/md/0 -> /dev/md0 - * /dev/md/d0 -> /dev/md_d0 - */ - if (isdigit(new[8])) - strcpy(new+7, new+8); - else - new[7] = '_'; - if (symlink(dev+5, new)) - perror(new); -} - - -void make_parts(char *dev, int cnt, int symlinks) +void make_parts(char *dev, int cnt) { /* make 'cnt' partition devices for 'dev' - * We use the major/minor from dev and add 1..cnt + * If dev is a device name we use the + * major/minor from dev and add 1..cnt + * If it is a symlink, we make similar symlinks. * If dev ends with a digit, we add "p%d" else "%d" * If the name exists, we use it's owner/mode, * else that of dev */ struct stat stb; - int major_num, minor_num; + int major_num = major_num; /* quiet gcc -Os unitialized warning */ + int minor_num = minor_num; /* quiet gcc -Os unitialized warning */ + int odig = odig; /* quiet gcc -Os unitialized warning */ int i; int nlen = strlen(dev) + 20; - char *name = malloc(nlen); + char *name; int dig = isdigit(dev[strlen(dev)-1]); + char orig[1024]; + char sym[1024]; + int err; if (cnt==0) cnt=4; - if (stat(dev, &stb)!= 0) + if (lstat(dev, &stb)!= 0) return; - if (!S_ISBLK(stb.st_mode)) - return; - major_num = major(stb.st_rdev); - minor_num = minor(stb.st_rdev); + + if (S_ISLNK(stb.st_mode)) { + int len = readlink(dev, orig, sizeof(orig)); + if (len < 0 || len > 1000) + return; + orig[len] = 0; + odig = isdigit(orig[len-1]); + } else if (S_ISBLK(stb.st_mode)) { + major_num = major(stb.st_rdev); + minor_num = minor(stb.st_rdev); + } else + return; + name = malloc(nlen); for (i=1; i <= cnt ; i++) { struct stat stb2; snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); @@ -83,39 +77,76 @@ } else { stb2 = stb; } - if (mknod(name, S_IFBLK | 0600, makedev(major_num, minor_num+i))) - perror("mknod"); - if (chown(name, stb2.st_uid, stb2.st_gid)) - perror("chown"); - if (chmod(name, stb2.st_mode & 07777)) - perror("chmod"); - if (symlinks && strncmp(name, "/dev/md/", 8) == 0) - make_dev_symlink(name); - stat(name, &stb2); - add_dev(name, &stb2, 0, NULL); + if (S_ISBLK(stb.st_mode)) { + if (mknod(name, S_IFBLK | 0600, + makedev(major_num, minor_num+i))) + perror("mknod"); + if (chown(name, stb2.st_uid, stb2.st_gid)) + perror("chown"); + if (chmod(name, stb2.st_mode & 07777)) + perror("chmod"); + err = 0; + } else { + snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i); + err = symlink(sym, name); + } + + if (err == 0 && stat(name, &stb2) == 0) + add_dev(name, &stb2, 0, NULL); } + free(name); } /* - * Open a given md device, and check that it really is one. - * If 'autof' is given, then we need to create, or recreate, the md device. - * If the name already exists, and is not a block device, we fail. - * If it exists and is not an md device, is not the right type (partitioned or not), - * or is currently in-use, we remove the device, but remember the owner and mode. - * If it now doesn't exist, we find a new md array and create the device. - * Default ownership/mode comes from config file. + * We need a new md device to assemble/build/create an array. + * 'dev' is a name given us by the user (command line or mdadm.conf) + * It might start with /dev or /dev/md any might end with a digit + * string. + * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX + * If it ends with a digit string, then it must be as above, or + * 'trustworthy' must be 'METADATA' and the 'dev' must be + * /dev/md/'name'NN or 'name'NN + * If it doesn't end with a digit string, it must be /dev/md/'name' + * or 'name' or must be NULL. + * If the digit string is present, it gives the minor number to use + * If not, we choose a high, unused minor number. + * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'. + * else if the name is 'd[0-9]+' then we use mdp + * else if trustworthy is 'METADATA' we use md + * else the choice depends on 'autof'. + * If name is NULL it is assumed to match whatever dev provides. + * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX' + * + * If 'name' is given, and 'trustworthy' is 'foreign' and name is not + * supported by 'dev', we add a "_%d" suffix based on the minor number + * use that. + * + * If udev is configured, we create a temporary device, open it, and + * unlink it. + * If not, we create the /dev/mdXX device, and is name is usable, + * /dev/md/name + * In any case we return /dev/md/name or (if that isn't available) + * /dev/mdXX in 'chosen'. + * + * When we create devices, we use uid/gid/umask from config file. */ -int open_mddev(char *dev, int autof) + +int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen) { int mdfd; struct stat stb; - int major_num = MD_MAJOR; - int minor_num = 0; - int must_remove = 0; - int num; + int num = -1; + int use_mdp = -1; struct createinfo *ci = conf_get_create_info(); int parts; + char *cname; + char devname[20]; + char cbuf[400]; + if (chosen == NULL) + chosen = cbuf; + if (autof == 0) autof = ci->autof; @@ -123,226 +154,254 @@ parts = autof >> 3; autof &= 7; - if (autof && autof != 1) { - /* autof is set, so we need to check that the name is ok, - * and possibly create one if not - */ - int std; - stb.st_mode = 0; - if (stat(dev, &stb)==0 && ! S_ISBLK(stb.st_mode)) { - fprintf(stderr, Name ": %s is not a block device.\n", - dev); - return -1; - } - /* check major number is correct */ - num = -1; - std = is_standard(dev, &num); - if (std>0) major_num = get_mdp_major(); - switch(autof) { - case 2: /* only create is_standard names */ - if (!std && !stb.st_mode) { - fprintf(stderr, Name - ": %s does not exist and is not a 'standard' name " - "so it cannot be created\n", dev); - return -1; - } - break; - case 3: /* create md, reject std>0 */ - if (std > 0) { - fprintf(stderr, Name ": that --auto option " - "not compatable with device named %s\n", dev); - return -1; - } - break; - case 4: /* create mdp, reject std<0 */ - if (std < 0) { - fprintf(stderr, Name ": that --auto option " - "not compatable with device named %s\n", dev); - return -1; - } - major_num = get_mdp_major(); - break; - case 5: /* default to md if not standard */ - break; - case 6: /* default to mdp if not standard */ - if (std == 0) major_num = get_mdp_major(); - break; - } - /* major is final. num is -1 if not standard */ - if (stb.st_mode && major(stb.st_rdev) != major_num) - must_remove = 1; - if (stb.st_mode && !must_remove) { - /* looks ok, see if it is available */ - mdfd = open(dev, O_RDWR, 0); - if (mdfd < 0) { - fprintf(stderr, Name ": error opening %s: %s\n", - dev, strerror(errno)); - return -1; - } else if (md_get_version(mdfd) <= 0) { - fprintf(stderr, Name ": %s does not appear to be an md device\n", - dev); - close(mdfd); - return -1; - } - if (major_num != MD_MAJOR && parts > 0) - make_parts(dev, parts, ci->symlinks); - return mdfd; - } - /* Ok, need to find a minor that is not in use. - * If the device name is in a 'standard' format, - * intuit the minor from that, else - * easiest to read /proc/mdstat, and hunt through for - * an unused number - */ - if (num < 0) { - /* need to pick an unused number */ - int num = find_free_devnum(major_num != MD_MAJOR); + strcpy(chosen, "/dev/md/"); + cname = chosen + strlen(chosen); - if (major_num == MD_MAJOR) - minor_num = num; - else - minor_num = (-1-num) << MdpMinorShift; - } else if (major_num == MD_MAJOR) - minor_num = num; - else - minor_num = num << MdpMinorShift; - /* major and minor have been chosen */ - /* If it was a 'standard' name and it is in-use, then - * the device could already be correct - */ - if (stb.st_mode && major(stb.st_rdev) == major_num && - minor(stb.st_rdev) == minor_num) - ; - else { - if (major(makedev(major_num,minor_num)) != major_num || - minor(makedev(major_num,minor_num)) != minor_num) { - fprintf(stderr, Name ": Need newer C library to use more than 4 partitionable md devices, sorry\n"); + if (dev) { + if (strncmp(dev, "/dev/md/", 8) == 0) { + strcpy(cname, dev+8); + } else if (strncmp(dev, "/dev/", 5) == 0) { + char *e = dev + strlen(dev); + while (e > dev && isdigit(e[-1])) + e--; + if (e[0]) + num = strtoul(e, NULL, 10); + strcpy(cname, dev+5); + cname[e-(dev+5)] = 0; + /* name *must* be mdXX or md_dXX in this context */ + if (num < 0 || + (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) { + fprintf(stderr, Name ": %s is an invalid name " + "for an md device. Try /dev/md/%s\n", + dev, dev+5); return -1; } - if (must_remove) - unlink(dev); + if (strcmp(cname, "md") == 0) + use_mdp = 0; + else + use_mdp = 1; + /* recreate name: /dev/md/0 or /dev/md/d0 */ + sprintf(cname, "%s%d", use_mdp?"d":"", num); + } else + strcpy(cname, dev); - if (strncmp(dev, "/dev/md/", 8) == 0) { - if (mkdir("/dev/md",0700)==0) { - if (chown("/dev/md", ci->uid, ci->gid)) - perror("chown /dev/md"); - if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111))) - perror("chmod /dev/md"); - } - } - if (mknod(dev, S_IFBLK|0600, makedev(major_num, minor_num))!= 0) { - fprintf(stderr, Name ": failed to create %s\n", dev); - return -1; - } - if (must_remove) { - if (chown(dev, stb.st_uid, stb.st_gid)) - perror("chown"); - if (chmod(dev, stb.st_mode & 07777)) - perror("chmod"); - } else { - if (chown(dev, ci->uid, ci->gid)) - perror("chown"); - if (chmod(dev, ci->mode)) - perror("chmod"); - } - stat(dev, &stb); - add_dev(dev, &stb, 0, NULL); - if (ci->symlinks && strncmp(dev, "/dev/md/", 8) == 0) - make_dev_symlink(dev); - if (major_num != MD_MAJOR) - make_parts(dev,parts, ci->symlinks); + /* 'cname' must not contain a slash, and may not be + * empty. + */ + if (strchr(cname, '/') != NULL) { + fprintf(stderr, Name ": %s is an invalid name " + "for an md device.\n", dev); + return -1; + } + if (cname[0] == 0) { + fprintf(stderr, Name ": %s is an invalid name " + "for an md device (empty!).", dev); + return -1; + } + if (num < 0) { + /* If cname is 'N' or 'dN', we get dev number + * from there. + */ + char *sp = cname; + char *ep; + if (cname[0] == 'd') + sp++; + num = strtoul(sp, &ep, 10); + if (ep == sp || *ep || num < 0) + num = -1; + else if (cname[0] == 'd') + use_mdp = 1; + else + use_mdp = 0; } } - mdfd = open(dev, O_RDWR, 0); - if (mdfd < 0) - fprintf(stderr, Name ": error opening %s: %s\n", - dev, strerror(errno)); - else if (md_get_version(mdfd) <= 0) { - fprintf(stderr, Name ": %s does not appear to be an md device\n", - dev); - close(mdfd); - mdfd = -1; - } - return mdfd; -} - -int open_mddev_devnum(char *devname, int devnum, char *name, - char *chosen_name, int parts) -{ - /* Open the md device with number 'devnum', possibly using 'devname', - * possibly constructing a name with 'name', but in any case, copying - * the name into 'chosen_name' - */ - int major_num, minor_num; - struct stat stb; - int i; - struct createinfo *ci = conf_get_create_info(); - - if (devname) - strcpy(chosen_name, devname); - else if (name && strchr(name,'/') == NULL) { - char *n = strchr(name, ':'); - if (n) n++; else n = name; - if (isdigit(*n) && devnum < 0) - sprintf(chosen_name, "/dev/md/d%s", n); - else - sprintf(chosen_name, "/dev/md/%s", n); - } else { - if (devnum >= 0) - sprintf(chosen_name, "/dev/md%d", devnum); + /* Now determine device number */ + /* named 'METADATA' cannot use 'mdp'. */ + if (name && name[0] == 0) + name = NULL; + if (name && trustworthy == METADATA && use_mdp == 1) { + fprintf(stderr, Name ": %s is not allowed for a %s container. " + "Consider /dev/md%d.\n", dev, name, num); + return -1; + } + if (name && trustworthy == METADATA) + use_mdp = 0; + if (use_mdp == -1) { + if (autof == 4 || autof == 6) + use_mdp = 1; else - sprintf(chosen_name, "/dev/md/d%d", -1-devnum); + use_mdp = 0; } - if (devnum >= 0) { - major_num = MD_MAJOR; - minor_num = devnum; - } else { - major_num = get_mdp_major(); - minor_num = (-1-devnum) << 6; + if (num < 0 && trustworthy == LOCAL && name) { + /* if name is numeric, possibly prefixed by + * 'md' or '/dev/md', use that for num + * if it is not already in use */ + char *ep; + char *n2 = name; + if (strncmp(n2, "/dev/", 5) == 0) + n2 += 5; + if (strncmp(n2, "md", 2) == 0) + n2 += 2; + if (*n2 == '/') + n2++; + num = strtoul(n2, &ep, 10); + if (ep == n2 || *ep) + num = -1; + else if (mddev_busy(use_mdp ? (-1-num) : num)) + num = -1; } - if (stat(chosen_name, &stb) == 0) { - /* It already exists. Check it is right. */ - if ( ! S_ISBLK(stb.st_mode) || - stb.st_rdev != makedev(major_num, minor_num)) { - errno = EEXIST; + + if (num < 0) { + /* need to choose a free number. */ + num = find_free_devnum(use_mdp); + if (num == NoMdDev) { + fprintf(stderr, Name ": No avail md devices - aborting\n"); return -1; } } else { - /* special case: if --incremental is suggesting a name - * in /dev/md/, we make sure the directory exists. + num = use_mdp ? (-1-num) : num; + if (mddev_busy(num)) { + fprintf(stderr, Name ": %s is already in use.\n", + dev); + return -1; + } + } + + if (num < 0) + sprintf(devname, "/dev/md_d%d", -1-num); + else + sprintf(devname, "/dev/md%d", num); + + if (cname[0] == 0 && name) { + /* Need to find a name if we can + * We don't completely trust 'name'. Truncate to + * reasonable length and remove '/' */ - if (strncmp(chosen_name, "/dev/md/", 8) == 0) { + char *cp; + struct map_ent *map = NULL; + int conflict = 1; + int unum = 0; + int cnlen; + strncpy(cname, name, 200); + cname[200] = 0; + while ((cp = strchr(cname, '/')) != NULL) + *cp = '-'; + if (trustworthy == LOCAL || + (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) { + /* Only need suffix if there is a conflict */ + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + cnlen = strlen(cname); + while (conflict) { + if (trustworthy == METADATA && !isdigit(cname[cnlen-1])) + sprintf(cname+cnlen, "%d", unum); + else + /* add _%d to FOREIGN array that don't + * a 'host:' prefix + */ + sprintf(cname+cnlen, "_%d", unum); + unum++; + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + } + + if (dev && dev[0] == '/') + strcpy(chosen, dev); + else if (cname[0] == 0) + strcpy(chosen, devname); + + /* We have a device number and name. + * If we cannot detect udev, we need to make + * devices and links ourselves. + */ + if (stat("/dev/.udev", &stb) != 0 || + check_env("MDADM_NO_UDEV")) { + /* Make sure 'devname' exists and 'chosen' is a symlink to it */ + if (lstat(devname, &stb) == 0) { + /* Must be the correct device, else error */ + if ((stb.st_mode&S_IFMT) != S_IFBLK || + stb.st_rdev != makedev(dev2major(num),dev2minor(num))) { + fprintf(stderr, Name ": %s exists but looks wrong, please fix\n", + devname); + return -1; + } + } else { + if (mknod(devname, S_IFBLK|0600, + makedev(dev2major(num),dev2minor(num))) != 0) { + fprintf(stderr, Name ": failed to create %s\n", + devname); + return -1; + } + if (chown(devname, ci->uid, ci->gid)) + perror("chown"); + if (chmod(devname, ci->mode)) + perror("chmod"); + stat(devname, &stb); + add_dev(devname, &stb, 0, NULL); + } + if (use_mdp == 1) + make_parts(devname, parts); + if (strcmp(chosen, devname) != 0) { + if (mkdir("/dev/md",0700)==0) { if (chown("/dev/md", ci->uid, ci->gid)) perror("chown /dev/md"); - if (chmod("/dev/md", ci->mode| - ((ci->mode>>2) & 0111))) + if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111))) perror("chmod /dev/md"); } - } - if (mknod(chosen_name, S_IFBLK | 0600, - makedev(major_num, minor_num)) != 0) { - return -1; + if (dev && strcmp(chosen, dev) == 0) + /* We know we are allowed to use this name */ + unlink(chosen); + + if (lstat(chosen, &stb) == 0) { + char buf[300]; + if ((stb.st_mode & S_IFMT) != S_IFLNK || + readlink(chosen, buf, 300) <0 || + strcmp(buf, devname) != 0) { + fprintf(stderr, Name ": %s exists - ignoring\n", + chosen); + strcpy(chosen, devname); + } + } else if (symlink(devname, chosen) != 0) + fprintf(stderr, Name ": failed to create %s: %s\n", + chosen, strerror(errno)); + if (use_mdp && strcmp(chosen, devname) != 0) + make_parts(chosen, parts); } - /* FIXME chown/chmod ?? */ } + mdfd = open_dev_excl(num); + if (mdfd < 0) + fprintf(stderr, Name ": unexpected failure opening %s\n", + devname); + return mdfd; +} - /* Simple locking to avoid --incr being called for the same - * array multiple times in parallel. - */ - for (i = 0; i < 25 ; i++) { - int fd; - fd = open(chosen_name, O_RDWR|O_EXCL); - if (fd >= 0 || errno != EBUSY) { - if (devnum < 0) - make_parts(chosen_name, parts, ci->symlinks); - return fd; - } - usleep(200000); +/* Open this and check that it is an md device. + * On success, return filedescriptor. + * On failure, return -1 if it doesn't exist, + * or -2 if it exists but is not an md device. + */ +int open_mddev(char *dev, int report_errors) +{ + int mdfd = open(dev, O_RDWR); + if (mdfd < 0) { + if (report_errors) + fprintf(stderr, Name ": error opening %s: %s\n", + dev, strerror(errno)); + return -1; } - return -1; + if (md_get_version(mdfd) <= 0) { + close(mdfd); + if (report_errors) + fprintf(stderr, Name ": %s does not appear to be " + "an md device\n", dev); + return -2; + } + return mdfd; } diff -Nru mdadm-2.6.7.1/md_p.h mdadm-3.1.4/md_p.h --- mdadm-2.6.7.1/md_p.h 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/md_p.h 2010-08-26 05:24:15.000000000 +0300 @@ -75,7 +75,7 @@ * Device "operational" state bits */ #define MD_DISK_FAULTY 0 /* disk is faulty / operational */ -#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ diff -Nru mdadm-2.6.7.1/mdstat.c mdadm-3.1.4/mdstat.c --- mdadm-2.6.7.1/mdstat.c 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/mdstat.c 2010-08-26 05:24:15.000000000 +0300 @@ -2,7 +2,7 @@ * mdstat - parse /proc/mdstat file. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2002-2006 Neil Brown + * Copyright (C) 2002-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -20,12 +20,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ /* @@ -86,14 +81,44 @@ #include "mdadm.h" #include "dlink.h" #include +#include + +static void free_member_devnames(struct dev_member *m) +{ + while(m) { + struct dev_member *t = m; + + m = m->next; + free(t->name); + free(t); + } +} + +static int add_member_devname(struct dev_member **m, char *name) +{ + struct dev_member *new; + char *t; + + if ((t = strchr(name, '[')) == NULL) + /* not a device */ + return 0; + + new = malloc(sizeof(*new)); + new->name = strndup(name, t - name); + new->next = *m; + *m = new; + return 1; +} void free_mdstat(struct mdstat_ent *ms) { while (ms) { struct mdstat_ent *t; - if (ms->dev) free(ms->dev); - if (ms->level) free(ms->level); - if (ms->pattern) free(ms->pattern); + free(ms->dev); + free(ms->level); + free(ms->pattern); + free(ms->metadata_version); + free_member_devnames(ms->members); t = ms; ms = ms->next; free(t); @@ -158,6 +183,11 @@ ent->percent = -1; ent->active = -1; ent->resync = 0; + ent->metadata_version = NULL; + ent->raid_disks = 0; + ent->chunk_size = 0; + ent->devcnt = 0; + ent->members = NULL; ent->dev = strdup(line); ent->devnum = devnum; @@ -167,31 +197,39 @@ char *eq; if (strcmp(w, "active")==0) ent->active = 1; - else if (strcmp(w, "inactive")==0) + else if (strcmp(w, "inactive")==0) { ent->active = 0; - else if (ent->active >=0 && + in_devs = 1; + } else if (ent->active > 0 && ent->level == NULL && w[0] != '(' /*readonly*/) { ent->level = strdup(w); in_devs = 1; } else if (in_devs && strcmp(w, "blocks")==0) in_devs = 0; - else if (in_devs && strncmp(w, "md", 2)==0) { - /* This has an md device as a component. - * If that device is already in the list, - * make sure we insert before there. - */ - struct mdstat_ent **ih; - int dn2; - if (strncmp(w, "md_d", 4)==0) - dn2 = -1-strtoul(w+4, &ep, 10); - else - dn2 = strtoul(w+2, &ep, 10); - ih = &all; - while (ih != insert_here && *ih && - (*ih)->devnum != dn2) - ih = & (*ih)->next; - insert_here = ih; + else if (in_devs) { + ent->devcnt += + add_member_devname(&ent->members, w); + if (strncmp(w, "md", 2)==0) { + /* This has an md device as a component. + * If that device is already in the + * list, make sure we insert before + * there. + */ + struct mdstat_ent **ih; + int dn2 = devname2devnum(w); + ih = &all; + while (ih != insert_here && *ih && + (*ih)->devnum != dn2) + ih = & (*ih)->next; + insert_here = ih; + } + } else if (strcmp(w, "super") == 0 && + dl_next(w) != line) { + w = dl_next(w); + ent->metadata_version = strdup(w); + } else if (w[0] == '[' && isdigit(w[1])) { + ent->raid_disks = atoi(w+1); } else if (!ent->pattern && w[0] == '[' && (w[1] == 'U' || w[1] == '_')) { @@ -248,12 +286,48 @@ { fd_set fds; struct timeval tm; + int maxfd = 0; FD_ZERO(&fds); - if (mdstat_fd >= 0) + if (mdstat_fd >= 0) { FD_SET(mdstat_fd, &fds); + maxfd = mdstat_fd; + } tm.tv_sec = seconds; tm.tv_usec = 0; - select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm); + select(maxfd + 1, NULL, NULL, &fds, &tm); +} + +void mdstat_wait_fd(int fd, const sigset_t *sigmask) +{ + fd_set fds, rfds; + int maxfd = 0; + + FD_ZERO(&fds); + FD_ZERO(&rfds); + if (mdstat_fd >= 0) + FD_SET(mdstat_fd, &fds); + + if (fd >= 0) { + struct stat stb; + fstat(fd, &stb); + if ((stb.st_mode & S_IFMT) == S_IFREG) + /* Must be a /proc or /sys fd, so expect + * POLLPRI + * i.e. an 'exceptional' event. + */ + FD_SET(fd, &fds); + else + FD_SET(fd, &rfds); + + if (fd > maxfd) + maxfd = fd; + + } + if (mdstat_fd > maxfd) + maxfd = mdstat_fd; + + pselect(maxfd + 1, &rfds, NULL, &fds, + NULL, sigmask); } int mddev_busy(int devnum) @@ -267,3 +341,30 @@ free_mdstat(mdstat); return me != NULL; } + +struct mdstat_ent *mdstat_by_component(char *name) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + + while (mdstat) { + struct dev_member *m; + struct mdstat_ent *ent; + if (mdstat->metadata_version && + strncmp(mdstat->metadata_version, "external:", 9) == 0 && + is_subarray(mdstat->metadata_version+9)) + /* don't return subarrays, only containers */ + ; + else for (m = mdstat->members; m; m = m->next) { + if (strcmp(m->name, name) == 0) { + free_mdstat(mdstat->next); + mdstat->next = NULL; + return mdstat; + } + } + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + free_mdstat(ent); + } + return NULL; +} diff -Nru mdadm-2.6.7.1/md.txt mdadm-3.1.4/md.txt --- mdadm-2.6.7.1/md.txt 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/md.txt 1970-01-01 02:00:00.000000000 +0200 @@ -1,357 +0,0 @@ -Tools that manage md devices can be found at - http://www..kernel.org/pub/linux/utils/raid/.... - - -Boot time assembly of RAID arrays ---------------------------------- - -You can boot with your md device with the following kernel command -lines: - -for old raid arrays without persistent superblocks: - md=,,,,dev0,dev1,...,devn - -for raid arrays with persistent superblocks - md=,dev0,dev1,...,devn -or, to assemble a partitionable array: - md=d,dev0,dev1,...,devn - -md device no. = the number of the md device ... - 0 means md0, - 1 md1, - 2 md2, - 3 md3, - 4 md4 - -raid level = -1 linear mode - 0 striped mode - other modes are only supported with persistent super blocks - -chunk size factor = (raid-0 and raid-1 only) - Set the chunk size as 4k << n. - -fault level = totally ignored - -dev0-devn: e.g. /dev/hda1,/dev/hdc1,/dev/sda1,/dev/sdb1 - -A possible loadlin line (Harald Hoyer ) looks like this: - -e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro - - -Boot time autodetection of RAID arrays --------------------------------------- - -When md is compiled into the kernel (not as module), partitions of -type 0xfd are scanned and automatically assembled into RAID arrays. -This autodetection may be suppressed with the kernel parameter -"raid=noautodetect". As of kernel 2.6.9, only drives with a type 0 -superblock can be autodetected and run at boot time. - -The kernel parameter "raid=partitionable" (or "raid=part") means -that all auto-detected arrays are assembled as partitionable. - -Boot time assembly of degraded/dirty arrays -------------------------------------------- - -If a raid5 or raid6 array is both dirty and degraded, it could have -undetectable data corruption. This is because the fact that it is -'dirty' means that the parity cannot be trusted, and the fact that it -is degraded means that some datablocks are missing and cannot reliably -be reconstructed (due to no parity). - -For this reason, md will normally refuse to start such an array. This -requires the sysadmin to take action to explicitly start the array -desipite possible corruption. This is normally done with - mdadm --assemble --force .... - -This option is not really available if the array has the root -filesystem on it. In order to support this booting from such an -array, md supports a module parameter "start_dirty_degraded" which, -when set to 1, bypassed the checks and will allows dirty degraded -arrays to be started. - -So, to boot with a root filesystem of a dirty degraded raid[56], use - - md-mod.start_dirty_degraded=1 - - -Superblock formats ------------------- - -The md driver can support a variety of different superblock formats. -Currently, it supports superblock formats "0.90.0" and the "md-1" format -introduced in the 2.5 development series. - -The kernel will autodetect which format superblock is being used. - -Superblock format '0' is treated differently to others for legacy -reasons - it is the original superblock format. - - -General Rules - apply for all superblock formats ------------------------------------------------- - -An array is 'created' by writing appropriate superblocks to all -devices. - -It is 'assembled' by associating each of these devices with an -particular md virtual device. Once it is completely assembled, it can -be accessed. - -An array should be created by a user-space tool. This will write -superblocks to all devices. It will usually mark the array as -'unclean', or with some devices missing so that the kernel md driver -can create appropriate redundancy (copying in raid1, parity -calculation in raid4/5). - -When an array is assembled, it is first initialized with the -SET_ARRAY_INFO ioctl. This contains, in particular, a major and minor -version number. The major version number selects which superblock -format is to be used. The minor number might be used to tune handling -of the format, such as suggesting where on each device to look for the -superblock. - -Then each device is added using the ADD_NEW_DISK ioctl. This -provides, in particular, a major and minor number identifying the -device to add. - -The array is started with the RUN_ARRAY ioctl. - -Once started, new devices can be added. They should have an -appropriate superblock written to them, and then passed be in with -ADD_NEW_DISK. - -Devices that have failed or are not yet active can be detached from an -array using HOT_REMOVE_DISK. - - -Specific Rules that apply to format-0 super block arrays, and - arrays with no superblock (non-persistent). -------------------------------------------------------------- - -An array can be 'created' by describing the array (level, chunksize -etc) in a SET_ARRAY_INFO ioctl. This must has major_version==0 and -raid_disks != 0. - -Then uninitialized devices can be added with ADD_NEW_DISK. The -structure passed to ADD_NEW_DISK must specify the state of the device -and it's role in the array. - -Once started with RUN_ARRAY, uninitialized spares can be added with -HOT_ADD_DISK. - - - -MD devices in sysfs -------------------- -md devices appear in sysfs (/sys) as regular block devices, -e.g. - /sys/block/md0 - -Each 'md' device will contain a subdirectory called 'md' which -contains further md-specific information about the device. - -All md devices contain: - level - a text file indicating the 'raid level'. This may be a standard - numerical level prefixed by "RAID-" - e.g. "RAID-5", or some - other name such as "linear" or "multipath". - If no raid level has been set yet (array is still being - assembled), this file will be empty. - - raid_disks - a text file with a simple number indicating the number of devices - in a fully functional array. If this is not yet known, the file - will be empty. If an array is being resized (not currently - possible) this will contain the larger of the old and new sizes. - Some raid level (RAID1) allow this value to be set while the - array is active. This will reconfigure the array. Otherwise - it can only be set while assembling an array. - - chunk_size - This is the size if bytes for 'chunks' and is only relevant to - raid levels that involve striping (1,4,5,6,10). The address space - of the array is conceptually divided into chunks and consecutive - chunks are striped onto neighbouring devices. - The size should be atleast PAGE_SIZE (4k) and should be a power - of 2. This can only be set while assembling an array - - component_size - For arrays with data redundancy (i.e. not raid0, linear, faulty, - multipath), all components must be the same size - or at least - there must a size that they all provide space for. This is a key - part or the geometry of the array. It is measured in sectors - and can be read from here. Writing to this value may resize - the array if the personality supports it (raid1, raid5, raid6), - and if the component drives are large enough. - - metadata_version - This indicates the format that is being used to record metadata - about the array. It can be 0.90 (traditional format), 1.0, 1.1, - 1.2 (newer format in varying locations) or "none" indicating that - the kernel isn't managing metadata at all. - - level - The raid 'level' for this array. The name will often (but not - always) be the same as the name of the module that implements the - level. To be auto-loaded the module must have an alias - md-$LEVEL e.g. md-raid5 - This can be written only while the array is being assembled, not - after it is started. - - new_dev - This file can be written but not read. The value written should - be a block device number as major:minor. e.g. 8:0 - This will cause that device to be attached to the array, if it is - available. It will then appear at md/dev-XXX (depending on the - name of the device) and further configuration is then possible. - - sync_speed_min - sync_speed_max - This are similar to /proc/sys/dev/raid/speed_limit_{min,max} - however they only apply to the particular array. - If no value has been written to these, of if the word 'system' - is written, then the system-wide value is used. If a value, - in kibibytes-per-second is written, then it is used. - When the files are read, they show the currently active value - followed by "(local)" or "(system)" depending on whether it is - a locally set or system-wide value. - - sync_completed - This shows the number of sectors that have been completed of - whatever the current sync_action is, followed by the number of - sectors in total that could need to be processed. The two - numbers are separated by a '/' thus effectively showing one - value, a fraction of the process that is complete. - - sync_speed - This shows the current actual speed, in K/sec, of the current - sync_action. It is averaged over the last 30 seconds. - - -As component devices are added to an md array, they appear in the 'md' -directory as new directories named - dev-XXX -where XXX is a name that the kernel knows for the device, e.g. hdb1. -Each directory contains: - - block - a symlink to the block device in /sys/block, e.g. - /sys/block/md0/md/dev-hdb1/block -> ../../../../block/hdb/hdb1 - - super - A file containing an image of the superblock read from, or - written to, that device. - - state - A file recording the current state of the device in the array - which can be a comma separated list of - faulty - device has been kicked from active use due to - a detected fault - in_sync - device is a fully in-sync member of the array - spare - device is working, but not a full member. - This includes spares that are in the process - of being recoverred to - This list make grow in future. - - errors - An approximate count of read errors that have been detected on - this device but have not caused the device to be evicted from - the array (either because they were corrected or because they - happened while the array was read-only). When using version-1 - metadata, this value persists across restarts of the array. - - This value can be written while assembling an array thus - providing an ongoing count for arrays with metadata managed by - userspace. - - slot - This gives the role that the device has in the array. It will - either be 'none' if the device is not active in the array - (i.e. is a spare or has failed) or an integer less than the - 'raid_disks' number for the array indicating which possition - it currently fills. This can only be set while assembling an - array. A device for which this is set is assumed to be working. - - offset - This gives the location in the device (in sectors from the - start) where data from the array will be stored. Any part of - the device before this offset us not touched, unless it is - used for storing metadata (Formats 1.1 and 1.2). - - size - The amount of the device, after the offset, that can be used - for storage of data. This will normally be the same as the - component_size. This can be written while assembling an - array. If a value less than the current component_size is - written, component_size will be reduced to this value. - - -An active md device will also contain and entry for each active device -in the array. These are named - - rdNN - -where 'NN' is the possition in the array, starting from 0. -So for a 3 drive array there will be rd0, rd1, rd2. -These are symbolic links to the appropriate 'dev-XXX' entry. -Thus, for example, - cat /sys/block/md*/md/rd*/state -will show 'in_sync' on every line. - - - -Active md devices for levels that support data redundancy (1,4,5,6) -also have - - sync_action - a text file that can be used to monitor and control the rebuild - process. It contains one word which can be one of: - resync - redundancy is being recalculated after unclean - shutdown or creation - recover - a hot spare is being built to replace a - failed/missing device - idle - nothing is happening - check - A full check of redundancy was requested and is - happening. This reads all block and checks - them. A repair may also happen for some raid - levels. - repair - A full check and repair is happening. This is - similar to 'resync', but was requested by the - user, and the write-intent bitmap is NOT used to - optimise the process. - - This file is writable, and each of the strings that could be - read are meaningful for writing. - - 'idle' will stop an active resync/recovery etc. There is no - guarantee that another resync/recovery may not be automatically - started again, though some event will be needed to trigger - this. - 'resync' or 'recovery' can be used to restart the - corresponding operation if it was stopped with 'idle'. - 'check' and 'repair' will start the appropriate process - providing the current state is 'idle'. - - mismatch_count - When performing 'check' and 'repair', and possibly when - performing 'resync', md will count the number of errors that are - found. The count in 'mismatch_cnt' is the number of sectors - that were re-written, or (for 'check') would have been - re-written. As most raid levels work in units of pages rather - than sectors, this my be larger than the number of actual errors - by a factor of the number of sectors in a page. - -Each active md device may also have attributes specific to the -personality module that manages it. -These are specific to the implementation of the module and could -change substantially if the implementation changes. - -These currently include - - stripe_cache_size (currently raid5 only) - number of entries in the stripe cache. This is writable, but - there are upper and lower limits (32768, 16). Default is 128. - strip_cache_active (currently raid5 only) - number of active entries in the stripe cache diff -Nru mdadm-2.6.7.1/monitor.c mdadm-3.1.4/monitor.c --- mdadm-2.6.7.1/monitor.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/monitor.c 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,609 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "mdadm.h" +#include "mdmon.h" +#include +#include +#include + +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", + "clean", "active", "write-pending", "active-idle", NULL }; +static char *sync_actions[] = { + "idle", "reshape", "resync", "recover", "check", "repair", NULL +}; + +static int write_attr(char *attr, int fd) +{ + return write(fd, attr, strlen(attr)); +} + +static void add_fd(fd_set *fds, int *maxfd, int fd) +{ + if (fd < 0) + return; + if (fd > *maxfd) + *maxfd = fd; + FD_SET(fd, fds); +} + +static int read_attr(char *buf, int len, int fd) +{ + int n; + + if (fd < 0) { + buf[0] = 0; + return 0; + } + lseek(fd, 0, 0); + n = read(fd, buf, len - 1); + + if (n <= 0) { + buf[0] = 0; + return 0; + } + buf[n] = 0; + if (buf[n-1] == '\n') + buf[n-1] = 0; + return n; +} + +static unsigned long long read_resync_start(int fd) +{ + char buf[30]; + int n; + + n = read_attr(buf, 30, fd); + if (n <= 0) + return 0; + if (strncmp(buf, "none", 4) == 0) + return MaxSector; + else + return strtoull(buf, NULL, 10); +} + +static unsigned long long read_sync_completed(int fd) +{ + unsigned long long val; + char buf[50]; + int n; + char *ep; + + n = read_attr(buf, 50, fd); + + if (n <= 0) + return 0; + buf[n] = 0; + val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return 0; + return val; +} + +static enum array_state read_state(int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_word; + return (enum array_state) sysfs_match_word(buf, array_states); +} + +static enum sync_action read_action( int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_action; + return (enum sync_action) sysfs_match_word(buf, sync_actions); +} + +int read_dev_state(int fd) +{ + char buf[60]; + int n = read_attr(buf, 60, fd); + char *cp; + int rv = 0; + + if (n <= 0) + return 0; + + cp = buf; + while (cp) { + if (sysfs_attr_match(cp, "faulty")) + rv |= DS_FAULTY; + if (sysfs_attr_match(cp, "in_sync")) + rv |= DS_INSYNC; + if (sysfs_attr_match(cp, "write_mostly")) + rv |= DS_WRITE_MOSTLY; + if (sysfs_attr_match(cp, "spare")) + rv |= DS_SPARE; + if (sysfs_attr_match(cp, "blocked")) + rv |= DS_BLOCKED; + cp = strchr(cp, ','); + if (cp) + cp++; + } + return rv; +} + +static void signal_manager(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1); +} + +/* Monitor a set of active md arrays - all of which share the + * same metadata - and respond to events that require + * metadata update. + * + * New arrays are detected by another thread which allocates + * required memory and attaches the data structure to our list. + * + * Events: + * Array stops. + * This is detected by array_state going to 'clear' or 'inactive'. + * while we thought it was active. + * Response is to mark metadata as clean and 'clear' the array(??) + * write-pending + * array_state if 'write-pending' + * We mark metadata as 'dirty' then set array to 'active'. + * active_idle + * Either ignore, or mark clean, then mark metadata as clean. + * + * device fails + * detected by rd-N/state reporting "faulty" + * mark device as 'failed' in metadata, let the kernel release the + * device by writing '-blocked' to rd/state, and finally write 'remove' to + * rd/state. Before a disk can be replaced it must be failed and removed + * from all container members, this will be preemptive for the other + * arrays... safe? + * + * sync completes + * sync_action was 'resync' and becomes 'idle' and resync_start becomes + * MaxSector + * Notify metadata that sync is complete. + * + * recovery completes + * sync_action changes from 'recover' to 'idle' + * Check each device state and mark metadata if 'faulty' or 'in_sync'. + * + * deal with resync + * This only happens on finding a new array... mdadm will have set + * 'resync_start' to the correct value. If 'resync_start' indicates that an + * resync needs to occur set the array to the 'active' state rather than the + * initial read-auto state. + * + * + * + * We wait for a change (poll/select) on array_state, sync_action, and + * each rd-X/state file. + * When we get any change, we check everything. So read each state file, + * then decide what to do. + * + * The core action is to write new metadata to all devices in the array. + * This is done at most once on any wakeup. + * After that we might: + * - update the array_state + * - set the role of some devices. + * - request a sync_action + * + */ + +static int read_and_act(struct active_array *a) +{ + unsigned long long sync_completed; + int check_degraded = 0; + int deactivate = 0; + struct mdinfo *mdi; + int dirty = 0; + + a->next_state = bad_word; + a->next_action = bad_action; + + a->curr_state = read_state(a->info.state_fd); + a->curr_action = read_action(a->action_fd); + a->info.resync_start = read_resync_start(a->resync_start_fd); + sync_completed = read_sync_completed(a->sync_completed_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->next_state = 0; + if (mdi->state_fd >= 0) { + mdi->recovery_start = read_resync_start(mdi->recovery_fd); + mdi->curr_state = read_dev_state(mdi->state_fd); + } + } + + if (a->curr_state <= inactive && + a->prev_state > inactive) { + /* array has been stopped */ + a->container->ss->set_array_state(a, 1); + a->next_state = clear; + deactivate = 1; + } + if (a->curr_state == write_pending) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + dirty = 1; + } + if (a->curr_state == active_idle) { + /* Set array to 'clean' FIRST, then mark clean + * in the metadata + */ + a->next_state = clean; + dirty = 1; + } + if (a->curr_state == clean) { + a->container->ss->set_array_state(a, 1); + } + if (a->curr_state == active || + a->curr_state == suspended || + a->curr_state == bad_word) + dirty = 1; + if (a->curr_state == readonly) { + /* Well, I'm ready to handle things. If readonly + * wasn't requested, transition to read-auto. + */ + char buf[64]; + read_attr(buf, sizeof(buf), a->metadata_fd); + if (strncmp(buf, "external:-", 10) == 0) { + /* explicit request for readonly array. Leave it alone */ + ; + } else { + if (a->container->ss->set_array_state(a, 2)) + a->next_state = read_auto; /* array is clean */ + else { + a->next_state = active; /* Now active for recovery etc */ + dirty = 1; + } + } + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == resync) { + /* A resync has finished. The endpoint is recorded in + * 'sync_start'. We don't update the metadata + * until the array goes inactive or readonly though. + * Just check if we need to fiddle spares. + */ + a->container->ss->set_array_state(a, a->curr_state <= clean); + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == recover) { + /* A recovery has finished. Some disks may be in sync now, + * and the array may no longer be degraded + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + if (! (mdi->curr_state & DS_INSYNC)) + check_degraded = 1; + } + } + + /* Check for failures and if found: + * 1/ Record the failure in the metadata and unblock the device. + * FIXME update the kernel to stop notifying on failed drives when + * the array is readonly and we have cleared 'blocked' + * 2/ Try to remove the device if the array is writable, or can be + * made writable. + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->curr_state & DS_FAULTY) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + check_degraded = 1; + mdi->next_state |= DS_UNBLOCK; + if (a->curr_state == read_auto) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + } + if (a->curr_state > readonly) + mdi->next_state |= DS_REMOVE; + } + } + + /* Check for recovery checkpoint notifications. We need to be a + * minimum distance away from the last checkpoint to prevent + * over checkpointing. Note reshape checkpointing is not + * handled here. + */ + if (sync_completed > a->last_checkpoint && + sync_completed - a->last_checkpoint > a->info.component_size >> 4 && + a->curr_action > reshape) { + /* A (non-reshape) sync_action has reached a checkpoint. + * Record the updated position in the metadata + */ + a->last_checkpoint = sync_completed; + a->container->ss->set_array_state(a, a->curr_state <= clean); + } else if (sync_completed > a->last_checkpoint) + a->last_checkpoint = sync_completed; + + a->container->ss->sync_metadata(a->container); + dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member, + array_states[a->curr_state], sync_actions[a->curr_action]); + + /* Effect state changes in the array */ + if (a->next_state != bad_word) { + dprintf(" state:%s", array_states[a->next_state]); + write_attr(array_states[a->next_state], a->info.state_fd); + } + if (a->next_action != bad_action) { + write_attr(sync_actions[a->next_action], a->action_fd); + dprintf(" action:%s", sync_actions[a->next_action]); + } + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + if (mdi->next_state & DS_UNBLOCK) { + dprintf(" %d:-blocked", mdi->disk.raid_disk); + write_attr("-blocked", mdi->state_fd); + } + + if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) { + int remove_result; + + /* the kernel may not be able to immediately remove the + * disk, we can simply wait until the next event to try + * again. + */ + remove_result = write_attr("remove", mdi->state_fd); + if (remove_result > 0) { + dprintf(" %d:removed", mdi->disk.raid_disk); + close(mdi->state_fd); + mdi->state_fd = -1; + } + } + if (mdi->next_state & DS_INSYNC) { + write_attr("+in_sync", mdi->state_fd); + dprintf(" %d:+in_sync", mdi->disk.raid_disk); + } + } + dprintf(" )\n"); + + /* move curr_ to prev_ */ + a->prev_state = a->curr_state; + + a->prev_action = a->curr_action; + + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->prev_state = mdi->curr_state; + mdi->next_state = 0; + } + + if (check_degraded) { + /* manager will do the actual check */ + a->check_degraded = 1; + signal_manager(); + } + + if (deactivate) + a->container = NULL; + + return dirty; +} + +static struct mdinfo * +find_device(struct active_array *a, int major, int minor) +{ + struct mdinfo *mdi; + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->disk.major == major && mdi->disk.minor == minor) + return mdi; + + return NULL; +} + +static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) +{ + struct active_array *a; + struct mdinfo *victim; + + for (a = aa; a; a = a->next) { + if (!a->container) + continue; + victim = find_device(a, failed->disk.major, failed->disk.minor); + if (!victim) + continue; + + if (!(victim->curr_state & DS_FAULTY)) + write_attr("faulty", victim->state_fd); + } +} + +#ifdef DEBUG +static void dprint_wake_reasons(fd_set *fds) +{ + int i; + char proc_path[256]; + char link[256]; + char *basename; + int rv; + + fprintf(stderr, "monitor: wake ( "); + for (i = 0; i < FD_SETSIZE; i++) { + if (FD_ISSET(i, fds)) { + sprintf(proc_path, "/proc/%d/fd/%d", + (int) getpid(), i); + + rv = readlink(proc_path, link, sizeof(link) - 1); + if (rv < 0) { + fprintf(stderr, "%d:unknown ", i); + continue; + } + link[rv] = '\0'; + basename = strrchr(link, '/'); + fprintf(stderr, "%d:%s ", + i, basename ? ++basename : link); + } + } + fprintf(stderr, ")\n"); +} +#endif + +int monitor_loop_cnt; + +static int wait_and_act(struct supertype *container, int nowait) +{ + fd_set rfds; + int maxfd = 0; + struct active_array **aap = &container->arrays; + struct active_array *a, **ap; + int rv; + struct mdinfo *mdi; + static unsigned int dirty_arrays = ~0; /* start at some non-zero value */ + + FD_ZERO(&rfds); + + for (ap = aap ; *ap ;) { + a = *ap; + /* once an array has been deactivated we want to + * ask the manager to discard it. + */ + if (!a->container) { + if (discard_this) { + ap = &(*ap)->next; + continue; + } + *ap = a->next; + a->next = NULL; + discard_this = a; + signal_manager(); + continue; + } + + add_fd(&rfds, &maxfd, a->info.state_fd); + add_fd(&rfds, &maxfd, a->action_fd); + add_fd(&rfds, &maxfd, a->sync_completed_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + add_fd(&rfds, &maxfd, mdi->state_fd); + + ap = &(*ap)->next; + } + + if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) { + /* No interesting arrays, or we have been told to + * terminate and everything is clean. Lets see about + * exiting. Note that blocking at this point is not a + * problem as there are no active arrays, there is + * nothing that we need to be ready to do. + */ + int fd = open_dev_excl(container->devnum); + if (fd >= 0 || errno != EBUSY) { + /* OK, we are safe to leave */ + if (sigterm && !dirty_arrays) + dprintf("caught sigterm, all clean... exiting\n"); + else + dprintf("no arrays to monitor... exiting\n"); + if (!sigterm) + /* On SIGTERM, someone (the take-over mdmon) will + * clean up + */ + remove_pidfile(container->devname); + exit_now = 1; + signal_manager(); + exit(0); + } + } + + if (!nowait) { + sigset_t set; + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + monitor_loop_cnt |= 1; + rv = pselect(maxfd+1, NULL, NULL, &rfds, NULL, &set); + monitor_loop_cnt += 1; + if (rv == -1 && errno == EINTR) + rv = 0; + #ifdef DEBUG + dprint_wake_reasons(&rfds); + #endif + + } + + if (update_queue) { + struct metadata_update *this; + + for (this = update_queue; this ; this = this->next) + container->ss->process_update(container, this); + + update_queue_handled = update_queue; + update_queue = NULL; + signal_manager(); + container->ss->sync_metadata(container); + } + + rv = 0; + dirty_arrays = 0; + for (a = *aap; a ; a = a->next) { + int is_dirty; + + if (a->replaces && !discard_this) { + struct active_array **ap; + for (ap = &a->next; *ap && *ap != a->replaces; + ap = & (*ap)->next) + ; + if (*ap) + *ap = (*ap)->next; + discard_this = a->replaces; + a->replaces = NULL; + /* FIXME check if device->state_fd need to be cleared?*/ + signal_manager(); + } + if (a->container) { + is_dirty = read_and_act(a); + rv |= 1; + dirty_arrays += is_dirty; + /* when terminating stop manipulating the array after it + * is clean, but make sure read_and_act() is given a + * chance to handle 'active_idle' + */ + if (sigterm && !is_dirty) + a->container = NULL; /* stop touching this array */ + } + } + + /* propagate failures across container members */ + for (a = *aap; a ; a = a->next) { + if (!a->container) + continue; + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->curr_state & DS_FAULTY) + reconcile_failed(*aap, mdi); + } + + return rv; +} + +void do_monitor(struct supertype *container) +{ + int rv; + int first = 1; + do { + rv = wait_and_act(container, first); + first = 0; + } while (rv >= 0); +} diff -Nru mdadm-2.6.7.1/Monitor.c mdadm-3.1.4/Monitor.c --- mdadm-2.6.7.1/Monitor.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/Monitor.c 2010-08-26 05:24:15.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -38,14 +33,6 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom, char *cmd, int dosyslog); -static char *percentalerts[] = { - "RebuildStarted", - "Rebuild20", - "Rebuild40", - "Rebuild60", - "Rebuild80", -}; - /* The largest number of disks current arrays can manage is 384 * This really should be dynamically, but that will have to wait * At least it isn't MD_SB_DISKS. @@ -54,7 +41,7 @@ int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char* pidfile) + int dosyslog, int test, char* pidfile, int increments) { /* * Every few seconds, scan every md device looking for changes @@ -82,8 +69,8 @@ * An active device had a reverse transition * RebuildStarted * percent went from -1 to +ve - * Rebuild20 Rebuild40 Rebuild60 Rebuild80 - * percent went from below to not-below that number + * RebuildNN + * percent went from below to not-below NN% * DeviceDisappeared * Couldn't access a device which was previously visible * @@ -107,7 +94,7 @@ int active, working, failed, spare, raid; int expected_spares; int devstate[MaxDisks]; - int devid[MaxDisks]; + unsigned devid[MaxDisks]; int percent; struct state *next; } *statelist = NULL; @@ -156,7 +143,7 @@ return 1; } close(0); - open("/dev/null", 3); + open("/dev/null", O_RDWR); dup2(0,1); dup2(0,2); setsid(); @@ -165,10 +152,21 @@ if (devlist == NULL) { mddev_ident_t mdlist = conf_get_ident(NULL); for (; mdlist; mdlist=mdlist->next) { - struct state *st = malloc(sizeof *st); + struct state *st; + if (mdlist->devname == NULL) + continue; + if (strcasecmp(mdlist->devname, "") == 0) + continue; + st = malloc(sizeof *st); if (st == NULL) continue; - st->devname = strdup(mdlist->devname); + if (mdlist->devname[0] == '/') + st->devname = strdup(mdlist->devname); + else { + st->devname = malloc(8+strlen(mdlist->devname)+1); + strcpy(strcpy(st->devname, "/dev/md/"), + mdlist->devname); + } st->utime = 0; st->next = statelist; st->err = 0; @@ -220,7 +218,7 @@ struct mdstat_ent *mse = NULL, *mse2; char *dev = st->devname; int fd; - unsigned int i; + int i; if (test) alert("TestMessage", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog); @@ -273,6 +271,10 @@ mse = mse2; } + if (array.utime == 0) + /* external arrays don't update utime */ + array.utime = time(0); + if (st->utime == array.utime && st->failed == array.failed_disks && st->working == array.working_disks && @@ -301,9 +303,17 @@ if (mse && st->percent >= 0 && mse->percent >= 0 && - (mse->percent / 20) > (st->percent / 20)) - alert(percentalerts[mse->percent/20], + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog); + } if (mse && mse->percent == -1 && @@ -342,7 +352,7 @@ close(fd); for (i=0; ipattern && i < strlen(mse->pattern)) { + } else if (mse && mse->pattern && i < (int)strlen(mse->pattern)) { switch(mse->pattern[i]) { case 'U': newstate = 6 /* ACTIVE/SYNC */; break; case '_': newstate = 0; break; @@ -368,19 +378,20 @@ minor(st->devid[i]), 1); change = newstate ^ st->devstate[i]; if (st->utime && change && !st->err) { - if (i < (unsigned)array.raid_disks && + if (i < array.raid_disks && (((newstate&change)&(1<devstate[i]&change)&(1<devstate[i]&change)&(1<= (unsigned)array.raid_disks && + else if (i >= array.raid_disks && (disc.major || disc.minor) && st->devid[i] == makedev(disc.major, disc.minor) && ((newstate&change)&(1<devstate[i]&change)&(1<spare_group = NULL; st->expected_spares = -1; statelist = st; + if (test) + alert("TestMessage", st->devname, NULL, mailaddr, mailfrom, alert_cmd, dosyslog); alert("NewArray", st->devname, NULL, mailaddr, mailfrom, alert_cmd, dosyslog); new_found = 1; } @@ -468,16 +481,25 @@ } } if (dev > 0) { - if (ioctl(fd2, HOT_REMOVE_DISK, - (unsigned long)dev) == 0) { - if (ioctl(fd1, HOT_ADD_DISK, - (unsigned long)dev) == 0) { + struct mddev_dev_s devlist; + char devname[20]; + devlist.next = NULL; + devlist.used = 0; + devlist.re_add = 0; + devlist.writemostly = 0; + devlist.devname = devname; + sprintf(devname, "%d:%d", major(dev), minor(dev)); + + devlist.disposition = 'r'; + if (Manage_subdevs(st2->devname, fd2, &devlist, -1, 0) == 0) { + devlist.disposition = 'a'; + if (Manage_subdevs(st->devname, fd1, &devlist, -1, 0) == 0) { alert("MoveSpare", st->devname, st2->devname, mailaddr, mailfrom, alert_cmd, dosyslog); close(fd1); close(fd2); break; } - else ioctl(fd2, HOT_ADD_DISK, (unsigned long) dev); + else Manage_subdevs(st2->devname, fd2, &devlist, -1, 0); } } close(fd1); @@ -560,7 +582,7 @@ n=fwrite(buf, 1, n, mp); /* yes, i don't care about the result */ fclose(mdstat); } - fclose(mp); + pclose(mp); } } @@ -602,10 +624,7 @@ strerror(errno)); return 2; } - if (major(stb.st_rdev) == MD_MAJOR) - devnum = minor(stb.st_rdev); - else - devnum = -1-(minor(stb.st_rdev)/64); + devnum = stat2devnum(&stb); while(1) { struct mdstat_ent *ms = mdstat_read(1, 0); @@ -616,10 +635,17 @@ break; if (!e || e->percent < 0) { + if (e && e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0) { + if (is_subarray(&e->metadata_version[9])) + ping_monitor(&e->metadata_version[9]); + else + ping_monitor(devnum2devname(devnum)); + } free_mdstat(ms); return rv; } - free(ms); + free_mdstat(ms); rv = 0; mdstat_wait(5); } diff -Nru mdadm-2.6.7.1/msg.c mdadm-3.1.4/msg.c --- mdadm-2.6.7.1/msg.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/msg.c 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,231 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mdadm.h" +#include "mdmon.h" + +static const __u32 start_magic = 0x5a5aa5a5; +static const __u32 end_magic = 0xa5a55a5a; + +static int send_buf(int fd, const void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, NULL, &set, NULL, ptmo); + if (rv <= 0) + return -1; + rv = write(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +static int recv_buf(int fd, void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, &set, NULL, NULL, ptmo); + if (rv <= 0) + return -1; + rv = read(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + + +int send_message(int fd, struct metadata_update *msg, int tmo) +{ + __s32 len = msg->len; + int rv; + + rv = send_buf(fd, &start_magic, 4, tmo); + rv = rv ?: send_buf(fd, &len, 4, tmo); + if (len > 0) + rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo); + rv = send_buf(fd, &end_magic, 4, tmo); + + return rv; +} + +int receive_message(int fd, struct metadata_update *msg, int tmo) +{ + __u32 magic; + __s32 len; + int rv; + + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != start_magic) + return -1; + rv = recv_buf(fd, &len, 4, tmo); + if (rv < 0 || len > MSG_MAX_LEN) + return -1; + if (len > 0) { + msg->buf = malloc(len); + if (msg->buf == NULL) + return -1; + rv = recv_buf(fd, msg->buf, len, tmo); + if (rv < 0) { + free(msg->buf); + return -1; + } + } else + msg->buf = NULL; + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != end_magic) { + free(msg->buf); + return -1; + } + msg->len = len; + return 0; +} + +int ack(int fd, int tmo) +{ + struct metadata_update msg = { .len = 0 }; + + return send_message(fd, &msg, tmo); +} + +int wait_reply(int fd, int tmo) +{ + struct metadata_update msg; + return receive_message(fd, &msg, tmo); +} + +int connect_monitor(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + int pos; + char *c; + + pos = sprintf(path, "%s/", MDMON_DIR); + if (is_subarray(devname)) { + devname++; + c = strchr(devname, '/'); + if (!c) + return -1; + snprintf(&path[pos], c - devname + 1, "%s", devname); + pos += c - devname; + } else + pos += sprintf(&path[pos], "%s", devname); + sprintf(&path[pos], ".sock"); + + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (connect(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + + return sfd; +} + +int fping_monitor(int sfd) +{ + int err = 0; + + if (sfd < 0) + return sfd; + + /* try to ping existing socket */ + if (ack(sfd, 20) != 0) + err = -1; + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + return err; +} + + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err = fping_monitor(sfd); + + close(sfd); + return err; +} + +/* give the manager a chance to view the updated container state. This + * would naturally happen due to the manager noticing a change in + * /proc/mdstat; however, pinging encourages this detection to happen + * while an exclusive open() on the container is active + */ +int ping_manager(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg = { .len = -1 }; + int err = 0; + + if (sfd < 0) + return sfd; + + err = send_message(sfd, &msg, 20); + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + close(sfd); + return err; +} diff -Nru mdadm-2.6.7.1/msg.h mdadm-3.1.4/msg.h --- mdadm-2.6.7.1/msg.h 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/msg.h 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +struct mdinfo; +struct metadata_update; + +extern int receive_message(int fd, struct metadata_update *msg, int tmo); +extern int send_message(int fd, struct metadata_update *msg, int tmo); +extern int ack(int fd, int tmo); +extern int wait_reply(int fd, int tmo); +extern int connect_monitor(char *devname); +extern int ping_monitor(char *devname); +extern int fping_monitor(int sock); +extern int ping_manager(char *devname); + +#define MSG_MAX_LEN (4*1024*1024) diff -Nru mdadm-2.6.7.1/platform-intel.c mdadm-3.1.4/platform-intel.c --- mdadm-2.6.7.1/platform-intel.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/platform-intel.c 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,266 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "mdadm.h" +#include "platform-intel.h" +#include "probe_roms.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void free_sys_dev(struct sys_dev **list) +{ + while (*list) { + struct sys_dev *next = (*list)->next; + + if ((*list)->path) + free((*list)->path); + free(*list); + *list = next; + } +} + +struct sys_dev *find_driver_devices(const char *bus, const char *driver) +{ + /* search sysfs for devices driven by 'driver' */ + char path[292]; + char link[256]; + char *c; + DIR *driver_dir; + struct dirent *de; + struct sys_dev *head = NULL; + struct sys_dev *list = NULL; + + sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver); + driver_dir = opendir(path); + if (!driver_dir) + return NULL; + for (de = readdir(driver_dir); de; de = readdir(driver_dir)) { + int n; + + /* is 'de' a device? check that the 'subsystem' link exists and + * that its target matches 'bus' + */ + sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem", + bus, driver, de->d_name); + n = readlink(path, link, sizeof(link)); + if (n < 0 || n >= (int)sizeof(link)) + continue; + link[n] = '\0'; + c = strrchr(link, '/'); + if (!c) + continue; + if (strncmp(bus, c+1, strlen(bus)) != 0) + continue; + + /* start / add list entry */ + if (!head) { + head = malloc(sizeof(*head)); + list = head; + } else { + list->next = malloc(sizeof(*head)); + list = list->next; + } + + if (!list) { + free_sys_dev(&head); + break; + } + + /* generate canonical path name for the device */ + sprintf(path, "/sys/bus/%s/drivers/%s/%s", + bus, driver, de->d_name); + list->path = canonicalize_file_name(path); + list->next = NULL; + } + closedir(driver_dir); + return head; +} + +__u16 devpath_to_vendor(const char *dev_path) +{ + char path[strlen(dev_path) + strlen("/vendor") + 1]; + char vendor[7]; + int fd; + __u16 id = 0xffff; + int n; + + sprintf(path, "%s/vendor", dev_path); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 0xffff; + + n = read(fd, vendor, sizeof(vendor)); + if (n == sizeof(vendor)) { + vendor[n - 1] = '\0'; + id = strtoul(vendor, NULL, 16); + } + close(fd); + + return id; +} + +static int platform_has_intel_ahci(void) +{ + struct sys_dev *devices = find_driver_devices("pci", "ahci"); + struct sys_dev *dev; + int ret = 0; + + for (dev = devices; dev; dev = dev->next) + if (devpath_to_vendor(dev->path) == 0x8086) { + ret = 1; + break; + } + + free_sys_dev(&devices); + + return ret; +} + + +static struct imsm_orom imsm_orom; +static int scan(const void *start, const void *end) +{ + int offset; + const struct imsm_orom *imsm_mem; + int len = (end - start); + + for (offset = 0; offset < len; offset += 4) { + imsm_mem = start + offset; + if (memcmp(imsm_mem->signature, "$VER", 4) == 0) { + imsm_orom = *imsm_mem; + return 1; + } + } + + return 0; +} + +const struct imsm_orom *find_imsm_orom(void) +{ + static int populated = 0; + unsigned long align; + + /* it's static data so we only need to read it once */ + if (populated) + return &imsm_orom; + + if (check_env("IMSM_TEST_OROM")) { + memset(&imsm_orom, 0, sizeof(imsm_orom)); + imsm_orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5; + imsm_orom.sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB | + IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB | + IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB; + imsm_orom.dpa = 6; + imsm_orom.tds = 6; + imsm_orom.vpa = 2; + imsm_orom.vphba = 4; + imsm_orom.attr = imsm_orom.rlc | IMSM_OROM_ATTR_ChecksumVerify; + populated = 1; + return &imsm_orom; + } + + if (!platform_has_intel_ahci()) + return NULL; + + /* scan option-rom memory looking for an imsm signature */ + if (check_env("IMSM_SAFE_OROM_SCAN")) + align = 2048; + else + align = 512; + if (probe_roms_init(align) != 0) + return NULL; + probe_roms(); + populated = scan_adapter_roms(scan); + probe_roms_exit(); + + if (populated) + return &imsm_orom; + return NULL; +} + +char *devt_to_devpath(dev_t dev) +{ + char device[46]; + + sprintf(device, "/sys/dev/block/%d:%d/device", major(dev), minor(dev)); + return canonicalize_file_name(device); +} + +static char *diskfd_to_devpath(int fd) +{ + /* return the device path for a disk, return NULL on error or fd + * refers to a partition + */ + struct stat st; + + if (fstat(fd, &st) != 0) + return NULL; + if (!S_ISBLK(st.st_mode)) + return NULL; + + return devt_to_devpath(st.st_rdev); +} + +int path_attached_to_hba(const char *disk_path, const char *hba_path) +{ + int rc; + + if (!disk_path || !hba_path) + return 0; + + if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0) + rc = 1; + else + rc = 0; + + return rc; +} + +int devt_attached_to_hba(dev_t dev, const char *hba_path) +{ + char *disk_path = devt_to_devpath(dev); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + +int disk_attached_to_hba(int fd, const char *hba_path) +{ + char *disk_path = diskfd_to_devpath(fd); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + diff -Nru mdadm-2.6.7.1/platform-intel.h mdadm-3.1.4/platform-intel.h --- mdadm-2.6.7.1/platform-intel.h 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/platform-intel.h 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,178 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include +#include + +/* The IMSM OROM Version Table definition */ +struct imsm_orom { + __u8 signature[4]; + __u8 table_ver_major; /* Currently 2 (can change with future revs) */ + __u8 table_ver_minor; /* Currently 2 (can change with future revs) */ + __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */ + __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */ + __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */ + __u16 build; /* Example: 1020 as in 8.6.0.1020 */ + __u8 len; /* number of bytes in this entire table */ + __u8 checksum; /* checksum of all the bytes in this table */ + __u16 rlc; /* RAID Level Capability */ + /* we assume the cpu is x86 as the orom should not be found + * anywhere else + */ + #define IMSM_OROM_RLC_RAID0 (1 << 0) + #define IMSM_OROM_RLC_RAID1 (1 << 1) + #define IMSM_OROM_RLC_RAID10 (1 << 2) + #define IMSM_OROM_RLC_RAID1E (1 << 3) + #define IMSM_OROM_RLC_RAID5 (1 << 4) + #define IMSM_OROM_RLC_RAID_CNG (1 << 5) + __u16 sss; /* Strip Size Supported */ + #define IMSM_OROM_SSS_2kB (1 << 0) + #define IMSM_OROM_SSS_4kB (1 << 1) + #define IMSM_OROM_SSS_8kB (1 << 2) + #define IMSM_OROM_SSS_16kB (1 << 3) + #define IMSM_OROM_SSS_32kB (1 << 4) + #define IMSM_OROM_SSS_64kB (1 << 5) + #define IMSM_OROM_SSS_128kB (1 << 6) + #define IMSM_OROM_SSS_256kB (1 << 7) + #define IMSM_OROM_SSS_512kB (1 << 8) + #define IMSM_OROM_SSS_1MB (1 << 9) + #define IMSM_OROM_SSS_2MB (1 << 10) + #define IMSM_OROM_SSS_4MB (1 << 11) + #define IMSM_OROM_SSS_8MB (1 << 12) + #define IMSM_OROM_SSS_16MB (1 << 13) + #define IMSM_OROM_SSS_32MB (1 << 14) + #define IMSM_OROM_SSS_64MB (1 << 15) + __u16 dpa; /* Disks Per Array supported */ + __u16 tds; /* Total Disks Supported */ + __u8 vpa; /* # Volumes Per Array supported */ + __u8 vphba; /* # Volumes Per Host Bus Adapter supported */ + /* Attributes supported. This should map to the + * attributes in the MPB. Also, lower 16 bits + * should match/duplicate RLC bits above. + */ + __u32 attr; + #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0 + #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1 + #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10 + #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E + #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5 + #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG + #define IMSM_OROM_ATTR_2TB (1 << 29) + #define IMSM_OROM_ATTR_PM (1 << 30) + #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31) + __u32 reserved1; + __u32 reserved2; +} __attribute__((packed)); + +static inline int imsm_orom_has_raid0(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID0); +} +static inline int imsm_orom_has_raid1(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1); +} +static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1E); +} +static inline int imsm_orom_has_raid10(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID10); +} +static inline int imsm_orom_has_raid5(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID5); +} + +/** + * imsm_orom_has_chunk - check if the orom supports the given chunk size + * @orom: orom pointer from find_imsm_orom + * @chunk: chunk size in kibibytes + */ +static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk) +{ + int fs = ffs(chunk); + + if (!fs) + return 0; + fs--; /* bit num to bit index */ + return !!(orom->sss & (1 << (fs - 1))); +} + +/** + * fls - find last (most-significant) bit set + * @x: the word to search + * The funciton is borrowed from Linux kernel code + * include/asm-generic/bitops/fls.h + */ +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +/** + * imsm_orom_default_chunk - return the largest chunk size supported via orom + * @orom: orom pointer from find_imsm_orom + */ +static inline int imsm_orom_default_chunk(const struct imsm_orom *orom) +{ + int fs = fls(orom->sss); + + if (!fs) + return 0; + + return min(512, (1 << fs)); +} + +struct sys_dev { + char *path; + struct sys_dev *next; +}; + +struct sys_dev *find_driver_devices(const char *bus, const char *driver); +__u16 devpath_to_vendor(const char *dev_path); +void free_sys_dev(struct sys_dev **list); +const struct imsm_orom *find_imsm_orom(void); +int disk_attached_to_hba(int fd, const char *hba_path); +char *devt_to_devpath(dev_t dev); +int path_attached_to_hba(const char *disk_path, const char *hba_path); diff -Nru mdadm-2.6.7.1/probe_roms.c mdadm-3.1.4/probe_roms.c --- mdadm-2.6.7.1/probe_roms.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/probe_roms.c 2010-08-26 05:24:15.000000000 +0300 @@ -0,0 +1,297 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c) + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "probe_roms.h" +#include +#include +#include +#include +#include +#include +#include + +static void *rom_mem = MAP_FAILED; +static int rom_fd = -1; +static const int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */ +static int _sigbus; +static unsigned long rom_align; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +static void sigbus(int sig) +{ + _sigbus = 1; +} + +static int probe_address8(const __u8 *ptr, __u8 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +static int probe_address16(const __u16 *ptr, __u16 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +void probe_roms_exit(void) +{ + signal(SIGBUS, SIG_DFL); + if (rom_fd >= 0) { + close(rom_fd); + rom_fd = -1; + } + if (rom_mem != MAP_FAILED) { + munmap(rom_mem, rom_len); + rom_mem = MAP_FAILED; + } +} + +int probe_roms_init(unsigned long align) +{ + int fd = -1; + int rc = 0; + + /* valid values are 2048 and 512. 512 is for PCI-3.0 compliant + * systems, or systems that do not have dangerous/legacy ISA + * devices. 2048 should always be safe + */ + if (align == 512 || align == 2048) + rom_align = align; + else + return -1; + + if (signal(SIGBUS, sigbus) == SIG_ERR) + rc = -1; + if (rc == 0) { + fd = open("/dev/mem", O_RDONLY); + if (fd < 0) + rc = -1; + } + if (rc == 0) { + rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000); + if (rom_mem == MAP_FAILED) + rc = -1; + } + + if (rc == 0) + rom_fd = fd; + else { + if (fd >= 0) + close(fd); + probe_roms_exit(); + } + return rc; +} + +/** + * isa_bus_to_virt - convert physical address to mmap'd region + * @addr - address to convert + * + * Only valid between a successful call to probe_roms_init and the + * corresponding probe_roms_exit + */ +static void *isa_bus_to_virt(unsigned long addr) +{ + return rom_mem + (addr - 0xc0000); +} + +struct resource { + unsigned long start; + unsigned long end; + const char *name; +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, +}; + +#define ROMSIGNATURE 0xaa55 + +static int romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig = 0; + + return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE; +} + +static int romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_address8(rom++, &c) == 0; length--) + sum += c; + return !length && !sum; +} + +int scan_adapter_roms(scan_fn fn) +{ + /* let scan_fn examing each of the adapter roms found by probe_roms */ + unsigned int i; + int found; + + if (rom_fd < 0) + return 0; + + found = 0; + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + + if (res->start) { + found = fn(isa_bus_to_virt(res->start), + isa_bus_to_virt(res->end)); + if (found) + break; + } else + break; + } + + return found; +} + +static unsigned long align(unsigned long addr, unsigned long alignment) +{ + return (addr + alignment - 1) & ~(alignment - 1); +} + +void probe_roms(void) +{ + const void *rom; + unsigned long start, length, upper; + unsigned char c; + unsigned int i; + + if (rom_fd < 0) + return; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + break; + } + + start = align(video_rom_resource.end + 1, rom_align); + if (start < upper) + start = upper; + + /* system rom */ + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) + upper = extension_rom_resource.start; + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + + start = adapter_rom_resources[i++].end & ~(rom_align - 1); + } +} + diff -Nru mdadm-2.6.7.1/probe_roms.h mdadm-3.1.4/probe_roms.h --- mdadm-2.6.7.1/probe_roms.h 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/probe_roms.h 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,24 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +void probe_roms_exit(void); +int probe_roms_init(unsigned long align); +typedef int (*scan_fn)(const void *start, const void *end); +int scan_adapter_roms(scan_fn fn); +void probe_roms(void); diff -Nru mdadm-2.6.7.1/Query.c mdadm-3.1.4/Query.c --- mdadm-2.6.7.1/Query.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/Query.c 2010-08-05 09:51:58.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2002-2006 Neil Brown + * Copyright (C) 2002-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -37,7 +32,7 @@ * whether it is an md device and whether it has * a superblock */ - int fd = open(dev, O_RDONLY, 0); + int fd = open(dev, O_RDONLY); int vers; int ioctlerr; int superror, superrno; @@ -96,7 +91,7 @@ if (superror == 0) { /* array might be active... */ st->ss->getinfo_super(st, &info); - if (st->ss->major == 0) { + if (st->ss == &super0) { mddev = get_md_name(info.array.md_minor); disc.number = info.disk.number; activity = "undetected"; @@ -121,7 +116,7 @@ activity, map_num(pers, info.array.level), mddev); - if (st->ss->major == 0) + if (st->ss == &super0) put_md_name(mddev); } return 0; diff -Nru mdadm-2.6.7.1/RAID5_versus_RAID10.txt mdadm-3.1.4/RAID5_versus_RAID10.txt --- mdadm-2.6.7.1/RAID5_versus_RAID10.txt 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/RAID5_versus_RAID10.txt 1970-01-01 02:00:00.000000000 +0200 @@ -1,177 +0,0 @@ -# from http://www.miracleas.com/BAARF/RAID5_versus_RAID10.txt -# also see http://www.miracleas.com/BAARF/BAARF2.html -# -# Note: I, the Debian maintainer, do not agree with some of the arguments, -# especially not with the total condemning of RAID5. Anyone who talks about -# data loss and blames the RAID system should spend time reading up on Backups -# instead of trying to evangelise, but that's only my opinion. RAID5 has its -# merits and its shortcomings, just like any other method. However, the author -# of this argument puts forth a good case and thus I am including the -# document. Remember that you're the only one that can decide which RAID level -# to use. -# - -RAID5 versus RAID10 (or even RAID3 or RAID4) - -First let's get on the same page so we're all talking about apples. - -What is RAID5? - -OK here is the deal, RAID5 uses ONLY ONE parity drive per stripe and many -RAID5 arrays are 5 (if your counts are different adjust the calculations -appropriately) drives (4 data and 1 parity though it is not a single drive -that is holding all of the parity as in RAID 3 & 4 but read on). If you -have 10 drives or say 20GB each for 200GB RAID5 will use 20% for parity -(assuming you set it up as two 5 drive arrays) so you will have 160GB of -storage. Now since RAID10, like mirroring (RAID1), uses 1 (or more) mirror -drive for each primary drive you are using 50% for redundancy so to get the -same 160GB of storage you will need 8 pairs or 16 - 20GB drives, which is -why RAID5 is so popular. This intro is just to put things into -perspective. - -RAID5 is physically a stripe set like RAID0 but with data recovery -included. RAID5 reserves one disk block out of each stripe block for -parity data. The parity block contains an error correction code which can -correct any error in the RAID5 block, in effect it is used in combination -with the remaining data blocks to recreate any single missing block, gone -missing because a drive has failed. The innovation of RAID5 over RAID3 & -RAID4 is that the parity is distributed on a round robin basis so that -there can be independent reading of different blocks from the several -drives. This is why RAID5 became more popular than RAID3 & RAID4 which -must sychronously read the same block from all drives together. So, if -Drive2 fails blocks 1,2,4,5,6 & 7 are data blocks on this drive and blocks -3 and 8 are parity blocks on this drive. So that means that the parity on -Drive5 will be used to recreate the data block from Disk2 if block 1 is -requested before a new drive replaces Drive2 or during the rebuilding of -the new Drive2 replacement. Likewise the parity on Drive1 will be used to -repair block 2 and the parity on Drive3 will repair block4, etc. For block -2 all the data is safely on the remaining drives but during the rebuilding -of Drive2's replacement a new parity block will be calculated from the -block 2 data and will be written to Drive 2. - -Now when a disk block is read from the array the RAID software/firmware -calculates which RAID block contains the disk block, which drive the disk -block is on and which drive contains the parity block for that RAID block -and reads ONLY the one data drive. It returns the data block. If you -later modify the data block it recalculates the parity by subtracting the -old block and adding in the new version then in two separate operations it -writes the data block followed by the new parity block. To do this it must -first read the parity block from whichever drive contains the parity for -that stripe block and reread the unmodified data for the updated block from -the original drive. This read-read-write-write is known as the RAID5 write -penalty since these two writes are sequential and synchronous the write -system call cannot return until the reread and both writes complete, for -safety, so writing to RAID5 is up to 50% slower than RAID0 for an array of -the same capacity. (Some software RAID5's avoid the re-read by keeping an -unmodified copy of the orginal block in memory.) - -Now what is RAID10: - -RAID10 is one of the combinations of RAID1 (mirroring) and RAID0 -(striping) which are possible. There used to be confusion about what -RAID01 or RAID01 meant and different RAID vendors defined them -differently. About five years or so ago I proposed the following standard -language which seems to have taken hold. When N mirrored pairs are -striped together this is called RAID10 because the mirroring (RAID1) is -applied before striping (RAID0). The other option is to create two stripe -sets and mirror them one to the other, this is known as RAID01 (because -the RAID0 is applied first). In either a RAID01 or RAID10 system each and -every disk block is completely duplicated on its drive's mirror. -Performance-wise both RAID01 and RAID10 are functionally equivalent. The -difference comes in during recovery where RAID01 suffers from some of the -same problems I will describe affecting RAID5 while RAID10 does not. - -Now if a drive in the RAID5 array dies, is removed, or is shut off data is -returned by reading the blocks from the remaining drives and calculating -the missing data using the parity, assuming the defunct drive is not the -parity block drive for that RAID block. Note that it takes 4 physical -reads to replace the missing disk block (for a 5 drive array) for four out -of every five disk blocks leading to a 64% performance degradation until -the problem is discovered and a new drive can be mapped in to begin -recovery. Performance is degraded further during recovery because all -drives are being actively accessed in order to rebuild the replacement -drive (see below). - -If a drive in the RAID10 array dies data is returned from its mirror drive -in a single read with only minor (6.25% on average for a 4 pair array as a -whole) performance reduction when two non-contiguous blocks are needed from -the damaged pair (since the two blocks cannot be read in parallel from both -drives) and none otherwise. - -One begins to get an inkling of what is going on and why I dislike RAID5, -but, as they say on late night info-mercials, there's more. - -What's wrong besides a bit of performance I don't know I'm missing? - -OK, so that brings us to the final question of the day which is: What is -the problem with RAID5? It does recover a failed drive right? So writes -are slower, I don't do enough writing to worry about it and the cache -helps a lot also, I've got LOTS of cache! The problem is that despite the -improved reliability of modern drives and the improved error correction -codes on most drives, and even despite the additional 8 bytes of error -correction that EMC puts on every Clariion drive disk block (if you are -lucky enough to use EMC systems), it is more than a little possible that a -drive will become flaky and begin to return garbage. This is known as -partial media failure. Now SCSI controllers reserve several hundred disk -blocks to be remapped to replace fading sectors with unused ones, but if -the drive is going these will not last very long and will run out and SCSI -does NOT report correctable errors back to the OS! Therefore you will not -know the drive is becoming unstable until it is too late and there are no -more replacement sectors and the drive begins to return garbage. [Note -that the recently popular IDE/ATA drives do not (TMK) include bad sector -remapping in their hardware so garbage is returned that much sooner.] -When a drive returns garbage, since RAID5 does not EVER check parity on -read (RAID3 & RAID4 do BTW and both perform better for databases than -RAID5 to boot) when you write the garbage sector back garbage parity will -be calculated and your RAID5 integrity is lost! Similarly if a drive -fails and one of the remaining drives is flaky the replacement will be -rebuilt with garbage also propagating the problem to two blocks instead of -just one. - -Need more? During recovery, read performance for a RAID5 array is -degraded by as much as 80%. Some advanced arrays let you configure the -preference more toward recovery or toward performance. However, doing so -will increase recovery time and increase the likelihood of losing a second -drive in the array before recovery completes resulting in catastrophic -data loss. RAID10 on the other hand will only be recovering one drive out -of 4 or more pairs with performance ONLY of reads from the recovering pair -degraded making the performance hit to the array overall only about 20%! -Plus there is no parity calculation time used during recovery - it's a -straight data copy. - -What about that thing about losing a second drive? Well with RAID10 there -is no danger unless the one mirror that is recovering also fails and -that's 80% or more less likely than that any other drive in a RAID5 array -will fail! And since most multiple drive failures are caused by -undetected manufacturing defects you can make even this possibility -vanishingly small by making sure to mirror every drive with one from a -different manufacturer's lot number. ("Oh", you say, "this schenario does -not seem likely!" Pooh, we lost 50 drives over two weeks when a batch of -200 IBM drives began to fail. IBM discovered that the single lot of -drives would have their spindle bearings freeze after so many hours of -operation. Fortunately due in part to RAID10 and in part to a herculean -effort by DG techs and our own people over 2 weeks no data was lost. -HOWEVER, one RAID5 filesystem was a total loss after a second drive failed -during recover. Fortunately everything was on tape. - -Conclusion? For safety and performance favor RAID10 first, RAID3 second, -RAID4 third, and RAID5 last! The original reason for the RAID2-5 specs -was that the high cost of disks was making RAID1, mirroring, impractical. -That is no longer the case! Drives are commodity priced, even the biggest -fastest drives are cheaper in absolute dollars than drives were then and -cost per MB is a tiny fraction of what it was. Does RAID5 make ANY sense -anymore? Obviously I think not. - -To put things into perspective: If a drive costs $1000US (and most are far -less expensive than that) then switching from a 4 pair RAID10 array to a 5 -drive RAID5 array will save 3 drives or $3000US. What is the cost of -overtime, wear and tear on the technicians, DBAs, managers, and customers -of even a recovery scare? What is the cost of reduced performance and -possibly reduced customer satisfaction? Finally what is the cost of lost -business if data is unrecoverable? I maintain that the drives are FAR -cheaper! Hence my mantra: - -NO RAID5! NO RAID5! NO RAID5! NO RAID5! NO RAID5! NO RAID5! NO RAID5! - -Art S. Kagel - diff -Nru mdadm-2.6.7.1/ReadMe.c mdadm-3.1.4/ReadMe.c --- mdadm-2.6.7.1/ReadMe.c 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/ReadMe.c 2010-08-31 10:21:13.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2007 Neil Brown + * Copyright (C) 2001-2010 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -24,7 +24,7 @@ #include "mdadm.h" -char Version[] = Name " - v2.6.7.1 - 15th October 2008\n"; +char Version[] = Name " - v3.1.4 - 31st August 2010\n"; /* * File: ReadMe.c @@ -86,11 +86,11 @@ * At the time if writing, there is only minimal support. */ -char short_options[]="-ABCDEFGIQhVXWvqbc:i:l:p:m:n:x:u:c:d:z:U:sarfRSow1tye:"; +char short_options[]="-ABCDEFGIQhVXWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; char short_bitmap_options[]= - "-ABCDEFGIQhVXWvqb:c:i:l:p:m:n:x:u:c:d:z:U:sarfRSow1tye:"; + "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; char short_bitmap_auto_options[]= - "-ABCDEFGIQhVXWvqb:c:i:l:p:m:n:x:u:c:d:z:U:sa:rfRSow1tye:"; + "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; struct option long_options[] = { {"manage", 0, 0, '@'}, @@ -107,6 +107,9 @@ {"query", 0, 0, 'Q'}, {"examine-bitmap", 0, 0, 'X'}, {"auto-detect", 0, 0, AutoDetect}, + {"detail-platform", 0, 0, DetailPlatform}, + {"kill-subarray", 1, 0, KillSubarray}, + {"update-subarray", 1, 0, UpdateSubarray}, /* synonyms */ {"monitor", 0, 0, 'F'}, @@ -138,7 +141,9 @@ {"write-mostly",0, 0, 'W'}, {"re-add", 0, 0, ReAdd}, {"homehost", 1, 0, HomeHost}, +#if 0 {"auto-update-homehost", 0, 0, AutoHomeHost}, +#endif {"symlinks", 1, 0, Symlinks}, /* For assemble */ @@ -161,6 +166,7 @@ {"readwrite", 0, 0, 'w'}, {"no-degraded",0,0, NoDegraded }, {"wait", 0, 0, 'W'}, + {"wait-clean", 0, 0, Waitclean }, /* For Detail/Examine */ {"brief", 0, 0, 'b'}, @@ -172,6 +178,7 @@ {"mail", 1, 0, 'm'}, {"program", 1, 0, 'p'}, {"alert", 1, 0, 'p'}, + {"increment", 1, 0, 'r'}, {"delay", 1, 0, 'd'}, {"daemonise", 0, 0, 'f'}, {"daemonize", 0, 0, 'f'}, @@ -180,6 +187,7 @@ {"syslog", 0, 0, 'y'}, /* For Grow */ {"backup-file", 1,0, BackupFile}, + {"array-size", 1, 0, 'Z'}, /* For Incremental */ {"rebuild-map", 0, 0, 'r'}, @@ -207,7 +215,7 @@ " mdadm --grow options device\n" " resize/reshape an active array\n" " mdadm --incremental device\n" -" add a device to an array as appropriate\n" +" add/remove a device to/from an array as appropriate\n" " mdadm --monitor options...\n" " Monitor one or more array for significant changes.\n" " mdadm device options...\n" @@ -250,7 +258,7 @@ " --examine-bitmap -X: Display the detail of a bitmap file\n" " --monitor -F : monitor (follow) some arrays\n" " --grow -G : resize/ reshape and array\n" -" --incremental -I : add a single device to an array as appropriate\n" +" --incremental -I : add/remove a single device to/from an array as appropriate\n" " --query -Q : Display general information about how a\n" " device relates to the md driver\n" " --auto-detect : Start arrays auto-detected by the kernel\n" @@ -269,7 +277,6 @@ " --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" " --force -f : Honour devices as listed on command line. Don't\n" " : insert a missing drive for RAID5.\n" -" --auto(=p) -a : Automatically allocate new (partitioned) md array if needed.\n" " --assume-clean : Assume the array is already in-sync. This is dangerous.\n" " --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" " --delay= -d : seconds between bitmap updates\n" @@ -287,7 +294,6 @@ " --scan -s : scan config file for missing information\n" " --force -f : Assemble the array even if some superblocks appear out-of-date\n" " --update= -U : Update superblock: try '-A --update=?' for list of options.\n" -" --auto(=p) -a : Automatically allocate new (partitioned) md array if needed.\n" " --no-degraded : Do not start any degraded arrays - default unless --scan.\n" "\n" " For detail or examine:\n" @@ -360,7 +366,7 @@ "\n" " This usage is similar to --create. The difference is that it creates\n" " a legacy array without a superblock. With these arrays there is no\n" -" difference between initially creating the array and subsequently\n" +" different between initially creating the array and subsequently\n" " assembling the array, except that hopefully there is useful data\n" " there in the second case.\n" "\n" @@ -465,6 +471,7 @@ " --query -Q : Display general information about how a\n" " device relates to the md driver\n" " --detail -D : Display details of an array\n" +" --detail-platform : Display hardware/firmware details\n" " --examine -E : Examine superblock on an array component\n" " --examine-bitmap -X: Display contents of a bitmap file\n" " --zero-superblock : erase the MD superblock from a device.\n" @@ -492,6 +499,7 @@ " --mail= -m : Address to mail alerts of failure to\n" " --program= -p : Program to run when an event is detected\n" " --alert= : same as --program\n" +" --increment= -r : Report RebuildNN events in the given increment. default=20\n" " --delay= -d : seconds of delay between polling state. default=60\n" " --config= -c : specify a different config file\n" " --scan -s : find mail-address/program in config file\n" @@ -517,31 +525,42 @@ " --layout= -p : For a FAULTY array, set/change the error mode.\n" " --size= -z : Change the active size of devices in an array.\n" " : This is useful if all devices have been replaced\n" -" : with larger devices.\n" -" --raid-disks= -n : Change the number of active devices in an array.\n" -" : array.\n" +" : with larger devices. Value is in Kilobytes, or\n" +" : the special word 'max' meaning 'as large as possible'.\n" +" --raid-devices= -n : Change the number of active devices in an array.\n" " --bitmap= -b : Add or remove a write-intent bitmap.\n" +" --backup-file= file : A file on a differt device to store data for a\n" +" : short time while increasing raid-devices on a\n" +" : RAID4/5/6 array. Not needed when a spare is present.\n" +" --array-size= -Z : Change visible size of array. This does not change\n" +" : any data on the device, and is not stable across restarts.\n" ; char Help_incr[] = -"Usage: mdadm --incremental [-Rqrs] device\n" +"Usage: mdadm --incremental [-Rqrsf] device\n" "\n" "This usage allows for incremental assembly of md arrays. Devices can be\n" "added one at a time as they are discovered. Once an array has all expected\n" "devices, it will be started.\n" "\n" -"Options that are valid with incremental assembly (-I --incremental) more are:\n" -" --run -R : run arrays as soon as a minimal number of devices are\n" -" : present rather than waiting for all expected.\n" -" --quiet -q : Don't print any information messages, just errors.\n" -" --rebuild -r : Rebuild the 'map' file that mdadm uses for tracking\n" -" : partial arrays.\n" -" --scan -s : Use with -R to start any arrays that have the minimal\n" -" : required number of devices, but are not yet started.\n" +"Optionally, the process can be reversed by using the fail option.\n" +"When fail mode is invoked, mdadm will see if the device belongs to an array\n" +"and then both fail (if needed) and remove the device from that array.\n" +"\n" +"Options that are valid with incremental assembly (-I --incremental) are:\n" +" --run -R : Run arrays as soon as a minimal number of devices are\n" +" : present rather than waiting for all expected.\n" +" --quiet -q : Don't print any information messages, just errors.\n" +" --rebuild-map -r : Rebuild the 'map' file that mdadm uses for tracking\n" +" : partial arrays.\n" +" --scan -s : Use with -R to start any arrays that have the minimal\n" +" : required number of devices, but are not yet started.\n" +" --fail -f : First fail (if needed) and then remove device from\n" +" : any array that it is a member of.\n" ; char Help_config[] = -"The /etc/mdadm/mdadm.conf config file:\n\n" +"The /etc/mdadm.conf config file:\n\n" " The config file contains, apart from blank lines and comment lines that\n" " start with a hash(#), four sorts of configuration lines: array lines, \n" " device lines, mailaddr lines and program lines.\n" @@ -578,16 +597,49 @@ /* name/number mappings */ mapping_t r5layout[] = { - { "left-asymmetric", 0}, - { "right-asymmetric", 1}, - { "left-symmetric", 2}, - { "right-symmetric", 3}, - - { "default", 2}, - { "la", 0}, - { "ra", 1}, - { "ls", 2}, - { "rs", 3}, + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC}, + { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC}, + + { NULL, 0} +}; +mapping_t r6layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART}, + { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART}, + { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE}, + + { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6}, + { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6}, + { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6}, + { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6}, + { "parity-first-6", ALGORITHM_PARITY_0_6}, + { NULL, 0} }; @@ -610,6 +662,7 @@ { "raid10", 10}, { "10", 10}, { "faulty", LEVEL_FAULTY}, + { "container", LEVEL_CONTAINER}, { NULL, 0} }; diff -Nru mdadm-2.6.7.1/restripe.c mdadm-3.1.4/restripe.c --- mdadm-2.6.7.1/restripe.c 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/restripe.c 2010-08-26 05:24:16.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2006 Neil Brown + * Copyright (C) 2006-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -23,14 +23,18 @@ */ #include "mdadm.h" +#include /* To restripe, we read from old geometry to a buffer, and * read from buffer to new geometry. - * When reading we don't worry about parity. When writing we do. + * When reading, we might have missing devices and so could need + * to reconstruct. + * When writing, we need to create correct parity and Q. * */ -static int geo_map(int block, unsigned long long stripe, int raid_disks, int level, int layout) +static int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout) { /* On the given stripe, find which disk in the array will have * block numbered 'block'. @@ -42,6 +46,7 @@ switch(level*100 + layout) { case 000: case 400: + case 500 + ALGORITHM_PARITY_N: /* raid 4 isn't messed around by parity blocks */ if (block == -1) return raid_disks-1; /* parity block */ @@ -70,6 +75,65 @@ if (block == -1) return pd; return (pd + 1 + block) % raid_disks; + case 500 + ALGORITHM_PARITY_0: + return block + 1; + + + case 600 + ALGORITHM_PARITY_N_6: + if (block == -2) + return raid_disks - 1; + if (block == -1) + return raid_disks - 2; /* parity block */ + return block; + case 600 + ALGORITHM_LEFT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_PARITY_0_6: + if (block == -2) + return raid_disks - 1; + return block + 1; + + + case 600 + ALGORITHM_PARITY_0: + if (block == -1) + return 0; + if (block == -2) + return 1; + return block + 2; + case 600 + ALGORITHM_LEFT_ASYMMETRIC: pd = raid_disks - 1 - (stripe % raid_disks); if (block == -1) return pd; @@ -80,6 +144,8 @@ return block+2; return block; + case 600 + ALGORITHM_ROTATING_ZERO_RESTART: + /* Different order for calculating Q, otherwize same as ... */ case 600 + ALGORITHM_RIGHT_ASYMMETRIC: pd = stripe % raid_disks; if (block == -1) return pd; @@ -101,9 +167,43 @@ if (block == -1) return pd; if (block == -2) return (pd+1) % raid_disks; return (pd + 2 + block) % raid_disks; + + + case 600 + ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd = raid_disks - 1 - ((stripe + 1) % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+raid_disks-1) % raid_disks; + return (pd + 1 + block) % raid_disks; } return -1; } +static int is_ddf(int layout) +{ + switch (layout) + { + default: + return 0; + case ALGORITHM_ROTATING_N_CONTINUE: + case ALGORITHM_ROTATING_N_RESTART: + case ALGORITHM_ROTATING_ZERO_RESTART: + return 1; + } +} static void xor_blocks(char *target, char **sources, int disks, int size) @@ -118,10 +218,10 @@ } } -static void qsyndrome(char *p, char *q, char **sources, int disks, int size) +static void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size) { int d, z; - char wq0, wp0, wd0, w10, w20; + uint8_t wq0, wp0, wd0, w10, w20; for ( d = 0; d < size; d++) { wq0 = wp0 = sources[disks-1][d]; for ( z = disks-2 ; z >= 0 ; z-- ) { @@ -138,49 +238,306 @@ } } + +/* + * The following was taken from linux/drivers/md/mktables.c, and modified + * to create in-memory tables rather than C code + */ +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int tables_ready = 0; +uint8_t raid6_gfmul[256][256]; +uint8_t raid6_gfexp[256]; +uint8_t raid6_gfinv[256]; +uint8_t raid6_gfexi[256]; +void make_tables(void) +{ + int i, j; + uint8_t v; + + /* Compute multiplication table */ + for (i = 0; i < 256; i++) + for (j = 0; j < 256; j++) + raid6_gfmul[i][j] = gfmul(i, j); + + /* Compute power-of-2 table (exponent) */ + v = 1; + for (i = 0; i < 256; i++) { + raid6_gfexp[i] = v; + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + + /* Compute inverse table x^-1 == x^254 */ + for (i = 0; i < 256; i++) + raid6_gfinv[i] = gfpow(i, 254); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + for (i = 0; i < 256; i ++) + raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; + + tables_ready = 1; +} + +uint8_t *zero; +/* Following was taken from linux/drivers/md/raid6recov.c */ + +/* Recover two failed data blocks. */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs) +{ + uint8_t *p, *q, *dp, *dq; + uint8_t px, qx, db; + const uint8_t *pbmul; /* P multiplier table for B data */ + const uint8_t *qmul; /* Q multiplier table (for both) */ + + p = ptrs[disks-2]; + q = ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = ptrs[faila]; + ptrs[faila] = zero; + dq = ptrs[failb]; + ptrs[failb] = zero; + + qsyndrome(dp, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs) +{ + uint8_t *p, *q, *dq; + const uint8_t *qmul; /* Q multiplier table */ + + p = ptrs[disks-2]; + q = ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = ptrs[faila]; + ptrs[faila] = zero; + + qsyndrome(p, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dq; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} + /* Save data: * We are given: - * A list of 'fds' of the active disks. For now we require all to be present. + * A list of 'fds' of the active disks. Some may be absent. * A geometry: raid_disks, chunk_size, level, layout * A list of 'fds' for mirrored targets. They are already seeked to * right (Write) location - * A start and length + * A start and length which must be stripe-aligned + * 'buf' is large enough to hold one stripe, and is aligned */ int save_stripes(int *source, unsigned long long *offsets, int raid_disks, int chunk_size, int level, int layout, int nwrites, int *dest, - unsigned long long start, unsigned long long length) + unsigned long long start, unsigned long long length, + char *buf) { - char buf[8192]; - int cpos = start % chunk_size; /* where in chunk we are up to */ int len; int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); int disk; + int i; + if (!tables_ready) + make_tables(); + + if (zero == NULL) { + zero = malloc(chunk_size); + memset(zero, 0, chunk_size); + } + + len = data_disks * chunk_size; while (length > 0) { - unsigned long long offset; - int i; - len = chunk_size - cpos; - if (len > sizeof(buf)) len = sizeof(buf); - if (len > length) len = length; - /* len bytes to be moved from one device */ - - offset = (start/chunk_size/data_disks)*chunk_size + cpos; - disk = start/chunk_size % data_disks; - disk = geo_map(disk, start/chunk_size/data_disks, - raid_disks, level, layout); - if (lseek64(source[disk], offsets[disk]+offset, 0) < 0) - return -1; - if (read(source[disk], buf, len) != len) + int failed = 0; + int fdisk[3], fblock[3]; + for (disk = 0; disk < raid_disks ; disk++) { + unsigned long long offset; + int dnum; + + offset = (start/chunk_size/data_disks)*chunk_size; + dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, + start/chunk_size/data_disks, + raid_disks, level, layout); + if (dnum < 0) abort(); + if (source[dnum] < 0 || + lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 || + read(source[dnum], buf+disk * chunk_size, chunk_size) + != chunk_size) + if (failed <= 2) { + fdisk[failed] = dnum; + fblock[failed] = disk; + failed++; + } + } + if (failed == 0 || fblock[0] >= data_disks) + /* all data disks are good */ + ; + else if (failed == 1 || fblock[1] >= data_disks+1) { + /* one failed data disk and good parity */ + char *bufs[data_disks]; + for (i=0; i < data_disks; i++) + if (fblock[0] == i) + bufs[i] = buf + data_disks*chunk_size; + else + bufs[i] = buf + i*chunk_size; + + xor_blocks(buf + fblock[0]*chunk_size, + bufs, data_disks, chunk_size); + } else if (failed > 2 || level != 6) + /* too much failure */ return -1; + else { + /* RAID6 computations needed. */ + uint8_t *bufs[data_disks+4]; + int qdisk; + int syndrome_disks; + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + bufs[i] = zero; + for (i = 0; i < data_disks; i++) { + int dnum = geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout); + int snum; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * and thus the syndrome number. + */ + snum = dnum; + bufs[snum] = (uint8_t*)buf + chunk_size * i; + } + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + * Note that for the '_6' variety, the p block + * makes a hole that we need to be careful of. + */ + int j; + int snum = 0; + for (j = 0; j < raid_disks; j++) { + int dnum = (qdisk + 1 + j) % raid_disks; + if (dnum == disk || dnum == qdisk) + continue; + for (i = 0; i < data_disks; i++) + if (geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout) == dnum) + break; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * snum is syndrome disk for which 0 is immediately after Q + */ + bufs[snum] = (uint8_t*)buf + chunk_size * i; + + if (fblock[0] == i) + fdisk[0] = snum; + if (fblock[1] == i) + fdisk[1] = snum; + snum++; + } + + syndrome_disks = data_disks; + } + + /* Place P and Q blocks at end of bufs */ + bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks; + bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1); + + if (fblock[1] == data_disks) + /* One data failed, and parity failed */ + raid6_datap_recov(syndrome_disks+2, chunk_size, + fdisk[0], bufs); + else { + if (fdisk[0] > fdisk[1]) { + int t = fdisk[0]; + fdisk[0] = fdisk[1]; + fdisk[1] = t; + } + /* Two data blocks failed, P,Q OK */ + raid6_2data_recov(syndrome_disks+2, chunk_size, + fdisk[0], fdisk[1], bufs); + } + } + for (i=0; i= chunk_size) cpos -= chunk_size; } return 0; } @@ -201,34 +558,45 @@ int source, unsigned long long read_offset, unsigned long long start, unsigned long long length) { - char *stripe_buf = malloc(raid_disks * chunk_size); + char *stripe_buf; char **stripes = malloc(raid_disks * sizeof(char*)); char **blocks = malloc(raid_disks * sizeof(char*)); int i; - int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); + int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); - if (stripe_buf == NULL || stripes == NULL || blocks == NULL) { + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size)) + stripe_buf = NULL; + if (zero == NULL) { + zero = malloc(chunk_size); + if (zero) + memset(zero, 0, chunk_size); + } + if (stripe_buf == NULL || stripes == NULL || blocks == NULL + || zero == NULL) { free(stripe_buf); free(stripes); free(blocks); + free(zero); return -2; } for (i=0; i 0) { - int len = data_disks * chunk_size; + unsigned int len = data_disks * chunk_size; unsigned long long offset; int disk, qdisk; + int syndrome_disks; if (length < len) return -3; for (i=0; i < data_disks; i++) { int disk = geo_map(i, start/chunk_size/data_disks, raid_disks, level, layout); - blocks[i] = stripes[disk]; - if (lseek64(source, read_offset, 0) != read_offset) + if ((unsigned long long)lseek64(source, read_offset, 0) + != read_offset) return -1; - if (read(source, stripes[disk], chunk_size) != chunk_size) + if (read(source, stripes[disk], + chunk_size) != chunk_size) return -1; read_offset += chunk_size; } @@ -239,6 +607,8 @@ case 5: disk = geo_map(-1, start/chunk_size/data_disks, raid_disks, level, layout); + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(disk+1+i) % raid_disks]; xor_blocks(stripes[disk], blocks, data_disks, chunk_size); break; case 6: @@ -246,9 +616,29 @@ raid_disks, level, layout); qdisk = geo_map(-2, start/chunk_size/data_disks, raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + if (i == disk || i == qdisk) + blocks[i] = (char*)zero; + else + blocks[i] = stripes[i]; + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + */ + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(qdisk+1+i) % raid_disks]; - qsyndrome(stripes[disk], stripes[qdisk], blocks, - data_disks, chunk_size); + syndrome_disks = data_disks; + } + qsyndrome((uint8_t*)stripes[disk], + (uint8_t*)stripes[qdisk], + (uint8_t**)blocks, + syndrome_disks, chunk_size); break; } for (i=0; i < raid_disks ; i++) @@ -297,7 +687,7 @@ } switch(level) { case 6: - qsyndrome(p, q, blocks, data_disks, chunk_size); + qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); disk = geo_map(-1, start/chunk_size, raid_disks, level, layout); if (memcmp(p, stripes[disk], chunk_size) != 0) { @@ -336,6 +726,7 @@ int save; int *fds; char *file; + char *buf; int storefd; unsigned long long *offsets; int raid_disks, chunk_size, level, layout; @@ -394,11 +785,13 @@ } } + buf = malloc(raid_disks * chunk_size); + if (save == 1) { int rv = save_stripes(fds, offsets, raid_disks, chunk_size, level, layout, 1, &storefd, - start, length); + start, length, buf); if (rv != 0) { fprintf(stderr, "test_stripe: save_stripes returned %d\n", rv); diff -Nru mdadm-2.6.7.1/rootraiddoc.97.html mdadm-3.1.4/rootraiddoc.97.html --- mdadm-2.6.7.1/rootraiddoc.97.html 2010-10-18 14:20:55.000000000 +0300 +++ mdadm-3.1.4/rootraiddoc.97.html 1970-01-01 02:00:00.000000000 +0200 @@ -1,1333 +0,0 @@ - - - - Convert Root System to Bootable Software RAID1 (Debian) - - - - - - - - - - - -
-

-Convert Root System to Bootable Software RAID1 (Debian)

-

-How to convert a Debian system to bootable Software RAID 1 with a second hard drive, 'mdadm' and a few standard UNIX tools

-
- -

-Version 0.97 (2004-06-03) Lucas Albers -- admin At cs DOT montana dot edu and Roger Chrisman
-Home of most recent version: -http://alioth.debian.org/projects/rootraiddoc
-Thanks to: Alvin Olga, Era Eriksson, Yazz D. Atlas, James Bromberger, Timothy F Nagy, and alioth.debian.org
-

-WARNING: No warranty of any kind. Proceed at your own risk. A typo, especially in lilo.conf, can leave your system unbootable. Back-up data and make a boot floppy before starting this procedure.

- - - -
- -

Table of Contents

- -

-Summary

- -

-Procedure

- -
    -
  1. -Install Debian
    -on your Primary Master disk -- hda. Or if you already have Debian installed, go to step 2.
  2. -
  3. -Upgrade to RAID savvy Kernel
    -and install 'mdadm'.
  4. -
  5. -Setup RAID 1
    -declaring disk-one 'missing' and disk-two hdc.
  6. -
  7. -Copy your Debian system
    -from hda to /dev/md0 ('missing' + 'hdc').
  8. -
  9. -Reboot to RAID device.
  10. -
  11. -Reformat hda as 'fd' and declare it as disk-one of your RAID,
    -and watch the booted RAID system automatically mirror itself onto the new drive. Done.
  12. -
- -

Alternate grub/initrd procedure

-
    -
  1. -Part II. RAID using initrd and grub
  2. -
- -

-Appendix

- -
    -
  1. -RAID Introduction
  2. -
  3. -Drive designators (hda, hdb, hdc, hdd), jumpers and cables
  4. -
  5. -Setting up software RAID for multiple partitions
  6. -
  7. -Lilo
  8. -
  9. -Copying Data
  10. -
  11. -Rebooting
  12. -
  13. -Initrd
  14. -
  15. -Verify that system will boot even with one disk off-line
  16. -
  17. -Setting up a RAID 1 Swap device
  18. -
  19. -Performance Optimizations
  20. -
  21. -Disaster Recovery
  22. -
  23. -Quick Reference
  24. - -
  25. -Troubleshooting
  26. -
  27. -Raid Disk Maintenance
  28. -
- -

-References

- -
- - - - -

-Summary

-

-We begin with Debian installed on the Primary Master drive, hda (step 1). We need RAID support in our Kernel (step 2). We add another disk as Secondary Master, hdc, set it up for RAID (step 3), and copy Debian to it (step 4). Now we can reboot to the RAID device (step 5) and declare hda part of the RAID and it automatically syncs with hdc to complete our RAID 1 device (step 6).

-

-If all goes well

-
    -
  • You do not need a rescue disk or to boot off anything except the hard drive.
  • -
  • You can do this operation completely remotely.
  • -
  • And you will not lose any data.
  • -
-

-Use this HowTo at your own risk. We are not responsible for what happens!

-

-First things first

-
    -
  • Backup your data.
  • -
  • Create a boot floppy.
  • -
-

-Whenever you change your partitions, you need to reboot! (If you know what you are -doing, ignore this advice.)

-

-I assume you will mess up a step so wherever possible, we include verification.

-

-I use 'mdadm' because it is easier than 'raidtools' or 'raidtools2'.

-

-
We now have grub and lilo directions, grub directions are still in beta form. -
Read the grub directions, and comment on them.

- - - - - - - - -

-Procedure

- - -

-1. Install Debian

-

-Do a fresh install the normal way on your first drive, hda (the Primary Master drive in your computer). -Or, if you already have a running Debian system that you want to use on hda; skip ahead to step 2. -If you need Debian installation instructions, see:

-

- -Debian Installation HowTo » http://www.debian.org/releases/stable/installmanual

-

- -Sarge Debian Installation HowTo » http://d-i.alioth.debian.org/manual/

- - -

-2. Upgrade to a RAID savvy Kernel

- - -

-2.1 Compile and install a RAID savvy Kernel.

-

-RAID must be compiled into the Kernel, not added as a module, for you to boot from the RAID device (unless you use a RAID savvy initrd kernel or boot from a non-RAID boot drive. (I now cover initrd methods!). You need RAID 1 but I usually include RAID 5, too. For step by step Kernel compile and install instructions, see:

-

-Creating custom Kernels with Debian's kernel-package system » http://newbiedoc.sourceforge.net/system/kernel-pkg.html

- -
- -

-2.2 Verify your RAID savvy Kernel.

-

-cat /proc/mdstat

-

-(You should see the RAID "personalities" your Kernel supports.)

-Something like this: - -

-Personalities : [linear] [raid0] [raid1] [raid5] -read_ahead 1024 sectors -md4 : active raid5 hdh4[3] hdg4[2] hdf4[1] hde4[0] - 356958720 blocks level 5, 64k chunk, algorithm 2 [4/4] [UUUU] - -unused devices: -

- -

-YOU MUST VERIFY you have raid support via /proc/mdstat. -This is the most important item to verify before going any farther. -So the kernel has to support it or you have to load the modules in initrd. -

- - -(This will show you if raid is compiled into kernel, or detected as a module from initrd.) -/etc/modules will not list RAID if Kernel has RAID compiled in instead of loaded as modules. -
- Use lsmod to list currently loaded modules, this will show raid modules loaded. -

- reiserfs
-raid1
-ext2
-ide-disk
-raid5
-ext3
-

-cat /etc/modules

-

-(IF YOU SEE ANY RAID LISTED IN /etc/modules, then you probably have your Kernel loading RAID via modules. That will prevent you from booting from your RAID device, unless you use initrd. To boot from your RAID device, unless you use a RAID savvy initrd, you need RAID compiled into Kernel, not added as a module.)

-

- - -

2.3 Install 'mdadm':

-

-apt-get install mdadm

-2.4 List what IDE devices you have: -

-ls /proc/ide

- - -

-3. Setup RAID 1

-

-Setup RAID 1 and declare disk-one of your RAID to be 'missing' and disk-two of your RAID to be 'hdc'.

- - -

-3.1 Create RAID (fd) partition on hdc

-

-Warning: ALWAYS give the partition when editing with cfdisk. By default cfdisk will select the first disk in the system. I accidentally wiped the wrong partition with cfdisk, once.

-

-Do A or B, either way will work:

-

-A. Create partitions on new disk.

-

-cfdisk /dev/hdc

- -

or

- -

B. copy existing partitions to new disk with sfdisk.

-

-sfdisk -d /dev/hda | sfdisk /dev/hdc

- -
NOTE: On some disks you cannot copy over the partitions correctly using this method -
It will detect the new partition as 0 size or a strange size. -
You will need to manually create the partitions, making them the same size with cfdisk. -
-

-3.2 Create correct partition type signatures on new partition.

-

-cfdisk /dev/hdc

- -
    -
  • Select Type, then hit enter, then type 'fd' (this means RAID type partition).
  • -
  • Select Write
  • -
  • Select Quit.
  • -
- -

-reboot

-

-(To verify that everything is working ok.) - - -

-3.3 Create RAID device

-

-that has two members and one of the members does not exist yet. md0 is the RAID partition we are creating, /dev/hdc1 is the initial partition. We will be adding /dev/hda1 back into the /dev/md0

-

-RAID set after we boot into /dev/md0.

-

-mdadm --create /dev/md0 --level=1 --raid-disks=2 missing /dev/hdc1

-

-If this gives errors then you need to zero the super block, see useful mdadm commands.

- - -

-3.4 Format RAID device

-

-You can use reiserfs or ext3 for this, both work, I use reiserfs for larger devices. Go with what you trust.

-

-mkfs.ext3 /dev/md0

-

-or

-

-mkfs -t reiserfs /dev/md0

- - - -

-4. Copy your Debian system

-

-Copy your Debian system from hda to /dev/md0 ('missing' + 'hdc'). Then, check to -make sure that the new RAID device is still setup right and can be mounted -correctly. We do this with an entry in hda's /etc/fstab and a reboot. Note that -by editing hda's /etc/fstab after the copy, instead of before, we leave the copy -on md0 unaltered and only are editing hda's /etc/fstab.

- - -

-NB: THIS IS A BRANCH IN OUR SYSTEM CONFIGURATION (eg temporary!), but it -will overwritten later by the md0 version of /etc/fstab by the sync in step 6.

- - -

-4.1 Create a mount point.

-

-mkdir /mnt/md0

- - -

-4.2 Mount your RAID device.

-

-mount /dev/md0 /mnt/md0

- - -

-4.3 Copy your Debian system to RAID device.

-

-cp -axu / /mnt/md0

-Please refer to the Copying data section to verify you copied the data correctly. -
See Copying Data -

-You don't need the -u switch; it just tells cp not to copy the files again if they exist. If you are running the command a second time it will run faster with the -u switch.

- - -

-4.4 Edit /etc/fstab so that you mount your new RAID partition on boot up.

-

-This verifies that you have the correct partition signatures on the partition and that your partition is correct. Sample Line in /etc/fstab:

-

-/dev/md0 /mnt/md0 ext3 defaults 0 0

-

-Then

-

-reboot

-

-And see if the RAID partition comes up.

-

-mount

-

-Should show /dev/md0 mounted on /mnt/md0.

- - - -

-5. Reboot to RAID device

-

-For step 5 reboot, we will tell Lilo that -

    -
  • as before, /boot and MBR are still on hda,
    -
  • and now we want root (/) to mount on md0.
    -

-

-We will, as before, be using hda's MBR (Master Boot Record is the first 512 bytes on a disk and is what the BIOS reads first in determining how to boot up a system) and hda's /boot dir (the kernel-image and some other stuff live here), but instead of mounting root (/) from hda, we will mount md0's root (/) (the root of our RAID device, currently running off of only hdc because we declared the first disk 'missing'). - - -

-5.1 Configure Lilo to boot to the RAID device

-

-(Later we will configure Lilo to write the boot sector to the RAID boot device also, so we can still boot even if either disk fails.)

-

-Add a stanza labeled 'RAID' to /etc/lilo.conf on hda1 so that we can boot with /dev/md0, our RAID device, as root (/):

-

-#the same boot drive as before.
-boot=/dev/hda
-image=/vmlinuz
-label=RAID
-read-only
-#our new root partition.
-root=/dev/md0

-

-That makes an entry labeled 'RAID' specific to the RAID device, so you can still boot to /dev/hda if /dev/md0 does not work.

-sample complete lilo.conf file:

-

-#sample working lilo.conf for raid.
-#hda1,hdc1 are boot, hda2,hdc2 are swap
-#hda3,hdc3 are the partition used by array
-#root partition is /dev/md3 on / type reiserfs (rw)
-#I named the raid volumes the same as the partition numbers
-#this is the final lilo.conf file of a system completely finished,
-#and booted into raid.
-
-
-lba32
-boot=/dev/md1
-root=/dev/hda3
-install=/boot/boot-menu.b
-map=/boot/map
- prompt
- delay=50
- timeout=50
- vga=normal
- raid-extra-boot=/dev/hda,/dev/hdd
- default=RAID
- image=/boot/vmlinuz-RAID
- label=RAID
- read-only
- root=/dev/md3
- alias=1
- - image=/vmlinuz
- label=Linux
- read-only
- alias=2
-
- image=/vmlinuz.old
- label=LinuxOLD
- read-only
- optional

- - -

-5.2 Test our new lilo.conf

-

-lilo -t -v

-

-(With a RAID installation, always run lilo -t first just to have Lilo tell you what it is about to do; use the -v flag, too, for verbose output.)

- - -

-5.3 Run Lilo

-

-Configure a one time Lilo boot via the -R flag and with a reboot with Kernel panic

-

-The -R <boot-parameters-here> tells Lilo to only use the specified image for the next boot. So once you reboot it will revert to your old Kernel.

-

-From 'man lilo':
--R command line
-This option sets the default command for the boot loader the next time it executes. The boot loader will then erase this line: this is a once-only command. It is typically used in reboot scripts, just before calling `shutdown -r'. Used without any arguments, it will cancel a lock-ed or fallback command line.

-

Before you can do the 'lilo -v -R RAID' command, you must first do a 'lilo' command to update the Lilo boot record with the contents of your new lilo.conf. Otherwise Lilo does not know what you mean by 'RAID' and you just get a 'Fatal: No image "RAID" is defined' error message when you do 'lilo -v -R RAID'. So,

-

-lilo
-lilo -v -R RAID

- - -

-5.4 Edit /mnt/md0/etc/fstab and reboot

-

-to have /dev/md0 mount as root (/), when Lilo boots from our RAID device, /dev/md0.

-

-Previous root (/) in fstab was:

-

-/dev/hda1 / reiserfs defaults 0 0

-

-Edit it to:

-

-/dev/md0 / ext3 defaults 0 0

-

-Note: edit /mnt/md0/etc/fstab, not /etc/fstab, because at the moment we are booted with hda1 as root (/) but we want to change the /etc/fstab that we currently have mounted on /mnt/md0/etc/fstab, our RAID device.

-

-Reboot to check if system boots our RAID device, /dev/md0, as root (/). If it does not, just reboot again and you will come up with your previous boot partition courtesy of the -R flag in step 5.3 above.

-

-reboot

-

-Verify /dev/md0 is mounted as root (/)

-

-mount

-

-should show:

- -

-/dev/md0 on / type reiserfs (rw)
-proc on /proc type proc (rw)
-devpts on /dev/pts type devpts (rw,gid=5,mode=620)

-

-'type reiserfs' is just my example; you will see whatever your file system type is.

-

-Now we are booted into the new RAID device -- md0 as root (/). Our RAID device only has one disk in it at the moment because we earlier declared the other disk as 'missing'. That was because we needed that other disk, hda, to install Debian on or because it was our pre-existing Debian system.

- - - -

-6. Reformat hda as 'fd' and declare it as disk-one of your RAID

- -

-For step 6 reboots, we tell Lilo that -

    -
  • as in step 5 above, our root (/) is now on md0.
  • -
  • and now, /boot is also on md0,
  • -
  • and MBR is on both hda and hdc.
  • -

-

-Here we not only use md0's root (/) as in step 5, but also md0's /boot (it contains an identical kernel-image to the one on hda because we copied it here from hda in step 4, but we will be overwriting everything on hda in step 6 and can't continue relying on the stuff on hda) and MBR from either hda or hdc, whichever the BIOS can find (they will be identical MBRs and the BIOS will still find hda's MBR but in case the hda disk were to fail down the road we would want the BIOS to look on hdc as a fail over so that it could still boot up the system).

- - -

-6.1 Change the signature on /dev/hda to software RAID

-

-cfdisk /dev/hda

-
    -
  • Select "/dev/hda1"
  • -
  • Then select "[Type]"
  • -
  • Then hit "enter".
  • -
  • Then type "FD".
  • -
  • We are setting partition to "Software RAID"
  • -
  • Should already be set.
  • -
  • Then Select "Boot" if not set, so that you can boot -off the device.
  • -
  • All the boot partitions that are members of your bootable RAID device (hda1 and hdc1) should have the bootable flag set. If one is not set, set it here now
  • -
  • Then select "Write" and enter 'yes'.
  • -
  • Then select "Quite".
  • -
-

-My two hard disks are from different manufacturers and as it happens, while both are roughly 40G, they have different architectures in terms of sectors and precise size. So cfdisk was unable to make the partitions precisely the same size and I had hda1 29,997.60MB and hdc1 30,000MB. This didn't work when I get to the 'mdadm --add /dev/md0 /dev/hda1' step. I got a, "failed: no space left on device!" error. So I ran cfdisk again and made hda1 slightly larger than hdc1, since I could not make them both exactly the same size. Now hda1 is 30,005.83MB and the 'mdadm -add /dev/md0 /dev/hda1' step works :-). (The remaining 10,000MB on each disk I am using for other purposes, including a md1 of 1,000MB composed of hda2 and hdc2.)

- - -

-6.2 Add the first-disk to our existing RAID device

-

-And watch the booted RAID system automatically mirror itself onto the new drive. We are currently booted from MBR and /boot device on /dev/hdc1, with /dev/md0 as root (/).

-

-mdadm --add /dev/md0 /dev/hda1

-

-Note: We are adding /dev/hda1 into our existing RAID device. See if it is syncing.

-

-cat /proc/mdstat

-

-should show that it is syncing.

- - -

-6.3 Write new /etc/lilo.conf settings

-

-these are from when we are booted onto RAID.

-

-boot=/dev/md0
-root=/dev/md0
-#this writes the boot signatures to either disk.
-raid-extra-boot=/dev/hda,/dev/hdc
-image=/vmlinuz
-label=RAID
-read-only

-

-YOU NEED THE raid-extra-boot to have it write the boot loader to all the disks.

-

-YOU ARE OVERWRITING THE BOOT LOADER ON BOTH /dev/hda and /dev/hdc.

-

-You can keep your old boot option to boot /dev/hda so you can boot RAID and /dev/hda.

-

-But remember you don't want to boot into a RAID device in non RAID as it will hurt the synchronization. If you make changes on one disk and not the other.

- - -

-6.4 Run Lilo with -R option and reboot

-

-(we are currently booted into RAID)

-

-lilo -t -v

-

-lilo -R RAID

-

-The -R option tells Lilo it to use the new Lilo setting only for the next reboot, and then revert back to previous setting.

-

-

-Note 1: Step 6.4 returned an error, "Fatal: Trying to map files from unnamed device 0x0000 (NFS/RAID mirror down ?)."

-

-So I waited for the synchronization, started in Step 6.2, to finish (checking it with 'cat /proc/mdstat'). Once it was done, did 'lilo -t -v' again. No "Fatal" error; Lilo seems happy now (no "Fatal" message).

- -Note 1a: The synchronization however took two hours! I checked with 'hdparm' and it seems I have DMA turned off. Perhaps the synchronization would go faster with DMA turned on. Some examination of my system revealed that I did not have my computer's PCI chipset support compiled into my custom kernel. I recompiled the kernel (kernel 2.6.4) and selected the correct PCI chipset support for my computer and now DMA works correctly :-) and by default. For DMA to be default is also configurable in the PCI area of 'make menuconfig' during kernel compile configuration, and I chose it.

-

-So I can now do Lilo with '-R ' switch and reboot.

-

-Note 2: another error, "Fatal: No image "RAID" is defined."

-

-As in Step 5.3 above, I need to do 'lilo' first so that Lilo reads my new /etc/lilo.conf, otherwise Lilo does not know about my stanza labeled "RAID" which is new in my lilo.conf. (Yes I told Lilo about it on hda1 in step 5.3, but that was after I had copied the hda1 root (/) system to here, md0, which branched my system into two separate system configurations. So it needs to be done here, too. Then I can do 'lilo -R RAID'.

-

-Note 2a: However, the '-R' switch is pointless here unless the lilo.conf stanza labeled "RAID" is *not* the first kernel-image stanza in my lilo.conf. Because if it *is* the first stanza, then it is the default stanza anyway, with or without the '-R'.

-

-

-Then

-

-reboot

-

-and check

-

-cat /proc/mdstat -

-and check

-

-mount

-

-to be sure all is as expected.

- - -

-6.5 Now run Lilo normally (without -R) and reboot

-

-See what Lilo will do.

-

-lilo -t -v

-

-If it looks okay, do it:

-

-lilo

-

-reboot

-

-and check

-

-cat /proc/mdstat

-

-and check

-

-mount

-

-as a final system check.

- -

-Done.

- - - - - - - -

Part II. RAID using initrd and grub

- Ferdy Nagy -

I used the following procedure with stock Debian 2.6.5, which has an initrd with all the modules ready to boot -into RAID. The procedure also covers using grub as the boot loader. I built this from a bare install of Sarge -using the new installer with grub as the boot loader, but most of this document is distro independent. My file system -throughout is ext3 and it shouldn't take too much to use reiserfs.

-

These steps reference back to the procedure sections outlined above and indicate where things differ due to initrd or -grub, so you will have to read/do/be familiar with the above steps. Also, make sure you currently use grub -as your boot loader, if you are using LILO, install grub and make sure it works before proceeding!

- - -

Section - 2. Upgrade to a RAID savvy kernel

- -Section 2 -

When using initrd the kernel does not need to have the RAID compiled in, they will be loaded as modules. Make sure -the kernel loads the RAID modules.

- -

Edit /etc/modules and add

-

md
-raid1

- - -

Section - 3. Setup RAID 1

-Follow section 3 to setup the RAID 1. - - -

Section - 4. Copy your Debian system

-Follow section 4 to copy the debian system. - - -

Section - 5. Reboot to RAID device

- -

Instead of section 5 using LILO, grub is used as the boot loader, and initrd used to load the kernel. A new kernel -entry in the grub menu is created that refers to an initrd that is created which will start the md [raid] device. -The original kernel entry will remain and can be reverted to if something goes wrong until RAID is running. This will -still use grub loaded installed on the /dev/hda MBR.

- -

5.1 Build a new RAID initrd

- -

A) Make sure the initrd has the modules it needs, by editing /etc/mkinitrd/modules. Add the -following [you can see what modules are available by mounting the initrd and looking in the lib/modules - see section 8.]:

-

- md
- raid1

- -

B) Update the initrd so that the root device loaded is the raid device, not probed. Edit the /etc/mkinitrd/mkinitrd.conf, and update the ROOT line
-ROOT=/dev/md0

- -

C) Create the new initrd and a link to it.

-

mkinitrd -o /boot/initrd.img-2.6.5-raid

- - -

5.2 Update the grub boot menu

- -

edit /boot/grub/menu.lst

-

1. Add the following entry

-

-

-title           Debian GNU/Linux, kernel 2.6.5-1-686 RAID
-root            (hd0,0)
-kernel          /boot/vmlinuz-2.6.5-1-686 root=/dev/md0 ro
-initrd          /boot/initrd.img-2.6.5-1-686-raid
-savedefault
-boot
-
-

-

2. Update the following kernel root option in the file. Note: the grub known issues, so -this option will not be used anyway.

-

# kopt=root=/dev/md0 ro
- - -

5.3 Do the above 5.4 Edit /mnt/md0/etc/fstab and reboot

- -

[Copied from Part I 5.4 above]

-

to have /dev/md0 mount as root (/), when grub boots from our RAID device, /dev/md0:

-

Previous root (/) in fstab was:

-

/dev/hda1 / ext3 defaults 0 0

-

Edit it to:

-

/dev/md0 / ext3 defaults 0 0

-

Note: edit /mnt/md0/etc/fstab, not /etc/fstab, because at the moment we are booted with hda1 as root (/) but we -want to change the /etc/fstab that we currently have mounted on /mnt/md0/etc/fstab, our RAID device.

-

Reboot and choose the RAID kernel to check if system boots our RAID device, /dev/md0, as root (/). If it does not, just reboot again and choose the -original pre-read kernel image

-

reboot

-

Verify /dev/md0 is mounted as root (/)

-

mount

-

should show something similar to:

- -

/dev/md0 on / type ext3 (rw)
-proc on /proc type proc (rw)
-devpts on /dev/pts type devpts (rw,gid=5,mode=620)

- -

Now we are booted into the new RAID device -- md0 as root (/). Our RAID device only has one disk in it at the -moment because we earlier declared the other disk as 'missing'. That was because we needed that other disk, hda, to -install Debian on or because it was our pre-existing Debian system.

- -cat /proc/mdstat shows the [degraded] array is up and running, note the [_U] - second disk is up.

- - -

Section - 6. Reformat hda as fd and declare it as disk-one of your raid

- -

6.1/2 Setup hda and add to array

-

Follow steps 6.1, and 6.2. Wait and make sure the drives are fully synced before proceeding. - -

6.3 re-run mkinitrd again, and reboot.

- -

This is needed to make sure that mkinitrd starts the newly built array with all drives. mkinitrd uses mdadm -D to -discover what drives to assemble in the array during startup, this is contained in a script in the initrd image. If this -step is not done the next time you reboot the array will be degraded.

- -

Do the following

-

mkinitrd -o /boot/initrd-2.6.5-raid.img

- -

reboot

- -

and check the array is fully up, look for the [UU]

- -

cat /proc/mdstat

- -

and check /dev/md0 is mounted

- -

mount

- - -

7. Put grub into the MBR of the second disk

- -

grub refers to the boot(ed) device as hd0, so if the primary hard drive (/dev/hda) fails the system will look for -the next bootable device (/dev/hdc) and loads it's MBR, which grub will still refer to as hd0. So, the grub -configuration can still use hd0 even when the primary device fails.

- -

7.1 Put grub into the MBR

- -

These steps temporarily tell grub the second device is hd0 and then loads the MBR.

- -

start the grub command line, then run the load commands. Note: grub partition references -are offset by 1, so in the following with a partition of /dev/hdc1, the root is (hd0,0) [previous line tells -grub to set hdc as hd0]. If the partition was /dev/hdc2, the root would be (hd0,1)!

-

grub
-grub> device (hd0) /dev/hdc
-grub> root (hd0,0)
-grub> setup (hd0)
-

- -

7.2 Testing

- -

reboot, verify the /proc/mdstat devices always start. Follow section VIII and verify the -system boots with one disk off line.

- - -

8. Known Issues

- -

grub

-

grub will already be installed on hda, and you will manually force grub to be installed on hdc so the MBRs are -ok; however, install-grub and update-grub will fail because -grub does not understand the md0 device. This is not a problem with install-grub as it will not be executed again -after it has been installed, but update-grub is executed after an updated kernel is apt'd, causing an error to be -reported by apt. The update-grub error is ok, the kernel gets installed and the initrd is created with all -the md array information, provided the array was not degraded during the kernel upgrade. But you will have -to manually update the grub menu.lst and add the new kernel information before you reboot, or the new -kernel will not appear in the grub menu.

- -

mkinitrd

-

When using mdadm, mkinitrd will only detect disks in the array that are running at the time of execution. You should -not install a new kernel while the array is degraded, otherwise, even if you do an mdadm --add, the next reboot will -still be degraded! The array is started at boot time by script. You can see what -is in the script of the initrd by mounting it, e.g.

-

mount /boot/initrd.img-X.X.X /mnt -o loop
-cat /mnt/script

-

And look for the array start line similar to

-

mdadm -A /devfs/md/0 -R -u 23d8dd00:bc834589:0dab55b1:7bfcc1ec /dev/hda1 /dev/hdc1

- - - - -

-Appendix

- - -

-I. RAID 1 Introduction

-

-Redundant Array of Inexpensive Disks (RAID) refers to putting more than one hard disk to work together in various advantageous ways. Hardware RAID relies on special hardware controllers to do this and we do not covered in this HowTo. Software RAID, this HowTo, uses software plus the ordinary controllers on your computer's motherboard and works excellently.

-

-RAID 1 is where you use two hard drives as if they were one by mirroring them onto each other. Advantages of RAID 1 are (a) faster data reads because one part of the data can be read from one of the disks while simultaneously another part of the data is read from the other disk, and (b) a measure of fail over stability -- if one of the disks in the RAID 1 fails, the system will usually stay online using the remaining drive while you find time to replace the failed drive.

-

-To achieve the speed gain, the two disks that comprise your RAID 1 device must be on separate controllers (in other words, on separate drive cables). The first part of the data is read from one disk while simultaneously the second part of data is read from the other disk. Writing data to a RAID 1 device takes twice as long apparently. However, under most system use data is more often read from disk than written to disk. So RAID 1 almost doubles the effective speed of your drives. Nice.

-

-RAID is not a substitute for regular data back ups. Many things can happen that destroy both your drives at the same time.

- - - -

-II. Drive designators (hda, hdb, hdc, hdd), jumpers and cables

-

-Drive designators.

-

-Drives on IDE 1 -- Primary Controller

-
    -
  • -hda, Primary Master drive
  • -
  • -hdb, Primary Slave drive
  • -
-

-Drives on IDE 2 -- Secondary Controller

-
    -
  • -hdc, Secondary Master drive
  • -
  • -hdd, Secondary Slave drive
  • -
-

-Jumpers. When moving drives around in your computer, be sure to set the jumpers on your drives correctly. They are the little clips that connect two of various pins on your drive to set it to Cable Select, Master, or Slave. IDE drives usually have a diagram right on their case that shows where to set the clip for what setting. Different brands sometimes use different pin configurations. -

-Cables. Use 80 wire 40 pin IDE drive cables, not 40 wire 40 pin or you will slow down your hard drive access. For best results, cables should be no longer than the standard 18". If your cable has a blue end, that's the end to attach to the mother board (I don't know why). I don't think it matters which of the two drive connectors on the cable you plug your drive into, the middle or end one, unless you use Cable Select in which case I believe the sable's end plug is Master and its middle plug is Slave.

- - - -

-III. Setting up software RAID for multiple partitions.

-

-You can have a multi-partition RAID system if you prefer. You just need to create multiple RAID devices.

-

-I have found it useful when setting software RAID on multiple partitions to set the RAID device to the same name as the disk partition.

-

-If you have 3 partitions on /dev/hda and I want to add /dev/hdc for software RAID, then boot /dev/hdc and add /dev/hda back into the device, exactly what I did earlier, but with 3 partitions which are: hda1=/boot, hda2=/, hda3=/var

-

-sfdisk -d /dev/hda | sfdisk /dev/hdc;
-reboot
-mdadm --zero-superblock /dev/hda1
-mdadm --zero-superblock /dev/hda2
-mdadm --zero-superblock /dev/hda3
-mdadm --create /dev/md1 --level=1 --raid-disks=2 missing /dev/hdc1
-mdadm --create /dev/md2 --level=1 --raid-disks=2 missing /dev/hdc2
-mdadm --create /dev/md3 --level=1 --raid-disks=2 missing /dev/hdc3
-mkfs.reiserfs /dev/md1;mkfs.reiserfs /dev/md2; mkfs /dev/md3;
-mkdir /mnt/md1 /mnt/md2 /mnt/md3;
-cp -ax /boot /mnt/md1;cp -ax / /mnt/md2; cp -ax /var /mnt/md3;

-

-add entry in current fstab for all 3 and REBOOT.

-

-Sync data again, only copying changed stuff. -

-cp -aux /boot /mnt/md1;cp -aux / /mnt/md2; cp -aux /var /mnt/md3;

-

-edit lilo.conf entry in this case: -

-boot=/dev/md1
-root=/dev/md2

-

-Edit /mnt/md2/etc/fstab to have / set to /dev/md2.

-

-REBOOT into RAID.

-

-Add devices in: -

-mdadm --add /dev/md1 /dev/hda1
-mdadm --add /dev/md2 /dev/hda2

-

-Wait for sync, write Lilo permanently, and REBOOT into your setup.

-

-It is not harder to include more devices in a software RAID device.

- - - -

-IV. Lilo

-

-You need special entries to use Lilo as your boot loader, I couldn't get grub to work, but nothing prevents you from using grub. Just standard Lilo/grub entries WILL NOT WORK FOR RAID.

-

-Entries in /etc/lilo.conf: -

-raid-extra-boot=<option>

-

-That option only has meaning for RAID 1 installations. The <option> may be specified as none, auto, mbr-only, or a comma-separated list of devices; e.g., "/dev/hda,/dev/hdc6".

-

-panic='' line in lilo.conf tells Lilo to automatically boot back to the old install if something goes wrong with the new Kernel.

- - - -

-V. Copying data

-

-Use "cp -aux" to just copy updated items. if you are copying a partition that is not root you need to copy the subdirectories and not the mount point, otherwise it will just copy the directory over. To copy boot which is a separately mounted partition to /mnt/md1 which is our new software RAID partition we copy as thus: "cp -aux /boot/* /mnt/md1" NOTE THE DIFFERENCE when copying mount points and not just /. If you just do cp -aux /boot /mnt/md1 it will just copy over boot as a subdirectory of /mnt/md1.

-

-Or, alternatively, you could copy the root system with 'find' piped to 'cpio', like this:

-

-cd /
-find . -xdev -print | cpio -dvpm /mnt/md0

- - - - -

-VI. Rebooting

-

-You should always reboot if you have changed your partitions, otherwise the Kernel will not see the new partitions correctly. I have changed partitions and not rebooted, and it caused problems. I would rather have the simpler longer less potentially troublesome approach. Just because it appears to work, does not mean it does work. You really only need to reboot if you are CHANGING or rebooting a new Lilo configuration. Don't email me if you hose yourself because you did not feel the urge to reboot. Trust me.

- - - -

-VII. initrd

-

-initrd: Use RAID as initrd modules.

-

-The Kernel that is installed when you first build a system does not use an initrd.img. -However the default kernel uses initrd. So you can use a stock kernel for with -software raid.

-

-The new Kernel by default won't contain the right modules for creating a RAID savvy initrd, but they can be added.

-

-(Per James Bromberger)

-Now we need to prepare for running a RAID setup. Our packages need an update. -Use apt, because it rocks, and install the following:
-

-DevFSd
-kernel-image-2.4.x (whatever suits you)
-reiserfsprogs
-less
-screen
-vim
-

-...Anything else you need and can't live without for the next 10 minutes
-
-You might already have some of these modules in the kernel, eg ext2. -Edit /etc/modules and add the following modules:
-

-reiserfs
-md
-raid1
-ext2
-ide-disk (might not need this one.)
-raid5
-ext3
-ide-probe-mod (might not need this one.)
-ide-mod (might not need this one.)
-

-
-Edit /etc/mkinitrd/modules, and add the same modules to this list. Your initrd -image needs to be able to read and write to your RAID array, before your -filesystem is mounted. Initrd is the trick here. You probably also want to see -if you need to edit /etc/mkinitrd/mkinitrd.cfg and set the variable ROOT=probe -to be ROOT=/dev/md0, or possibly, if using DevFS, ROOT=/dev/md/0.
-
-Regenerate your initrd image for your new kernel with -

- -mkinitrd -o /tmp/initrd-new /lib/modules/2.4.x-... . - -

If all is good, move this to /boot/initrd-2.4.x-... and -edit your /etc/lilo.conf to add initrd=/boot/initrd against the "Linux" kernel -entry. Run lilo, and you should see an asterisk next to the boot image "Linux".

-With those modules you should be able to install the new kernel-image package. The install will add those modules to the initrd.img that. Now you can do for example (I actually only tested with kernel-image-2.4.24-1-686-smp on a machine using testing and unstable listed in the /etc/apt/source.list) -

-apt-get install kernel-image-2.4.24-1-686-smp

-

-You will need to modify /etc/lilo.conf to include the right stuff. Otherwise the post install scripts for the package will likely fail. -

-image=/vmlinuz
-label=Linux
-initrd=/initrd.img

-

-(The above is all one line)

-

-Run Lilo and REBOOT.

-

-You should now have the modules loaded. Check with: cat /proc/mdstat

- - - -

-VIII. Verify that system will boot even with one disk off-line

-

-Roger did it this way.

-
    -
  1. Shutdown and power-off your computer.
  2. -
  3. Open up computer and unplug the power to Primary Master disk (/dev/hda).
  4. -
  5. Start up your computer. It should boot up from the other disk.
  6. -
  7. Now look at
    - cat /proc/mdstat
    - you should see that one of the disks in your md0 has "failed".
  8. -
  9. Shutdown and then unplug the power to you computer, again.
  10. -
  11. Reconnect the power to Primary Master disk.
  12. -
  13. Start up your computer, again. It should boot up from the other disk still. It wont try to access the disk that it now has on record as "failed" until you re-add it to your RAID. Look again at
    - cat /proc/mdstat
    - you should still see one of the disks in your md0 listed as "failed". If this were not a simulation it probably would be failed and you would want to replace it with a new one. But for the simulation we just un-plug and later re-plug the power connector to the disk.
  14. -
  15. Now that you have re-connected the power to the disk (or replaced it with a new one were it really was a failed disk) bring it back online with mdadm,
    - mdadm --add /dev/md0 /dev/hda1
    - and check its status with, - cat /proc/mdstat
    - you should see that it is being synchronized the the other disk in your RAID 1.
  16. -
  17. WAIT until the synchronization has completed. Then you can try the above again but unplugging the other disk in your RAID 1. WARNING if you do not wait for synchronization to fully complete (check with '/proc/mdstat') you will have a real problem because your system is only partially rebuild on the "new" disk until synchronization has finished.
  18. -
-

-NB: I (Roger) had to disconnect power to my CD-ROM drive (because my CD-ROM was on /dev/hdd -- Secondary Slave) in order to boot with my Secondary Master disconnected. Otherwise my BIOS refused to boot the machine because my CD-ROM was then a Slave on a cable without any Master. Your mileage may vary. :-) So I decided to leave my CD-ROM disconnected, as this is a server and I need it to boot even with a failed drive more than I need the convenience of keeping the CD-ROM connected. I can of course connect the CD-ROM when I need it as long as I have a working Master drive on its cable with it or set it to Master.

- - - -

-IX. Setting up a RAID 1 Swap device

-

-I created a swap RAID device as follows:

-

-(I have a 1000MB hda2 and a 1000MB hdc2, both as type 'fd' created with 'cfdisk', that I will use as md1 for swap.)

-

- (Or you can just create the swap parttions on the actual disk, don't put swap on raid. - Just put a swap partition on each disk in your raid set on an empty partition.) - -

- -

-Add a Swap entry in /etc/fstab, just after root (/) partition line. Example line to add to /etc/fstab: -

-/dev/md1 none swap sw 0 0

-

-Reboot and the boot sequence should start up the Swap when it reads /etc/fstab.

-

-reboot

-

- You can argue whether swap should be on raid. A large colo admin mentions that he does not use swap on raid. Keep it as simple as possible. You decide.

- - - -

-X. Performance Optimizations

- For every ide drive turn on hdparm. -
-

 

hdparm -d1 -c3 /dev/hda /dev/hdc

-
You need to use bonnie++ to measure software raid performance -
You want all your devices to be as masters. As your limited to total bandwidth on that chain of -
hard drives. -
I just stick as many hard drives in the system as possible, -
I have not encountered problems where having disks on the same master -
slave channel caused a slowdown. - -

-

-XI. Disaster Recovery -

- -

-

-(These directions are untested, I need to adopt them to mdadm instead of raid2 --luke)

- -

So what to do if you can't get your root RAID1 filesystem to boot? Here is a -straightforward way to get to your md0:

-
    -
    Find the 2.4 kernel install media from $DEBIAN/dists/unstable/main/disks-i386, -
    and download the bf2.4 set of disks. -
    You only need the rescue and root images. -
    Find the corresponding kernel-image-2.4.18-bf2.4_2.4.18-4_i386.deb or -
    similar; and unpack this somewhere with -

     

    -

    dpkg-deb -x kernel-image-2.4.yy-bf45.deb temp/

    -
    In the temp directory, find the md.o and raid1.o modules. -
    Copy them to a new floppy in /floppy/boot. -
    Copy /sbin/raid* to the root of the floppy disk (/floppy). You'll notice -
    that all the raid programs are symlinks to the same binary; doesn't matter, -
    since you probably have a vfat disk that doesn't know about symlinks. Just make -
    multiple copies. (Or be smart here and use an ext2 disk). -
    Boot with the rescue, then with the root disk -
    After choosing a language and keyboard from the installer, choose to preload -
    some modules. Grab that third disk you just put those modules and binaries on, -
    and put it in the floppy drive. -
    Load up md.o first, and then raid1.o. -
    Press Alt-F2 to get a text console. -

    -
    mount /floppy -
    cp /etc/raid* /sbin -

    -# (Ie: copy to the ramfs /sbin) -
    mkdir /etc/raid

    -cp /floppy/raidtab /etc/raid -

    -ln -s /etc/raid/raidtab /etc/raidtab -

    -raidstart /dev/md0

    -mount -t reiserfs /dev/md0 /target

-

- - -

-XII. Quick Reference

-

-DON'T JUST LOOK AT THIS QUICK REFERENCE. Understand the rest of the document.

-

-Quick Reference -- setting up bootable system on /dev/md0 using /dev/hda and /dev/hdc as RAID 1 component disks

-

-Verify RAID savvy Kernel. (1) You should see the RAID "personalities" your Kernel supports:

-

-cat /proc/mdstat

-

- dmsg|grep -i RAID -

-(This will show you if raid is compiled into kernel, or detected as a module from initrd.) -/etc/modules will not list RAID if Kernel has RAID compiled in instead of loaded as modules. -Use lsmod to list currently loaded modules this will show raid modules loaded. -

-(2) You should NOT see any RAID modules in /etc/modules (If you do, review step 2 of Procedure):

-

-cat /etc/modules

-

-Copy partitions hda to hdc: -

-sfdisk -d /dev/hda | sfdisk /dev/hdc

-

-Create array: -

-mdadm --create /dev/md0 --level=1 --raid-disks=2 missing /dev/hdc1

-

-Copy data: -

-cp -ax / /mnt/md0

-

-Example /etc/lilo.conf entry for 1 disk RAID device: -

-boot=/dev/hda
-image=/vmlinuz
-label=RAID
-read-only
-#our new root partition.
-root=/dev/md0

-

-Add second disk to array: -

-mdadm --add /dev/md0 /dev/hdc1

-

-Example final /etc/lilo.conf entry: -

-boot=/dev/md0
-root=/dev/md0
-#this writes the boot signatures to either disk.
-raid-extra-boot=/dev/hda,/dev/hdc
-image=/vmlinuz
-label=RAID
-read-only

- - -

-Useful 'mdadm' commands

-

-Always zero the superblock of a device before adding it to a RAID device. Why? Because the disks decide what array they are in based on the disk-id information written on them. Zero the superblock first in case the disk was part of a previous RAID device. Also, if a partition was part of a previous RAID device, it appears to store the size of it's previous partition in the signature. Zeroing the superblock before adding it to a new RAID device takes care of cleaning up that, too.

-

-Erase the MD superblock from a device: -

-mdadm --zero-superblock /dev/hdx

-

-Remove disk from array:

-

-mdadm --set-faulty /dev/md1 /dev/hda1
-mdadm --remove /dev/md1 /dev/hda1

-

-Replace failed disk or add disk to array: -

-mdadm --add /dev/md1 /dev/hda1

-

-(that will format the disk and copy the data from the existing disk to the new disk.)

-

-Create mdadm config file: -

-echo "DEVICE /dev/hda /dev/hdc" > /etc/mdadm/mdadm.conf
-mdadm --brief --detail --verbose /dev/md0 >> /etc/mdadm/mdadm.conf
-mdadm --brief --detail --verbose /dev/md1 >> /etc/mdadm/mdadm.conf

-

-To stop the array completely: -

-mdadm -S /dev/md0

- - - -

-XIII. Troubleshooting

-
The main problems people encounter is:
-
Kernel must have support for raid compiled in or loaded correctly in initrd.
-
You will actually have 2 configurations of raid. You boot to the failed raid volume,
-
then add in the original disk, then boot the final raid configuration.
- -
Performance is too slow:
-
See Performance Optimizations - - -

-XIIII. Raid Disk Maintenance.

-
You need to configure raid to monitor for errors.
-
It will email you when it detects and error
-
Once a failed disk is detected, remove it and then add it back in.
-
Create an mdadm.conf file
-
See mdadm commands -
You can also configure hot spare, that will come online if a disk fails.
-
-

- Finish directions on smart monitoring and mdadm configuration to monitor disks,and hot spares. -

- - - -

-

-References

-

-RAID 1 Root HowTo PA-RISC
-http://www.pa-RISC-linux.org/faq/RAIDboot-howto.html

-

-Lilo RAID Configuration:
-http://lists.debian.org/debian-user/2003/debian-user-200309/msg04821.html

-

-Grub RAID Howto
-http://www.linuxsa.org.au/mailing-list/2003-07/1270.html

-

-Building a Software RAID System in Slackware 8.0
-http://slacksite.com/slackware/RAID.html

-

-Root-on-LVM-on-RAID HowTo
-http://www.midhgard.it/docs/lvm/html/install.disks.html

-

-Software RAID HowTo
-http://unthought.net/Software-RAID.HOWTO/Software-RAID.HOWTO.txt

-

-HowTo - Install Debian Onto a Remote Linux System
-http://trilldev.sourceforge.net/files/remotedeb.html

-

-Kernel Compilation Information and good getting started info for Debian
-http://newbiedoc.sourceforge.net

-

-Initrd information and Raid Disaster Recovery,

-

- -http://www.james.rcpt.to/programs/debian/raid1/

- - - - - - diff -Nru mdadm-2.6.7.1/sg_io.c mdadm-3.1.4/sg_io.c --- mdadm-2.6.7.1/sg_io.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/sg_io.c 2010-03-22 08:08:43.000000000 +0200 @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007-2008 Intel Corporation + * + * Retrieve drive serial numbers for scsi disks + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include +#include +#include +#include + +int scsi_get_serial(int fd, void *buf, size_t buf_len) +{ + unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0}; + unsigned char sense[32]; + struct sg_io_hdr io_hdr; + + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmdp = inq_cmd; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.dxferp = buf; + io_hdr.dxfer_len = buf_len; + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.sbp = sense; + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.timeout = 5000; + + return ioctl(fd, SG_IO, &io_hdr); +} diff -Nru mdadm-2.6.7.1/super0.c mdadm-3.1.4/super0.c --- mdadm-2.6.7.1/super0.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/super0.c 2010-08-31 09:14:47.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #define HAVE_STDINT_H 1 @@ -53,7 +48,7 @@ } -void super0_swap_endian(struct mdp_superblock_s *sb) +static void super0_swap_endian(struct mdp_superblock_s *sb) { /* as super0 superblocks are host-endian, it is sometimes * useful to be able to swap the endianness @@ -90,10 +85,11 @@ mdp_super_t *sb = st->sb; time_t atime; int d; + int delta_extra = 0; char *c; printf(" Magic : %08x\n", sb->md_magic); - printf(" Version : %02d.%02d.%02d\n", sb->major_version, sb->minor_version, + printf(" Version : %d.%02d.%02d\n", sb->major_version, sb->minor_version, sb->patch_version); if (sb->minor_version >= 90) { printf(" UUID : %08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, @@ -117,7 +113,7 @@ printf(" Creation Time : %.24s\n", ctime(&atime)); c=map_num(pers, sb->level); printf(" Raid Level : %s\n", c?c:"-unknown-"); - if ((int)sb->level >= 0) { + if ((int)sb->level > 0) { int ddsks=0; printf(" Used Dev Size : %d%s\n", sb->size, human_size((long long)sb->size<<10)); @@ -140,10 +136,9 @@ printf(" Reshape pos'n : %llu%s\n", (unsigned long long)sb->reshape_position/2, human_size((long long)sb->reshape_position<<9)); if (sb->delta_disks) { printf(" Delta Devices : %d", sb->delta_disks); - if (sb->delta_disks) - printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks); - else - printf(" (%d->%d)\n", sb->raid_disks, sb->raid_disks+sb->delta_disks); + printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks); + if (((int)sb->delta_disks) < 0) + delta_extra = - sb->delta_disks; } if (sb->new_level != sb->level) { c = map_num(pers, sb->new_level); @@ -154,6 +149,10 @@ c = map_num(r5layout, sb->new_layout); printf(" New Layout : %s\n", c?c:"-unknown-"); } + if (sb->level == 6) { + c = map_num(r6layout, sb->new_layout); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } if (sb->level == 10) { printf(" New Layout : near=%d, %s=%d\n", sb->new_layout&255, @@ -187,11 +186,14 @@ c = map_num(r5layout, sb->layout); printf(" Layout : %s\n", c?c:"-unknown-"); } + if (sb->level == 6) { + c = map_num(r6layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } if (sb->level == 10) { - printf(" Layout : near=%d, %s=%d\n", - sb->layout&255, - (sb->layout&0x10000)?"offset":"far", - (sb->layout>>8)&255); + printf(" Layout :"); + print_r10_layout(sb->layout); + printf("\n"); } switch(sb->level) { case 0: @@ -208,7 +210,7 @@ } printf("\n"); printf(" Number Major Minor RaidDevice State\n"); - for (d= -1; d<(signed int)(sb->raid_disks+sb->spare_disks); d++) { + for (d= -1; d<(signed int)(sb->raid_disks+delta_extra + sb->spare_disks); d++) { mdp_disk_t *dp; char *dv; char nb[5]; @@ -233,7 +235,7 @@ } } -static void brief_examine_super0(struct supertype *st) +static void brief_examine_super0(struct supertype *st, int verbose) { mdp_super_t *sb = st->sb; char *c=map_num(pers, sb->level); @@ -241,14 +243,18 @@ sprintf(devname, "/dev/md%d", sb->md_minor); - printf("ARRAY %s level=%s num-devices=%d UUID=", - devname, - c?c:"-unknown-", sb->raid_disks); + if (verbose) { + printf("ARRAY %s level=%s num-devices=%d", + devname, + c?c:"-unknown-", sb->raid_disks); + } else + printf("ARRAY %s", devname); + if (sb->minor_version >= 90) - printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3); else - printf("%08x", sb->set_uuid0); + printf(" UUID=%08x", sb->set_uuid0); printf("\n"); } @@ -301,27 +307,19 @@ else printf("%08x", sb->set_uuid0); } - -static void export_detail_super0(struct supertype *st) -{ - mdp_super_t *sb = st->sb; - printf("MD_UUID="); - if (sb->minor_version >= 90) - printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, - sb->set_uuid2, sb->set_uuid3); - else - printf("%08x", sb->set_uuid0); - printf("\n"); -} #endif static int match_home0(struct supertype *st, char *homehost) { mdp_super_t *sb = st->sb; char buf[20]; - char *hash = sha1_buffer(homehost, - strlen(homehost), - buf); + char *hash; + + if (!homehost) + return 0; + hash = sha1_buffer(homehost, + strlen(homehost), + buf); return (memcmp(&sb->set_uuid2, hash, 8)==0); } @@ -369,8 +367,12 @@ info->events = md_event(sb); info->data_offset = 0; + sprintf(info->text_version, "0.%d", sb->minor_version); + info->safe_mode_delay = 200; + uuid_from_super0(st, info->uuid); + info->recovery_start = MaxSector; if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { info->reshape_active = 1; info->reshape_progress = sb->reshape_position; @@ -378,6 +380,8 @@ info->delta_disks = sb->delta_disks; info->new_layout = sb->new_layout; info->new_chunk = sb->new_chunk; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; } else info->reshape_active = 0; @@ -385,7 +389,7 @@ /* work_disks is calculated rather than read directly */ for (i=0; i < MD_SB_DISKS; i++) if ((sb->disks[i].state & (1<disks[i].raid_disk < info->array.raid_disks) && + (sb->disks[i].raid_disk < (unsigned)info->array.raid_disks) && (sb->disks[i].state & (1<disks[i].state & (1<array.md_minor); } if (strcmp(update, "summaries") == 0) { - int i; + unsigned int i; /* set nr_disks, active_disks, working_disks, * failed_disks, spare_disks based on disks[] * array in superblock. @@ -475,8 +479,15 @@ if (strcmp(update, "assemble")==0) { int d = info->disk.number; int wonly = sb->disks[d].state & (1<disks[d].state & ~(1<disk.state) { + int mask = (1<minor_version >= 91) + /* During reshape we don't insist on everything + * being marked 'sync' + */ + add = (1<disks[d].state & ~mask) | add) + != (unsigned)info->disk.state) { sb->disks[d].state = info->disk.state | wonly; rv = 1; } @@ -552,12 +563,18 @@ unsigned long long size, char *ignored_name, char *homehost, int *uuid) { - mdp_super_t *sb = malloc(MD_SB_BYTES + sizeof(bitmap_super_t)); + mdp_super_t *sb; int spares; + + if (posix_memalign((void**)&sb, 4096, + MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + fprintf(stderr, Name ": %s could not allocate superblock\n", __func__); + return 0; + } memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t)); st->sb = sb; - if (info->major_version == -1) { + if (info == NULL) { /* zeroing the superblock */ return 0; } @@ -576,7 +593,7 @@ sb->gvalid_words = 0; /* ignored */ sb->ctime = time(0); sb->level = info->level; - if (size != info->size) + if (size != (unsigned long long)info->size) return 0; sb->size = info->size; sb->nr_disks = info->nr_disks; @@ -623,18 +640,44 @@ return 1; } +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; + +#ifndef MDASSEMBLE /* Add a device to the superblock being created */ -static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo) +static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname) { mdp_super_t *sb = st->sb; mdp_disk_t *dk = &sb->disks[dinfo->number]; + struct devinfo *di, **dip; dk->number = dinfo->number; dk->major = dinfo->major; dk->minor = dinfo->minor; dk->raid_disk = dinfo->raid_disk; dk->state = dinfo->state; + + sb->this_disk = sb->disks[dinfo->number]; + sb->sb_csum = calc_sb0_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = malloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dinfo; + di->next = NULL; + *dip = di; + + return 0; } +#endif static int store_super0(struct supertype *st, int fd) { @@ -661,7 +704,8 @@ if (super->state & (1<magic) == BITMAP_MAGIC) - if (write(fd, bm, sizeof(*bm)) != sizeof(*bm)) + if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != + ROUND_UP(sizeof(*bm),4096)) return 5; } @@ -669,32 +713,41 @@ return 0; } -static int write_init_super0(struct supertype *st, - mdu_disk_info_t *dinfo, char *devname) +#ifndef MDASSEMBLE +static int write_init_super0(struct supertype *st) { mdp_super_t *sb = st->sb; - int fd = open(devname, O_RDWR|O_EXCL); - int rv; - - if (fd < 0) { - fprintf(stderr, Name ": Failed to open %s to write superblock\n", devname); - return -1; - } - - sb->disks[dinfo->number].state &= ~(1<this_disk = sb->disks[dinfo->number]; - sb->sb_csum = calc_sb0_csum(sb); - rv = store_super0(st, fd); + int rv = 0; + struct devinfo *di; - if (rv == 0 && (sb->state & (1<ss->write_bitmap(st, fd); + for (di = st->info ; di && ! rv ; di = di->next) { - close(fd); - if (rv) - fprintf(stderr, Name ": failed to write superblock to %s\n", devname); + if (di->disk.state == 1) + continue; + if (di->fd == -1) + continue; + while (Kill(di->devname, NULL, 0, 1, 1) == 0) + ; + + sb->disks[di->disk.number].state &= ~(1<this_disk = sb->disks[di->disk.number]; + sb->sb_csum = calc_sb0_csum(sb); + rv = store_super0(st, di->fd); + + if (rv == 0 && (sb->state & (1<ss->write_bitmap(st, di->fd); + + if (rv) + fprintf(stderr, + Name ": failed to write superblock to %s\n", + di->devname); + close(di->fd); + di->fd = -1; + } return rv; } +#endif static int compare_super0(struct supertype *st, struct supertype *tst) { @@ -712,7 +765,13 @@ if (second->md_magic != MD_SB_MAGIC) return 1; if (!first) { - first = malloc(MD_SB_BYTES + sizeof(struct bitmap_super_s)); + if (posix_memalign((void**)&first, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); + return 1; + } memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s)); st->sb = first; return 0; @@ -754,6 +813,9 @@ free_super0(st); + if (st->subarray[0]) + return 1; + if (!get_dev_size(fd, devname, &dsize)) return 1; @@ -778,7 +840,13 @@ return 1; } - super = malloc(MD_SB_BYTES + sizeof(bitmap_super_t)); + if (posix_memalign((void**)&super, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); + return 1; + } if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) { if (devname) @@ -812,6 +880,7 @@ st->ss = &super0; st->minor_version = super->minor_version; st->max_devs = MD_SB_DISKS; + st->info = NULL; } /* Now check on the bitmap superblock */ @@ -821,8 +890,8 @@ * valid. If it doesn't clear the bit. An --assemble --force * should get that written out. */ - if (read(fd, super+1, sizeof(struct bitmap_super_s)) - != sizeof(struct bitmap_super_s)) + if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),4096)) + != ROUND_UP(sizeof(struct bitmap_super_s),4096)) goto no_bitmap; uuid_from_super0(st, uuid); @@ -843,18 +912,28 @@ struct supertype *st = malloc(sizeof(*st)); if (!st) return st; + memset(st, 0, sizeof(*st)); st->ss = &super0; + st->info = NULL; st->minor_version = 90; st->max_devs = MD_SB_DISKS; st->sb = NULL; + /* we sometimes get 00.90 */ + while (arg[0] == '0' && arg[1] == '0') + arg++; if (strcmp(arg, "0") == 0 || - strcmp(arg, "0.90") == 0 || - strcmp(arg, "0.91") == 0 || +#ifdef DEFAULT_OLD_METADATA /* ifndef in super1.c */ strcmp(arg, "default") == 0 || - strcmp(arg, "") == 0 /* no metadata */ +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "0.90") == 0 || + strcmp(arg, "") == 0 /* no metadata - i.e. non_persistent */ ) return st; + st->minor_version = 91; /* reshape in progress */ + if (strcmp(arg, "0.91") == 0) /* For dup_super support */ + return st; + st->minor_version = 9; /* flag for 'byte-swapped' */ if (strcmp(arg, "0.swap")==0 || strcmp(arg, "0.9") == 0) /* For dup_super support */ @@ -883,7 +962,7 @@ * size is in sectors, chunk is in bytes !!! */ unsigned long long bits; - unsigned long long max_bits = 60*1024*8; + unsigned long long max_bits = (60*1024 - sizeof(bitmap_super_t))*8; unsigned long long min_chunk; int chunk = *chunkp; mdp_super_t *sb = st->sb; @@ -896,9 +975,14 @@ min_chunk *= 2; bits = (bits+1)/2; } - if (chunk == UnSet) + if (chunk == UnSet) { + /* A chunk size less than a few Megabytes gives poor + * performance without increasing resync noticeably + */ chunk = min_chunk; - else if (chunk < min_chunk) + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if ((unsigned long long)chunk < min_chunk) return 0; /* chunk size too small */ sb->state |= (1< 0) { n = towrite; - if (n > sizeof(buf)) - n = sizeof(buf); + if (n > 4096) + n = 4096; n = write(fd, buf, n); if (n > 0) towrite -= n; else break; + memset(buf, 0xff, 4096); } fsync(fd); if (towrite) @@ -991,6 +1074,58 @@ st->sb = NULL; } +#ifndef MDASSEMBLE +static int validate_geometry0(struct supertype *st, int level, + int layout, int raiddisks, + int chunk, unsigned long long size, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize; + int fd; + + if (level == LEVEL_CONTAINER) { + if (verbose) + fprintf(stderr, Name ": 0.90 metadata does not support containers\n"); + return 0; + } + if (raiddisks > MD_SB_DISKS) { + if (verbose) + fprintf(stderr, Name ": 0.90 metadata supports at most %d devices per array\n", + MD_SB_DISKS); + return 0; + } + if (size > (0x7fffffffULL<<9)) { + if (verbose) + fprintf(stderr, Name ": 0.90 metadata supports at most 2 terrabytes per device\n"); + return 0; + } + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": super0.90 cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + if (ldsize < MD_RESERVED_SECTORS * 512) + return 0; + if (size > (0x7fffffffULL<<9)) + return 0; + *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9); + return 1; +} +#endif /* MDASSEMBLE */ + struct superswitch super0 = { #ifndef MDASSEMBLE .examine_super = examine_super0, @@ -998,16 +1133,16 @@ .export_examine_super = export_examine_super0, .detail_super = detail_super0, .brief_detail_super = brief_detail_super0, - .export_detail_super = export_detail_super0, + .write_init_super = write_init_super0, + .validate_geometry = validate_geometry0, + .add_to_super = add_to_super0, #endif .match_home = match_home0, .uuid_from_super = uuid_from_super0, .getinfo_super = getinfo_super0, .update_super = update_super0, .init_super = init_super0, - .add_to_super = add_to_super0, .store_super = store_super0, - .write_init_super = write_init_super0, .compare_super = compare_super0, .load_super = load_super0, .match_metadata_desc = match_metadata_desc0, @@ -1016,6 +1151,5 @@ .locate_bitmap = locate_bitmap0, .write_bitmap = write_bitmap0, .free_super = free_super0, - .major = 0, - .swapuuid = 0, + .name = "0.90", }; diff -Nru mdadm-2.6.7.1/super1.c mdadm-3.1.4/super1.c --- mdadm-2.6.7.1/super1.c 2008-10-15 08:29:37.000000000 +0300 +++ mdadm-3.1.4/super1.c 2010-08-31 09:14:47.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,12 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" @@ -81,8 +76,8 @@ __u64 utime; /* 40 bits second, 24 btes microseconds */ __u64 events; /* incremented when superblock updated */ __u64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ - __u32 sb_csum; /* checksum upto devs[max_dev] */ - __u32 max_dev; /* size of devs[] array to consider */ + __u32 sb_csum; /* checksum upto dev_roles[max_dev] */ + __u32 max_dev; /* size of dev_roles[] array to consider */ __u8 pad3[64-32]; /* set to 0 when writing */ /* device state information. Indexed by dev_number. @@ -141,13 +136,72 @@ return __cpu_to_le32(csum); } +static char abuf[4096+4096]; +static int aread(int fd, void *buf, int len) +{ + /* aligned read. + * On devices with a 4K sector size, we need to read + * the full sector and copy relevant bits into + * the buffer + */ + int bsize; + char *b; + int n; + if (ioctl(fd, BLKSSZGET, &bsize) != 0 || + bsize <= len) + return read(fd, buf, len); + if (bsize > 4096) + return -1; + b = (char*)(((long)(abuf+4096))&~4095UL); + + n = read(fd, b, bsize); + if (n <= 0) + return n; + lseek(fd, len - n, 1); + if (n > len) + n = len; + memcpy(buf, b, n); + return n; +} + +static int awrite(int fd, void *buf, int len) +{ + /* aligned write. + * On devices with a 4K sector size, we need to write + * the full sector. We pre-read if the sector is larger + * than the write. + * The address must be sector-aligned. + */ + int bsize; + char *b; + int n; + if (ioctl(fd, BLKSSZGET, &bsize) != 0 || + bsize <= len) + return write(fd, buf, len); + if (bsize > 4096) + return -1; + b = (char*)(((long)(abuf+4096))&~4095UL); + + n = read(fd, b, bsize); + if (n <= 0) + return n; + lseek(fd, -n, 1); + memcpy(b, buf, len); + n = write(fd, b, bsize); + if (n <= 0) + return n; + lseek(fd, len - n, 1); + return len; +} + #ifndef MDASSEMBLE static void examine_super1(struct supertype *st, char *homehost) { struct mdp_superblock_1 *sb = st->sb; time_t atime; - int d; - int faulty; + unsigned int d; + int role; + int delta_extra = 0; int i; char *c; int l = homehost ? strlen(homehost) : 0; @@ -185,7 +239,7 @@ printf(" Avail Dev Size : %llu%s\n", (unsigned long long)__le64_to_cpu(sb->data_size), human_size(__le64_to_cpu(sb->data_size)<<9)); - if (__le32_to_cpu(sb->level) >= 0) { + if (__le32_to_cpu(sb->level) > 0) { int ddsks=0; switch(__le32_to_cpu(sb->level)) { case 1: ddsks=1;break; @@ -230,13 +284,11 @@ human_size(__le64_to_cpu(sb->reshape_position)<<9)); if (__le32_to_cpu(sb->delta_disks)) { printf(" Delta Devices : %d", __le32_to_cpu(sb->delta_disks)); - if (__le32_to_cpu(sb->delta_disks)) - printf(" (%d->%d)\n", - __le32_to_cpu(sb->raid_disks)-__le32_to_cpu(sb->delta_disks), - __le32_to_cpu(sb->raid_disks)); - else - printf(" (%d->%d)\n", __le32_to_cpu(sb->raid_disks), - __le32_to_cpu(sb->raid_disks)+__le32_to_cpu(sb->delta_disks)); + printf(" (%d->%d)\n", + __le32_to_cpu(sb->raid_disks)-__le32_to_cpu(sb->delta_disks), + __le32_to_cpu(sb->raid_disks)); + if ((int)__le32_to_cpu(sb->delta_disks) < 0) + delta_extra = -__le32_to_cpu(sb->delta_disks); } if (__le32_to_cpu(sb->new_level) != __le32_to_cpu(sb->level)) { c = map_num(pers, __le32_to_cpu(sb->new_level)); @@ -247,11 +299,14 @@ c = map_num(r5layout, __le32_to_cpu(sb->new_layout)); printf(" New Layout : %s\n", c?c:"-unknown-"); } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } if (__le32_to_cpu(sb->level) == 10) { - printf(" New Layout : near=%d, %s=%d\n", - __le32_to_cpu(sb->new_layout)&255, - (__le32_to_cpu(sb->new_layout)&0x10000)?"offset":"far", - (__le32_to_cpu(sb->new_layout)>>8)&255); + printf(" New Layout :"); + print_r10_layout(__le32_to_cpu(sb->new_layout)); + printf("\n"); } } if (__le32_to_cpu(sb->new_chunk) != __le32_to_cpu(sb->chunksize)) @@ -279,12 +334,15 @@ c = map_num(r5layout, __le32_to_cpu(sb->layout)); printf(" Layout : %s\n", c?c:"-unknown-"); } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } if (__le32_to_cpu(sb->level) == 10) { int lo = __le32_to_cpu(sb->layout); - printf(" Layout : near=%d, %s=%d\n", - lo&255, - (lo&0x10000)?"offset":"far", - (lo>>8)&255); + printf(" Layout :"); + print_r10_layout(lo); + printf("\n"); } switch(__le32_to_cpu(sb->level)) { case 0: @@ -300,6 +358,8 @@ default: break; } printf("\n"); +#if 0 + /* This turns out to just be confusing */ printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number)); for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--) if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff) @@ -312,13 +372,25 @@ else printf("%d", role); } printf(")\n"); +#endif + printf(" Device Role : "); + d = __le32_to_cpu(sb->dev_number); + if (d < __le32_to_cpu(sb->max_dev)) + role = __le16_to_cpu(sb->dev_roles[d]); + else + role = 0xFFFF; + if (role >= 0xFFFE) + printf("spare\n"); + else + printf("Active device %d\n", role); + printf(" Array State : "); - for (d=0; d<__le32_to_cpu(sb->raid_disks); d++) { + for (d=0; d<__le32_to_cpu(sb->raid_disks) + delta_extra; d++) { int cnt = 0; int me = 0; - int i; + unsigned int i; for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { - int role = __le16_to_cpu(sb->dev_roles[i]); + unsigned int role = __le16_to_cpu(sb->dev_roles[i]); if (role == d) { if (i == __le32_to_cpu(sb->dev_number)) me = 1; @@ -326,10 +398,11 @@ } } if (cnt > 1) printf("?"); - else if (cnt == 1 && me) printf("U"); - else if (cnt == 1) printf("u"); - else printf ("_"); + else if (cnt == 1) printf("A"); + else printf ("."); } +#if 0 + /* This is confusing too */ faulty = 0; for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { int role = __le16_to_cpu(sb->dev_roles[i]); @@ -337,11 +410,13 @@ faulty++; } if (faulty) printf(" %d failed", faulty); +#endif + printf(" ('A' == active, '.' == missing)"); printf("\n"); } -static void brief_examine_super1(struct supertype *st) +static void brief_examine_super1(struct supertype *st, int verbose) { struct mdp_superblock_1 *sb = st->sb; int i; @@ -355,17 +430,21 @@ else if (sb->set_name[0]) nm = sb->set_name; else - nm = "??"; + nm = NULL; - printf("ARRAY /dev/md/%s level=%s ", nm, c?c:"-unknown-"); + printf("ARRAY%s%s", nm ? " /dev/md/":"", nm); + if (verbose && c) + printf(" level=%s", c); sb_offset = __le64_to_cpu(sb->super_offset); if (sb_offset <= 4) - printf("metadata=1.1 "); + printf(" metadata=1.1 "); else if (sb_offset <= 8) - printf("metadata=1.2 "); + printf(" metadata=1.2 "); else - printf("metadata=1.0 "); - printf("num-devices=%d UUID=", __le32_to_cpu(sb->raid_disks)); + printf(" metadata=1.0 "); + if (verbose) + printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks)); + printf("UUID="); for (i=0; i<16; i++) { if ((i&3)==0 && i != 0) printf(":"); printf("%02x", sb->set_uuid[i]); @@ -456,12 +535,6 @@ } if (len) printf("MD_NAME=%.*s\n", len, sb->set_name); - printf("MD_UUID="); - for (i=0; i<16; i++) { - if ((i&3)==0 && i != 0) printf(":"); - printf("%02x", sb->set_uuid[i]); - } - printf("\n"); } #endif @@ -489,11 +562,11 @@ { struct mdp_superblock_1 *sb = st->sb; int working = 0; - int i; + unsigned int i; int role; info->array.major_version = 1; - info->array.minor_version = __le32_to_cpu(sb->feature_map); + info->array.minor_version = st->minor_version; info->array.patch_version = 0; info->array.raid_disks = __le32_to_cpu(sb->raid_disks); info->array.level = __le32_to_cpu(sb->level); @@ -521,7 +594,7 @@ info->disk.raid_disk = -1; switch(role) { case 0xFFFF: - info->disk.state = 2; /* spare: ACTIVE, not sync, not faulty */ + info->disk.state = 0; /* spare: not active, not sync, not faulty */ break; case 0xFFFE: info->disk.state = 1; /* faulty */ @@ -531,12 +604,19 @@ info->disk.raid_disk = role; } info->events = __le64_to_cpu(sb->events); + sprintf(info->text_version, "1.%d", st->minor_version); + info->safe_mode_delay = 200; memcpy(info->uuid, sb->set_uuid, 16); strncpy(info->name, sb->set_name, 32); info->name[32] = 0; + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RECOVERY_OFFSET)) + info->recovery_start = __le32_to_cpu(sb->recovery_offset); + else + info->recovery_start = MaxSector; + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) { info->reshape_active = 1; info->reshape_progress = __le64_to_cpu(sb->reshape_position); @@ -544,10 +624,12 @@ info->delta_disks = __le32_to_cpu(sb->delta_disks); info->new_layout = __le32_to_cpu(sb->new_layout); info->new_chunk = __le32_to_cpu(sb->new_chunk)<<9; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; } else info->reshape_active = 0; - for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { + for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) { role = __le16_to_cpu(sb->dev_roles[i]); if (/*role == 0xFFFF || */role < info->array.raid_disks) working++; @@ -582,9 +664,9 @@ switch(__le32_to_cpu(sb->level)) { case 5: case 4: case 6: /* need to force clean */ - if (sb->resync_offset != ~0ULL) + if (sb->resync_offset != MaxSector) rv = 1; - sb->resync_offset = ~0ULL; + sb->resync_offset = MaxSector; } } if (strcmp(update, "assemble")==0) { @@ -600,9 +682,9 @@ } } if (strcmp(update, "linear-grow-new") == 0) { - int i; - int rfd; - int max = __le32_to_cpu(sb->max_dev); + unsigned int i; + int rfd, fd; + unsigned int max = __le32_to_cpu(sb->max_dev); for (i=0 ; i < max ; i++) if (__le16_to_cpu(sb->dev_roles[i]) >= 0xfffe) @@ -614,14 +696,33 @@ if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || read(rfd, sb->device_uuid, 16) != 16) { - *(__u32*)(sb->device_uuid) = random(); - *(__u32*)(sb->device_uuid+4) = random(); - *(__u32*)(sb->device_uuid+8) = random(); - *(__u32*)(sb->device_uuid+12) = random(); + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); } + if (rfd >= 0) + close(rfd); sb->dev_roles[i] = __cpu_to_le16(info->disk.raid_disk); + + fd = open(devname, O_RDONLY); + if (fd >= 0) { + unsigned long long ds; + get_dev_size(fd, devname, &ds); + close(fd); + ds >>= 9; + if (__le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } else { + ds -= 8*2; + ds &= ~(unsigned long long)(4*2-1); + sb->super_offset = __cpu_to_le64(ds); + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } + } } if (strcmp(update, "linear-grow-update") == 0) { sb->raid_disks = __cpu_to_le32(info->array.raid_disks); @@ -670,7 +771,7 @@ __le64_to_cpu(sb->data_offset)) { /* set data_size to device size less data_offset */ struct misc_dev_info *misc = (struct misc_dev_info*) - (st->sb + 1024 + sizeof(struct bitmap_super_s)); + (st->sb + 1024 + 512); printf("Size was %llu\n", (unsigned long long) __le64_to_cpu(sb->data_size)); sb->data_size = __cpu_to_le64( @@ -688,15 +789,21 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, char *homehost, int *uuid) { - struct mdp_superblock_1 *sb = malloc(1024 + sizeof(bitmap_super_t) + - sizeof(struct misc_dev_info)); + struct mdp_superblock_1 *sb; int spares; int rfd; char defname[10]; + + if (posix_memalign((void**)&sb, 512, (1024 + 512 + + sizeof(struct misc_dev_info))) != 0) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); + return 0; + } memset(sb, 0, 1024); st->sb = sb; - if (info->major_version == -1) { + if (info == NULL) { /* zeroing superblock */ return 0; } @@ -718,10 +825,8 @@ else { if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || read(rfd, sb->set_uuid, 16) != 16) { - *(__u32*)(sb->set_uuid) = random(); - *(__u32*)(sb->set_uuid+4) = random(); - *(__u32*)(sb->set_uuid+8) = random(); - *(__u32*)(sb->set_uuid+12) = random(); + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->set_uuid, r, 16); } if (rfd >= 0) close(rfd); } @@ -755,7 +860,7 @@ sb->utime = sb->ctime; sb->events = __cpu_to_le64(1); if (info->state & (1<resync_offset = ~0ULL; + sb->resync_offset = MaxSector; else sb->resync_offset = 0; sb->max_dev = __cpu_to_le32((1024- sizeof(struct mdp_superblock_1))/ @@ -767,18 +872,48 @@ return 1; } +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; +#ifndef MDASSEMBLE /* Add a device to the superblock being created */ -static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk) +static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) { struct mdp_superblock_1 *sb = st->sb; __u16 *rp = sb->dev_roles + dk->number; + struct devinfo *di, **dip; + if ((dk->state & 6) == 6) /* active, sync */ *rp = __cpu_to_le16(dk->raid_disk); else if ((dk->state & ~2) == 0) /* active or idle -> spare */ *rp = 0xffff; else *rp = 0xfffe; + + if (dk->number >= (int)__le32_to_cpu(sb->max_dev) && + __le32_to_cpu(sb->max_dev) < 384) + sb->max_dev = __cpu_to_le32(dk->number+1); + + sb->dev_number = __cpu_to_le32(dk->number); + sb->sb_csum = calc_sb_1_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = malloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dk; + di->next = NULL; + *dip = di; + + return 0; } +#endif static void locate_bitmap1(struct supertype *st, int fd); @@ -834,8 +969,9 @@ return 3; sbsize = sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev); + sbsize = (sbsize+511)&(~511UL); - if (write(fd, sb, sbsize) != sbsize) + if (awrite(fd, sb, sbsize) != sbsize) return 4; if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { @@ -843,7 +979,8 @@ (((char*)sb)+1024); if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) { locate_bitmap1(st, fd); - if (write(fd, bm, sizeof(*bm)) != sizeof(*bm)) + if (awrite(fd, bm, sizeof(*bm)) != + sizeof(*bm)) return 5; } } @@ -857,6 +994,8 @@ { /* if the device is bigger than 8Gig, save 64k for bitmap usage, * if bigger than 200Gig, save 128k + * NOTE: result must be multiple of 4K else bad things happen + * on 4K-sector devices. */ if (devsize < 64*2) return 0; if (devsize - 64*2 >= 200*1024*1024*2) @@ -866,123 +1005,150 @@ return 4*2; } -static int write_init_super1(struct supertype *st, - mdu_disk_info_t *dinfo, char *devname) +#ifndef MDASSEMBLE +static int write_init_super1(struct supertype *st) { struct mdp_superblock_1 *sb = st->sb; struct supertype refst; - int fd = open(devname, O_RDWR | O_EXCL); int rfd; - int rv; - int bm_space; - + int rv = 0; + unsigned long long bm_space; + unsigned long long reserved; + struct devinfo *di; unsigned long long dsize, array_size; - long long sb_offset; + unsigned long long sb_offset; + for (di = st->info; di && ! rv ; di = di->next) { + if (di->disk.state == 1) + continue; + if (di->fd < 0) + continue; + + while (Kill(di->devname, NULL, 0, 1, 1) == 0) + ; + + sb->dev_number = __cpu_to_le32(di->disk.number); + if (di->disk.state & (1<devflags |= __cpu_to_le32(WriteMostly1); - if (fd < 0) { - fprintf(stderr, Name ": Failed to open %s to write superblock\n", - devname); - return -1; - } - - sb->dev_number = __cpu_to_le32(dinfo->number); - if (dinfo->state & (1<devflags |= __cpu_to_le32(WriteMostly1); - - if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || - read(rfd, sb->device_uuid, 16) != 16) { - *(__u32*)(sb->device_uuid) = random(); - *(__u32*)(sb->device_uuid+4) = random(); - *(__u32*)(sb->device_uuid+8) = random(); - *(__u32*)(sb->device_uuid+12) = random(); - } - if (rfd >= 0) close(rfd); - sb->events = 0; - - refst =*st; - refst.sb = NULL; - if (load_super1(&refst, fd, NULL)==0) { - struct mdp_superblock_1 *refsb = refst.sb; - - memcpy(sb->device_uuid, refsb->device_uuid, 16); - if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { - /* same array, so preserve events and dev_number */ - sb->events = refsb->events; - /* bugs in 2.6.17 and earlier mean the dev_number - * chosen in Manage must be preserved - */ - if (get_linux_version() >= 2006018) - sb->dev_number = refsb->dev_number; + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); } - free(refsb); - } + if (rfd >= 0) + close(rfd); - if (!get_dev_size(fd, NULL, &dsize)) - return 1; - dsize >>= 9; + sb->events = 0; - if (dsize < 24) { - close(fd); - return 2; - } + refst =*st; + refst.sb = NULL; + if (load_super1(&refst, di->fd, NULL)==0) { + struct mdp_superblock_1 *refsb = refst.sb; + + memcpy(sb->device_uuid, refsb->device_uuid, 16); + if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { + /* same array, so preserve events and + * dev_number */ + sb->events = refsb->events; + /* bugs in 2.6.17 and earlier mean the + * dev_number chosen in Manage must be preserved + */ + if (get_linux_version() >= 2006018) + sb->dev_number = refsb->dev_number; + } + free(refsb); + } + if (!get_dev_size(di->fd, NULL, &dsize)) + return 1; + dsize >>= 9; + + if (dsize < 24) { + close(di->fd); + return 2; + } - /* - * Calculate the position of the superblock. - * It is always aligned to a 4K boundary and - * depending on minor_version, it can be: - * 0: At least 8K, but less than 12K, from end of device - * 1: At start of device - * 2: 4K from start of device. - * Depending on the array size, we might leave extra space - * for a bitmap. - */ - array_size = __le64_to_cpu(sb->size); - /* work out how much space we left for a bitmap */ - bm_space = choose_bm_space(array_size); - switch(st->minor_version) { - case 0: - sb_offset = dsize; - sb_offset -= 8*2; - sb_offset &= ~(4*2-1); - sb->super_offset = __cpu_to_le64(sb_offset); - sb->data_offset = __cpu_to_le64(0); - if (sb_offset - bm_space < array_size) - bm_space = sb_offset - array_size; - sb->data_size = __cpu_to_le64(sb_offset - bm_space); - break; - case 1: - sb->super_offset = __cpu_to_le64(0); - if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize) - bm_space = dsize - __le64_to_cpu(sb->size) - 4*2; - sb->data_offset = __cpu_to_le64(bm_space + 4*2); - sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2); - break; - case 2: - sb_offset = 4*2; - sb->super_offset = __cpu_to_le64(4*2); - if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) > dsize) - bm_space = dsize - __le64_to_cpu(sb->size) - 4*2 - 4*2; - sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space); - sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 - bm_space ); - break; - default: - return -EINVAL; - } + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + * Depending on the array size, we might leave extra space + * for a bitmap. + */ + array_size = __le64_to_cpu(sb->size); + /* work out how much space we left for a bitmap */ + bm_space = choose_bm_space(array_size); + + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + sb->super_offset = __cpu_to_le64(sb_offset); + sb->data_offset = __cpu_to_le64(0); + if (sb_offset < array_size + bm_space) + bm_space = sb_offset - array_size; + sb->data_size = __cpu_to_le64(sb_offset - bm_space); + break; + case 1: + sb->super_offset = __cpu_to_le64(0); + reserved = bm_space + 4*2; + /* Try for multiple of 1Meg so it is nicely aligned */ + #define ONE_MEG (2*1024) + reserved = ((reserved + ONE_MEG-1)/ONE_MEG) * ONE_MEG; + if (reserved + __le64_to_cpu(sb->size) > dsize) + reserved = dsize - __le64_to_cpu(sb->size); + /* force 4K alignment */ + reserved &= ~7ULL; + sb->data_offset = __cpu_to_le64(reserved); + sb->data_size = __cpu_to_le64(dsize - reserved); + break; + case 2: + sb_offset = 4*2; + sb->super_offset = __cpu_to_le64(4*2); + if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) + > dsize) + bm_space = dsize - __le64_to_cpu(sb->size) + - 4*2 - 4*2; + + reserved = bm_space + 4*2 + 4*2; + /* Try for multiple of 1Meg so it is nicely aligned */ + #define ONE_MEG (2*1024) + reserved = ((reserved + ONE_MEG-1)/ONE_MEG) * ONE_MEG; + if (reserved + __le64_to_cpu(sb->size) > dsize) + reserved = dsize - __le64_to_cpu(sb->size); + /* force 4K alignment */ + reserved &= ~7ULL; - sb->sb_csum = calc_sb_1_csum(sb); - rv = store_super1(st, fd); - if (rv) - fprintf(stderr, Name ": failed to write superblock to %s\n", devname); + sb->data_offset = __cpu_to_le64(reserved); + sb->data_size = __cpu_to_le64(dsize - reserved); + break; + default: + return -EINVAL; + } - if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) - rv = st->ss->write_bitmap(st, fd); - close(fd); + + sb->sb_csum = calc_sb_1_csum(sb); + rv = store_super1(st, di->fd); + if (rv) + fprintf(stderr, + Name ": failed to write superblock to %s\n", + di->devname); + + if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) + rv = st->ss->write_bitmap(st, di->fd); + close(di->fd); + di->fd = -1; + } return rv; } +#endif static int compare_super1(struct supertype *st, struct supertype *tst) { @@ -1002,9 +1168,14 @@ return 1; if (!first) { - first = malloc(1024+sizeof(bitmap_super_t) + - sizeof(struct misc_dev_info)); - memcpy(first, second, 1024+sizeof(bitmap_super_t) + + if (posix_memalign((void**)&first, 512, + 1024 + 512 + + sizeof(struct misc_dev_info)) != 0) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); + return 1; + } + memcpy(first, second, 1024 + 512 + sizeof(struct misc_dev_info)); st->sb = first; return 0; @@ -1035,13 +1206,16 @@ free_super1(st); + if (st->subarray[0]) + return 1; + if (st->ss == NULL || st->minor_version == -1) { int bestvers = -1; struct supertype tst; __u64 bestctime = 0; /* guess... choose latest ctime */ + memset(&tst, 0, sizeof(tst)); tst.ss = &super1; - tst.sb = NULL; for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) { switch(load_super1(&tst, fd, devname)) { case 0: super = tst.sb; @@ -1114,10 +1288,15 @@ return 1; } - super = malloc(1024 + sizeof(bitmap_super_t) + - sizeof(struct misc_dev_info)); + if (posix_memalign((void**)&super, 512, + 1024 + 512 + + sizeof(struct misc_dev_info)) != 0) { + fprintf(stderr, Name ": %s could not allocate superblock\n", + __func__); + return 1; + } - if (read(fd, super, 1024) != 1024) { + if (aread(fd, super, 1024) != 1024) { if (devname) fprintf(stderr, Name ": Cannot read superblock on %s\n", devname); @@ -1151,7 +1330,7 @@ bsb = (struct bitmap_super_s *)(((char*)super)+1024); - misc = (struct misc_dev_info*) (bsb+1); + misc = (struct misc_dev_info*) (((char*)super)+1024+512); misc->device_size = dsize; /* Now check on the bitmap superblock */ @@ -1162,8 +1341,8 @@ * should get that written out. */ locate_bitmap1(st, fd); - if (read(fd, ((char*)super)+1024, sizeof(struct bitmap_super_s)) - != sizeof(struct bitmap_super_s)) + if (aread(fd, ((char*)super)+1024, 512) + != 512) goto no_bitmap; uuid_from_super1(st, uuid); @@ -1183,23 +1362,34 @@ struct supertype *st = malloc(sizeof(*st)); if (!st) return st; + memset(st, 0, sizeof(*st)); st->ss = &super1; st->max_devs = 384; st->sb = NULL; - if (strcmp(arg, "1.0") == 0) { + /* leading zeros can be safely ignored. --detail generates them. */ + while (*arg == '0') + arg++; + if (strcmp(arg, "1.0") == 0 || + strcmp(arg, "1.00") == 0) { st->minor_version = 0; return st; } - if (strcmp(arg, "1.1") == 0) { + if (strcmp(arg, "1.1") == 0 || + strcmp(arg, "1.01") == 0 + ) { st->minor_version = 1; return st; } - if (strcmp(arg, "1.2") == 0) { + if (strcmp(arg, "1.2") == 0 || +#ifndef DEFAULT_OLD_METADATA /* ifdef in super0.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "1.02") == 0) { st->minor_version = 2; return st; } if (strcmp(arg, "1") == 0 || - strcmp(arg, "default/large") == 0) { + strcmp(arg, "default") == 0) { st->minor_version = -1; return st; } @@ -1230,10 +1420,19 @@ } #endif + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + if (super == NULL && st->minor_version > 0) { + /* haven't committed to a size yet, so allow some + * slack for alignment of data_offset. + * We haven't access to device details so allow + * 1 Meg if bigger than 1Gig + */ + if (devsize > 1024*1024*2) + devsize -= 1024*2; + } switch(st->minor_version) { - case -1: /* no specified. Now time to set default */ - st->minor_version = 0; - /* FALL THROUGH */ case 0: /* at end */ return ((devsize - 8*2 ) & ~(4*2-1)); @@ -1268,15 +1467,15 @@ unsigned long long max_bits; unsigned long long min_chunk; long offset; - int chunk = *chunkp; + unsigned long long chunk = *chunkp; int room = 0; struct mdp_superblock_1 *sb = st->sb; bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + 1024); switch(st->minor_version) { case 0: - /* either 3K after the superblock, or some amount of space - * before. + /* either 3K after the superblock (when hot-add), + * or some amount of space before. */ if (may_change) { /* We are creating array, so we *know* how much room has @@ -1284,11 +1483,6 @@ */ offset = 0; room = choose_bm_space(__le64_to_cpu(sb->size)); - if (room == 4*2) { - /* make it 3K after the superblock */ - room = 3*2; - offset = 2; - } } else { room = __le64_to_cpu(sb->super_offset) - __le64_to_cpu(sb->data_offset) @@ -1336,16 +1530,25 @@ min_chunk *= 2; bits = (bits+1)/2; } - if (chunk == UnSet) + if (chunk == UnSet) { + /* For practical purpose, 64Meg is a good + * default chunk size for internal bitmaps. + */ chunk = min_chunk; - else if (chunk < min_chunk) + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if (chunk < min_chunk) return 0; /* chunk size too small */ if (chunk == 0) /* rounding problem */ return 0; if (offset == 0) { + /* start bitmap on a 4K boundary with enough space for + * the bitmap + */ bits = (size*512) / chunk + 1; - room = ((bits+7)/8 + sizeof(bitmap_super_t) +511)/512; + room = ((bits+7)/8 + sizeof(bitmap_super_t) +4095)/4096; + room *= 8; /* convert 4K blocks to sectors */ offset = -room; } @@ -1393,25 +1596,27 @@ int rv = 0; int towrite, n; - char buf[4096]; + char *buf = (char*)(((long)(abuf+4096))&~4095UL); locate_bitmap1(st, fd); - if (write(fd, ((char*)sb)+1024, sizeof(bitmap_super_t)) != - sizeof(bitmap_super_t)) - return -2; + memset(buf, 0xff, 4096); + memcpy(buf, ((char*)sb)+1024, sizeof(bitmap_super_t)); + towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); towrite = (towrite+7) >> 3; /* bits to bytes */ - memset(buf, 0xff, sizeof(buf)); + towrite += sizeof(bitmap_super_t); + towrite = ROUND_UP(towrite, 512); while (towrite > 0) { n = towrite; - if (n > sizeof(buf)) - n = sizeof(buf); + if (n > 4096) + n = 4096; n = write(fd, buf, n); if (n > 0) towrite -= n; else break; + memset(buf, 0xff, 4096); } fsync(fd); if (towrite) @@ -1427,6 +1632,43 @@ st->sb = NULL; } +#ifndef MDASSEMBLE +static int validate_geometry1(struct supertype *st, int level, + int layout, int raiddisks, + int chunk, unsigned long long size, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize; + int fd; + + if (level == LEVEL_CONTAINER) { + if (verbose) + fprintf(stderr, Name ": 1.x metadata does not support containers\n"); + return 0; + } + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": super1.x cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size1(st, ldsize >> 9); + return 1; +} +#endif /* MDASSEMBLE */ + struct superswitch super1 = { #ifndef MDASSEMBLE .examine_super = examine_super1, @@ -1435,15 +1677,16 @@ .detail_super = detail_super1, .brief_detail_super = brief_detail_super1, .export_detail_super = export_detail_super1, + .write_init_super = write_init_super1, + .validate_geometry = validate_geometry1, + .add_to_super = add_to_super1, #endif .match_home = match_home1, .uuid_from_super = uuid_from_super1, .getinfo_super = getinfo_super1, .update_super = update_super1, .init_super = init_super1, - .add_to_super = add_to_super1, .store_super = store_super1, - .write_init_super = write_init_super1, .compare_super = compare_super1, .load_super = load_super1, .match_metadata_desc = match_metadata_desc1, @@ -1452,10 +1695,10 @@ .locate_bitmap = locate_bitmap1, .write_bitmap = write_bitmap1, .free_super = free_super1, - .major = 1, #if __BYTE_ORDER == BIG_ENDIAN .swapuuid = 0, #else .swapuuid = 1, #endif + .name = "1.x", }; diff -Nru mdadm-2.6.7.1/super-ddf.c mdadm-3.1.4/super-ddf.c --- mdadm-2.6.7.1/super-ddf.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/super-ddf.c 2010-08-31 09:14:46.000000000 +0300 @@ -0,0 +1,3698 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2 + * (July 28 2006). Reused by permission of SNIA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include + +/* a non-official T10 name for creation GUIDs */ +static char T10[] = "Linux-MD"; + +/* DDF timestamps are 1980 based, so we need to add + * second-in-decade-of-seventies to convert to linux timestamps. + * 10 years with 2 leap years. + */ +#define DECADE (3600*24*(365*10+2)) +unsigned long crc32( + unsigned long crc, + const unsigned char *buf, + unsigned len); + +/* The DDF metadata handling. + * DDF metadata lives at the end of the device. + * The last 512 byte block provides an 'anchor' which is used to locate + * the rest of the metadata which usually lives immediately behind the anchor. + * + * Note: + * - all multibyte numeric fields are bigendian. + * - all strings are space padded. + * + */ + +/* Primary Raid Level (PRL) */ +#define DDF_RAID0 0x00 +#define DDF_RAID1 0x01 +#define DDF_RAID3 0x03 +#define DDF_RAID4 0x04 +#define DDF_RAID5 0x05 +#define DDF_RAID1E 0x11 +#define DDF_JBOD 0x0f +#define DDF_CONCAT 0x1f +#define DDF_RAID5E 0x15 +#define DDF_RAID5EE 0x25 +#define DDF_RAID6 0x06 + +/* Raid Level Qualifier (RLQ) */ +#define DDF_RAID0_SIMPLE 0x00 +#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */ +#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */ +#define DDF_RAID3_0 0x00 /* parity in first extent */ +#define DDF_RAID3_N 0x01 /* parity in last extent */ +#define DDF_RAID4_0 0x00 /* parity in first extent */ +#define DDF_RAID4_N 0x01 /* parity in last extent */ +/* these apply to raid5e and raid5ee as well */ +#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */ +#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */ +#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */ +#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */ + +#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */ +#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */ + +/* Secondary RAID Level (SRL) */ +#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */ +#define DDF_2MIRRORED 0x01 +#define DDF_2CONCAT 0x02 +#define DDF_2SPANNED 0x03 /* This is also weird - be careful */ + +/* Magic numbers */ +#define DDF_HEADER_MAGIC __cpu_to_be32(0xDE11DE11) +#define DDF_CONTROLLER_MAGIC __cpu_to_be32(0xAD111111) +#define DDF_PHYS_RECORDS_MAGIC __cpu_to_be32(0x22222222) +#define DDF_PHYS_DATA_MAGIC __cpu_to_be32(0x33333333) +#define DDF_VIRT_RECORDS_MAGIC __cpu_to_be32(0xDDDDDDDD) +#define DDF_VD_CONF_MAGIC __cpu_to_be32(0xEEEEEEEE) +#define DDF_SPARE_ASSIGN_MAGIC __cpu_to_be32(0x55555555) +#define DDF_VU_CONF_MAGIC __cpu_to_be32(0x88888888) +#define DDF_VENDOR_LOG_MAGIC __cpu_to_be32(0x01dBEEF0) +#define DDF_BBM_LOG_MAGIC __cpu_to_be32(0xABADB10C) + +#define DDF_GUID_LEN 24 +#define DDF_REVISION_0 "01.00.00" +#define DDF_REVISION_2 "01.02.00" + +struct ddf_header { + __u32 magic; /* DDF_HEADER_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + char revision[8]; /* 01.02.00 */ + __u32 seq; /* starts at '1' */ + __u32 timestamp; + __u8 openflag; + __u8 foreignflag; + __u8 enforcegroups; + __u8 pad0; /* 0xff */ + __u8 pad1[12]; /* 12 * 0xff */ + /* 64 bytes so far */ + __u8 header_ext[32]; /* reserved: fill with 0xff */ + __u64 primary_lba; + __u64 secondary_lba; + __u8 type; + __u8 pad2[3]; /* 0xff */ + __u32 workspace_len; /* sectors for vendor space - + * at least 32768(sectors) */ + __u64 workspace_lba; + __u16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ + __u16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ + __u16 max_partitions; /* i.e. max num of configuration + record entries per disk */ + __u16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries + *12/512) */ + __u16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ + __u8 pad3[54]; /* 0xff */ + /* 192 bytes so far */ + __u32 controller_section_offset; + __u32 controller_section_length; + __u32 phys_section_offset; + __u32 phys_section_length; + __u32 virt_section_offset; + __u32 virt_section_length; + __u32 config_section_offset; + __u32 config_section_length; + __u32 data_section_offset; + __u32 data_section_length; + __u32 bbm_section_offset; + __u32 bbm_section_length; + __u32 diag_space_offset; + __u32 diag_space_length; + __u32 vendor_offset; + __u32 vendor_length; + /* 256 bytes so far */ + __u8 pad4[256]; /* 0xff */ +}; + +/* type field */ +#define DDF_HEADER_ANCHOR 0x00 +#define DDF_HEADER_PRIMARY 0x01 +#define DDF_HEADER_SECONDARY 0x02 + +/* The content of the 'controller section' - global scope */ +struct ddf_controller_data { + __u32 magic; /* DDF_CONTROLLER_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + struct controller_type { + __u16 vendor_id; + __u16 device_id; + __u16 sub_vendor_id; + __u16 sub_device_id; + } type; + char product_id[16]; + __u8 pad[8]; /* 0xff */ + __u8 vendor_data[448]; +}; + +/* The content of phys_section - global scope */ +struct phys_disk { + __u32 magic; /* DDF_PHYS_RECORDS_MAGIC */ + __u32 crc; + __u16 used_pdes; + __u16 max_pdes; + __u8 pad[52]; + struct phys_disk_entry { + char guid[DDF_GUID_LEN]; + __u32 refnum; + __u16 type; + __u16 state; + __u64 config_size; /* DDF structures must be after here */ + char path[18]; /* another horrible structure really */ + __u8 pad[6]; + } entries[0]; +}; + +/* phys_disk_entry.type is a bitmap - bigendian remember */ +#define DDF_Forced_PD_GUID 1 +#define DDF_Active_in_VD 2 +#define DDF_Global_Spare 4 /* VD_CONF records are ignored */ +#define DDF_Spare 8 /* overrides Global_spare */ +#define DDF_Foreign 16 +#define DDF_Legacy 32 /* no DDF on this device */ + +#define DDF_Interface_mask 0xf00 +#define DDF_Interface_SCSI 0x100 +#define DDF_Interface_SAS 0x200 +#define DDF_Interface_SATA 0x300 +#define DDF_Interface_FC 0x400 + +/* phys_disk_entry.state is a bigendian bitmap */ +#define DDF_Online 1 +#define DDF_Failed 2 /* overrides 1,4,8 */ +#define DDF_Rebuilding 4 +#define DDF_Transition 8 +#define DDF_SMART 16 +#define DDF_ReadErrors 32 +#define DDF_Missing 64 + +/* The content of the virt_section global scope */ +struct virtual_disk { + __u32 magic; /* DDF_VIRT_RECORDS_MAGIC */ + __u32 crc; + __u16 populated_vdes; + __u16 max_vdes; + __u8 pad[52]; + struct virtual_entry { + char guid[DDF_GUID_LEN]; + __u16 unit; + __u16 pad0; /* 0xffff */ + __u16 guid_crc; + __u16 type; + __u8 state; + __u8 init_state; + __u8 pad1[14]; + char name[16]; + } entries[0]; +}; + +/* virtual_entry.type is a bitmap - bigendian */ +#define DDF_Shared 1 +#define DDF_Enforce_Groups 2 +#define DDF_Unicode 4 +#define DDF_Owner_Valid 8 + +/* virtual_entry.state is a bigendian bitmap */ +#define DDF_state_mask 0x7 +#define DDF_state_optimal 0x0 +#define DDF_state_degraded 0x1 +#define DDF_state_deleted 0x2 +#define DDF_state_missing 0x3 +#define DDF_state_failed 0x4 +#define DDF_state_part_optimal 0x5 + +#define DDF_state_morphing 0x8 +#define DDF_state_inconsistent 0x10 + +/* virtual_entry.init_state is a bigendian bitmap */ +#define DDF_initstate_mask 0x03 +#define DDF_init_not 0x00 +#define DDF_init_quick 0x01 /* initialisation is progress. + * i.e. 'state_inconsistent' */ +#define DDF_init_full 0x02 + +#define DDF_access_mask 0xc0 +#define DDF_access_rw 0x00 +#define DDF_access_ro 0x80 +#define DDF_access_blocked 0xc0 + +/* The content of the config_section - local scope + * It has multiple records each config_record_len sectors + * They can be vd_config or spare_assign + */ + +struct vd_config { + __u32 magic; /* DDF_VD_CONF_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + __u32 timestamp; + __u32 seqnum; + __u8 pad0[24]; + __u16 prim_elmnt_count; + __u8 chunk_shift; /* 0 == 512, 1==1024 etc */ + __u8 prl; + __u8 rlq; + __u8 sec_elmnt_count; + __u8 sec_elmnt_seq; + __u8 srl; + __u64 blocks; /* blocks per component could be different + * on different component devices...(only + * for concat I hope) */ + __u64 array_blocks; /* blocks in array */ + __u8 pad1[8]; + __u32 spare_refs[8]; + __u8 cache_pol[8]; + __u8 bg_rate; + __u8 pad2[3]; + __u8 pad3[52]; + __u8 pad4[192]; + __u8 v0[32]; /* reserved- 0xff */ + __u8 v1[32]; /* reserved- 0xff */ + __u8 v2[16]; /* reserved- 0xff */ + __u8 v3[16]; /* reserved- 0xff */ + __u8 vendor[32]; + __u32 phys_refnum[0]; /* refnum of each disk in sequence */ + /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a + bvd are always the same size */ +}; + +/* vd_config.cache_pol[7] is a bitmap */ +#define DDF_cache_writeback 1 /* else writethrough */ +#define DDF_cache_wadaptive 2 /* only applies if writeback */ +#define DDF_cache_readahead 4 +#define DDF_cache_radaptive 8 /* only if doing read-ahead */ +#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */ +#define DDF_cache_wallowed 32 /* enable write caching */ +#define DDF_cache_rallowed 64 /* enable read caching */ + +struct spare_assign { + __u32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ + __u32 crc; + __u32 timestamp; + __u8 reserved[7]; + __u8 type; + __u16 populated; /* SAEs used */ + __u16 max; /* max SAEs */ + __u8 pad[8]; + struct spare_assign_entry { + char guid[DDF_GUID_LEN]; + __u16 secondary_element; + __u8 pad[6]; + } spare_ents[0]; +}; +/* spare_assign.type is a bitmap */ +#define DDF_spare_dedicated 0x1 /* else global */ +#define DDF_spare_revertible 0x2 /* else committable */ +#define DDF_spare_active 0x4 /* else not active */ +#define DDF_spare_affinity 0x8 /* enclosure affinity */ + +/* The data_section contents - local scope */ +struct disk_data { + __u32 magic; /* DDF_PHYS_DATA_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + __u32 refnum; /* crc of some magic drive data ... */ + __u8 forced_ref; /* set when above was not result of magic */ + __u8 forced_guid; /* set if guid was forced rather than magic */ + __u8 vendor[32]; + __u8 pad[442]; +}; + +/* bbm_section content */ +struct bad_block_log { + __u32 magic; + __u32 crc; + __u16 entry_count; + __u32 spare_count; + __u8 pad[10]; + __u64 first_spare; + struct mapped_block { + __u64 defective_start; + __u32 replacement_start; + __u16 remap_count; + __u8 pad[2]; + } entries[0]; +}; + +/* Struct for internally holding ddf structures */ +/* The DDF structure stored on each device is potentially + * quite different, as some data is global and some is local. + * The global data is: + * - ddf header + * - controller_data + * - Physical disk records + * - Virtual disk records + * The local data is: + * - Configuration records + * - Physical Disk data section + * ( and Bad block and vendor which I don't care about yet). + * + * The local data is parsed into separate lists as it is read + * and reconstructed for writing. This means that we only need + * to make config changes once and they are automatically + * propagated to all devices. + * Note that the ddf_super has space of the conf and disk data + * for this disk and also for a list of all such data. + * The list is only used for the superblock that is being + * built in Create or Assemble to describe the whole array. + */ +struct ddf_super { + struct ddf_header anchor, primary, secondary; + struct ddf_controller_data controller; + struct ddf_header *active; + struct phys_disk *phys; + struct virtual_disk *virt; + int pdsize, vdsize; + unsigned int max_part, mppe, conf_rec_len; + int currentdev; + int updates_pending; + struct vcl { + union { + char space[512]; + struct { + struct vcl *next; + __u64 *lba_offset; /* location in 'conf' of + * the lba table */ + unsigned int vcnum; /* index into ->virt */ + __u64 *block_sizes; /* NULL if all the same */ + }; + }; + struct vd_config conf; + } *conflist, *currentconf; + struct dl { + union { + char space[512]; + struct { + struct dl *next; + int major, minor; + char *devname; + int fd; + unsigned long long size; /* sectors */ + int pdnum; /* index in ->phys */ + struct spare_assign *spare; + void *mdupdate; /* hold metadata update */ + + /* These fields used by auto-layout */ + int raiddisk; /* slot to fill in autolayout */ + __u64 esize; + }; + }; + struct disk_data disk; + struct vcl *vlist[0]; /* max_part in size */ + } *dlist, *add_list; +}; + +#ifndef offsetof +#define offsetof(t,f) ((size_t)&(((t*)0)->f)) +#endif + + +static unsigned int calc_crc(void *buf, int len) +{ + /* crcs are always at the same place as in the ddf_header */ + struct ddf_header *ddf = buf; + __u32 oldcrc = ddf->crc; + __u32 newcrc; + ddf->crc = 0xffffffff; + + newcrc = crc32(0, buf, len); + ddf->crc = oldcrc; + /* The crc is store (like everything) bigendian, so convert + * here for simplicity + */ + return __cpu_to_be32(newcrc); +} + +static int load_ddf_header(int fd, unsigned long long lba, + unsigned long long size, + int type, + struct ddf_header *hdr, struct ddf_header *anchor) +{ + /* read a ddf header (primary or secondary) from fd/lba + * and check that it is consistent with anchor + * Need to check: + * magic, crc, guid, rev, and LBA's header_type, and + * everything after header_type must be the same + */ + if (lba >= size-1) + return 0; + + if (lseek64(fd, lba<<9, 0) < 0) + return 0; + + if (read(fd, hdr, 512) != 512) + return 0; + + if (hdr->magic != DDF_HEADER_MAGIC) + return 0; + if (calc_crc(hdr, 512) != hdr->crc) + return 0; + if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 || + memcmp(anchor->revision, hdr->revision, 8) != 0 || + anchor->primary_lba != hdr->primary_lba || + anchor->secondary_lba != hdr->secondary_lba || + hdr->type != type || + memcmp(anchor->pad2, hdr->pad2, 512 - + offsetof(struct ddf_header, pad2)) != 0) + return 0; + + /* Looks good enough to me... */ + return 1; +} + +static void *load_section(int fd, struct ddf_super *super, void *buf, + __u32 offset_be, __u32 len_be, int check) +{ + unsigned long long offset = __be32_to_cpu(offset_be); + unsigned long long len = __be32_to_cpu(len_be); + int dofree = (buf == NULL); + + if (check) + if (len != 2 && len != 8 && len != 32 + && len != 128 && len != 512) + return NULL; + + if (len > 1024) + return NULL; + if (buf) { + /* All pre-allocated sections are a single block */ + if (len != 1) + return NULL; + } else if (posix_memalign(&buf, 512, len<<9) != 0) + buf = NULL; + + if (!buf) + return NULL; + + if (super->active->type == 1) + offset += __be64_to_cpu(super->active->primary_lba); + else + offset += __be64_to_cpu(super->active->secondary_lba); + + if ((unsigned long long)lseek64(fd, offset<<9, 0) != (offset<<9)) { + if (dofree) + free(buf); + return NULL; + } + if ((unsigned long long)read(fd, buf, len<<9) != (len<<9)) { + if (dofree) + free(buf); + return NULL; + } + return buf; +} + +static int load_ddf_headers(int fd, struct ddf_super *super, char *devname) +{ + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize-512, 0) < 0) { + if (devname) + fprintf(stderr, + Name": Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (read(fd, &super->anchor, 512) != 512) { + if (devname) + fprintf(stderr, + Name ": Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (super->anchor.magic != DDF_HEADER_MAGIC) { + if (devname) + fprintf(stderr, Name ": no DDF anchor found on %s\n", + devname); + return 2; + } + if (calc_crc(&super->anchor, 512) != super->anchor.crc) { + if (devname) + fprintf(stderr, Name ": bad CRC on anchor on %s\n", + devname); + return 2; + } + if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 && + memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) { + if (devname) + fprintf(stderr, Name ": can only support super revision" + " %.8s and earlier, not %.8s on %s\n", + DDF_REVISION_2, super->anchor.revision,devname); + return 2; + } + if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba), + dsize >> 9, 1, + &super->primary, &super->anchor) == 0) { + if (devname) + fprintf(stderr, + Name ": Failed to load primary DDF header " + "on %s\n", devname); + return 2; + } + super->active = &super->primary; + if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba), + dsize >> 9, 2, + &super->secondary, &super->anchor)) { + if ((__be32_to_cpu(super->primary.seq) + < __be32_to_cpu(super->secondary.seq) && + !super->secondary.openflag) + || (__be32_to_cpu(super->primary.seq) + == __be32_to_cpu(super->secondary.seq) && + super->primary.openflag && !super->secondary.openflag) + ) + super->active = &super->secondary; + } + return 0; +} + +static int load_ddf_global(int fd, struct ddf_super *super, char *devname) +{ + void *ok; + ok = load_section(fd, super, &super->controller, + super->active->controller_section_offset, + super->active->controller_section_length, + 0); + super->phys = load_section(fd, super, NULL, + super->active->phys_section_offset, + super->active->phys_section_length, + 1); + super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512; + + super->virt = load_section(fd, super, NULL, + super->active->virt_section_offset, + super->active->virt_section_length, + 1); + super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512; + if (!ok || + !super->phys || + !super->virt) { + free(super->phys); + free(super->virt); + super->phys = NULL; + super->virt = NULL; + return 2; + } + super->conflist = NULL; + super->dlist = NULL; + + super->max_part = __be16_to_cpu(super->active->max_partitions); + super->mppe = __be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = __be16_to_cpu(super->active->config_record_len); + return 0; +} + +static int load_ddf_local(int fd, struct ddf_super *super, + char *devname, int keep) +{ + struct dl *dl; + struct stat stb; + char *conf; + unsigned int i; + unsigned int confsec; + int vnum; + unsigned int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries); + unsigned long long dsize; + + /* First the local disk info */ + if (posix_memalign((void**)&dl, 512, + sizeof(*dl) + + (super->max_part) * sizeof(dl->vlist[0])) != 0) { + fprintf(stderr, Name ": %s could not allocate disk info buffer\n", + __func__); + return 1; + } + + load_section(fd, super, &dl->disk, + super->active->data_section_offset, + super->active->data_section_length, + 0); + dl->devname = devname ? strdup(devname) : NULL; + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->dlist; + dl->fd = keep ? fd : -1; + + dl->size = 0; + if (get_dev_size(fd, devname, &dsize)) + dl->size = dsize >> 9; + dl->spare = NULL; + for (i = 0 ; i < super->max_part ; i++) + dl->vlist[i] = NULL; + super->dlist = dl; + dl->pdnum = -1; + for (i = 0; i < __be16_to_cpu(super->active->max_pd_entries); i++) + if (memcmp(super->phys->entries[i].guid, + dl->disk.guid, DDF_GUID_LEN) == 0) + dl->pdnum = i; + + /* Now the config list. */ + /* 'conf' is an array of config entries, some of which are + * probably invalid. Those which are good need to be copied into + * the conflist + */ + + conf = load_section(fd, super, NULL, + super->active->config_section_offset, + super->active->config_section_length, + 0); + + vnum = 0; + for (confsec = 0; + confsec < __be32_to_cpu(super->active->config_section_length); + confsec += super->conf_rec_len) { + struct vd_config *vd = + (struct vd_config *)((char*)conf + confsec*512); + struct vcl *vcl; + + if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) { + if (dl->spare) + continue; + if (posix_memalign((void**)&dl->spare, 512, + super->conf_rec_len*512) != 0) { + fprintf(stderr, Name + ": %s could not allocate spare info buf\n", + __func__); + return 1; + } + + memcpy(dl->spare, vd, super->conf_rec_len*512); + continue; + } + if (vd->magic != DDF_VD_CONF_MAGIC) + continue; + for (vcl = super->conflist; vcl; vcl = vcl->next) { + if (memcmp(vcl->conf.guid, + vd->guid, DDF_GUID_LEN) == 0) + break; + } + + if (vcl) { + dl->vlist[vnum++] = vcl; + if (__be32_to_cpu(vd->seqnum) <= + __be32_to_cpu(vcl->conf.seqnum)) + continue; + } else { + if (posix_memalign((void**)&vcl, 512, + (super->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + fprintf(stderr, Name + ": %s could not allocate vcl buf\n", + __func__); + return 1; + } + vcl->next = super->conflist; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + super->conflist = vcl; + dl->vlist[vnum++] = vcl; + } + memcpy(&vcl->conf, vd, super->conf_rec_len*512); + vcl->lba_offset = (__u64*) + &vcl->conf.phys_refnum[super->mppe]; + + for (i=0; i < max_virt_disks ; i++) + if (memcmp(super->virt->entries[i].guid, + vcl->conf.guid, DDF_GUID_LEN)==0) + break; + if (i < max_virt_disks) + vcl->vcnum = i; + } + free(conf); + + return 0; +} + +#ifndef MDASSEMBLE +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname, int keep_fd); +#endif + +static void free_super_ddf(struct supertype *st); + +static int load_super_ddf(struct supertype *st, int fd, + char *devname) +{ + unsigned long long dsize; + struct ddf_super *super; + int rv; + +#ifndef MDASSEMBLE + /* if 'fd' is a container, load metadata from all the devices */ + if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0) + return 0; +#endif + if (st->subarray[0]) + return 1; /* FIXME Is this correct */ + + if (get_dev_size(fd, devname, &dsize) == 0) + return 1; + + if (test_partition(fd)) + /* DDF is not allowed on partitions */ + return 1; + + /* 32M is a lower bound */ + if (dsize <= 32*1024*1024) { + if (devname) + fprintf(stderr, + Name ": %s is too small for ddf: " + "size is %llu sectors.\n", + devname, dsize>>9); + return 1; + } + if (dsize & 511) { + if (devname) + fprintf(stderr, + Name ": %s is an odd size for ddf: " + "size is %llu bytes.\n", + devname, dsize); + return 1; + } + + free_super_ddf(st); + + if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) { + fprintf(stderr, Name ": malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + memset(super, 0, sizeof(*super)); + + rv = load_ddf_headers(fd, super, devname); + if (rv) { + free(super); + return rv; + } + + /* Have valid headers and have chosen the best. Let's read in the rest*/ + + rv = load_ddf_global(fd, super, devname); + + if (rv) { + if (devname) + fprintf(stderr, + Name ": Failed to load all information " + "sections on %s\n", devname); + free(super); + return rv; + } + + rv = load_ddf_local(fd, super, devname, 0); + + if (rv) { + if (devname) + fprintf(stderr, + Name ": Failed to load all information " + "sections on %s\n", devname); + free(super); + return rv; + } + + if (st->subarray[0]) { + unsigned long val; + struct vcl *v; + char *ep; + + val = strtoul(st->subarray, &ep, 10); + if (*ep != '\0') { + free(super); + return 1; + } + + for (v = super->conflist; v; v = v->next) + if (v->vcnum == val) + super->currentconf = v; + if (!super->currentconf) { + free(super); + return 1; + } + } + + /* Should possibly check the sections .... */ + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + st->loaded_container = 0; + return 0; + +} + +static void free_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + if (ddf == NULL) + return; + free(ddf->phys); + free(ddf->virt); + while (ddf->conflist) { + struct vcl *v = ddf->conflist; + ddf->conflist = v->next; + if (v->block_sizes) + free(v->block_sizes); + free(v); + } + while (ddf->dlist) { + struct dl *d = ddf->dlist; + ddf->dlist = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + free(ddf); + st->sb = NULL; +} + +static struct supertype *match_metadata_desc_ddf(char *arg) +{ + /* 'ddf' only support containers */ + struct supertype *st; + if (strcmp(arg, "ddf") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = malloc(sizeof(*st)); + memset(st, 0, sizeof(*st)); + st->ss = &super_ddf; + st->max_devs = 512; + st->minor_version = 0; + st->sb = NULL; + return st; +} + + +#ifndef MDASSEMBLE + +static mapping_t ddf_state[] = { + { "Optimal", 0}, + { "Degraded", 1}, + { "Deleted", 2}, + { "Missing", 3}, + { "Failed", 4}, + { "Partially Optimal", 5}, + { "-reserved-", 6}, + { "-reserved-", 7}, + { NULL, 0} +}; + +static mapping_t ddf_init_state[] = { + { "Not Initialised", 0}, + { "QuickInit in Progress", 1}, + { "Fully Initialised", 2}, + { "*UNKNOWN*", 3}, + { NULL, 0} +}; +static mapping_t ddf_access[] = { + { "Read/Write", 0}, + { "Reserved", 1}, + { "Read Only", 2}, + { "Blocked (no access)", 3}, + { NULL ,0} +}; + +static mapping_t ddf_level[] = { + { "RAID0", DDF_RAID0}, + { "RAID1", DDF_RAID1}, + { "RAID3", DDF_RAID3}, + { "RAID4", DDF_RAID4}, + { "RAID5", DDF_RAID5}, + { "RAID1E",DDF_RAID1E}, + { "JBOD", DDF_JBOD}, + { "CONCAT",DDF_CONCAT}, + { "RAID5E",DDF_RAID5E}, + { "RAID5EE",DDF_RAID5EE}, + { "RAID6", DDF_RAID6}, + { NULL, 0} +}; +static mapping_t ddf_sec_level[] = { + { "Striped", DDF_2STRIPED}, + { "Mirrored", DDF_2MIRRORED}, + { "Concat", DDF_2CONCAT}, + { "Spanned", DDF_2SPANNED}, + { NULL, 0} +}; +#endif + +struct num_mapping { + int num1, num2; +}; +static struct num_mapping ddf_level_num[] = { + { DDF_RAID0, 0 }, + { DDF_RAID1, 1 }, + { DDF_RAID3, LEVEL_UNSUPPORTED }, + { DDF_RAID4, 4 }, + { DDF_RAID5, 5 }, + { DDF_RAID1E, LEVEL_UNSUPPORTED }, + { DDF_JBOD, LEVEL_UNSUPPORTED }, + { DDF_CONCAT, LEVEL_LINEAR }, + { DDF_RAID5E, LEVEL_UNSUPPORTED }, + { DDF_RAID5EE, LEVEL_UNSUPPORTED }, + { DDF_RAID6, 6}, + { MAXINT, MAXINT } +}; + +static int map_num1(struct num_mapping *map, int num) +{ + int i; + for (i=0 ; map[i].num1 != MAXINT; i++) + if (map[i].num1 == num) + break; + return map[i].num2; +} + +static int all_ff(char *guid) +{ + int i; + for (i = 0; i < DDF_GUID_LEN; i++) + if (guid[i] != (char)0xff) + return 0; + return 1; +} + +#ifndef MDASSEMBLE +static void print_guid(char *guid, int tstamp) +{ + /* A GUIDs are part (or all) ASCII and part binary. + * They tend to be space padded. + * We print the GUID in HEX, then in parentheses add + * any initial ASCII sequence, and a possible + * time stamp from bytes 16-19 + */ + int l = DDF_GUID_LEN; + int i; + + for (i=0 ; i= 0x20 && guid[i] < 0x7f) + fputc(guid[i], stdout); + else + break; + } + if (tstamp) { + time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE; + char tbuf[100]; + struct tm *tm; + tm = localtime(&then); + strftime(tbuf, 100, " %D %T",tm); + fputs(tbuf, stdout); + } + printf(")"); +} + +static void examine_vd(int n, struct ddf_super *sb, char *guid) +{ + int crl = sb->conf_rec_len; + struct vcl *vcl; + + for (vcl = sb->conflist ; vcl ; vcl = vcl->next) { + unsigned int i; + struct vd_config *vc = &vcl->conf; + + if (calc_crc(vc, crl*512) != vc->crc) + continue; + if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0) + continue; + + /* Ok, we know about this VD, let's give more details */ + printf(" Raid Devices[%d] : %d (", n, + __be16_to_cpu(vc->prim_elmnt_count)); + for (i = 0; i < __be16_to_cpu(vc->prim_elmnt_count); i++) { + int j; + int cnt = __be16_to_cpu(sb->phys->used_pdes); + for (j=0; jphys_refnum[i] == sb->phys->entries[j].refnum) + break; + if (i) printf(" "); + if (j < cnt) + printf("%d", j); + else + printf("--"); + } + printf(")\n"); + if (vc->chunk_shift != 255) + printf(" Chunk Size[%d] : %d sectors\n", n, + 1 << vc->chunk_shift); + printf(" Raid Level[%d] : %s\n", n, + map_num(ddf_level, vc->prl)?:"-unknown-"); + if (vc->sec_elmnt_count != 1) { + printf(" Secondary Position[%d] : %d of %d\n", n, + vc->sec_elmnt_seq, vc->sec_elmnt_count); + printf(" Secondary Level[%d] : %s\n", n, + map_num(ddf_sec_level, vc->srl) ?: "-unknown-"); + } + printf(" Device Size[%d] : %llu\n", n, + (unsigned long long)__be64_to_cpu(vc->blocks)/2); + printf(" Array Size[%d] : %llu\n", n, + (unsigned long long)__be64_to_cpu(vc->array_blocks)/2); + } +} + +static void examine_vds(struct ddf_super *sb) +{ + int cnt = __be16_to_cpu(sb->virt->populated_vdes); + int i; + printf(" Virtual Disks : %d\n", cnt); + + for (i=0; ivirt->entries[i]; + printf("\n"); + printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1); + printf("\n"); + printf(" unit[%d] : %d\n", i, __be16_to_cpu(ve->unit)); + printf(" state[%d] : %s, %s%s\n", i, + map_num(ddf_state, ve->state & 7), + (ve->state & 8) ? "Morphing, ": "", + (ve->state & 16)? "Not Consistent" : "Consistent"); + printf(" init state[%d] : %s\n", i, + map_num(ddf_init_state, ve->init_state&3)); + printf(" access[%d] : %s\n", i, + map_num(ddf_access, (ve->init_state>>6) & 3)); + printf(" Name[%d] : %.16s\n", i, ve->name); + examine_vd(i, sb, ve->guid); + } + if (cnt) printf("\n"); +} + +static void examine_pds(struct ddf_super *sb) +{ + int cnt = __be16_to_cpu(sb->phys->used_pdes); + int i; + struct dl *dl; + printf(" Physical Disks : %d\n", cnt); + printf(" Number RefNo Size Device Type/State\n"); + + for (i=0 ; iphys->entries[i]; + int type = __be16_to_cpu(pd->type); + int state = __be16_to_cpu(pd->state); + + //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0); + //printf("\n"); + printf(" %3d %08x ", i, + __be32_to_cpu(pd->refnum)); + printf("%8lluK ", + (unsigned long long)__be64_to_cpu(pd->config_size)>>1); + for (dl = sb->dlist; dl ; dl = dl->next) { + if (dl->disk.refnum == pd->refnum) { + char *dv = map_dev(dl->major, dl->minor, 0); + if (dv) { + printf("%-15s", dv); + break; + } + } + } + if (!dl) + printf("%15s",""); + printf(" %s%s%s%s%s", + (type&2) ? "active":"", + (type&4) ? "Global-Spare":"", + (type&8) ? "spare" : "", + (type&16)? ", foreign" : "", + (type&32)? "pass-through" : ""); + printf("/%s%s%s%s%s%s%s", + (state&1)? "Online": "Offline", + (state&2)? ", Failed": "", + (state&4)? ", Rebuilding": "", + (state&8)? ", in-transition": "", + (state&16)? ", SMART-errors": "", + (state&32)? ", Unrecovered-Read-Errors": "", + (state&64)? ", Missing" : ""); + printf("\n"); + } +} + +static void examine_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + + printf(" Magic : %08x\n", __be32_to_cpu(sb->anchor.magic)); + printf(" Version : %.8s\n", sb->anchor.revision); + printf("Controller GUID : "); print_guid(sb->controller.guid, 0); + printf("\n"); + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", __be32_to_cpu(sb->active->seq)); + printf(" Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC + ?"yes" : "no"); + examine_vds(sb); + examine_pds(sb); +} + +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info); + +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]); + +static void brief_examine_super_ddf(struct supertype *st, int verbose) +{ + /* We just write a generic DDF ARRAY entry + */ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + + printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_ddf(struct supertype *st, int verbose) +{ + /* We just write a generic DDF ARRAY entry + */ + struct ddf_super *ddf = st->sb; + struct mdinfo info; + unsigned int i; + char nbuf[64]; + getinfo_super_ddf(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + + for (i = 0; i < __be16_to_cpu(ddf->virt->max_vdes); i++) { + struct virtual_entry *ve = &ddf->virt->entries[i]; + struct vcl vcl; + char nbuf1[64]; + if (all_ff(ve->guid)) + continue; + memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN); + ddf->currentconf =&vcl; + uuid_from_super_ddf(st, info.uuid); + fname_from_uuid(st, &info, nbuf1, ':'); + printf("ARRAY container=%s member=%d UUID=%s\n", + nbuf+5, i, nbuf1+5); + } +} + +static void export_examine_super_ddf(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=ddf\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); +} + + +static void detail_super_ddf(struct supertype *st, char *homehost) +{ + /* FIXME later + * Could print DDF GUID + * Need to find which array + * If whole, briefly list all arrays + * If one, give name + */ +} + +static void brief_detail_super_ddf(struct supertype *st) +{ + /* FIXME I really need to know which array we are detailing. + * Can that be stored in ddf_super?? + */ +// struct ddf_super *ddf = st->sb; + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info); + fname_from_uuid(st, &info, nbuf,':'); + printf(" UUID=%s", nbuf + 5); +} +#endif + +static int match_home_ddf(struct supertype *st, char *homehost) +{ + /* It matches 'this' host if the controller is a + * Linux-MD controller with vendor_data matching + * the hostname + */ + struct ddf_super *ddf = st->sb; + unsigned int len; + + if (!homehost) + return 0; + len = strlen(homehost); + + return (memcmp(ddf->controller.guid, T10, 8) == 0 && + len < sizeof(ddf->controller.vendor_data) && + memcmp(ddf->controller.vendor_data, homehost,len) == 0 && + ddf->controller.vendor_data[len] == 0); +} + +#ifndef MDASSEMBLE +static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst) +{ + struct vcl *v; + + for (v = ddf->conflist; v; v = v->next) + if (inst == v->vcnum) + return &v->conf; + return NULL; +} +#endif + +static int find_phys(struct ddf_super *ddf, __u32 phys_refnum) +{ + /* Find the entry in phys_disk which has the given refnum + * and return it's index + */ + unsigned int i; + for (i = 0; i < __be16_to_cpu(ddf->phys->max_pdes); i++) + if (ddf->phys->entries[i].refnum == phys_refnum) + return i; + return -1; +} + +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In the case of SVD we assume the BVD is of interest, + * though that might be the case if a bitmap were made for + * a mirrored SVD - worry about that later. + * So we need to find the VD configuration record for the + * relevant BVD and extract the GUID and Secondary_Element_Seq. + * The first 16 bytes of the sha1 of these is used. + */ + struct ddf_super *ddf = st->sb; + struct vcl *vcl = ddf->currentconf; + char *guid; + char buf[20]; + struct sha1_ctx ctx; + + if (vcl) + guid = vcl->conf.guid; + else + guid = ddf->anchor.guid; + + sha1_init_ctx(&ctx); + sha1_process_bytes(guid, DDF_GUID_LEN, &ctx); + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info); + +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info) +{ + struct ddf_super *ddf = st->sb; + + if (ddf->currentconf) { + getinfo_super_ddf_bvd(st, info); + return; + } + + info->array.raid_disks = __be16_to_cpu(ddf->phys->used_pdes); + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = DECADE + __be32_to_cpu(*(__u32*) + (ddf->anchor.guid+16)); + info->array.utime = 0; + info->array.chunk_size = 0; + info->container_enough = 1; + + + info->disk.major = 0; + info->disk.minor = 0; + if (ddf->dlist) { + info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum); + + info->data_offset = __be64_to_cpu(ddf->phys-> + entries[info->disk.raid_disk]. + config_size); + info->component_size = ddf->dlist->size - info->data_offset; + } else { + info->disk.number = -1; + info->disk.raid_disk = -1; +// info->disk.raid_disk = find refnum in the table and use index; + } + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + + + info->recovery_start = MaxSector; + info->reshape_active = 0; + info->name[0] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "ddf"); + info->safe_mode_delay = 0; + + uuid_from_super_ddf(st, info->uuid); + +} + +static int rlq_to_layout(int rlq, int prl, int raiddisks); + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info) +{ + struct ddf_super *ddf = st->sb; + struct vcl *vc = ddf->currentconf; + int cd = ddf->currentdev; + int j; + struct dl *dl; + + /* FIXME this returns BVD info - what if we want SVD ?? */ + + info->array.raid_disks = __be16_to_cpu(vc->conf.prim_elmnt_count); + info->array.level = map_num1(ddf_level_num, vc->conf.prl); + info->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, + info->array.raid_disks); + info->array.md_minor = -1; + info->array.ctime = DECADE + + __be32_to_cpu(*(__u32*)(vc->conf.guid+16)); + info->array.utime = DECADE + __be32_to_cpu(vc->conf.timestamp); + info->array.chunk_size = 512 << vc->conf.chunk_shift; + info->custom_array_size = 0; + + if (cd >= 0 && (unsigned)cd < ddf->mppe) { + info->data_offset = __be64_to_cpu(vc->lba_offset[cd]); + if (vc->block_sizes) + info->component_size = vc->block_sizes[cd]; + else + info->component_size = __be64_to_cpu(vc->conf.blocks); + } + + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->raiddisk == info->disk.raid_disk) + break; + info->disk.major = 0; + info->disk.minor = 0; + if (dl) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + } +// info->disk.number = __be32_to_cpu(ddf->disk.refnum); +// info->disk.raid_disk = find refnum in the table and use index; +// info->disk.state = ???; + + info->container_member = ddf->currentconf->vcnum; + + info->recovery_start = MaxSector; + info->resync_start = 0; + info->reshape_active = 0; + if (!(ddf->virt->entries[info->container_member].state + & DDF_state_inconsistent) && + (ddf->virt->entries[info->container_member].init_state + & DDF_initstate_mask) + == DDF_init_full) + info->resync_start = MaxSector; + + uuid_from_super_ddf(st, info->uuid); + + info->container_member = atoi(st->subarray); + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%s", + devnum2devname(st->container_dev), + st->subarray); + info->safe_mode_delay = 200; + + memcpy(info->name, ddf->virt->entries[info->container_member].name, 16); + info->name[16]=0; + for(j=0; j<16; j++) + if (info->name[j] == ' ') + info->name[j] = 0; +} + + +static int update_super_ddf(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * uuid: Change the uuid of the array to match what is given + * homehost: update the recorded homehost + * name: update the name - preserving the homehost + * _reshape_progress: record new reshape_progress position. + * + * Following are not relevant for this version: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + */ + int rv = 0; +// struct ddf_super *ddf = st->sb; +// struct vd_config *vd = find_vdcr(ddf, info->container_member); +// struct virtual_entry *ve = find_ve(ddf); + + /* we don't need to handle "force-*" or "assemble" as + * there is no need to 'trick' the kernel. We the metadata is + * first updated to activate the array, all the implied modifications + * will just happen. + */ + + if (strcmp(update, "grow") == 0) { + /* FIXME */ + } + if (strcmp(update, "resync") == 0) { +// info->resync_checkpoint = 0; + } + /* We ignore UUID updates as they make even less sense + * with DDF + */ + if (strcmp(update, "homehost") == 0) { + /* homehost is stored in controller->vendor_data, + * or it is when we are the vendor + */ +// if (info->vendor_is_local) +// strcpy(ddf->controller.vendor_data, homehost); + } + if (strcmp(update, "name") == 0) { + /* name is stored in virtual_entry->name */ +// memset(ve->name, ' ', 16); +// strncpy(ve->name, info->name, 16); + } + if (strcmp(update, "_reshape_progress") == 0) { + /* We don't support reshape yet */ + } + +// update_all_csum(ddf); + + return rv; +} + +static void make_header_guid(char *guid) +{ + __u32 stamp; + /* Create a DDF Header of Virtual Disk GUID */ + + /* 24 bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000 + * Remaining 8 random number plus timestamp + */ + memcpy(guid, T10, sizeof(T10)); + stamp = __cpu_to_be32(0xdeadbeef); + memcpy(guid+8, &stamp, 4); + stamp = __cpu_to_be32(0); + memcpy(guid+12, &stamp, 4); + stamp = __cpu_to_be32(time(0) - DECADE); + memcpy(guid+16, &stamp, 4); + stamp = random32(); + memcpy(guid+20, &stamp, 4); +} + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid); + +static int init_super_ddf(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For DDF, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + * + * We need to create the entire 'ddf' structure which includes: + * DDF headers - these are easy. + * Controller data - a Sector describing this controller .. not that + * this is a controller exactly. + * Physical Disk Record - one entry per device, so + * leave plenty of space. + * Virtual Disk Records - again, just leave plenty of space. + * This just lists VDs, doesn't give details + * Config records - describes the VDs that use this disk + * DiskData - describes 'this' device. + * BadBlockManagement - empty + * Diag Space - empty + * Vendor Logs - Could we put bitmaps here? + * + */ + struct ddf_super *ddf; + char hostname[17]; + int hostlen; + int max_phys_disks, max_virt_disks; + unsigned long long sector; + int clen; + int i; + int pdsize, vdsize; + struct phys_disk *pd; + struct virtual_disk *vd; + + if (st->sb) + return init_super_ddf_bvd(st, info, size, name, homehost, uuid); + + if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { + fprintf(stderr, Name ": %s could not allocate superblock\n", __func__); + return 0; + } + memset(ddf, 0, sizeof(*ddf)); + ddf->dlist = NULL; /* no physical disks yet */ + ddf->conflist = NULL; /* No virtual disks yet */ + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + /* At least 32MB *must* be reserved for the ddf. So let's just + * start 32MB from the end, and put the primary header there. + * Don't do secondary for now. + * We don't know exactly where that will be yet as it could be + * different on each device. To just set up the lengths. + * + */ + + ddf->anchor.magic = DDF_HEADER_MAGIC; + make_header_guid(ddf->anchor.guid); + + memcpy(ddf->anchor.revision, DDF_REVISION_2, 8); + ddf->anchor.seq = __cpu_to_be32(1); + ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE); + ddf->anchor.openflag = 0xFF; + ddf->anchor.foreignflag = 0; + ddf->anchor.enforcegroups = 0; /* Is this best?? */ + ddf->anchor.pad0 = 0xff; + memset(ddf->anchor.pad1, 0xff, 12); + memset(ddf->anchor.header_ext, 0xff, 32); + ddf->anchor.primary_lba = ~(__u64)0; + ddf->anchor.secondary_lba = ~(__u64)0; + ddf->anchor.type = DDF_HEADER_ANCHOR; + memset(ddf->anchor.pad2, 0xff, 3); + ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */ + ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom + of 32M reserved.. */ + max_phys_disks = 1023; /* Should be enough */ + ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks); + max_virt_disks = 255; + ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */ + ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */ + ddf->max_part = 64; + ddf->mppe = 256; + ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512; + ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len); + ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe); + memset(ddf->anchor.pad3, 0xff, 54); + /* controller sections is one sector long immediately + * after the ddf header */ + sector = 1; + ddf->anchor.controller_section_offset = __cpu_to_be32(sector); + ddf->anchor.controller_section_length = __cpu_to_be32(1); + sector += 1; + + /* phys is 8 sectors after that */ + pdsize = ROUND_UP(sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)*max_phys_disks, + 512); + switch(pdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.phys_section_offset = __cpu_to_be32(sector); + ddf->anchor.phys_section_length = + __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ + sector += pdsize/512; + + /* virt is another 32 sectors */ + vdsize = ROUND_UP(sizeof(struct virtual_disk) + + sizeof(struct virtual_entry) * max_virt_disks, + 512); + switch(vdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.virt_section_offset = __cpu_to_be32(sector); + ddf->anchor.virt_section_length = + __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ + sector += vdsize/512; + + clen = ddf->conf_rec_len * (ddf->max_part+1); + ddf->anchor.config_section_offset = __cpu_to_be32(sector); + ddf->anchor.config_section_length = __cpu_to_be32(clen); + sector += clen; + + ddf->anchor.data_section_offset = __cpu_to_be32(sector); + ddf->anchor.data_section_length = __cpu_to_be32(1); + sector += 1; + + ddf->anchor.bbm_section_length = __cpu_to_be32(0); + ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF); + ddf->anchor.diag_space_length = __cpu_to_be32(0); + ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF); + ddf->anchor.vendor_length = __cpu_to_be32(0); + ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF); + + memset(ddf->anchor.pad4, 0xff, 256); + + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->primary.openflag = 1; /* I guess.. */ + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 1; /* I guess.. */ + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->active = &ddf->primary; + + ddf->controller.magic = DDF_CONTROLLER_MAGIC; + + /* 24 more bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * Remaining 16 are serial number.... maybe a hostname would do? + */ + memcpy(ddf->controller.guid, T10, sizeof(T10)); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = 0; + hostlen = strlen(hostname); + memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen); + for (i = strlen(T10) ; i+hostlen < 24; i++) + ddf->controller.guid[i] = ' '; + + ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD); + ddf->controller.type.device_id = __cpu_to_be16(0xBEEF); + ddf->controller.type.sub_vendor_id = 0; + ddf->controller.type.sub_device_id = 0; + memcpy(ddf->controller.product_id, "What Is My PID??", 16); + memset(ddf->controller.pad, 0xff, 8); + memset(ddf->controller.vendor_data, 0xff, 448); + if (homehost && strlen(homehost) < 440) + strcpy((char*)ddf->controller.vendor_data, homehost); + + if (posix_memalign((void**)&pd, 512, pdsize) != 0) { + fprintf(stderr, Name ": %s could not allocate pd\n", __func__); + return 0; + } + ddf->phys = pd; + ddf->pdsize = pdsize; + + memset(pd, 0xff, pdsize); + memset(pd, 0, sizeof(*pd)); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = __cpu_to_be16(0); + pd->max_pdes = __cpu_to_be16(max_phys_disks); + memset(pd->pad, 0xff, 52); + + if (posix_memalign((void**)&vd, 512, vdsize) != 0) { + fprintf(stderr, Name ": %s could not allocate vd\n", __func__); + return 0; + } + ddf->virt = vd; + ddf->vdsize = vdsize; + memset(vd, 0, vdsize); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = __cpu_to_be16(0); + vd->max_vdes = __cpu_to_be16(max_virt_disks); + memset(vd->pad, 0xff, 52); + + for (i=0; ientries[i], 0xff, sizeof(struct virtual_entry)); + + st->sb = ddf; + ddf->updates_pending = 1; + return 1; +} + +static int chunk_to_shift(int chunksize) +{ + return ffs(chunksize/512)-1; +} + +static int level_to_prl(int level) +{ + switch (level) { + case LEVEL_LINEAR: return DDF_CONCAT; + case 0: return DDF_RAID0; + case 1: return DDF_RAID1; + case 4: return DDF_RAID4; + case 5: return DDF_RAID5; + case 6: return DDF_RAID6; + default: return -1; + } +} +static int layout_to_rlq(int level, int layout, int raiddisks) +{ + switch(level) { + case 0: + return DDF_RAID0_SIMPLE; + case 1: + switch(raiddisks) { + case 2: return DDF_RAID1_SIMPLE; + case 3: return DDF_RAID1_MULTI; + default: return -1; + } + case 4: + switch(layout) { + case 0: return DDF_RAID4_N; + } + break; + case 5: + switch(layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + return DDF_RAID5_N_RESTART; + case ALGORITHM_RIGHT_ASYMMETRIC: + return DDF_RAID5_0_RESTART; + case ALGORITHM_LEFT_SYMMETRIC: + return DDF_RAID5_N_CONTINUE; + case ALGORITHM_RIGHT_SYMMETRIC: + return -1; /* not mentioned in standard */ + } + case 6: + switch(layout) { + case ALGORITHM_ROTATING_N_RESTART: + return DDF_RAID5_N_RESTART; + case ALGORITHM_ROTATING_ZERO_RESTART: + return DDF_RAID6_0_RESTART; + case ALGORITHM_ROTATING_N_CONTINUE: + return DDF_RAID5_N_CONTINUE; + } + } + return -1; +} + +static int rlq_to_layout(int rlq, int prl, int raiddisks) +{ + switch(prl) { + case DDF_RAID0: + return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */ + case DDF_RAID1: + return 0; /* hopefully rlq == SIMPLE or MULTI depending + on raiddisks*/ + case DDF_RAID4: + switch(rlq) { + case DDF_RAID4_N: + return 0; + default: + /* not supported */ + return -1; /* FIXME this isn't checked */ + } + case DDF_RAID5: + switch(rlq) { + case DDF_RAID5_N_RESTART: + return ALGORITHM_LEFT_ASYMMETRIC; + case DDF_RAID5_0_RESTART: + return ALGORITHM_RIGHT_ASYMMETRIC; + case DDF_RAID5_N_CONTINUE: + return ALGORITHM_LEFT_SYMMETRIC; + default: + return -1; + } + case DDF_RAID6: + switch(rlq) { + case DDF_RAID5_N_RESTART: + return ALGORITHM_ROTATING_N_RESTART; + case DDF_RAID6_0_RESTART: + return ALGORITHM_ROTATING_ZERO_RESTART; + case DDF_RAID5_N_CONTINUE: + return ALGORITHM_ROTATING_N_CONTINUE; + default: + return -1; + } + } + return -1; +} + +#ifndef MDASSEMBLE +struct extent { + unsigned long long start, size; +}; +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl) +{ + /* find a list of used extents on the give physical device + * (dnum) of the given ddf. + * Return a malloced array of 'struct extent' + +FIXME ignore DDF_Legacy devices? + + */ + struct extent *rv; + int n = 0; + unsigned int i, j; + + rv = malloc(sizeof(struct extent) * (ddf->max_part + 2)); + if (!rv) + return NULL; + + for (i = 0; i < ddf->max_part; i++) { + struct vcl *v = dl->vlist[i]; + if (v == NULL) + continue; + for (j = 0; j < v->conf.prim_elmnt_count; j++) + if (v->conf.phys_refnum[j] == dl->disk.refnum) { + /* This device plays role 'j' in 'v'. */ + rv[n].start = __be64_to_cpu(v->lba_offset[j]); + rv[n].size = __be64_to_cpu(v->conf.blocks); + n++; + break; + } + } + qsort(rv, n, sizeof(*rv), cmp_extent); + + rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); + rv[n].size = 0; + return rv; +} +#endif + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid) +{ + /* We are creating a BVD inside a pre-existing container. + * so st->sb is already set. + * We need to create a new vd_config and a new virtual_entry + */ + struct ddf_super *ddf = st->sb; + unsigned int venum; + struct virtual_entry *ve; + struct vcl *vcl; + struct vd_config *vc; + + if (__be16_to_cpu(ddf->virt->populated_vdes) + >= __be16_to_cpu(ddf->virt->max_vdes)) { + fprintf(stderr, Name": This ddf already has the " + "maximum of %d virtual devices\n", + __be16_to_cpu(ddf->virt->max_vdes)); + return 0; + } + + for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++) + if (all_ff(ddf->virt->entries[venum].guid)) + break; + if (venum == __be16_to_cpu(ddf->virt->max_vdes)) { + fprintf(stderr, Name ": Cannot find spare slot for " + "virtual disk - DDF is corrupt\n"); + return 0; + } + ve = &ddf->virt->entries[venum]; + + /* A Virtual Disk GUID contains the T10 Vendor ID, controller type, + * timestamp, random number + */ + make_header_guid(ve->guid); + ve->unit = __cpu_to_be16(info->md_minor); + ve->pad0 = 0xFFFF; + ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN); + ve->type = 0; + ve->state = DDF_state_degraded; /* Will be modified as devices are added */ + if (info->state & 1) /* clean */ + ve->init_state = DDF_init_full; + else + ve->init_state = DDF_init_not; + + memset(ve->pad1, 0xff, 14); + memset(ve->name, ' ', 16); + if (name) + strncpy(ve->name, name, 16); + ddf->virt->populated_vdes = + __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1); + + /* Now create a new vd_config */ + if (posix_memalign((void**)&vcl, 512, + (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) { + fprintf(stderr, Name ": %s could not allocate vd_config\n", __func__); + return 0; + } + vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe]; + vcl->vcnum = venum; + sprintf(st->subarray, "%d", venum); + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + + vc = &vcl->conf; + + vc->magic = DDF_VD_CONF_MAGIC; + memcpy(vc->guid, ve->guid, DDF_GUID_LEN); + vc->timestamp = __cpu_to_be32(time(0)-DECADE); + vc->seqnum = __cpu_to_be32(1); + memset(vc->pad0, 0xff, 24); + vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks); + vc->chunk_shift = chunk_to_shift(info->chunk_size); + vc->prl = level_to_prl(info->level); + vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks); + vc->sec_elmnt_count = 1; + vc->sec_elmnt_seq = 0; + vc->srl = 0; + vc->blocks = __cpu_to_be64(info->size * 2); + vc->array_blocks = __cpu_to_be64( + calc_array_size(info->level, info->raid_disks, info->layout, + info->chunk_size, info->size*2)); + memset(vc->pad1, 0xff, 8); + vc->spare_refs[0] = 0xffffffff; + vc->spare_refs[1] = 0xffffffff; + vc->spare_refs[2] = 0xffffffff; + vc->spare_refs[3] = 0xffffffff; + vc->spare_refs[4] = 0xffffffff; + vc->spare_refs[5] = 0xffffffff; + vc->spare_refs[6] = 0xffffffff; + vc->spare_refs[7] = 0xffffffff; + memset(vc->cache_pol, 0, 8); + vc->bg_rate = 0x80; + memset(vc->pad2, 0xff, 3); + memset(vc->pad3, 0xff, 52); + memset(vc->pad4, 0xff, 192); + memset(vc->v0, 0xff, 32); + memset(vc->v1, 0xff, 32); + memset(vc->v2, 0xff, 16); + memset(vc->v3, 0xff, 16); + memset(vc->vendor, 0xff, 32); + + memset(vc->phys_refnum, 0xff, 4*ddf->mppe); + memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe); + + vcl->next = ddf->conflist; + ddf->conflist = vcl; + ddf->currentconf = vcl; + ddf->updates_pending = 1; + return 1; +} + +#ifndef MDASSEMBLE +static void add_to_super_ddf_bvd(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname) +{ + /* fd and devname identify a device with-in the ddf container (st). + * dk identifies a location in the new BVD. + * We need to find suitable free space in that device and update + * the phys_refnum and lba_offset for the newly created vd_config. + * We might also want to update the type in the phys_disk + * section. + * + * Alternately: fd == -1 and we have already chosen which device to + * use and recorded in dlist->raid_disk; + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + __u64 *lba_offset; + unsigned int working; + unsigned int i; + unsigned long long blocks, pos, esize; + struct extent *ex; + + if (fd == -1) { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + if (!dl || ! (dk->state & (1<currentconf->conf; + lba_offset = ddf->currentconf->lba_offset; + + ex = get_extents(ddf, dl); + if (!ex) + return; + + i = 0; pos = 0; + blocks = __be64_to_cpu(vc->blocks); + if (ddf->currentconf->block_sizes) + blocks = ddf->currentconf->block_sizes[dk->raid_disk]; + + do { + esize = ex[i].start - pos; + if (esize >= blocks) + break; + pos = ex[i].start + ex[i].size; + i++; + } while (ex[i-1].size); + + free(ex); + if (esize < blocks) + return; + + ddf->currentdev = dk->raid_disk; + vc->phys_refnum[dk->raid_disk] = dl->disk.refnum; + lba_offset[dk->raid_disk] = __cpu_to_be64(pos); + + for (i = 0; i < ddf->max_part ; i++) + if (dl->vlist[i] == NULL) + break; + if (i == ddf->max_part) + return; + dl->vlist[i] = ddf->currentconf; + + if (fd >= 0) + dl->fd = fd; + if (devname) + dl->devname = devname; + + /* Check how many working raid_disks, and if we can mark + * array as optimal yet + */ + working = 0; + + for (i = 0; i < __be16_to_cpu(vc->prim_elmnt_count); i++) + if (vc->phys_refnum[i] != 0xffffffff) + working++; + + /* Find which virtual_entry */ + i = ddf->currentconf->vcnum; + if (working == __be16_to_cpu(vc->prim_elmnt_count)) + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | DDF_state_optimal; + + if (vc->prl == DDF_RAID6 && + working+1 == __be16_to_cpu(vc->prim_elmnt_count)) + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | DDF_state_part_optimal; + + ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD); + ddf->updates_pending = 1; +} + +/* add a device to a container, either while creating it or while + * expanding a pre-existing container + */ +static int add_to_super_ddf(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname) +{ + struct ddf_super *ddf = st->sb; + struct dl *dd; + time_t now; + struct tm *tm; + unsigned long long size; + struct phys_disk_entry *pde; + unsigned int n, i; + struct stat stb; + + if (ddf->currentconf) { + add_to_super_ddf_bvd(st, dk, fd, devname); + return 0; + } + + /* This is device numbered dk->number. We need to create + * a phys_disk entry and a more detailed disk_data entry. + */ + fstat(fd, &stb); + if (posix_memalign((void**)&dd, 512, + sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) { + fprintf(stderr, Name + ": %s could allocate buffer for new disk, aborting\n", + __func__); + return 1; + } + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname; + dd->fd = fd; + dd->spare = NULL; + + dd->disk.magic = DDF_PHYS_DATA_MAGIC; + now = time(0); + tm = localtime(&now); + sprintf(dd->disk.guid, "%8s%04d%02d%02d", + T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday); + *(__u32*)(dd->disk.guid + 16) = random32(); + *(__u32*)(dd->disk.guid + 20) = random32(); + + do { + /* Cannot be bothered finding a CRC of some irrelevant details*/ + dd->disk.refnum = random32(); + for (i = __be16_to_cpu(ddf->active->max_pd_entries); + i > 0; i--) + if (ddf->phys->entries[i-1].refnum == dd->disk.refnum) + break; + } while (i > 0); + + dd->disk.forced_ref = 1; + dd->disk.forced_guid = 1; + memset(dd->disk.vendor, ' ', 32); + memcpy(dd->disk.vendor, "Linux", 5); + memset(dd->disk.pad, 0xff, 442); + for (i = 0; i < ddf->max_part ; i++) + dd->vlist[i] = NULL; + + n = __be16_to_cpu(ddf->phys->used_pdes); + pde = &ddf->phys->entries[n]; + dd->pdnum = n; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = malloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = __cpu_to_be16(n); + pde = &pd->entries[0]; + dd->mdupdate = pd; + } else { + n++; + ddf->phys->used_pdes = __cpu_to_be16(n); + } + + memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); + pde->refnum = dd->disk.refnum; + pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); + pde->state = __cpu_to_be16(DDF_Online); + get_dev_size(fd, NULL, &size); + /* We are required to reserve 32Meg, and record the size in sectors */ + pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512); + sprintf(pde->path, "%17.17s","Information: nil") ; + memset(pde->pad, 0xff, 6); + + dd->size = size >> 9; + if (st->update_tail) { + dd->next = ddf->add_list; + ddf->add_list = dd; + } else { + dd->next = ddf->dlist; + ddf->dlist = dd; + ddf->updates_pending = 1; + } + + return 0; +} + +/* + * This is the write_init_super method for a ddf container. It is + * called when creating a container or adding another device to a + * container. + */ + +static unsigned char null_conf[4096+512]; + +static int __write_init_super_ddf(struct supertype *st, int do_close) +{ + + struct ddf_super *ddf = st->sb; + int i; + struct dl *d; + int n_config; + int conf_size; + int attempts = 0; + int successes = 0; + unsigned long long size, sector; + + /* try to write updated metadata, + * if we catch a failure move on to the next disk + */ + for (d = ddf->dlist; d; d=d->next) { + int fd = d->fd; + + if (fd < 0) + continue; + + attempts++; + /* We need to fill in the primary, (secondary) and workspace + * lba's in the headers, set their checksums, + * Also checksum phys, virt.... + * + * Then write everything out, finally the anchor is written. + */ + get_dev_size(fd, NULL, &size); + size /= 512; + ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2); + ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2); + ddf->anchor.seq = __cpu_to_be32(1); + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ + ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */ + ddf->anchor.crc = calc_crc(&ddf->anchor, 512); + + ddf->primary.openflag = 0; + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 0; + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->primary.crc = calc_crc(&ddf->primary, 512); + ddf->secondary.crc = calc_crc(&ddf->secondary, 512); + + sector = size - 16*1024*2; + lseek64(fd, sector<<9, 0); + if (write(fd, &ddf->primary, 512) < 0) + continue; + + ddf->controller.crc = calc_crc(&ddf->controller, 512); + if (write(fd, &ddf->controller, 512) < 0) + continue; + + ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + + if (write(fd, ddf->phys, ddf->pdsize) < 0) + continue; + + ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); + if (write(fd, ddf->virt, ddf->vdsize) < 0) + continue; + + /* Now write lots of config records. */ + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; + for (i = 0 ; i <= n_config ; i++) { + struct vcl *c = d->vlist[i]; + if (i == n_config) + c = (struct vcl*)d->spare; + + if (c) { + c->conf.crc = calc_crc(&c->conf, conf_size); + if (write(fd, &c->conf, conf_size) < 0) + break; + } else { + char *null_aligned = (char*)((((unsigned long)null_conf)+511)&~511UL); + if (null_conf[0] != 0xff) + memset(null_conf, 0xff, sizeof(null_conf)); + unsigned int togo = conf_size; + while (togo > sizeof(null_conf)-512) { + if (write(fd, null_aligned, sizeof(null_conf)-512) < 0) + break; + togo -= sizeof(null_conf)-512; + } + if (write(fd, null_aligned, togo) < 0) + break; + } + } + if (i <= n_config) + continue; + d->disk.crc = calc_crc(&d->disk, 512); + if (write(fd, &d->disk, 512) < 0) + continue; + + /* Maybe do the same for secondary */ + + lseek64(fd, (size-1)*512, SEEK_SET); + if (write(fd, &ddf->anchor, 512) < 0) + continue; + successes++; + } + + if (do_close) + for (d = ddf->dlist; d; d=d->next) { + close(d->fd); + d->fd = -1; + } + + return attempts != successes; +} + +static int write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct vcl *currentconf = ddf->currentconf; + + /* we are done with currentconf reset it to point st at the container */ + ddf->currentconf = NULL; + + if (st->update_tail) { + /* queue the virtual_disk and vd_config as metadata updates */ + struct virtual_disk *vd; + struct vd_config *vc; + int len; + + if (!currentconf) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + + /* adding a disk to the container. */ + if (!ddf->add_list) + return 0; + + append_metadata_update(st, ddf->add_list->mdupdate, len); + ddf->add_list->mdupdate = NULL; + return 0; + } + + /* Newly created VD */ + + /* First the virtual disk. We have a slightly fake header */ + len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry); + vd = malloc(len); + *vd = *ddf->virt; + vd->entries[0] = ddf->virt->entries[currentconf->vcnum]; + vd->populated_vdes = __cpu_to_be16(currentconf->vcnum); + append_metadata_update(st, vd, len); + + /* Then the vd_config */ + len = ddf->conf_rec_len * 512; + vc = malloc(len); + memcpy(vc, ¤tconf->conf, len); + append_metadata_update(st, vc, len); + + /* FIXME I need to close the fds! */ + return 0; + } else { + struct dl *d; + for (d = ddf->dlist; d; d=d->next) + while (Kill(d->devname, NULL, 0, 1, 1) == 0); + return __write_init_super_ddf(st, 1); + } +} + +#endif + +static __u64 avail_size_ddf(struct supertype *st, __u64 devsize) +{ + /* We must reserve the last 32Meg */ + if (devsize <= 32*1024*2) + return 0; + return devsize - 32*1024*2; +} + +#ifndef MDASSEMBLE + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + /* Find 'raiddisks' spare extents at least 'size' big (but + * only caring about multiples of 'chunk') and remember + * them. + * If the cannot be found, fail. + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + int cnt = 0; + + for (dl = ddf->dlist; dl ; dl=dl->next) { + dl->raiddisk = -1; + dl->esize = 0; + } + /* Now find largest extent on each device */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + struct extent *e = get_extents(ddf, dl); + unsigned long long pos = 0; + int i = 0; + int found = 0; + unsigned long long minsize = size; + + if (size == 0) + minsize = chunk; + + if (!e) + continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) { + found = 1; + minsize = esize; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) { + cnt++; + dl->esize = minsize; + } + free(e); + } + if (cnt < raiddisks) { + fprintf(stderr, Name ": not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + if (size == 0) { + /* choose the largest size of which there are at least 'raiddisk' */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + struct dl *dl2; + if (dl->esize <= size) + continue; + /* This is bigger than 'size', see if there are enough */ + cnt = 0; + for (dl2 = dl; dl2 ; dl2=dl2->next) + if (dl2->esize >= dl->esize) + cnt++; + if (cnt >= raiddisks) + size = dl->esize; + } + if (chunk) { + size = size / chunk; + size *= chunk; + } + *freesize = size; + if (size < 32) { + fprintf(stderr, Name ": not enough spare devices to create array.\n"); + return 0; + } + } + /* We have a 'size' of which there are enough spaces. + * We simply do a first-fit */ + cnt = 0; + for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) { + if (dl->esize < size) + continue; + + dl->raiddisk = cnt; + cnt++; + } + return 1; +} + + + +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + struct mdinfo *sra; + int cfd; + + /* ddf potentially supports lots of things, but it depends on + * what devices are offered (and maybe kernel version?) + * If given unused devices, we will make a container. + * If given devices in a container, we will make a BVD. + * If given BVDs, we make an SVD, changing all the GUIDs in the process. + */ + + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_ddf_container(st, level, layout, + raiddisks, chunk, + size, dev, freesize, + verbose); + } + + if (!dev) { + /* Initial sanity check. Exclude illegal levels. */ + int i; + for (i=0; ddf_level_num[i].num1 != MAXINT; i++) + if (ddf_level_num[i].num2 == level) + break; + if (ddf_level_num[i].num1 == MAXINT) { + if (verbose) + fprintf(stderr, Name ": DDF does not support level %d arrays\n", + level); + return 0; + } + /* Should check layout? etc */ + + if (st->sb && freesize) { + /* --create was given a container to create in. + * So we need to check that there are enough + * free spaces and return the amount of space. + * We may as well remember which drives were + * chosen so that add_to_super/getinfo_super + * can return them. + */ + return reserve_space(st, raiddisks, size, chunk, freesize); + } + return 1; + } + + if (st->sb) { + /* A container has already been opened, so we are + * creating in there. Maybe a BVD, maybe an SVD. + * Should make a distinction one day. + */ + return validate_geometry_ddf_bvd(st, level, layout, raiddisks, + chunk, size, dev, freesize, + verbose); + } + /* This is the first device for the array. + * If it is a container, we read it in and do automagic allocations, + * no other devices should be given. + * Otherwise it must be a member device of a container, and we + * do manual allocation. + * Later we should check for a BVD and make an SVD. + */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + sra = sysfs_read(fd, 0, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + + /* load super */ + /* find space for 'n' devices. */ + /* remember the devices */ + /* Somehow return the fact that we have enough */ + } + + if (verbose) + fprintf(stderr, + Name ": ddf: Cannot create this array " + "on device %s - a container is required.\n", + dev); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + fprintf(stderr, Name ": ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe a 'ddf' container. */ + cfd = open_container(fd); + if (cfd < 0) { + close(fd); + if (verbose) + fprintf(stderr, Name ": ddf: Cannot use %s: %s\n", + dev, strerror(EBUSY)); + return 0; + } + sra = sysfs_read(cfd, 0, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + /* This is a member of a ddf container. Load the container + * and try to create a bvd + */ + struct ddf_super *ddf; + if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) { + st->sb = ddf; + st->container_dev = fd2devnum(cfd); + close(cfd); + return validate_geometry_ddf_bvd(st, level, layout, + raiddisks, chunk, size, + dev, freesize, + verbose); + } + close(cfd); + } else /* device may belong to a different container */ + return 0; + + return 1; +} + +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size_ddf(st, ldsize >> 9); + if (*freesize == 0) + return 0; + + return 1; +} + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct ddf_super *ddf = st->sb; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + /* ddf/bvd supports lots of things, but not containers */ + if (level == LEVEL_CONTAINER) { + if (verbose) + fprintf(stderr, Name ": DDF cannot create a container within an container\n"); + return 0; + } + /* We must have the container info already read in. */ + if (!ddf) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size'. + */ + unsigned long long minsize = size; + int dcnt = 0; + if (minsize == 0) + minsize = 8; + for (dl = ddf->dlist; dl ; dl = dl->next) + { + int found = 0; + pos = 0; + + i = 0; + e = get_extents(ddf, dl); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + fprintf(stderr, + Name ": ddf: Not enough devices with " + "space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = ddf->dlist ; dl ; dl = dl->next) { + if (dl->major == (int)major(stb.st_rdev) && + dl->minor == (int)minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + fprintf(stderr, Name ": ddf: %s is not in the " + "same DDF set\n", + dev); + return 0; + } + e = get_extents(ddf, dl); + maxsize = 0; + i = 0; + if (e) do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + *freesize = maxsize; + // FIXME here I am + + return 1; +} + +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname, int keep_fd) +{ + struct mdinfo *sra; + struct ddf_super *super; + struct mdinfo *sd, *best = NULL; + int bestseq = 0; + int seq; + char nm[20]; + int dfd; + + sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "ddf") != 0) + return 1; + + if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0) + return 1; + memset(super, 0, sizeof(*super)); + + /* first, try each device, and choose the best ddf */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + close(dfd); + if (rv == 0) { + seq = __be32_to_cpu(super->active->seq); + if (super->active->openflag) + seq--; + if (!best || seq > bestseq) { + bestseq = seq; + best = sd; + } + } + } + if (!best) + return 1; + /* OK, load this ddf */ + sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 1; + load_ddf_headers(dfd, super, NULL); + load_ddf_global(dfd, super, NULL); + close(dfd); + /* Now we need the device-local bits */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + if (rv == 0) + rv = load_ddf_local(dfd, super, NULL, keep_fd); + if (!keep_fd) close(dfd); + if (rv) + return 1; + } + if (st->subarray[0]) { + unsigned long val; + struct vcl *v; + char *ep; + + val = strtoul(st->subarray, &ep, 10); + if (*ep != '\0') { + free(super); + return 1; + } + + for (v = super->conflist; v; v = v->next) + if (v->vcnum == val) + super->currentconf = v; + if (!super->currentconf) { + free(super); + return 1; + } + } + + *sbp = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + st->container_dev = fd2devnum(fd); + } + st->loaded_container = 1; + return 0; +} +#endif /* MDASSEMBLE */ + +static struct mdinfo *container_content_ddf(struct supertype *st) +{ + /* Given a container loaded by load_super_ddf_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each vcl in conflist: create an mdinfo, fill it in, + * then look for matching devices (phys_refnum) in dlist + * and create appropriate device mdinfo. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *rest = NULL; + struct vcl *vc; + + for (vc = ddf->conflist ; vc ; vc=vc->next) + { + unsigned int i; + unsigned int j; + struct mdinfo *this; + this = malloc(sizeof(*this)); + memset(this, 0, sizeof(*this)); + this->next = rest; + rest = this; + + this->array.level = map_num1(ddf_level_num, vc->conf.prl); + this->array.raid_disks = + __be16_to_cpu(vc->conf.prim_elmnt_count); + this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, + this->array.raid_disks); + this->array.md_minor = -1; + this->array.major_version = -1; + this->array.minor_version = -2; + this->array.ctime = DECADE + + __be32_to_cpu(*(__u32*)(vc->conf.guid+16)); + this->array.utime = DECADE + + __be32_to_cpu(vc->conf.timestamp); + this->array.chunk_size = 512 << vc->conf.chunk_shift; + + i = vc->vcnum; + if ((ddf->virt->entries[i].state & DDF_state_inconsistent) || + (ddf->virt->entries[i].init_state & DDF_initstate_mask) != + DDF_init_full) { + this->array.state = 0; + this->resync_start = 0; + } else { + this->array.state = 1; + this->resync_start = MaxSector; + } + memcpy(this->name, ddf->virt->entries[i].name, 16); + this->name[16]=0; + for(j=0; j<16; j++) + if (this->name[j] == ' ') + this->name[j] = 0; + + memset(this->uuid, 0, sizeof(this->uuid)); + this->component_size = __be64_to_cpu(vc->conf.blocks); + this->array.size = this->component_size / 2; + this->container_member = i; + + ddf->currentconf = vc; + uuid_from_super_ddf(st, this->uuid); + ddf->currentconf = NULL; + + sprintf(this->text_version, "/%s/%d", + devnum2devname(st->container_dev), + this->container_member); + + for (i = 0 ; i < ddf->mppe ; i++) { + struct mdinfo *dev; + struct dl *d; + + if (vc->conf.phys_refnum[i] == 0xFFFFFFFF) + continue; + + this->array.working_disks++; + + for (d = ddf->dlist; d ; d=d->next) + if (d->disk.refnum == vc->conf.phys_refnum[i]) + break; + if (d == NULL) + /* Haven't found that one yet, maybe there are others */ + continue; + + dev = malloc(sizeof(*dev)); + memset(dev, 0, sizeof(*dev)); + dev->next = this->devs; + this->devs = dev; + + dev->disk.number = __be32_to_cpu(d->disk.refnum); + dev->disk.major = d->major; + dev->disk.minor = d->minor; + dev->disk.raid_disk = i; + dev->disk.state = (1<recovery_start = MaxSector; + + dev->events = __be32_to_cpu(ddf->primary.seq); + dev->data_offset = __be64_to_cpu(vc->lba_offset[i]); + dev->component_size = __be64_to_cpu(vc->conf.blocks); + if (d->devname) + strcpy(dev->name, d->devname); + } + } + return rest; +} + +static int store_super_ddf(struct supertype *st, int fd) +{ + struct ddf_super *ddf = st->sb; + unsigned long long dsize; + void *buf; + int rc; + + if (!ddf) + return 1; + + /* ->dlist and ->conflist will be set for updates, currently not + * supported + */ + if (ddf->dlist || ddf->conflist) + return 1; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (posix_memalign(&buf, 512, 512) != 0) + return 1; + memset(buf, 0, 512); + + lseek64(fd, dsize-512, 0); + rc = write(fd, buf, 512); + free(buf); + if (rc < 0) + return 1; + return 0; +} + +static int compare_super_ddf(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct ddf_super *first = st->sb; + struct ddf_super *second = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0) + return 2; + + /* FIXME should I look at anything else? */ + return 0; +} + +#ifndef MDASSEMBLE +/* + * A new array 'a' has been started which claims to be instance 'inst' + * within container 'c'. + * We need to confirm that the array matches the metadata in 'c' so + * that we don't corrupt any metadata. + */ +static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst) +{ + dprintf("ddf: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +/* + * The array 'a' is to be marked clean in the metadata. + * If '->resync_start' is not ~(unsigned long long)0, then the array is only + * clean up to the point (in sectors). If that cannot be recorded in the + * metadata, then leave it as dirty. + * + * For DDF, we need to clear the DDF_state_inconsistent bit in the + * !global! virtual_disk.virtual_entry structure. + */ +static int ddf_set_array_state(struct active_array *a, int consistent) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + int old = ddf->virt->entries[inst].state; + if (consistent == 2) { + /* Should check if a recovery should be started FIXME */ + consistent = 1; + if (!is_resync_complete(&a->info)) + consistent = 0; + } + if (consistent) + ddf->virt->entries[inst].state &= ~DDF_state_inconsistent; + else + ddf->virt->entries[inst].state |= DDF_state_inconsistent; + if (old != ddf->virt->entries[inst].state) + ddf->updates_pending = 1; + + old = ddf->virt->entries[inst].init_state; + ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; + if (is_resync_complete(&a->info)) + ddf->virt->entries[inst].init_state |= DDF_init_full; + else if (a->info.resync_start == 0) + ddf->virt->entries[inst].init_state |= DDF_init_not; + else + ddf->virt->entries[inst].init_state |= DDF_init_quick; + if (old != ddf->virt->entries[inst].init_state) + ddf->updates_pending = 1; + + dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty", + a->info.resync_start); + return consistent; +} + +/* + * The state of each disk is stored in the global phys_disk structure + * in phys_disk.entries[n].state. + * This makes various combinations awkward. + * - When a device fails in any array, it must be failed in all arrays + * that include a part of this device. + * - When a component is rebuilding, we cannot include it officially in the + * array unless this is the only array that uses the device. + * + * So: when transitioning: + * Online -> failed, just set failed flag. monitor will propagate + * spare -> online, the device might need to be added to the array. + * spare -> failed, just set failed. Don't worry if in array or not. + */ +static void ddf_set_disk(struct active_array *a, int n, int state) +{ + struct ddf_super *ddf = a->container->sb; + unsigned int inst = a->info.container_member; + struct vd_config *vc = find_vdcr(ddf, inst); + int pd = find_phys(ddf, vc->phys_refnum[n]); + int i, st, working; + + if (vc == NULL) { + dprintf("ddf: cannot find instance %d!!\n", inst); + return; + } + if (pd < 0) { + /* disk doesn't currently exist. If it is now in_sync, + * insert it. */ + if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) { + /* Find dev 'n' in a->info->devs, determine the + * ddf refnum, and set vc->phys_refnum and update + * phys->entries[] + */ + /* FIXME */ + } + } else { + int old = ddf->phys->entries[pd].state; + if (state & DS_FAULTY) + ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Failed); + if (state & DS_INSYNC) { + ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Online); + ddf->phys->entries[pd].state &= __cpu_to_be16(~DDF_Rebuilding); + } + if (old != ddf->phys->entries[pd].state) + ddf->updates_pending = 1; + } + + dprintf("ddf: set_disk %d to %x\n", n, state); + + /* Now we need to check the state of the array and update + * virtual_disk.entries[n].state. + * It needs to be one of "optimal", "degraded", "failed". + * I don't understand 'deleted' or 'missing'. + */ + working = 0; + for (i=0; i < a->info.array.raid_disks; i++) { + pd = find_phys(ddf, vc->phys_refnum[i]); + if (pd < 0) + continue; + st = __be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + == DDF_Online) + working++; + } + state = DDF_state_degraded; + if (working == a->info.array.raid_disks) + state = DDF_state_optimal; + else switch(vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < a->info.array.raid_disks-1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < a->info.array.raid_disks-2) + state = DDF_state_failed; + else if (working == a->info.array.raid_disks-1) + state = DDF_state_part_optimal; + break; + } + + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + ddf->updates_pending = 1; + } + +} + +static void ddf_sync_metadata(struct supertype *st) +{ + + /* + * Write all data to all devices. + * Later, we might be able to track whether only local changes + * have been made, or whether any global data has been changed, + * but ddf is sufficiently weird that it probably always + * changes global data .... + */ + struct ddf_super *ddf = st->sb; + if (!ddf->updates_pending) + return; + ddf->updates_pending = 0; + __write_init_super_ddf(st, 0); + dprintf("ddf: sync_metadata\n"); +} + +static void ddf_process_update(struct supertype *st, + struct metadata_update *update) +{ + /* Apply this update to the metadata. + * The first 4 bytes are a DDF_*_MAGIC which guides + * our actions. + * Possible update are: + * DDF_PHYS_RECORDS_MAGIC + * Add a new physical device. Changes to this record + * only happen implicitly. + * used_pdes is the device number. + * DDF_VIRT_RECORDS_MAGIC + * Add a new VD. Possibly also change the 'access' bits. + * populated_vdes is the entry number. + * DDF_VD_CONF_MAGIC + * New or updated VD. the VIRT_RECORD must already + * exist. For an update, phys_refnum and lba_offset + * (at least) are updated, and the VD_CONF must + * be written to precisely those devices listed with + * a phys_refnum. + * DDF_SPARE_ASSIGN_MAGIC + * replacement Spare Assignment Record... but for which device? + * + * So, e.g.: + * - to create a new array, we send a VIRT_RECORD and + * a VD_CONF. Then assemble and start the array. + * - to activate a spare we send a VD_CONF to add the phys_refnum + * and offset. This will also mark the spare as active with + * a spare-assignment record. + */ + struct ddf_super *ddf = st->sb; + __u32 *magic = (__u32*)update->buf; + struct phys_disk *pd; + struct virtual_disk *vd; + struct vd_config *vc; + struct vcl *vcl; + struct dl *dl; + unsigned int mppe; + unsigned int ent; + + dprintf("Process update %x\n", *magic); + + switch (*magic) { + case DDF_PHYS_RECORDS_MAGIC: + + if (update->len != (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry))) + return; + pd = (struct phys_disk*)update->buf; + + ent = __be16_to_cpu(pd->used_pdes); + if (ent >= __be16_to_cpu(ddf->phys->max_pdes)) + return; + if (!all_ff(ddf->phys->entries[ent].guid)) + return; + ddf->phys->entries[ent] = pd->entries[0]; + ddf->phys->used_pdes = __cpu_to_be16(1 + + __be16_to_cpu(ddf->phys->used_pdes)); + ddf->updates_pending = 1; + if (ddf->add_list) { + struct active_array *a; + struct dl *al = ddf->add_list; + ddf->add_list = al->next; + + al->next = ddf->dlist; + ddf->dlist = al; + + /* As a device has been added, we should check + * for any degraded devices that might make + * use of this spare */ + for (a = st->arrays ; a; a=a->next) + a->check_degraded = 1; + } + break; + + case DDF_VIRT_RECORDS_MAGIC: + + if (update->len != (sizeof(struct virtual_disk) + + sizeof(struct virtual_entry))) + return; + vd = (struct virtual_disk*)update->buf; + + ent = __be16_to_cpu(vd->populated_vdes); + if (ent >= __be16_to_cpu(ddf->virt->max_vdes)) + return; + if (!all_ff(ddf->virt->entries[ent].guid)) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = __cpu_to_be16(1 + + __be16_to_cpu(ddf->virt->populated_vdes)); + ddf->updates_pending = 1; + break; + + case DDF_VD_CONF_MAGIC: + dprintf("len %d %d\n", update->len, ddf->conf_rec_len); + + mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); + if ((unsigned)update->len != ddf->conf_rec_len * 512) + return; + vc = (struct vd_config*)update->buf; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) + break; + dprintf("vcl = %p\n", vcl); + if (vcl) { + /* An update, just copy the phys_refnum and lba_offset + * fields + */ + memcpy(vcl->conf.phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + } else { + /* A new VD_CONF */ + if (!update->space) + return; + vcl = update->space; + update->space = NULL; + vcl->next = ddf->conflist; + memcpy(&vcl->conf, vc, update->len); + vcl->lba_offset = (__u64*) + &vcl->conf.phys_refnum[mppe]; + ddf->conflist = vcl; + } + /* Now make sure vlist is correct for each dl. */ + for (dl = ddf->dlist; dl; dl = dl->next) { + unsigned int dn; + unsigned int vn = 0; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + for (dn=0; dn < ddf->mppe ; dn++) + if (vcl->conf.phys_refnum[dn] == + dl->disk.refnum) { + dprintf("dev %d has %p at %d\n", + dl->pdnum, vcl, vn); + dl->vlist[vn++] = vcl; + break; + } + while (vn < ddf->max_part) + dl->vlist[vn++] = NULL; + if (dl->vlist[0]) { + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Active_in_VD); + } + if (dl->spare) { + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Spare); + } + if (!dl->vlist[0] && !dl->spare) { + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Spare | + DDF_Active_in_VD); + } + } + ddf->updates_pending = 1; + break; + case DDF_SPARE_ASSIGN_MAGIC: + default: break; + } +} + +static void ddf_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /* This update arrived at managemon. + * We are about to pass it to monitor. + * If a malloc is needed, do it here. + */ + struct ddf_super *ddf = st->sb; + __u32 *magic = (__u32*)update->buf; + if (*magic == DDF_VD_CONF_MAGIC) + if (posix_memalign(&update->space, 512, + offsetof(struct vcl, conf) + + ddf->conf_rec_len * 512) != 0) + update->space = NULL; +} + +/* + * Check if the array 'a' is degraded but not failed. + * If it is, find as many spares as are available and needed and + * arrange for their inclusion. + * We only choose devices which are not already in the array, + * and prefer those with a spare-assignment to this array. + * otherwise we choose global spares - assuming always that + * there is enough room. + * For each spare that we assign, we return an 'mdinfo' which + * describes the position for the device in the array. + * We also add to 'updates' a DDF_VD_CONF_MAGIC update with + * the new phys_refnum and lba_offset values. + * + * Only worry about BVDs at the moment. + */ +static struct mdinfo *ddf_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + int working = 0; + struct mdinfo *d; + struct ddf_super *ddf = a->container->sb; + int global_ok = 0; + struct mdinfo *rv = NULL; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + int i; + struct vd_config *vc; + __u64 *lba; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + working ++; + } + + dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks, + a->info.array.level); + if (working == a->info.array.raid_disks) + return NULL; /* array not degraded */ + switch (a->info.array.level) { + case 1: + if (working == 0) + return NULL; /* failed */ + break; + case 4: + case 5: + if (working < a->info.array.raid_disks - 1) + return NULL; /* failed */ + break; + case 6: + if (working < a->info.array.raid_disks - 2) + return NULL; /* failed */ + break; + default: /* concat or stripe */ + return NULL; /* failed */ + } + + /* For each slot, if it is not working, find a spare */ + dl = ddf->dlist; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + again: + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + int is_global = 0; + int is_dedicated = 0; + struct extent *ex; + unsigned int j; + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", dl->major, dl->minor); + break; + } + if (d2) + continue; + if (ddf->phys->entries[dl->pdnum].type & + __cpu_to_be16(DDF_Spare)) { + /* Check spare assign record */ + if (dl->spare) { + if (dl->spare->type & DDF_spare_dedicated) { + /* check spare_ents for guid */ + for (j = 0 ; + j < __be16_to_cpu(dl->spare->populated); + j++) { + if (memcmp(dl->spare->spare_ents[j].guid, + ddf->virt->entries[a->info.container_member].guid, + DDF_GUID_LEN) == 0) + is_dedicated = 1; + } + } else + is_global = 1; + } + } else if (ddf->phys->entries[dl->pdnum].type & + __cpu_to_be16(DDF_Global_Spare)) { + is_global = 1; + } + if ( ! (is_dedicated || + (is_global && global_ok))) { + dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, + is_dedicated, is_global); + continue; + } + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + ex = get_extents(ddf, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + j = 0; pos = 0; + esize = 0; + + do { + esize = ex[j].start - pos; + if (esize >= a->info.component_size) + break; + pos = ex[i].start + ex[i].size; + i++; + } while (ex[i-1].size); + + free(ex); + if (esize < a->info.component_size) { + dprintf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor, + esize, a->info.component_size); + /* No room */ + continue; + } + + /* Cool, we have a device with some space at pos */ + di = malloc(sizeof(*di)); + if (!di) + continue; + memset(di, 0, sizeof(*di)); + di->disk.number = i; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pos; + di->component_size = a->info.component_size; + di->container_member = dl->pdnum; + di->next = rv; + rv = di; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, pos); + + break; + } + if (!dl && ! global_ok) { + /* not enough dedicated spares, try global */ + global_ok = 1; + dl = ddf->dlist; + goto again; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * phys_refnum and lba_offset values + */ + mu = malloc(sizeof(*mu)); + if (mu && posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) { + free(mu); + mu = NULL; + } + if (!mu) { + while (rv) { + struct mdinfo *n = rv->next; + + free(rv); + rv = n; + } + return NULL; + } + + mu->buf = malloc(ddf->conf_rec_len * 512); + mu->len = ddf->conf_rec_len; + mu->next = *updates; + vc = find_vdcr(ddf, a->info.container_member); + memcpy(mu->buf, vc, ddf->conf_rec_len * 512); + + vc = (struct vd_config*)mu->buf; + lba = (__u64*)&vc->phys_refnum[ddf->mppe]; + for (di = rv ; di ; di = di->next) { + vc->phys_refnum[di->disk.raid_disk] = + ddf->phys->entries[dl->pdnum].refnum; + lba[di->disk.raid_disk] = di->data_offset; + } + *updates = mu; + return rv; +} +#endif /* MDASSEMBLE */ + +static int ddf_level_to_layout(int level) +{ + switch(level) { + case 0: + case 1: + return 0; + case 5: + return ALGORITHM_LEFT_SYMMETRIC; + case 6: + return ALGORITHM_ROTATING_N_CONTINUE; + case 10: + return 0x102; + default: + return UnSet; + } +} + +struct superswitch super_ddf = { +#ifndef MDASSEMBLE + .examine_super = examine_super_ddf, + .brief_examine_super = brief_examine_super_ddf, + .brief_examine_subarrays = brief_examine_subarrays_ddf, + .export_examine_super = export_examine_super_ddf, + .detail_super = detail_super_ddf, + .brief_detail_super = brief_detail_super_ddf, + .validate_geometry = validate_geometry_ddf, + .write_init_super = write_init_super_ddf, + .add_to_super = add_to_super_ddf, +#endif + .match_home = match_home_ddf, + .uuid_from_super= uuid_from_super_ddf, + .getinfo_super = getinfo_super_ddf, + .update_super = update_super_ddf, + + .avail_size = avail_size_ddf, + + .compare_super = compare_super_ddf, + + .load_super = load_super_ddf, + .init_super = init_super_ddf, + .store_super = store_super_ddf, + .free_super = free_super_ddf, + .match_metadata_desc = match_metadata_desc_ddf, + .container_content = container_content_ddf, + .default_layout = ddf_level_to_layout, + + .external = 1, + +#ifndef MDASSEMBLE +/* for mdmon */ + .open_new = ddf_open_new, + .set_array_state= ddf_set_array_state, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + .process_update = ddf_process_update, + .prepare_update = ddf_prepare_update, + .activate_spare = ddf_activate_spare, +#endif + .name = "ddf", +}; diff -Nru mdadm-2.6.7.1/super-intel.c mdadm-3.1.4/super-intel.c --- mdadm-2.6.7.1/super-intel.c 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/super-intel.c 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,5606 @@ +/* + * mdadm - Intel(R) Matrix Storage Manager Support + * + * Copyright (C) 2002-2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include "platform-intel.h" +#include +#include +#include +#include + +/* MPB == Metadata Parameter Block */ +#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. " +#define MPB_SIG_LEN (strlen(MPB_SIGNATURE)) +#define MPB_VERSION_RAID0 "1.0.00" +#define MPB_VERSION_RAID1 "1.1.00" +#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00" +#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01" +#define MPB_VERSION_RAID5 "1.2.02" +#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04" +#define MPB_VERSION_CNG "1.2.06" +#define MPB_VERSION_ATTRIBS "1.3.00" +#define MAX_SIGNATURE_LENGTH 32 +#define MAX_RAID_SERIAL_LEN 16 + +#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000) +#define MPB_ATTRIB_PM __cpu_to_le32(0x40000000) +#define MPB_ATTRIB_2TB __cpu_to_le32(0x20000000) +#define MPB_ATTRIB_RAID0 __cpu_to_le32(0x00000001) +#define MPB_ATTRIB_RAID1 __cpu_to_le32(0x00000002) +#define MPB_ATTRIB_RAID10 __cpu_to_le32(0x00000004) +#define MPB_ATTRIB_RAID1E __cpu_to_le32(0x00000008) +#define MPB_ATTRIB_RAID5 __cpu_to_le32(0x00000010) +#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020) + +#define MPB_SECTOR_CNT 418 +#define IMSM_RESERVED_SECTORS 4096 +#define SECT_PER_MB_SHIFT 11 + +/* Disk configuration info. */ +#define IMSM_MAX_DEVICES 255 +struct imsm_disk { + __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */ + __u32 total_blocks; /* 0xE8 - 0xEB total blocks */ + __u32 scsi_id; /* 0xEC - 0xEF scsi ID */ +#define SPARE_DISK __cpu_to_le32(0x01) /* Spare */ +#define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ +#define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ + __u32 status; /* 0xF0 - 0xF3 */ + __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ +#define IMSM_DISK_FILLERS 4 + __u32 filler[IMSM_DISK_FILLERS]; /* 0xF4 - 0x107 MPB_DISK_FILLERS for future expansion */ +}; + +/* RAID map configuration infos. */ +struct imsm_map { + __u32 pba_of_lba0; /* start address of partition */ + __u32 blocks_per_member;/* blocks per member */ + __u32 num_data_stripes; /* number of data stripes */ + __u16 blocks_per_strip; + __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */ +#define IMSM_T_STATE_NORMAL 0 +#define IMSM_T_STATE_UNINITIALIZED 1 +#define IMSM_T_STATE_DEGRADED 2 +#define IMSM_T_STATE_FAILED 3 + __u8 raid_level; +#define IMSM_T_RAID0 0 +#define IMSM_T_RAID1 1 +#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */ + __u8 num_members; /* number of member disks */ + __u8 num_domains; /* number of parity domains */ + __u8 failed_disk_num; /* valid only when state is degraded */ + __u8 ddf; + __u32 filler[7]; /* expansion area */ +#define IMSM_ORD_REBUILD (1 << 24) + __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], + * top byte contains some flags + */ +} __attribute__ ((packed)); + +struct imsm_vol { + __u32 curr_migr_unit; + __u32 checkpoint_id; /* id to access curr_migr_unit */ + __u8 migr_state; /* Normal or Migrating */ +#define MIGR_INIT 0 +#define MIGR_REBUILD 1 +#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */ +#define MIGR_GEN_MIGR 3 +#define MIGR_STATE_CHANGE 4 +#define MIGR_REPAIR 5 + __u8 migr_type; /* Initializing, Rebuilding, ... */ + __u8 dirty; + __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */ + __u16 verify_errors; /* number of mismatches */ + __u16 bad_blocks; /* number of bad blocks during verify */ + __u32 filler[4]; + struct imsm_map map[1]; + /* here comes another one if migr_state */ +} __attribute__ ((packed)); + +struct imsm_dev { + __u8 volume[MAX_RAID_SERIAL_LEN]; + __u32 size_low; + __u32 size_high; +#define DEV_BOOTABLE __cpu_to_le32(0x01) +#define DEV_BOOT_DEVICE __cpu_to_le32(0x02) +#define DEV_READ_COALESCING __cpu_to_le32(0x04) +#define DEV_WRITE_COALESCING __cpu_to_le32(0x08) +#define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10) +#define DEV_HIDDEN_AT_BOOT __cpu_to_le32(0x20) +#define DEV_CURRENTLY_HIDDEN __cpu_to_le32(0x40) +#define DEV_VERIFY_AND_FIX __cpu_to_le32(0x80) +#define DEV_MAP_STATE_UNINIT __cpu_to_le32(0x100) +#define DEV_NO_AUTO_RECOVERY __cpu_to_le32(0x200) +#define DEV_CLONE_N_GO __cpu_to_le32(0x400) +#define DEV_CLONE_MAN_SYNC __cpu_to_le32(0x800) +#define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000) + __u32 status; /* Persistent RaidDev status */ + __u32 reserved_blocks; /* Reserved blocks at beginning of volume */ + __u8 migr_priority; + __u8 num_sub_vols; + __u8 tid; + __u8 cng_master_disk; + __u16 cache_policy; + __u8 cng_state; + __u8 cng_sub_state; +#define IMSM_DEV_FILLERS 10 + __u32 filler[IMSM_DEV_FILLERS]; + struct imsm_vol vol; +} __attribute__ ((packed)); + +struct imsm_super { + __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */ + __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */ + __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */ + __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */ + __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */ + __u32 error_log_size; /* 0x30 - 0x33 in bytes */ + __u32 attributes; /* 0x34 - 0x37 */ + __u8 num_disks; /* 0x38 Number of configured disks */ + __u8 num_raid_devs; /* 0x39 Number of configured volumes */ + __u8 error_log_pos; /* 0x3A */ + __u8 fill[1]; /* 0x3B */ + __u32 cache_size; /* 0x3c - 0x40 in mb */ + __u32 orig_family_num; /* 0x40 - 0x43 original family num */ + __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */ + __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */ +#define IMSM_FILLERS 35 + __u32 filler[IMSM_FILLERS]; /* 0x4C - 0xD7 RAID_MPB_FILLERS */ + struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */ + /* here comes imsm_dev[num_raid_devs] */ + /* here comes BBM logs */ +} __attribute__ ((packed)); + +#define BBM_LOG_MAX_ENTRIES 254 + +struct bbm_log_entry { + __u64 defective_block_start; +#define UNREADABLE 0xFFFFFFFF + __u32 spare_block_offset; + __u16 remapped_marked_count; + __u16 disk_ordinal; +} __attribute__ ((__packed__)); + +struct bbm_log { + __u32 signature; /* 0xABADB10C */ + __u32 entry_count; + __u32 reserved_spare_block_count; /* 0 */ + __u32 reserved; /* 0xFFFF */ + __u64 first_spare_lba; + struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + + +#ifndef MDASSEMBLE +static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; +#endif + +static __u8 migr_type(struct imsm_dev *dev) +{ + if (dev->vol.migr_type == MIGR_VERIFY && + dev->status & DEV_VERIFY_AND_FIX) + return MIGR_REPAIR; + else + return dev->vol.migr_type; +} + +static void set_migr_type(struct imsm_dev *dev, __u8 migr_type) +{ + /* for compatibility with older oroms convert MIGR_REPAIR, into + * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status + */ + if (migr_type == MIGR_REPAIR) { + dev->vol.migr_type = MIGR_VERIFY; + dev->status |= DEV_VERIFY_AND_FIX; + } else { + dev->vol.migr_type = migr_type; + dev->status &= ~DEV_VERIFY_AND_FIX; + } +} + +static unsigned int sector_count(__u32 bytes) +{ + return ((bytes + (512-1)) & (~(512-1))) / 512; +} + +static unsigned int mpb_sectors(struct imsm_super *mpb) +{ + return sector_count(__le32_to_cpu(mpb->mpb_size)); +} + +struct intel_dev { + struct imsm_dev *dev; + struct intel_dev *next; + unsigned index; +}; + +/* internal representation of IMSM metadata */ +struct intel_super { + union { + void *buf; /* O_DIRECT buffer for reading/writing metadata */ + struct imsm_super *anchor; /* immovable parameters */ + }; + size_t len; /* size of the 'buf' allocation */ + void *next_buf; /* for realloc'ing buf from the manager */ + size_t next_len; + int updates_pending; /* count of pending updates for mdmon */ + int current_vol; /* index of raid device undergoing creation */ + __u32 create_offset; /* common start for 'current_vol' */ + __u32 random; /* random data for seeding new family numbers */ + struct intel_dev *devlist; + struct dl { + struct dl *next; + int index; + __u8 serial[MAX_RAID_SERIAL_LEN]; + int major, minor; + char *devname; + struct imsm_disk disk; + int fd; + int extent_cnt; + struct extent *e; /* for determining freespace @ create */ + int raiddisk; /* slot to fill in autolayout */ + } *disks; + struct dl *add; /* list of disks to add while mdmon active */ + struct dl *missing; /* disks removed while we weren't looking */ + struct bbm_log *bbm_log; + const char *hba; /* device path of the raid controller for this metadata */ + const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; +}; + +struct extent { + unsigned long long start, size; +}; + +/* definition of messages passed to imsm_process_update */ +enum imsm_update_type { + update_activate_spare, + update_create_array, + update_kill_array, + update_rename_array, + update_add_disk, +}; + +struct imsm_update_activate_spare { + enum imsm_update_type type; + struct dl *dl; + int slot; + int array; + struct imsm_update_activate_spare *next; +}; + +struct disk_info { + __u8 serial[MAX_RAID_SERIAL_LEN]; +}; + +struct imsm_update_create_array { + enum imsm_update_type type; + int dev_idx; + struct imsm_dev dev; +}; + +struct imsm_update_kill_array { + enum imsm_update_type type; + int dev_idx; +}; + +struct imsm_update_rename_array { + enum imsm_update_type type; + __u8 name[MAX_RAID_SERIAL_LEN]; + int dev_idx; +}; + +struct imsm_update_add_disk { + enum imsm_update_type type; +}; + +static struct supertype *match_metadata_desc_imsm(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "imsm") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = malloc(sizeof(*st)); + if (!st) + return NULL; + memset(st, 0, sizeof(*st)); + st->ss = &super_imsm; + st->max_devs = IMSM_MAX_DEVICES; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static __u8 *get_imsm_version(struct imsm_super *mpb) +{ + return &mpb->sig[MPB_SIG_LEN]; +} +#endif + +/* retrieve a disk directly from the anchor when the anchor is known to be + * up-to-date, currently only at load time + */ +static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index) +{ + if (index >= mpb->num_disks) + return NULL; + return &mpb->disk[index]; +} + +/* retrieve a disk from the parsed metadata */ +static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index == index) + return &d->disk; + + return NULL; +} + +/* generate a checksum directly from the anchor when the anchor is known to be + * up-to-date, currently only at load or write_super after coalescing + */ +static __u32 __gen_imsm_checksum(struct imsm_super *mpb) +{ + __u32 end = mpb->mpb_size / sizeof(end); + __u32 *p = (__u32 *) mpb; + __u32 sum = 0; + + while (end--) { + sum += __le32_to_cpu(*p); + p++; + } + + return sum - __le32_to_cpu(mpb->check_sum); +} + +static size_t sizeof_imsm_map(struct imsm_map *map) +{ + return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1); +} + +struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map) +{ + struct imsm_map *map = &dev->vol.map[0]; + + if (second_map && !dev->vol.migr_state) + return NULL; + else if (second_map) { + void *ptr = map; + + return ptr + sizeof_imsm_map(map); + } else + return map; + +} + +/* return the size of the device. + * migr_state increases the returned size if map[0] were to be duplicated + */ +static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state) +{ + size_t size = sizeof(*dev) - sizeof(struct imsm_map) + + sizeof_imsm_map(get_imsm_map(dev, 0)); + + /* migrating means an additional map */ + if (dev->vol.migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, 1)); + else if (migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, 0)); + + return size; +} + +#ifndef MDASSEMBLE +/* retrieve disk serial number list from a metadata update */ +static struct disk_info *get_disk_info(struct imsm_update_create_array *update) +{ + void *u = update; + struct disk_info *inf; + + inf = u + sizeof(*update) - sizeof(struct imsm_dev) + + sizeof_imsm_dev(&update->dev, 0); + + return inf; +} +#endif + +static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) +{ + int offset; + int i; + void *_mpb = mpb; + + if (index >= mpb->num_raid_devs) + return NULL; + + /* devices start after all disks */ + offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb; + + for (i = 0; i <= index; i++) + if (i == index) + return _mpb + offset; + else + offset += sizeof_imsm_dev(_mpb + offset, 0); + + return NULL; +} + +static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) +{ + struct intel_dev *dv; + + if (index >= super->anchor->num_raid_devs) + return NULL; + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == index) + return dv->dev; + return NULL; +} + +static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot) +{ + struct imsm_map *map; + + if (dev->vol.migr_state) + map = get_imsm_map(dev, 1); + else + map = get_imsm_map(dev, 0); + + /* top byte identifies disk under rebuild */ + return __le32_to_cpu(map->disk_ord_tbl[slot]); +} + +#define ord_to_idx(ord) (((ord) << 8) >> 8) +static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot) +{ + __u32 ord = get_imsm_ord_tbl_ent(dev, slot); + + return ord_to_idx(ord); +} + +static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord) +{ + map->disk_ord_tbl[slot] = __cpu_to_le32(ord); +} + +static int get_imsm_disk_slot(struct imsm_map *map, unsigned idx) +{ + int slot; + __u32 ord; + + for (slot = 0; slot < map->num_members; slot++) { + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (ord_to_idx(ord) == idx) + return slot; + } + + return -1; +} + +static int get_imsm_raid_level(struct imsm_map *map) +{ + if (map->raid_level == 1) { + if (map->num_members == 2) + return 1; + else + return 10; + } + + return map->raid_level; +} + +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static int count_memberships(struct dl *dl, struct intel_super *super) +{ + int memberships = 0; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, 0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) + memberships++; + } + + return memberships; +} + +static struct extent *get_extents(struct intel_super *super, struct dl *dl) +{ + /* find a list of used extents on the given physical device */ + struct extent *rv, *e; + int i; + int memberships = count_memberships(dl, super); + __u32 reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + rv = malloc(sizeof(struct extent) * (memberships + 1)); + if (!rv) + return NULL; + e = rv; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, 0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) { + e->start = __le32_to_cpu(map->pba_of_lba0); + e->size = __le32_to_cpu(map->blocks_per_member); + e++; + } + } + qsort(rv, memberships, sizeof(*rv), cmp_extent); + + /* determine the start of the metadata + * when no raid devices are defined use the default + * ...otherwise allow the metadata to truncate the value + * as is the case with older versions of imsm + */ + if (memberships) { + struct extent *last = &rv[memberships - 1]; + __u32 remainder; + + remainder = __le32_to_cpu(dl->disk.total_blocks) - + (last->start + last->size); + /* round down to 1k block to satisfy precision of the kernel + * 'size' interface + */ + remainder &= ~1UL; + /* make sure remainder is still sane */ + if (remainder < (unsigned)ROUND_UP(super->len, 512) >> 9) + remainder = ROUND_UP(super->len, 512) >> 9; + if (reservation > remainder) + reservation = remainder; + } + e->start = __le32_to_cpu(dl->disk.total_blocks) - reservation; + e->size = 0; + return rv; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry, otherwise fallback to the + * default + */ +static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) +{ + struct extent *e; + int i; + __u32 rv; + + /* for spares just return a minimal reservation which will grow + * once the spare is picked up by an array + */ + if (dl->index == -1) + return MPB_SECTOR_CNT; + + e = get_extents(super, dl); + if (!e) + return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + /* scroll to last entry */ + for (i = 0; e[i].size; i++) + continue; + + rv = __le32_to_cpu(dl->disk.total_blocks) - e[i].start; + + free(e); + + return rv; +} + +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + +#ifndef MDASSEMBLE +static __u64 blocks_per_migr_unit(struct imsm_dev *dev); + +static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) +{ + __u64 sz; + int slot, i; + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 ord; + + printf("\n"); + printf("[%.16s]:\n", dev->volume); + printf(" UUID : %s\n", uuid); + printf(" RAID Level : %d\n", get_imsm_raid_level(map)); + printf(" Members : %d\n", map->num_members); + printf(" Slots : ["); + for (i = 0; i < map->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]\n"); + slot = get_imsm_disk_slot(map, disk_idx); + if (slot >= 0) { + ord = get_imsm_ord_tbl_ent(dev, slot); + printf(" This Slot : %d%s\n", slot, + ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : ""); + } else + printf(" This Slot : ?\n"); + sz = __le32_to_cpu(dev->size_high); + sz <<= 32; + sz += __le32_to_cpu(dev->size_low); + printf(" Array Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + sz = __le32_to_cpu(map->blocks_per_member); + printf(" Per Dev Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + printf(" Sector Offset : %u\n", + __le32_to_cpu(map->pba_of_lba0)); + printf(" Num Stripes : %u\n", + __le32_to_cpu(map->num_data_stripes)); + printf(" Chunk Size : %u KiB\n", + __le16_to_cpu(map->blocks_per_strip) / 2); + printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); + printf(" Migrate State : "); + if (dev->vol.migr_state) { + if (migr_type(dev) == MIGR_INIT) + printf("initialize\n"); + else if (migr_type(dev) == MIGR_REBUILD) + printf("rebuild\n"); + else if (migr_type(dev) == MIGR_VERIFY) + printf("check\n"); + else if (migr_type(dev) == MIGR_GEN_MIGR) + printf("general migration\n"); + else if (migr_type(dev) == MIGR_STATE_CHANGE) + printf("state change\n"); + else if (migr_type(dev) == MIGR_REPAIR) + printf("repair\n"); + else + printf("\n", migr_type(dev)); + } else + printf("idle\n"); + printf(" Map State : %s", map_state_str[map->map_state]); + if (dev->vol.migr_state) { + struct imsm_map *map = get_imsm_map(dev, 1); + + printf(" <-- %s", map_state_str[map->map_state]); + printf("\n Checkpoint : %u (%llu)", + __le32_to_cpu(dev->vol.curr_migr_unit), + (unsigned long long)blocks_per_migr_unit(dev)); + } + printf("\n"); + printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); +} + +static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) +{ + struct imsm_disk *disk = __get_imsm_disk(mpb, index); + char str[MAX_RAID_SERIAL_LEN + 1]; + __u64 sz; + + if (index < 0 || !disk) + return; + + printf("\n"); + snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); + printf(" Disk%02d Serial : %s\n", index, str); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); + printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); + sz = __le32_to_cpu(disk->total_blocks) - reserved; + printf(" Usable Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); +} + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info); + +static void examine_super_imsm(struct supertype *st, char *homehost) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + char str[MAX_SIGNATURE_LENGTH]; + int i; + struct mdinfo info; + char nbuf[64]; + __u32 sum; + __u32 reserved = imsm_reserved_sectors(super, super->disks); + + + snprintf(str, MPB_SIG_LEN, "%s", mpb->sig); + printf(" Magic : %s\n", str); + snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb)); + printf(" Version : %s\n", get_imsm_version(mpb)); + printf(" Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num)); + printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num)); + printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID : %s\n", nbuf + 5); + sum = __le32_to_cpu(mpb->check_sum); + printf(" Checksum : %08x %s\n", sum, + __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); + printf(" MPB Sectors : %d\n", mpb_sectors(mpb)); + printf(" Disks : %d\n", mpb->num_disks); + printf(" RAID Devices : %d\n", mpb->num_raid_devs); + print_imsm_disk(mpb, super->disks->index, reserved); + if (super->bbm_log) { + struct bbm_log *log = super->bbm_log; + + printf("\n"); + printf("Bad Block Management Log:\n"); + printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size)); + printf(" Signature : %x\n", __le32_to_cpu(log->signature)); + printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count)); + printf(" Spare Blocks : %d\n", __le32_to_cpu(log->reserved_spare_block_count)); + printf(" First Spare : %llx\n", + (unsigned long long) __le64_to_cpu(log->first_spare_lba)); + } + for (i = 0; i < mpb->num_raid_devs; i++) { + struct mdinfo info; + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + print_imsm_dev(dev, nbuf + 5, super->disks->index); + } + for (i = 0; i < mpb->num_disks; i++) { + if (i == super->disks->index) + continue; + print_imsm_disk(mpb, i, reserved); + } +} + +static void brief_examine_super_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + struct intel_super *super = st->sb; + + if (!super->anchor->num_raid_devs) { + printf("ARRAY metadata=imsm\n"); + return; + } + + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + char nbuf1[64]; + struct intel_super *super = st->sb; + int i; + + if (!super->anchor->num_raid_devs) + return; + + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf1, ':'); + printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n", + dev->volume, nbuf + 5, i, nbuf1 + 5); + } +} + +static void export_examine_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=imsm\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", mpb->num_disks); +} + +static void detail_super_imsm(struct supertype *st, char *homehost) +{ + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf("\n UUID : %s\n", nbuf + 5); +} + +static void brief_detail_super_imsm(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_imsm(st, &info); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID=%s", nbuf + 5); +} + +static int imsm_read_serial(int fd, char *devname, __u8 *serial); +static void fd2devname(int fd, char *name); + +static int imsm_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose) +{ + /* dump an unsorted list of devices attached to ahci, as well as + * non-connected ports + */ + int hba_len = strlen(hba_path) + 1; + struct dirent *ent; + DIR *dir; + char *path = NULL; + int err = 0; + unsigned long port_mask = (1 << port_count) - 1; + + if (port_count > (int)sizeof(port_mask) * 8) { + if (verbose) + fprintf(stderr, Name ": port_count %d out of range\n", port_count); + return 2; + } + + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int fd; + char model[64]; + char vendor[64]; + char buf[1024]; + int major, minor; + char *device; + char *c; + int port; + int type; + + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor)); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + + /* retrieve the scsi device type */ + if (asprintf(&device, "/sys/dev/block/%d:%d/device/xxxxxxx", major, minor) < 0) { + if (verbose) + fprintf(stderr, Name ": failed to allocate 'device'\n"); + err = 2; + break; + } + sprintf(device, "/sys/dev/block/%d:%d/device/type", major, minor); + if (load_sys(device, buf) != 0) { + if (verbose) + fprintf(stderr, Name ": failed to read device type for %s\n", + path); + err = 2; + free(device); + break; + } + type = strtoul(buf, NULL, 10); + + /* if it's not a disk print the vendor and model */ + if (!(type == 0 || type == 7 || type == 14)) { + vendor[0] = '\0'; + model[0] = '\0'; + sprintf(device, "/sys/dev/block/%d:%d/device/vendor", major, minor); + if (load_sys(device, buf) == 0) { + strncpy(vendor, buf, sizeof(vendor)); + vendor[sizeof(vendor) - 1] = '\0'; + c = (char *) &vendor[sizeof(vendor) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + + } + sprintf(device, "/sys/dev/block/%d:%d/device/model", major, minor); + if (load_sys(device, buf) == 0) { + strncpy(model, buf, sizeof(model)); + model[sizeof(model) - 1] = '\0'; + c = (char *) &model[sizeof(model) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + } + + if (vendor[0] && model[0]) + sprintf(buf, "%.64s %.64s", vendor, model); + else + switch (type) { /* numbers from hald/linux/device.c */ + case 1: sprintf(buf, "tape"); break; + case 2: sprintf(buf, "printer"); break; + case 3: sprintf(buf, "processor"); break; + case 4: + case 5: sprintf(buf, "cdrom"); break; + case 6: sprintf(buf, "scanner"); break; + case 8: sprintf(buf, "media_changer"); break; + case 9: sprintf(buf, "comm"); break; + case 12: sprintf(buf, "raid"); break; + default: sprintf(buf, "unknown"); + } + } else + buf[0] = '\0'; + free(device); + + /* chop device path to 'host%d' and calculate the port number */ + c = strchr(&path[hba_len], '/'); + if (!c) { + if (verbose) + fprintf(stderr, Name ": %s - invalid path name\n", path + hba_len); + err = 2; + break; + } + *c = '\0'; + if (sscanf(&path[hba_len], "host%d", &port) == 1) + port -= host_base; + else { + if (verbose) { + *c = '/'; /* repair the full string */ + fprintf(stderr, Name ": failed to determine port number for %s\n", + path); + } + err = 2; + break; + } + + /* mark this port as used */ + port_mask &= ~(1 << port); + + /* print out the device information */ + if (buf[0]) { + printf(" Port%d : - non-disk device (%s) -\n", port, buf); + continue; + } + + fd = dev_open(ent->d_name, O_RDONLY); + if (fd < 0) + printf(" Port%d : - disk info unavailable -\n", port); + else { + fd2devname(fd, buf); + printf(" Port%d : %s", port, buf); + if (imsm_read_serial(fd, NULL, (__u8 *) buf) == 0) + printf(" (%s)\n", buf); + else + printf("()\n"); + } + close(fd); + free(path); + path = NULL; + } + if (path) + free(path); + if (dir) + closedir(dir); + if (err == 0) { + int i; + + for (i = 0; i < port_count; i++) + if (port_mask & (1 << i)) + printf(" Port%d : - no device attached -\n", i); + } + + return err; +} + +static int detail_platform_imsm(int verbose, int enumerate_only) +{ + /* There are two components to imsm platform support, the ahci SATA + * controller and the option-rom. To find the SATA controller we + * simply look in /sys/bus/pci/drivers/ahci to see if an ahci + * controller with the Intel vendor id is present. This approach + * allows mdadm to leverage the kernel's ahci detection logic, with the + * caveat that if ahci.ko is not loaded mdadm will not be able to + * detect platform raid capabilities. The option-rom resides in a + * platform "Adapter ROM". We scan for its signature to retrieve the + * platform capabilities. If raid support is disabled in the BIOS the + * option-rom capability structure will not be available. + */ + const struct imsm_orom *orom; + struct sys_dev *list, *hba; + DIR *dir; + struct dirent *ent; + const char *hba_path; + int host_base = 0; + int port_count = 0; + + if (enumerate_only) { + if (check_env("IMSM_NO_PLATFORM") || find_imsm_orom()) + return 0; + return 2; + } + + list = find_driver_devices("pci", "ahci"); + for (hba = list; hba; hba = hba->next) + if (devpath_to_vendor(hba->path) == 0x8086) + break; + + if (!hba) { + if (verbose) + fprintf(stderr, Name ": unable to find active ahci controller\n"); + free_sys_dev(&list); + return 2; + } else if (verbose) + fprintf(stderr, Name ": found Intel SATA AHCI Controller\n"); + hba_path = hba->path; + hba->path = NULL; + free_sys_dev(&list); + + orom = find_imsm_orom(); + if (!orom) { + if (verbose) + fprintf(stderr, Name ": imsm option-rom not found\n"); + return 2; + } + + printf(" Platform : Intel(R) Matrix Storage Manager\n"); + printf(" Version : %d.%d.%d.%d\n", orom->major_ver, orom->minor_ver, + orom->hotfix_ver, orom->build); + printf(" RAID Levels :%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? " raid0" : "", + imsm_orom_has_raid1(orom) ? " raid1" : "", + imsm_orom_has_raid1e(orom) ? " raid1e" : "", + imsm_orom_has_raid10(orom) ? " raid10" : "", + imsm_orom_has_raid5(orom) ? " raid5" : ""); + printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? " 2k" : "", + imsm_orom_has_chunk(orom, 4) ? " 4k" : "", + imsm_orom_has_chunk(orom, 8) ? " 8k" : "", + imsm_orom_has_chunk(orom, 16) ? " 16k" : "", + imsm_orom_has_chunk(orom, 32) ? " 32k" : "", + imsm_orom_has_chunk(orom, 64) ? " 64k" : "", + imsm_orom_has_chunk(orom, 128) ? " 128k" : "", + imsm_orom_has_chunk(orom, 256) ? " 256k" : "", + imsm_orom_has_chunk(orom, 512) ? " 512k" : "", + imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "", + imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "", + imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "", + imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "", + imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "", + imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "", + imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : ""); + printf(" Max Disks : %d\n", orom->tds); + printf(" Max Volumes : %d\n", orom->vpa); + printf(" I/O Controller : %s\n", hba_path); + + /* find the smallest scsi host number to determine a port number base */ + dir = opendir(hba_path); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int host; + + if (sscanf(ent->d_name, "host%d", &host) != 1) + continue; + if (port_count == 0) + host_base = host; + else if (host < host_base) + host_base = host; + + if (host + 1 > port_count + host_base) + port_count = host + 1 - host_base; + + } + if (dir) + closedir(dir); + + if (!port_count || imsm_enumerate_ports(hba_path, port_count, + host_base, verbose) != 0) { + if (verbose) + fprintf(stderr, Name ": failed to enumerate ports\n"); + return 2; + } + + return 0; +} +#endif + +static int match_home_imsm(struct supertype *st, char *homehost) +{ + /* the imsm metadata format does not specify any host + * identification information. We return -1 since we can never + * confirm nor deny whether a given array is "meant" for this + * host. We rely on compare_super and the 'family_num' fields to + * exclude member disks that do not belong, and we rely on + * mdadm.conf to specify the arrays that should be assembled. + * Auto-assembly may still pick up "foreign" arrays. + */ + + return -1; +} + +static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In each case the uuid required is that of the data-array, + * not the device-set. + */ + /* imsm does not track uuid's so we synthesis one using sha1 on + * - The signature (Which is constant for all imsm array, but no matter) + * - the orig_family_num of the container + * - the index number of the volume + * - the 'serial' number of the volume. + * Hopefully these are all constant. + */ + struct intel_super *super = st->sb; + + char buf[20]; + struct sha1_ctx ctx; + struct imsm_dev *dev = NULL; + __u32 family_num; + + /* some mdadm versions failed to set ->orig_family_num, in which + * case fall back to ->family_num. orig_family_num will be + * fixed up with the first metadata update. + */ + family_num = super->anchor->orig_family_num; + if (family_num == 0) + family_num = super->anchor->family_num; + sha1_init_ctx(&ctx); + sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx); + sha1_process_bytes(&family_num, sizeof(__u32), &ctx); + if (super->current_vol >= 0) + dev = get_imsm_dev(super, super->current_vol); + if (dev) { + __u32 vol = super->current_vol; + sha1_process_bytes(&vol, sizeof(vol), &ctx); + sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx); + } + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +#if 0 +static void +get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) +{ + __u8 *v = get_imsm_version(mpb); + __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH; + char major[] = { 0, 0, 0 }; + char minor[] = { 0 ,0, 0 }; + char patch[] = { 0, 0, 0 }; + char *ver_parse[] = { major, minor, patch }; + int i, j; + + i = j = 0; + while (*v != '\0' && v < end) { + if (*v != '.' && j < 2) + ver_parse[i][j++] = *v; + else { + i++; + j = 0; + } + v++; + } + + *m = strtol(minor, NULL, 0); + *p = strtol(patch, NULL, 0); +} +#endif + +static __u32 migr_strip_blocks_resync(struct imsm_dev *dev) +{ + /* migr_strip_size when repairing or initializing parity */ + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 5: + case 10: + return chunk; + default: + return 128*1024 >> 9; + } +} + +static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev) +{ + /* migr_strip_size when rebuilding a degraded disk, no idea why + * this is different than migr_strip_size_resync(), but it's good + * to be compatible + */ + struct imsm_map *map = get_imsm_map(dev, 1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: + if (map->num_members % map->num_domains == 0) + return 128*1024 >> 9; + else + return chunk; + case 5: + return max((__u32) 64*1024 >> 9, chunk); + default: + return 128*1024 >> 9; + } +} + +static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, 0); + struct imsm_map *hi = get_imsm_map(dev, 1); + __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip); + __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip); + + return max((__u32) 1, hi_chunk / lo_chunk); +} + +static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, 0); + int level = get_imsm_raid_level(lo); + + if (level == 1 || level == 10) { + struct imsm_map *hi = get_imsm_map(dev, 1); + + return hi->num_domains; + } else + return num_stripes_per_unit_resync(dev); +} + +static __u8 imsm_num_data_members(struct imsm_dev *dev) +{ + /* named 'imsm_' because raid0, raid1 and raid10 + * counter-intuitively have the same number of data disks + */ + struct imsm_map *map = get_imsm_map(dev, 0); + + switch (get_imsm_raid_level(map)) { + case 0: + case 1: + case 10: + return map->num_members; + case 5: + return map->num_members - 1; + default: + dprintf("%s: unsupported raid level\n", __func__); + return 0; + } +} + +static __u32 parity_segment_depth(struct imsm_dev *dev) +{ + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch(get_imsm_raid_level(map)) { + case 1: + case 10: + return chunk * map->num_domains; + case 5: + return chunk * map->num_members; + default: + return chunk; + } +} + +static __u32 map_migr_block(struct imsm_dev *dev, __u32 block) +{ + struct imsm_map *map = get_imsm_map(dev, 1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + __u32 strip = block / chunk; + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: { + __u32 vol_strip = (strip * map->num_domains) + 1; + __u32 vol_stripe = vol_strip / map->num_members; + + return vol_stripe * chunk + block % chunk; + } case 5: { + __u32 stripe = strip / (map->num_members - 1); + + return stripe * chunk + block % chunk; + } + default: + return 0; + } +} + +static __u64 blocks_per_migr_unit(struct imsm_dev *dev) +{ + /* calculate the conversion factor between per member 'blocks' + * (md/{resync,rebuild}_start) and imsm migration units, return + * 0 for the 'not migrating' and 'unsupported migration' cases + */ + if (!dev->vol.migr_state) + return 0; + + switch (migr_type(dev)) { + case MIGR_VERIFY: + case MIGR_REPAIR: + case MIGR_INIT: { + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 stripes_per_unit; + __u32 blocks_per_unit; + __u32 parity_depth; + __u32 migr_chunk; + __u32 block_map; + __u32 block_rel; + __u32 segment; + __u32 stripe; + __u8 disks; + + /* yes, this is really the translation of migr_units to + * per-member blocks in the 'resync' case + */ + stripes_per_unit = num_stripes_per_unit_resync(dev); + migr_chunk = migr_strip_blocks_resync(dev); + disks = imsm_num_data_members(dev); + blocks_per_unit = stripes_per_unit * migr_chunk * disks; + stripe = __le32_to_cpu(map->blocks_per_strip) * disks; + segment = blocks_per_unit / stripe; + block_rel = blocks_per_unit - segment * stripe; + parity_depth = parity_segment_depth(dev); + block_map = map_migr_block(dev, block_rel); + return block_map + parity_depth * segment; + } + case MIGR_REBUILD: { + __u32 stripes_per_unit; + __u32 migr_chunk; + + stripes_per_unit = num_stripes_per_unit_rebuild(dev); + migr_chunk = migr_strip_blocks_rebuild(dev); + return migr_chunk * stripes_per_unit; + } + case MIGR_GEN_MIGR: + case MIGR_STATE_CHANGE: + default: + return 0; + } +} + +static int imsm_level_to_layout(int level) +{ + switch (level) { + case 0: + case 1: + return 0; + case 5: + case 6: + return ALGORITHM_LEFT_ASYMMETRIC; + case 10: + return 0x102; + } + return UnSet; +} + +static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = get_imsm_map(dev, 0); + struct dl *dl; + char *devname; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->raiddisk == info->disk.raid_disk) + break; + info->container_member = super->current_vol; + info->array.raid_disks = map->num_members; + info->array.level = get_imsm_raid_level(map); + info->array.layout = imsm_level_to_layout(info->array.level); + info->array.md_minor = -1; + info->array.ctime = 0; + info->array.utime = 0; + info->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9; + info->array.state = !dev->vol.dirty; + info->custom_array_size = __le32_to_cpu(dev->size_high); + info->custom_array_size <<= 32; + info->custom_array_size |= __le32_to_cpu(dev->size_low); + + info->disk.major = 0; + info->disk.minor = 0; + if (dl) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + } + + info->data_offset = __le32_to_cpu(map->pba_of_lba0); + info->component_size = __le32_to_cpu(map->blocks_per_member); + memset(info->uuid, 0, sizeof(info->uuid)); + info->recovery_start = MaxSector; + info->reshape_active = 0; + + if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) { + info->resync_start = 0; + } else if (dev->vol.migr_state) { + switch (migr_type(dev)) { + case MIGR_REPAIR: + case MIGR_INIT: { + __u64 blocks_per_unit = blocks_per_migr_unit(dev); + __u64 units = __le32_to_cpu(dev->vol.curr_migr_unit); + + info->resync_start = blocks_per_unit * units; + break; + } + case MIGR_VERIFY: + /* we could emulate the checkpointing of + * 'sync_action=check' migrations, but for now + * we just immediately complete them + */ + case MIGR_REBUILD: + /* this is handled by container_content_imsm() */ + case MIGR_GEN_MIGR: + case MIGR_STATE_CHANGE: + /* FIXME handle other migrations */ + default: + /* we are not dirty, so... */ + info->resync_start = MaxSector; + } + } else + info->resync_start = MaxSector; + + strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); + info->name[MAX_RAID_SERIAL_LEN] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + devname = devnum2devname(st->container_dev); + *info->text_version = '\0'; + if (devname) + sprintf(info->text_version, "/%s/%d", devname, info->container_member); + free(devname); + info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */ + uuid_from_super_imsm(st, info->uuid); +} + +/* check the config file to see if we can return a real uuid for this spare */ +static void fixup_container_spare_uuid(struct mdinfo *inf) +{ + struct mddev_ident_s *array_list; + + if (inf->array.level != LEVEL_CONTAINER || + memcmp(inf->uuid, uuid_match_any, sizeof(int[4])) != 0) + return; + + array_list = conf_get_ident(NULL); + + for (; array_list; array_list = array_list->next) { + if (array_list->uuid_set) { + struct supertype *_sst; /* spare supertype */ + struct supertype *_cst; /* container supertype */ + + _cst = array_list->st; + if (_cst) + _sst = _cst->ss->match_metadata_desc(inf->text_version); + else + _sst = NULL; + + if (_sst) { + memcpy(inf->uuid, array_list->uuid, sizeof(int[4])); + free(_sst); + break; + } + } + } +} + + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed); +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev); + +static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->missing; d; d = d->next) + if (d->index == index) + return &d->disk; + return NULL; +} + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct imsm_disk *disk; + + if (super->current_vol >= 0) { + getinfo_super_imsm_volume(st, info); + return; + } + + /* Set raid_disks to zero so that Assemble will always pull in valid + * spares + */ + info->array.raid_disks = 0; + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = 0; /* N/A for imsm */ + info->array.utime = 0; + info->array.chunk_size = 0; + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.raid_disk = -1; + info->reshape_active = 0; + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "imsm"); + info->safe_mode_delay = 0; + info->disk.number = -1; + info->disk.state = 0; + info->name[0] = 0; + info->recovery_start = MaxSector; + + /* do we have the all the insync disks that we expect? */ + if (st->loaded_container) { + struct imsm_super *mpb = super->anchor; + int max_enough = -1, i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + int failed, enough, j, missing = 0; + struct imsm_map *map; + __u8 state; + + failed = imsm_count_failed(super, dev); + state = imsm_check_degraded(super, dev, failed); + map = get_imsm_map(dev, dev->vol.migr_state); + + /* any newly missing disks? + * (catches single-degraded vs double-degraded) + */ + for (j = 0; j < map->num_members; j++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, i); + __u32 idx = ord_to_idx(ord); + + if (!(ord & IMSM_ORD_REBUILD) && + get_imsm_missing(super, idx)) { + missing = 1; + break; + } + } + + if (state == IMSM_T_STATE_FAILED) + enough = -1; + else if (state == IMSM_T_STATE_DEGRADED && + (state != map->map_state || missing)) + enough = 0; + else /* we're normal, or already degraded */ + enough = 1; + + /* in the missing/failed disk case check to see + * if at least one array is runnable + */ + max_enough = max(max_enough, enough); + } + dprintf("%s: enough: %d\n", __func__, max_enough); + info->container_enough = max_enough; + } else + info->container_enough = -1; + + if (super->disks) { + __u32 reserved = imsm_reserved_sectors(super, super->disks); + + disk = &super->disks->disk; + info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved; + info->component_size = reserved; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; + /* we don't change info->disk.raid_disk here because + * this state will be finalized in mdmon after we have + * found the 'most fresh' version of the metadata + */ + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + } + + /* only call uuid_from_super_imsm when this disk is part of a populated container, + * ->compare_super may have updated the 'num_raid_devs' field for spares + */ + if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs) + uuid_from_super_imsm(st, info->uuid); + else { + memcpy(info->uuid, uuid_match_any, sizeof(int[4])); + fixup_container_spare_uuid(info); + } +} + +static int update_super_imsm(struct supertype *st, struct mdinfo *info, + char *update, char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given + * + * Following are not relevant for this imsm: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + * homehost: update the recorded homehost + * _reshape_progress: record new reshape_progress position. + */ + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; + + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private) + fprintf(stderr, + Name ": '--uuid' not supported for imsm metadata\n"); + else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) { + mpb->orig_family_num = *((__u32 *) info->update_private); + rv = 0; + } else if (strcmp(update, "uuid") == 0) { + __u32 *new_family = malloc(sizeof(*new_family)); + + /* update orig_family_number with the incoming random + * data, report the new effective uuid, and store the + * new orig_family_num for future updates. + */ + if (new_family) { + memcpy(&mpb->orig_family_num, info->uuid, sizeof(__u32)); + uuid_from_super_imsm(st, info->uuid); + *new_family = mpb->orig_family_num; + info->update_private = new_family; + rv = 0; + } + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + fprintf(stderr, + Name ": '--update=%s' not supported for imsm metadata\n", + update); + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); + + return rv; +} + +static size_t disks_to_mpb_size(int disks) +{ + size_t size; + + size = sizeof(struct imsm_super); + size += (disks - 1) * sizeof(struct imsm_disk); + size += 2 * sizeof(struct imsm_dev); + /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */ + size += (4 - 2) * sizeof(struct imsm_map); + /* 4 possible disk_ord_tbl's */ + size += 4 * (disks - 1) * sizeof(__u32); + + return size; +} + +static __u64 avail_size_imsm(struct supertype *st, __u64 devsize) +{ + if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS)) + return 0; + + return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); +} + +static void free_devlist(struct intel_super *super) +{ + struct intel_dev *dv; + + while (super->devlist) { + dv = super->devlist->next; + free(super->devlist->dev); + free(super->devlist); + super->devlist = dv; + } +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + memcpy(dest, src, sizeof_imsm_dev(src, 0)); +} + +static int compare_super_imsm(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct intel_super *first = st->sb; + struct intel_super *sec = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + /* if an anchor does not have num_raid_devs set then it is a free + * floating spare + */ + if (first->anchor->num_raid_devs > 0 && + sec->anchor->num_raid_devs > 0) { + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 3; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) + return 3; + + } + + + /* if 'first' is a spare promote it to a populated mpb with sec's + * family number + */ + if (first->anchor->num_raid_devs == 0 && + sec->anchor->num_raid_devs > 0) { + int i; + struct intel_dev *dv; + struct imsm_dev *dev; + + /* we need to copy raid device info from sec if an allocation + * fails here we don't associate the spare + */ + for (i = 0; i < sec->anchor->num_raid_devs; i++) { + dv = malloc(sizeof(*dv)); + if (!dv) + break; + dev = malloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1)); + if (!dev) { + free(dv); + break; + } + dv->dev = dev; + dv->index = i; + dv->next = first->devlist; + first->devlist = dv; + } + if (i < sec->anchor->num_raid_devs) { + /* allocation failure */ + free_devlist(first); + fprintf(stderr, "imsm: failed to associate spare\n"); + return 3; + } + first->anchor->num_raid_devs = sec->anchor->num_raid_devs; + first->anchor->orig_family_num = sec->anchor->orig_family_num; + first->anchor->family_num = sec->anchor->family_num; + memcpy(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH); + for (i = 0; i < sec->anchor->num_raid_devs; i++) + imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i)); + } + + return 0; +} + +static void fd2devname(int fd, char *name) +{ + struct stat st; + char path[256]; + char dname[PATH_MAX]; + char *nm; + int rv; + + name[0] = '\0'; + if (fstat(fd, &st) != 0) + return; + sprintf(path, "/sys/dev/block/%d:%d", + major(st.st_rdev), minor(st.st_rdev)); + + rv = readlink(path, dname, sizeof(dname)); + if (rv <= 0) + return; + + dname[rv] = '\0'; + nm = strrchr(dname, '/'); + nm++; + snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); +} + +extern int scsi_get_serial(int fd, void *buf, size_t buf_len); + +static int imsm_read_serial(int fd, char *devname, + __u8 serial[MAX_RAID_SERIAL_LEN]) +{ + unsigned char scsi_serial[255]; + int rv; + int rsp_len; + int len; + char *dest; + char *src; + char *rsp_buf; + int i; + + memset(scsi_serial, 0, sizeof(scsi_serial)); + + rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial)); + + if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) { + memset(serial, 0, MAX_RAID_SERIAL_LEN); + fd2devname(fd, (char *) serial); + return 0; + } + + if (rv != 0) { + if (devname) + fprintf(stderr, + Name ": Failed to retrieve serial for %s\n", + devname); + return rv; + } + + rsp_len = scsi_serial[3]; + if (!rsp_len) { + if (devname) + fprintf(stderr, + Name ": Failed to retrieve serial for %s\n", + devname); + return 2; + } + rsp_buf = (char *) &scsi_serial[4]; + + /* trim all whitespace and non-printable characters and convert + * ':' to ';' + */ + for (i = 0, dest = rsp_buf; i < rsp_len; i++) { + src = &rsp_buf[i]; + if (*src > 0x20) { + /* ':' is reserved for use in placeholder serial + * numbers for missing disks + */ + if (*src == ':') + *dest++ = ';'; + else + *dest++ = *src; + } + } + len = dest - rsp_buf; + dest = rsp_buf; + + /* truncate leading characters */ + if (len > MAX_RAID_SERIAL_LEN) { + dest += len - MAX_RAID_SERIAL_LEN; + len = MAX_RAID_SERIAL_LEN; + } + + memset(serial, 0, MAX_RAID_SERIAL_LEN); + memcpy(serial, dest, len); + + return 0; +} + +static int serialcmp(__u8 *s1, __u8 *s2) +{ + return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN); +} + +static void serialcpy(__u8 *dest, __u8 *src) +{ + strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); +} + +#ifndef MDASSEMBLE +static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) +{ + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (serialcmp(dl->serial, serial) == 0) + break; + + return dl; +} +#endif + +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} + +static int +load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + struct imsm_disk *disk; + struct dl *dl; + struct stat stb; + int rv; + char name[40]; + __u8 serial[MAX_RAID_SERIAL_LEN]; + + rv = imsm_read_serial(fd, devname, serial); + + if (rv != 0) + return 2; + + dl = calloc(1, sizeof(*dl)); + if (!dl) { + if (devname) + fprintf(stderr, + Name ": failed to allocate disk buffer for %s\n", + devname); + return 2; + } + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = strdup(devname); + else + dl->devname = strdup(name); + + /* look up this disk's index in the current anchor */ + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + } + + return 0; +} + +#ifndef MDASSEMBLE +/* When migrating map0 contains the 'destination' state while map1 + * contains the current state. When not migrating map0 contains the + * current state. This routine assumes that map[0].map_state is set to + * the current array state before being called. + * + * Migration is indicated by one of the following states + * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed) + * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal + * map1state=unitialized) + * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal + * map1state=normal) + * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal + * map1state=degraded) + */ +static void migrate(struct imsm_dev *dev, __u8 to_state, int migr_type) +{ + struct imsm_map *dest; + struct imsm_map *src = get_imsm_map(dev, 0); + + dev->vol.migr_state = 1; + set_migr_type(dev, migr_type); + dev->vol.curr_migr_unit = 0; + dest = get_imsm_map(dev, 1); + + /* duplicate and then set the target end state in map[0] */ + memcpy(dest, src, sizeof_imsm_map(src)); + if (migr_type == MIGR_REBUILD) { + __u32 ord; + int i; + + for (i = 0; i < src->num_members; i++) { + ord = __le32_to_cpu(src->disk_ord_tbl[i]); + set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord)); + } + } + + src->map_state = to_state; +} + +static void end_migration(struct imsm_dev *dev, __u8 map_state) +{ + struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state); + int i; + + /* merge any IMSM_ORD_REBUILD bits that were not successfully + * completed in the last migration. + * + * FIXME add support for online capacity expansion and + * raid-level-migration + */ + for (i = 0; i < prev->num_members; i++) + map->disk_ord_tbl[i] |= prev->disk_ord_tbl[i]; + + dev->vol.migr_state = 0; + dev->vol.curr_migr_unit = 0; + map->map_state = map_state; +} +#endif + +static int parse_raid_devices(struct intel_super *super) +{ + int i; + struct imsm_dev *dev_new; + size_t len, len_migr; + size_t space_needed = 0; + struct imsm_super *mpb = super->anchor; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + struct intel_dev *dv; + + len = sizeof_imsm_dev(dev_iter, 0); + len_migr = sizeof_imsm_dev(dev_iter, 1); + if (len_migr > len) + space_needed += len_migr - len; + + dv = malloc(sizeof(*dv)); + if (!dv) + return 1; + dev_new = malloc(len_migr); + if (!dev_new) { + free(dv); + return 1; + } + imsm_copy_dev(dev_new, dev_iter); + dv->dev = dev_new; + dv->index = i; + dv->next = super->devlist; + super->devlist = dv; + } + + /* ensure that super->buf is large enough when all raid devices + * are migrating + */ + if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) { + void *buf; + + len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512); + if (posix_memalign(&buf, 512, len) != 0) + return 1; + + memcpy(buf, super->buf, super->len); + memset(buf + super->len, 0, len - super->len); + free(super->buf); + super->buf = buf; + super->len = len; + } + + return 0; +} + +/* retrieve a pointer to the bbm log which starts after all raid devices */ +struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb) +{ + void *ptr = NULL; + + if (__le32_to_cpu(mpb->bbm_log_size)) { + ptr = mpb; + ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size); + } + + return ptr; +} + +static void __free_imsm(struct intel_super *super, int free_disks); + +/* load_imsm_mpb - read matrix metadata + * allocates super->mpb to be freed by free_super + */ +static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) +{ + unsigned long long dsize; + unsigned long long sectors; + struct stat; + struct imsm_super *anchor; + __u32 check_sum; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) { + if (devname) + fprintf(stderr, + Name ": Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&anchor, 512, 512) != 0) { + if (devname) + fprintf(stderr, + Name ": Failed to allocate imsm anchor buffer" + " on %s\n", devname); + return 1; + } + if (read(fd, anchor, 512) != 512) { + if (devname) + fprintf(stderr, + Name ": Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + free(anchor); + return 1; + } + + if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) { + if (devname) + fprintf(stderr, + Name ": no IMSM anchor on %s\n", devname); + free(anchor); + return 2; + } + + __free_imsm(super, 0); + super->len = ROUND_UP(anchor->mpb_size, 512); + if (posix_memalign(&super->buf, 512, super->len) != 0) { + if (devname) + fprintf(stderr, + Name ": unable to allocate %zu byte mpb buffer\n", + super->len); + free(anchor); + return 2; + } + memcpy(super->buf, anchor, 512); + + sectors = mpb_sectors(anchor) - 1; + free(anchor); + if (!sectors) { + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + fprintf(stderr, + Name ": IMSM checksum %x != %x on %s\n", + check_sum, + __le32_to_cpu(super->anchor->check_sum), + devname); + return 2; + } + + return 0; + } + + /* read the extended mpb */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) { + if (devname) + fprintf(stderr, + Name ": Cannot seek to extended mpb on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if ((unsigned)read(fd, super->buf + 512, super->len - 512) != super->len - 512) { + if (devname) + fprintf(stderr, + Name ": Cannot read extended mpb on %s: %s\n", + devname, strerror(errno)); + return 2; + } + + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + fprintf(stderr, + Name ": IMSM checksum %x != %x on %s\n", + check_sum, __le32_to_cpu(super->anchor->check_sum), + devname); + return 3; + } + + /* FIXME the BBM log is disk specific so we cannot use this global + * buffer for all disks. Ok for now since we only look at the global + * bbm_log_size parameter to gate assembly + */ + super->bbm_log = __get_imsm_bbm_log(super->anchor); + + return 0; +} + +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); + + return err; +} + +static void __free_imsm_disk(struct dl *d) +{ + if (d->fd >= 0) + close(d->fd); + if (d->devname) + free(d->devname); + if (d->e) + free(d->e); + free(d); + +} +static void free_imsm_disks(struct intel_super *super) +{ + struct dl *d; + + while (super->disks) { + d = super->disks; + super->disks = d->next; + __free_imsm_disk(d); + } + while (super->missing) { + d = super->missing; + super->missing = d->next; + __free_imsm_disk(d); + } + +} + +/* free all the pieces hanging off of a super pointer */ +static void __free_imsm(struct intel_super *super, int free_disks) +{ + if (super->buf) { + free(super->buf); + super->buf = NULL; + } + if (free_disks) + free_imsm_disks(super); + free_devlist(super); + if (super->hba) { + free((void *) super->hba); + super->hba = NULL; + } +} + +static void free_imsm(struct intel_super *super) +{ + __free_imsm(super, 1); + free(super); +} + +static void free_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + if (!super) + return; + + free_imsm(super); + st->sb = NULL; +} + +static struct intel_super *alloc_super(void) +{ + struct intel_super *super = malloc(sizeof(*super)); + + if (super) { + memset(super, 0, sizeof(*super)); + super->current_vol = -1; + super->create_offset = ~((__u32 ) 0); + if (!check_env("IMSM_NO_PLATFORM")) + super->orom = find_imsm_orom(); + if (super->orom && !check_env("IMSM_TEST_OROM")) { + struct sys_dev *list, *ent; + + /* find the first intel ahci controller */ + list = find_driver_devices("pci", "ahci"); + for (ent = list; ent; ent = ent->next) + if (devpath_to_vendor(ent->path) == 0x8086) + break; + if (ent) { + super->hba = ent->path; + ent->path = NULL; + } + free_sys_dev(&list); + } + } + + return super; +} + +#ifndef MDASSEMBLE +/* find_missing - helper routine for load_super_imsm_all that identifies + * disks that have disappeared from the system. This routine relies on + * the mpb being uptodate, which it is at load time. + */ +static int find_missing(struct intel_super *super) +{ + int i; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + struct imsm_disk *disk; + + for (i = 0; i < mpb->num_disks; i++) { + disk = __get_imsm_disk(mpb, i); + dl = serial_to_dl(disk->serial, super); + if (dl) + continue; + + dl = malloc(sizeof(*dl)); + if (!dl) + return 1; + dl->major = 0; + dl->minor = 0; + dl->fd = -1; + dl->devname = strdup("missing"); + dl->index = i; + serialcpy(dl->serial, disk->serial); + dl->disk = *disk; + dl->e = NULL; + dl->next = super->missing; + super->missing = dl; + } + + return 0; +} + +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("%s: mpb from %d:%d matches %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("%s: mpb from %d:%d replaces %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("%s: mpb from %d:%d prefer %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = calloc(1, sizeof(*idisk)); + if (!idisk) + return -1; + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("%s: '%.16s' owner %d != %d\n", + __func__, disk->serial, idisk->owner, + owner); + } else { + dprintf("%s: unknown disk %x [%d]: %.16s\n", + __func__, __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + fprintf(stderr, "Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("%s: marking family: %#x from %d:%d offline\n", + __func__, mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + fprintf(stderr, "Chose family %#x on '%s', " + "assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (idisk && is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + +static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, + char *devname, int keep_fd) +{ + struct mdinfo *sra; + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; + int devnum = fd2devnum(fd); + struct mdinfo *sd; + int retry; + int err = 0; + int i; + + /* check if 'fd' an opened container */ + sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "imsm") != 0) { + err = 1; + goto error; + } + /* load all mpbs */ + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + struct intel_super *s = alloc_super(); + char nm[32]; + int dfd; + + err = 1; + if (!s) + goto error; + s->next = super_list; + super_list = s; + + err = 2; + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); + if (dfd < 0) + goto error; + + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + + /* retry the load if we might have raced against mdmon */ + if (err == 3 && mdmon_running(devnum)) + for (retry = 0; retry < 3; retry++) { + usleep(3000); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) + break; + } + if (!keep_fd) + close(dfd); + if (err) + goto error; + } + + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; + } + + if (find_missing(super) != 0) { + free_imsm(super); + err = 2; + goto error; + } + + if (st->subarray[0]) { + unsigned long val; + char *ep; + + err = 1; + val = strtoul(st->subarray, &ep, 10); + if (*ep != '\0') { + free_imsm(super); + goto error; + } + + if (val < super->anchor->num_raid_devs) + super->current_vol = val; + else { + free_imsm(super); + goto error; + } + } + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + sysfs_free(sra); + + if (err) + return err; + + *sbp = super; + st->container_dev = devnum; + if (err == 0 && st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + st->loaded_container = 1; + + return 0; +} +#endif + +static int load_super_imsm(struct supertype *st, int fd, char *devname) +{ + struct intel_super *super; + int rv; + +#ifndef MDASSEMBLE + if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0) + return 0; +#endif + + if (test_partition(fd)) + /* IMSM not allowed on partitions */ + return 1; + + free_super_imsm(st); + + super = alloc_super(); + if (!super) { + fprintf(stderr, + Name ": malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + + rv = load_and_parse_mpb(fd, super, devname, 0); + + if (rv) { + if (devname) + fprintf(stderr, + Name ": Failed to load all information " + "sections on %s\n", devname); + free_imsm(super); + return rv; + } + + if (st->subarray[0]) { + unsigned long val; + char *ep; + + val = strtoul(st->subarray, &ep, 10); + if (*ep != '\0') { + free_imsm(super); + return 1; + } + + if (val < super->anchor->num_raid_devs) + super->current_vol = val; + else { + free_imsm(super); + return 1; + } + } + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + st->loaded_container = 0; + + return 0; +} + +static __u16 info_to_blocks_per_strip(mdu_array_info_t *info) +{ + if (info->level == 1) + return 128; + return info->chunk_size >> 9; +} + +static __u32 info_to_num_data_stripes(mdu_array_info_t *info, int num_domains) +{ + __u32 num_stripes; + + num_stripes = (info->size * 2) / info_to_blocks_per_strip(info); + num_stripes /= num_domains; + + return num_stripes; +} + +static __u32 info_to_blocks_per_member(mdu_array_info_t *info) +{ + if (info->level == 1) + return info->size * 2; + else + return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1); +} + +static void imsm_update_version_info(struct intel_super *super) +{ + /* update the version and attributes */ + struct imsm_super *mpb = super->anchor; + char *version; + struct imsm_dev *dev; + struct imsm_map *map; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); + if (__le32_to_cpu(dev->size_high) > 0) + mpb->attributes |= MPB_ATTRIB_2TB; + + /* FIXME detect when an array spans a port multiplier */ + #if 0 + mpb->attributes |= MPB_ATTRIB_PM; + #endif + + if (mpb->num_raid_devs > 1 || + mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) { + version = MPB_VERSION_ATTRIBS; + switch (get_imsm_raid_level(map)) { + case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break; + case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break; + case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break; + case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break; + } + } else { + if (map->num_members >= 5) + version = MPB_VERSION_5OR6_DISK_ARRAY; + else if (dev->status == DEV_CLONE_N_GO) + version = MPB_VERSION_CNG; + else if (get_imsm_raid_level(map) == 5) + version = MPB_VERSION_RAID5; + else if (map->num_members >= 3) + version = MPB_VERSION_3OR4_DISK_ARRAY; + else if (get_imsm_raid_level(map) == 1) + version = MPB_VERSION_RAID1; + else + version = MPB_VERSION_RAID0; + } + strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version); + } +} + +static int check_name(struct intel_super *super, char *name, int quiet) +{ + struct imsm_super *mpb = super->anchor; + char *reason = NULL; + int i; + + if (strlen(name) > MAX_RAID_SERIAL_LEN) + reason = "must be 16 characters or less"; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + if (strncmp((char *) dev->volume, name, MAX_RAID_SERIAL_LEN) == 0) { + reason = "already exists"; + break; + } + } + + if (reason && !quiet) + fprintf(stderr, Name ": imsm volume name %s\n", reason); + + return !reason; +} + +static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid) +{ + /* We are creating a volume inside a pre-existing container. + * so st->sb is already set. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_vol *vol; + struct imsm_map *map; + int idx = mpb->num_raid_devs; + int i; + unsigned long long array_blocks; + size_t size_old, size_new; + __u32 num_data_stripes; + + if (super->orom && mpb->num_raid_devs >= super->orom->vpa) { + fprintf(stderr, Name": This imsm-container already has the " + "maximum of %d volumes\n", super->orom->vpa); + return 0; + } + + /* ensure the mpb is large enough for the new data */ + size_old = __le32_to_cpu(mpb->mpb_size); + size_new = disks_to_mpb_size(info->nr_disks); + if (size_new > size_old) { + void *mpb_new; + size_t size_round = ROUND_UP(size_new, 512); + + if (posix_memalign(&mpb_new, 512, size_round) != 0) { + fprintf(stderr, Name": could not allocate new mpb\n"); + return 0; + } + memcpy(mpb_new, mpb, size_old); + free(mpb); + mpb = mpb_new; + super->anchor = mpb_new; + mpb->mpb_size = __cpu_to_le32(size_new); + memset(mpb_new + size_old, 0, size_round - size_old); + } + super->current_vol = idx; + /* when creating the first raid device in this container set num_disks + * to zero, i.e. delete this spare and add raid member devices in + * add_to_super_imsm_volume() + */ + if (super->current_vol == 0) + mpb->num_disks = 0; + + if (!check_name(super, name, 0)) + return 0; + sprintf(st->subarray, "%d", idx); + dv = malloc(sizeof(*dv)); + if (!dv) { + fprintf(stderr, Name ": failed to allocate device list entry\n"); + return 0; + } + dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); + if (!dev) { + free(dv); + fprintf(stderr, Name": could not allocate raid device\n"); + return 0; + } + strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); + if (info->level == 1) + array_blocks = info_to_blocks_per_member(info); + else + array_blocks = calc_array_size(info->level, info->raid_disks, + info->layout, info->chunk_size, + info->size*2); + /* round array size down to closest MB */ + array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + + dev->size_low = __cpu_to_le32((__u32) array_blocks); + dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32)); + dev->status = __cpu_to_le32(0); + dev->reserved_blocks = __cpu_to_le32(0); + vol = &dev->vol; + vol->migr_state = 0; + set_migr_type(dev, MIGR_INIT); + vol->dirty = 0; + vol->curr_migr_unit = 0; + map = get_imsm_map(dev, 0); + map->pba_of_lba0 = __cpu_to_le32(super->create_offset); + map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info)); + map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); + map->failed_disk_num = ~0; + map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED : + IMSM_T_STATE_NORMAL; + map->ddf = 1; + + if (info->level == 1 && info->raid_disks > 2) { + free(dev); + free(dv); + fprintf(stderr, Name": imsm does not support more than 2 disks" + "in a raid1 volume\n"); + return 0; + } + + map->raid_level = info->level; + if (info->level == 10) { + map->raid_level = 1; + map->num_domains = info->raid_disks / 2; + } else if (info->level == 1) + map->num_domains = info->raid_disks; + else + map->num_domains = 1; + + num_data_stripes = info_to_num_data_stripes(info, map->num_domains); + map->num_data_stripes = __cpu_to_le32(num_data_stripes); + + map->num_members = info->raid_disks; + for (i = 0; i < map->num_members; i++) { + /* initialized in add_to_super */ + set_imsm_ord_tbl_ent(map, i, IMSM_ORD_REBUILD); + } + mpb->num_raid_devs++; + + dv->dev = dev; + dv->index = super->current_vol; + dv->next = super->devlist; + super->devlist = dv; + + imsm_update_version_info(super); + + return 1; +} + +static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For IMSM, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + */ + struct intel_super *super; + struct imsm_super *mpb; + size_t mpb_size; + char *version; + + if (st->sb) + return init_super_imsm_volume(st, info, size, name, homehost, uuid); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; + + super = alloc_super(); + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { + free(super); + super = NULL; + } + if (!super) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); + return 0; + } + memset(super->buf, 0, mpb_size); + mpb = super->buf; + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + + version = (char *) mpb->sig; + strcpy(version, MPB_SIGNATURE); + version += strlen(MPB_SIGNATURE); + strcpy(version, MPB_VERSION_RAID0); + + return 1; +} + +#ifndef MDASSEMBLE +static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + struct imsm_dev *dev; + struct imsm_map *map; + int slot; + + dev = get_imsm_dev(super, super->current_vol); + map = get_imsm_map(dev, 0); + + if (! (dk->state & (1<disks; dl; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = super->disks; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + + if (!dl) { + fprintf(stderr, Name ": %s is not a member of the same container\n", devname); + return 1; + } + + /* add a pristine spare to the metadata */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + /* Check the device has not already been added */ + slot = get_imsm_disk_slot(map, dl->index); + if (slot >= 0 && + (get_imsm_ord_tbl_ent(dev, slot) & IMSM_ORD_REBUILD) == 0) { + fprintf(stderr, Name ": %s has been included in this array twice\n", + devname); + return 1; + } + set_imsm_ord_tbl_ent(map, dk->number, dl->index); + dl->disk.status = CONFIGURED_DISK; + + /* if we are creating the first raid device update the family number */ + if (super->current_vol == 0) { + __u32 sum; + struct imsm_dev *_dev = __get_imsm_dev(mpb, 0); + struct imsm_disk *_disk = __get_imsm_disk(mpb, dl->index); + + if (!_dev || !_disk) { + fprintf(stderr, Name ": BUG mpb setup error\n"); + return 1; + } + *_dev = *dev; + *_disk = dl->disk; + sum = random32(); + sum += __gen_imsm_checksum(mpb); + mpb->family_num = __cpu_to_le32(sum); + mpb->orig_family_num = mpb->family_num; + } + + return 0; +} + +static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct dl *dd; + unsigned long long size; + __u32 id; + int rv; + struct stat stb; + + /* if we are on an RAID enabled platform check that the disk is + * attached to the raid controller + */ + if (super->hba && !disk_attached_to_hba(fd, super->hba)) { + fprintf(stderr, + Name ": %s is not attached to the raid controller: %s\n", + devname ? : "disk", super->hba); + return 1; + } + + if (super->current_vol >= 0) + return add_to_super_imsm_volume(st, dk, fd, devname); + + fstat(fd, &stb); + dd = malloc(sizeof(*dd)); + if (!dd) { + fprintf(stderr, + Name ": malloc failed %s:%d.\n", __func__, __LINE__); + return 1; + } + memset(dd, 0, sizeof(*dd)); + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->index = -1; + dd->devname = devname ? strdup(devname) : NULL; + dd->fd = fd; + dd->e = NULL; + rv = imsm_read_serial(fd, devname, dd->serial); + if (rv) { + fprintf(stderr, + Name ": failed to retrieve scsi serial, aborting\n"); + free(dd); + abort(); + } + + get_dev_size(fd, NULL, &size); + size /= 512; + serialcpy(dd->disk.serial, dd->serial); + dd->disk.total_blocks = __cpu_to_le32(size); + dd->disk.status = SPARE_DISK; + if (sysfs_disk_to_scsi_id(fd, &id) == 0) + dd->disk.scsi_id = __cpu_to_le32(id); + else + dd->disk.scsi_id = __cpu_to_le32(0); + + if (st->update_tail) { + dd->next = super->add; + super->add = dd; + } else { + dd->next = super->disks; + super->disks = dd; + } + + return 0; +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); + +/* spare records have their own family number and do not have any defined raid + * devices + */ +static int write_super_imsm_spares(struct intel_super *super, int doclose) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; + __u32 sum; + struct dl *d; + + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)), + spare->generation_num = __cpu_to_le32(1UL), + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1, + spare->num_raid_devs = 0, + spare->cache_size = mpb->cache_size, + spare->pwr_cycle_count = __cpu_to_le32(1), + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); + + for (d = super->disks; d; d = d->next) { + if (d->index != -1) + continue; + + spare->disk[0] = d->disk; + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); + + if (store_imsm_mpb(d->fd, spare)) { + fprintf(stderr, "%s: failed for device %d:%d %s\n", + __func__, d->major, d->minor, strerror(errno)); + return 1; + } + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + return 0; +} + +static int write_super_imsm(struct intel_super *super, int doclose) +{ + struct imsm_super *mpb = super->anchor; + struct dl *d; + __u32 generation; + __u32 sum; + int spares = 0; + int i; + __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk); + + /* 'generation' is incremented everytime the metadata is written */ + generation = __le32_to_cpu(mpb->generation_num); + generation++; + mpb->generation_num = __cpu_to_le32(generation); + + /* fix up cases where previous mdadm releases failed to set + * orig_family_num + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + + mpb_size += sizeof(struct imsm_disk) * mpb->num_disks; + for (d = super->disks; d; d = d->next) { + if (d->index == -1) + spares++; + else + mpb->disk[d->index] = d->disk; + } + for (d = super->missing; d; d = d->next) + mpb->disk[d->index] = d->disk; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + imsm_copy_dev(dev, get_imsm_dev(super, i)); + mpb_size += sizeof_imsm_dev(dev, 0); + } + mpb_size += __le32_to_cpu(mpb->bbm_log_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + + /* recalculate checksum */ + sum = __gen_imsm_checksum(mpb); + mpb->check_sum = __cpu_to_le32(sum); + + /* write the mpb for disks that compose raid devices */ + for (d = super->disks; d ; d = d->next) { + if (d->index < 0) + continue; + if (store_imsm_mpb(d->fd, mpb)) + fprintf(stderr, "%s: failed for device %d:%d %s\n", + __func__, d->major, d->minor, strerror(errno)); + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + if (spares) + return write_super_imsm_spares(super, doclose); + + return 0; +} + + +static int create_array(struct supertype *st, int dev_idx) +{ + size_t len; + struct imsm_update_create_array *u; + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, 0); + struct disk_info *inf; + struct imsm_disk *disk; + int i; + + len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) + + sizeof(*inf) * map->num_members; + u = malloc(len); + if (!u) { + fprintf(stderr, "%s: failed to allocate update buffer\n", + __func__); + return 1; + } + + u->type = update_create_array; + u->dev_idx = dev_idx; + imsm_copy_dev(&u->dev, dev); + inf = get_disk_info(u); + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_disk_idx(dev, i); + + disk = get_imsm_disk(super, idx); + serialcpy(inf[i].serial, disk->serial); + } + append_metadata_update(st, u, len); + + return 0; +} + +static int _add_disk(struct supertype *st) +{ + struct intel_super *super = st->sb; + size_t len; + struct imsm_update_add_disk *u; + + if (!super->add) + return 0; + + len = sizeof(*u); + u = malloc(len); + if (!u) { + fprintf(stderr, "%s: failed to allocate update buffer\n", + __func__); + return 1; + } + + u->type = update_add_disk; + append_metadata_update(st, u, len); + + return 0; +} + +static int write_init_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + int current_vol = super->current_vol; + + /* we are done with current_vol reset it to point st at the container */ + super->current_vol = -1; + + if (st->update_tail) { + /* queue the recently created array / added disk + * as a metadata update */ + struct dl *d; + int rv; + + /* determine if we are creating a volume or adding a disk */ + if (current_vol < 0) { + /* in the add disk case we are running in mdmon + * context, so don't close fd's + */ + return _add_disk(st); + } else + rv = create_array(st, current_vol); + + for (d = super->disks; d ; d = d->next) { + close(d->fd); + d->fd = -1; + } + + return rv; + } else { + struct dl *d; + for (d = super->disks; d; d = d->next) + Kill(d->devname, NULL, 0, 1, 1); + return write_super_imsm(st->sb, 1); + } +} +#endif + +static int store_super_imsm(struct supertype *st, int fd) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; + + if (!mpb) + return 1; + +#ifndef MDASSEMBLE + return store_imsm_mpb(fd, mpb); +#else + return 1; +#endif +} + +static int imsm_bbm_log_size(struct imsm_super *mpb) +{ + return __le32_to_cpu(mpb->bbm_log_size); +} + +#ifndef MDASSEMBLE +static int validate_geometry_imsm_container(struct supertype *st, int level, + int layout, int raiddisks, int chunk, + unsigned long long size, char *dev, + unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + const struct imsm_orom *orom; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + if (check_env("IMSM_NO_PLATFORM")) + orom = NULL; + else + orom = find_imsm_orom(); + if (orom && raiddisks > orom->tds) { + if (verbose) + fprintf(stderr, Name ": %d exceeds maximum number of" + " platform supported disks: %d\n", + raiddisks, orom->tds); + return 0; + } + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": imsm: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size_imsm(st, ldsize >> 9); + + return 1; +} + +static unsigned long long find_size(struct extent *e, int *idx, int num_extents) +{ + const unsigned long long base_start = e[*idx].start; + unsigned long long end = base_start + e[*idx].size; + int i; + + if (base_start == end) + return 0; + + *idx = *idx + 1; + for (i = *idx; i < num_extents; i++) { + /* extend overlapping extents */ + if (e[i].start >= base_start && + e[i].start <= end) { + if (e[i].size == 0) + return 0; + if (e[i].start + e[i].size > end) + end = e[i].start + e[i].size; + } else if (e[i].start > end) { + *idx = i; + break; + } + } + + return end - base_start; +} + +static unsigned long long merge_extents(struct intel_super *super, int sum_extents) +{ + /* build a composite disk with all known extents and generate a new + * 'maxsize' given the "all disks in an array must share a common start + * offset" constraint + */ + struct extent *e = calloc(sum_extents, sizeof(*e)); + struct dl *dl; + int i, j; + int start_extent; + unsigned long long pos; + unsigned long long start = 0; + unsigned long long maxsize; + unsigned long reserve; + + if (!e) + return 0; + + /* coalesce and sort all extents. also, check to see if we need to + * reserve space between member arrays + */ + j = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (!dl->e) + continue; + for (i = 0; i < dl->extent_cnt; i++) + e[j++] = dl->e[i]; + } + qsort(e, sum_extents, sizeof(*e), cmp_extent); + + /* merge extents */ + i = 0; + j = 0; + while (i < sum_extents) { + e[j].start = e[i].start; + e[j].size = find_size(e, &i, sum_extents); + j++; + if (e[j-1].size == 0) + break; + } + + pos = 0; + maxsize = 0; + start_extent = 0; + i = 0; + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) { + maxsize = esize; + start = pos; + start_extent = i; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + free(e); + + if (maxsize == 0) + return 0; + + /* FIXME assumes volume at offset 0 is the first volume in a + * container + */ + if (start_extent > 0) + reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */ + else + reserve = 0; + + if (maxsize < reserve) + return 0; + + super->create_offset = ~((__u32) 0); + if (start + reserve > super->create_offset) + return 0; /* start overflows create_offset */ + super->create_offset = start + reserve; + + return maxsize - reserve; +} + +static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks) +{ + if (level < 0 || level == 6 || level == 4) + return 0; + + /* if we have an orom prevent invalid raid levels */ + if (orom) + switch (level) { + case 0: return imsm_orom_has_raid0(orom); + case 1: + if (raiddisks > 2) + return imsm_orom_has_raid1e(orom); + return imsm_orom_has_raid1(orom) && raiddisks == 2; + case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4; + case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2; + } + else + return 1; /* not on an Intel RAID platform so anything goes */ + + return 0; +} + +#define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg)) +static int +validate_geometry_imsm_orom(struct intel_super *super, int level, int layout, + int raiddisks, int chunk, int verbose) +{ + if (!is_raid_level_supported(super->orom, level, raiddisks)) { + pr_vrb(": platform does not support raid%d with %d disk%s\n", + level, raiddisks, raiddisks > 1 ? "s" : ""); + return 0; + } + if (super->orom && level != 1 && + !imsm_orom_has_chunk(super->orom, chunk)) { + pr_vrb(": platform does not support a chunk size of: %d\n", chunk); + return 0; + } + if (layout != imsm_level_to_layout(level)) { + if (level == 5) + pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n"); + else if (level == 10) + pr_vrb(": imsm raid 10 only supports the n2 layout\n"); + else + pr_vrb(": imsm unknown layout %#x for this raid level %d\n", + layout, level); + return 0; + } + + return 1; +} + +/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd + * FIX ME add ahci details + */ +static int validate_geometry_imsm_volume(struct supertype *st, int level, + int layout, int raiddisks, int chunk, + unsigned long long size, char *dev, + unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + + /* We must have the container info already read in. */ + if (!super) + return 0; + + if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, verbose)) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size' at a given + * offset + */ + unsigned long long minsize = size; + unsigned long long start_offset = MaxSector; + int dcnt = 0; + if (minsize == 0) + minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + for (dl = super->disks; dl ; dl = dl->next) { + int found = 0; + + pos = 0; + i = 0; + e = get_extents(super, dl); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + if (found && start_offset == MaxSector) { + start_offset = pos; + break; + } else if (found && pos != start_offset) { + found = 0; + break; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + fprintf(stderr, Name ": imsm: Not enough " + "devices with space for this array " + "(%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = super->disks ; dl ; dl = dl->next) { + if (dl->major == (int)major(stb.st_rdev) && + dl->minor == (int)minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + fprintf(stderr, Name ": %s is not in the " + "same imsm set\n", dev); + return 0; + } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) { + /* If a volume is present then the current creation attempt + * cannot incorporate new spares because the orom may not + * understand this configuration (all member disks must be + * members of each array in the container). + */ + fprintf(stderr, Name ": %s is a spare and a volume" + " is already defined for this container\n", dev); + fprintf(stderr, Name ": The option-rom requires all member" + " disks to be a member of all volumes\n"); + return 0; + } + + /* retrieve the largest free space block */ + e = get_extents(super, dl); + maxsize = 0; + i = 0; + if (e) { + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + dl->e = e; + dl->extent_cnt = i; + } else { + if (verbose) + fprintf(stderr, Name ": unable to determine free space for: %s\n", + dev); + return 0; + } + if (maxsize < size) { + if (verbose) + fprintf(stderr, Name ": %s not enough space (%llu < %llu)\n", + dev, maxsize, size); + return 0; + } + + /* count total number of extents for merge */ + i = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + i += dl->extent_cnt; + + maxsize = merge_extents(super, i); + if (maxsize < size || maxsize == 0) { + if (verbose) + fprintf(stderr, Name ": not enough space after merge (%llu < %llu)\n", + maxsize, size); + return 0; + } + + *freesize = maxsize; + + return 1; +} + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + int i; + int extent_cnt; + struct extent *e; + unsigned long long maxsize; + unsigned long long minsize; + int cnt; + int used; + + /* find the largest common start free region of the possible disks */ + used = 0; + extent_cnt = 0; + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) { + dl->raiddisk = -1; + + if (dl->index >= 0) + used++; + + /* don't activate new spares if we are orom constrained + * and there is already a volume active in the container + */ + if (super->orom && dl->index < 0 && mpb->num_raid_devs) + continue; + + e = get_extents(super, dl); + if (!e) + continue; + for (i = 1; e[i-1].size; i++) + ; + dl->e = e; + dl->extent_cnt = i; + extent_cnt += i; + cnt++; + } + + maxsize = merge_extents(super, extent_cnt); + minsize = size; + if (size == 0) + minsize = chunk; + + if (cnt < raiddisks || + (super->orom && used && used != raiddisks) || + maxsize < minsize || + maxsize == 0) { + fprintf(stderr, Name ": not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + + if (size == 0) { + size = maxsize; + if (chunk) { + size /= chunk; + size *= chunk; + } + } + + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + + *freesize = size; + + return 1; +} + +static int validate_geometry_imsm(struct supertype *st, int level, int layout, + int raiddisks, int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd, cfd; + struct mdinfo *sra; + int is_member = 0; + + /* if given unused devices create a container + * if given given devices in a container create a member volume + */ + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_imsm_container(st, level, layout, + raiddisks, chunk, size, + dev, freesize, + verbose); + } + + if (!dev) { + if (st->sb && freesize) { + /* we are being asked to automatically layout a + * new volume based on the current contents of + * the container. If the the parameters can be + * satisfied reserve_space will record the disks, + * start offset, and size of the volume to be + * created. add_to_super and getinfo_super + * detect when autolayout is in progress. + */ + if (!validate_geometry_imsm_orom(st->sb, level, layout, + raiddisks, chunk, + verbose)) + return 0; + return reserve_space(st, raiddisks, size, chunk, freesize); + } + return 1; + } + if (st->sb) { + /* creating in a given container */ + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, size, + dev, freesize, verbose); + } + + /* This device needs to be a device in an 'imsm' container */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + if (verbose) + fprintf(stderr, + Name ": Cannot create this array on device %s\n", + dev); + close(fd); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + fprintf(stderr, Name ": Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe an 'imsm' container. */ + cfd = open_container(fd); + close(fd); + if (cfd < 0) { + if (verbose) + fprintf(stderr, Name ": Cannot use %s: It is busy\n", + dev); + return 0; + } + sra = sysfs_read(cfd, 0, GET_VERSION); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "imsm") == 0) + is_member = 1; + sysfs_free(sra); + if (is_member) { + /* This is a member of a imsm container. Load the container + * and try to create a volume + */ + struct intel_super *super; + + if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) { + st->sb = super; + st->container_dev = fd2devnum(cfd); + close(cfd); + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, + size, dev, + freesize, verbose); + } + } + + if (verbose) + fprintf(stderr, Name ": failed container membership check\n"); + + close(cfd); + return 0; +} + +static int default_chunk_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + if (!super->orom) + return 0; + + return imsm_orom_default_chunk(super->orom); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev); + +static int kill_subarray_imsm(struct supertype *st) +{ + /* remove the subarray currently referenced by ->current_vol */ + __u8 i; + struct intel_dev **dp; + struct intel_super *super = st->sb; + __u8 current_vol = super->current_vol; + struct imsm_super *mpb = super->anchor; + + if (super->current_vol < 0) + return 2; + super->current_vol = -1; /* invalidate subarray cursor */ + + /* block deletions that would change the uuid of active subarrays + * + * FIXME when immutable ids are available, but note that we'll + * also need to fixup the invalidated/active subarray indexes in + * mdstat + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + char subarray[4]; + + if (i < current_vol) + continue; + sprintf(subarray, "%u", i); + if (is_subarray_active(subarray, st->devname)) { + fprintf(stderr, + Name ": deleting subarray-%d would change the UUID of active subarray-%d, aborting\n", + current_vol, i); + + return 2; + } + } + + if (st->update_tail) { + struct imsm_update_kill_array *u = malloc(sizeof(*u)); + + if (!u) + return 2; + u->type = update_kill_array; + u->dev_idx = current_vol; + append_metadata_update(st, u, sizeof(*u)); + + return 0; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == current_vol) { + *dp = (*dp)->next; + } else { + handle_missing(super, (*dp)->dev); + if ((*dp)->index > current_vol) + (*dp)->index--; + dp = &(*dp)->next; + } + + /* no more raid devices, all active components are now spares, + * but of course failed are still failed + */ + if (--mpb->num_raid_devs == 0) { + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index > -2) { + d->index = -1; + d->disk.status = SPARE_DISK; + } + } + + super->updates_pending++; + + return 0; +} + +static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_t ident) +{ + /* update the subarray currently referenced by ->current_vol */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (super->current_vol < 0) + return 2; + + if (strcmp(update, "name") == 0) { + char *name = ident->name; + + if (is_subarray_active(st->subarray, st->devname)) { + fprintf(stderr, + Name ": Unable to update name of active subarray\n"); + return 2; + } + + if (!check_name(super, name, 0)) + return 2; + + if (st->update_tail) { + struct imsm_update_rename_array *u = malloc(sizeof(*u)); + + if (!u) + return 2; + u->type = update_rename_array; + u->dev_idx = super->current_vol; + snprintf((char *) u->name, MAX_RAID_SERIAL_LEN, "%s", name); + append_metadata_update(st, u, sizeof(*u)); + } else { + struct imsm_dev *dev; + int i; + + dev = get_imsm_dev(super, super->current_vol); + snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name); + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + handle_missing(super, dev); + } + super->updates_pending++; + } + } else + return 2; + + return 0; +} +#endif /* MDASSEMBLE */ + +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_REBUILD) + return 0; + + migr_map = get_imsm_map(dev, 1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +static void update_recovery_start(struct imsm_dev *dev, struct mdinfo *array) +{ + struct mdinfo *rebuild = NULL; + struct mdinfo *d; + __u32 units; + + if (!is_rebuilding(dev)) + return; + + /* Find the rebuild target, but punt on the dual rebuild case */ + for (d = array->devs; d; d = d->next) + if (d->recovery_start == 0) { + if (rebuild) + return; + rebuild = d; + } + + if (!rebuild) { + /* (?) none of the disks are marked with + * IMSM_ORD_REBUILD, so assume they are missing and the + * disk_ord_tbl was not correctly updated + */ + dprintf("%s: failed to locate out-of-sync disk\n", __func__); + return; + } + + units = __le32_to_cpu(dev->vol.curr_migr_unit); + rebuild->recovery_start = units * blocks_per_migr_unit(dev); +} + + +static struct mdinfo *container_content_imsm(struct supertype *st) +{ + /* Given a container loaded by load_super_imsm_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each imsm_dev create an mdinfo, fill it in, + * then look for matching devices in super->disks + * and create appropriate device mdinfo. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo *rest = NULL; + int i; + + /* do not assemble arrays that might have bad blocks */ + if (imsm_bbm_log_size(super->anchor)) { + fprintf(stderr, Name ": BBM log found in metadata. " + "Cannot activate array(s).\n"); + return NULL; + } + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, 0); + struct mdinfo *this; + int slot; + + /* do not publish arrays that are in the middle of an + * unsupported migration + */ + if (dev->vol.migr_state && + (migr_type(dev) == MIGR_GEN_MIGR || + migr_type(dev) == MIGR_STATE_CHANGE)) { + fprintf(stderr, Name ": cannot assemble volume '%.16s':" + " unsupported migration in progress\n", + dev->volume); + continue; + } + + this = malloc(sizeof(*this)); + if (!this) { + fprintf(stderr, Name ": failed to allocate %zu bytes\n", + sizeof(*this)); + break; + } + memset(this, 0, sizeof(*this)); + this->next = rest; + + super->current_vol = i; + getinfo_super_imsm_volume(st, this); + for (slot = 0 ; slot < map->num_members; slot++) { + unsigned long long recovery_start; + struct mdinfo *info_d; + struct dl *d; + int idx; + int skip; + __u32 ord; + + skip = 0; + idx = get_imsm_disk_idx(dev, slot); + ord = get_imsm_ord_tbl_ent(dev, slot); + for (d = super->disks; d ; d = d->next) + if (d->index == idx) + break; + + recovery_start = MaxSector; + if (d == NULL) + skip = 1; + if (d && is_failed(&d->disk)) + skip = 1; + if (ord & IMSM_ORD_REBUILD) + recovery_start = 0; + + /* + * if we skip some disks the array will be assmebled degraded; + * reset resync start to avoid a dirty-degraded + * situation when performing the intial sync + * + * FIXME handle dirty degraded + */ + if ((skip || recovery_start == 0) && !dev->vol.dirty) + this->resync_start = MaxSector; + if (skip) + continue; + + info_d = calloc(1, sizeof(*info_d)); + if (!info_d) { + fprintf(stderr, Name ": failed to allocate disk" + " for volume %.16s\n", dev->volume); + info_d = this->devs; + while (info_d) { + struct mdinfo *d = info_d->next; + + free(info_d); + info_d = d; + } + free(this); + this = rest; + break; + } + info_d->next = this->devs; + this->devs = info_d; + + info_d->disk.number = d->index; + info_d->disk.major = d->major; + info_d->disk.minor = d->minor; + info_d->disk.raid_disk = slot; + info_d->recovery_start = recovery_start; + + if (info_d->recovery_start == MaxSector) + this->array.working_disks++; + + info_d->events = __le32_to_cpu(mpb->generation_num); + info_d->data_offset = __le32_to_cpu(map->pba_of_lba0); + info_d->component_size = __le32_to_cpu(map->blocks_per_member); + } + /* now that the disk list is up-to-date fixup recovery_start */ + update_recovery_start(dev, this); + rest = this; + } + + return rest; +} + + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed) +{ + struct imsm_map *map = get_imsm_map(dev, 0); + + if (!failed) + return map->map_state == IMSM_T_STATE_UNINITIALIZED ? + IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; + + switch (get_imsm_raid_level(map)) { + case 0: + return IMSM_T_STATE_FAILED; + break; + case 1: + if (failed < map->num_members) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + case 10: + { + /** + * check to see if any mirrors have failed, otherwise we + * are degraded. Even numbered slots are mirrored on + * slot+1 + */ + int i; + /* gcc -Os complains that this is unused */ + int insync = insync; + + for (i = 0; i < map->num_members; i++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, i); + int idx = ord_to_idx(ord); + struct imsm_disk *disk; + + /* reset the potential in-sync count on even-numbered + * slots. num_copies is always 2 for imsm raid10 + */ + if ((i & 1) == 0) + insync = 2; + + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + insync--; + + /* no in-sync disks left in this mirror the + * array has failed + */ + if (insync == 0) + return IMSM_T_STATE_FAILED; + } + + return IMSM_T_STATE_DEGRADED; + } + case 5: + if (failed < 2) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + default: + break; + } + + return map->map_state; +} + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev) +{ + int i; + int failed = 0; + struct imsm_disk *disk; + struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state); + __u32 ord; + int idx; + + /* at the beginning of migration we set IMSM_ORD_REBUILD on + * disks that are being rebuilt. New failures are recorded to + * map[0]. So we look through all the disks we started with and + * see if any failures are still present, or if any new ones + * have arrived + * + * FIXME add support for online capacity expansion and + * raid-level-migration + */ + for (i = 0; i < prev->num_members; i++) { + ord = __le32_to_cpu(prev->disk_ord_tbl[i]); + ord |= __le32_to_cpu(map->disk_ord_tbl[i]); + idx = ord_to_idx(ord); + + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + failed++; + } + + return failed; +} + +#ifndef MDASSEMBLE +static int imsm_open_new(struct supertype *c, struct active_array *a, + char *inst) +{ + struct intel_super *super = c->sb; + struct imsm_super *mpb = super->anchor; + + if (atoi(inst) >= mpb->num_raid_devs) { + fprintf(stderr, "%s: subarry index %d, out of range\n", + __func__, atoi(inst)); + return -ENODEV; + } + + dprintf("imsm: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +static int is_resyncing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) == MIGR_INIT || + migr_type(dev) == MIGR_REPAIR) + return 1; + + migr_map = get_imsm_map(dev, 1); + + if (migr_map->map_state == IMSM_T_STATE_NORMAL) + return 1; + else + return 0; +} + +/* return true if we recorded new information */ +static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + __u32 ord; + int slot; + struct imsm_map *map; + + /* new failures are always set in map[0] */ + map = get_imsm_map(dev, 0); + + slot = get_imsm_disk_slot(map, idx); + if (slot < 0) + return 0; + + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) + return 0; + + disk->status |= FAILED_DISK; + disk->status &= ~CONFIGURED_DISK; + set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD); + if (map->failed_disk_num == 0xff) + map->failed_disk_num = slot; + return 1; +} + +static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + mark_failure(dev, disk, idx); + + if (disk->scsi_id == __cpu_to_le32(~(__u32)0)) + return; + + disk->scsi_id = __cpu_to_le32(~(__u32)0); + memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev) +{ + __u8 map_state; + struct dl *dl; + int failed; + + if (!super->missing) + return; + failed = imsm_count_failed(super, dev); + map_state = imsm_check_degraded(super, dev, failed); + + dprintf("imsm: mark missing\n"); + end_migration(dev, map_state); + for (dl = super->missing; dl; dl = dl->next) + mark_missing(dev, &dl->disk, dl->index); + super->updates_pending++; +} + +/* Handle dirty -> clean transititions and resync. Degraded and rebuild + * states are handled in imsm_set_disk() with one exception, when a + * resync is stopped due to a new failure this routine will set the + * 'degraded' state for the array. + */ +static int imsm_set_array_state(struct active_array *a, int consistent) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, 0); + int failed = imsm_count_failed(super, dev); + __u8 map_state = imsm_check_degraded(super, dev, failed); + __u32 blocks_per_unit; + + /* before we activate this array handle any missing disks */ + if (consistent == 2) + handle_missing(super, dev); + + if (consistent == 2 && + (!is_resync_complete(&a->info) || + map_state != IMSM_T_STATE_NORMAL || + dev->vol.migr_state)) + consistent = 0; + + if (is_resync_complete(&a->info)) { + /* complete intialization / resync, + * recovery and interrupted recovery is completed in + * ->set_disk + */ + if (is_resyncing(dev)) { + dprintf("imsm: mark resync done\n"); + end_migration(dev, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + } + } else if (!is_resyncing(dev) && !failed) { + /* mark the start of the init process if nothing is failed */ + dprintf("imsm: mark resync start\n"); + if (map->map_state == IMSM_T_STATE_UNINITIALIZED) + migrate(dev, IMSM_T_STATE_NORMAL, MIGR_INIT); + else + migrate(dev, IMSM_T_STATE_NORMAL, MIGR_REPAIR); + super->updates_pending++; + } + + /* check if we can update curr_migr_unit from resync_start, recovery_start */ + blocks_per_unit = blocks_per_migr_unit(dev); + if (blocks_per_unit) { + __u32 units32; + __u64 units; + + units = a->last_checkpoint / blocks_per_unit; + units32 = units; + + /* check that we did not overflow 32-bits, and that + * curr_migr_unit needs updating + */ + if (units32 == units && + __le32_to_cpu(dev->vol.curr_migr_unit) != units32) { + dprintf("imsm: mark checkpoint (%u)\n", units32); + dev->vol.curr_migr_unit = __cpu_to_le32(units32); + super->updates_pending++; + } + } + + /* mark dirty / clean */ + if (dev->vol.dirty != !consistent) { + dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); + if (consistent) + dev->vol.dirty = 0; + else + dev->vol.dirty = 1; + super->updates_pending++; + } + return consistent; +} + +static void imsm_set_disk(struct active_array *a, int n, int state) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_disk *disk; + int failed; + __u32 ord; + __u8 map_state; + + if (n > map->num_members) + fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n", + n, map->num_members - 1); + + if (n < 0) + return; + + dprintf("imsm: set_disk %d:%x\n", n, state); + + ord = get_imsm_ord_tbl_ent(dev, n); + disk = get_imsm_disk(super, ord_to_idx(ord)); + + /* check for new failures */ + if (state & DS_FAULTY) { + if (mark_failure(dev, disk, ord_to_idx(ord))) + super->updates_pending++; + } + + /* check if in_sync */ + if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) { + struct imsm_map *migr_map = get_imsm_map(dev, 1); + + set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord)); + super->updates_pending++; + } + + failed = imsm_count_failed(super, dev); + map_state = imsm_check_degraded(super, dev, failed); + + /* check if recovery complete, newly degraded, or failed */ + if (map_state == IMSM_T_STATE_NORMAL && is_rebuilding(dev)) { + end_migration(dev, map_state); + map = get_imsm_map(dev, 0); + map->failed_disk_num = ~0; + super->updates_pending++; + a->last_checkpoint = 0; + } else if (map_state == IMSM_T_STATE_DEGRADED && + map->map_state != map_state && + !dev->vol.migr_state) { + dprintf("imsm: mark degraded\n"); + map->map_state = map_state; + super->updates_pending++; + a->last_checkpoint = 0; + } else if (map_state == IMSM_T_STATE_FAILED && + map->map_state != map_state) { + dprintf("imsm: mark failed\n"); + end_migration(dev, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + } +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb) +{ + void *buf = mpb; + __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); + unsigned long long dsize; + unsigned long long sectors; + + get_dev_size(fd, NULL, &dsize); + + if (mpb_size > 512) { + /* -1 to account for anchor */ + sectors = mpb_sectors(mpb) - 1; + + /* write the extended mpb to the sectors preceeding the anchor */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) + return 1; + + if ((unsigned long long)write(fd, buf + 512, 512 * sectors) + != 512 * sectors) + return 1; + } + + /* first block is stored on second to last sector of the disk */ + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + return 1; + + if (write(fd, buf, 512) != 512) + return 1; + + return 0; +} + +static void imsm_sync_metadata(struct supertype *container) +{ + struct intel_super *super = container->sb; + + if (!super->updates_pending) + return; + + write_super_imsm(super, 0); + + super->updates_pending = 0; +} + +static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int i = get_imsm_disk_idx(dev, idx); + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == i) + break; + + if (dl && is_failed(&dl->disk)) + dl = NULL; + + if (dl) + dprintf("%s: found %x:%x\n", __func__, dl->major, dl->minor); + + return dl; +} + +static struct dl *imsm_add_spare(struct intel_super *super, int slot, + struct active_array *a, int activate_new) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int idx = get_imsm_disk_idx(dev, slot); + struct imsm_super *mpb = super->anchor; + struct imsm_map *map; + unsigned long long pos; + struct mdinfo *d; + struct extent *ex; + int i, j; + int found; + __u32 array_start = 0; + __u32 array_end = 0; + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) { + /* If in this array, skip */ + for (d = a->info.devs ; d ; d = d->next) + if (d->state_fd >= 0 && + d->disk.major == dl->major && + d->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", dl->major, dl->minor); + break; + } + if (d) + continue; + + /* skip in use or failed drives */ + if (is_failed(&dl->disk) || idx == dl->index || + dl->index == -2) { + dprintf("%x:%x status (failed: %d index: %d)\n", + dl->major, dl->minor, is_failed(&dl->disk), idx); + continue; + } + + /* skip pure spares when we are looking for partially + * assimilated drives + */ + if (dl->index == -1 && !activate_new) + continue; + + /* Does this unused device have the requisite free space? + * It needs to be able to cover all member volumes + */ + ex = get_extents(super, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); + + /* check if this disk is already a member of + * this array + */ + if (get_imsm_disk_slot(map, dl->index) >= 0) + continue; + + found = 0; + j = 0; + pos = 0; + array_start = __le32_to_cpu(map->pba_of_lba0); + array_end = array_start + + __le32_to_cpu(map->blocks_per_member) - 1; + + do { + /* check that we can start at pba_of_lba0 with + * blocks_per_member of space + */ + if (array_start >= pos && array_end < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + } while (ex[j-1].size); + + if (!found) + break; + } + + free(ex); + if (i < mpb->num_raid_devs) { + dprintf("%x:%x does not have %u to %u available\n", + dl->major, dl->minor, array_start, array_end); + /* No room */ + continue; + } + return dl; + } + + return dl; +} + +static struct mdinfo *imsm_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + /** + * Find a device with unused free space and use it to replace a + * failed/vacant region in an array. We replace failed regions one a + * array at a time. The result is that a new spare disk will be added + * to the first failed array and after the monitor has finished + * propagating failures the remainder will be consumed. + * + * FIXME add a capability for mdmon to request spares from another + * container. + */ + + struct intel_super *super = a->container->sb; + int inst = a->info.container_member; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, 0); + int failed = a->info.array.raid_disks; + struct mdinfo *rv = NULL; + struct mdinfo *d; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + struct imsm_update_activate_spare *u; + int num_spares = 0; + int i; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + failed--; + } + + dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n", + inst, failed, a->info.array.raid_disks, a->info.array.level); + if (imsm_check_degraded(super, dev, failed) != IMSM_T_STATE_DEGRADED) + return NULL; + + /* For each slot, if it is not working, find a spare */ + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* + * OK, this device needs recovery. Try to re-add the + * previous occupant of this slot, if this fails see if + * we can continue the assimilation of a spare that was + * partially assimilated, finally try to activate a new + * spare. + */ + dl = imsm_readd(super, i, a); + if (!dl) + dl = imsm_add_spare(super, i, a, 0); + if (!dl) + dl = imsm_add_spare(super, i, a, 1); + if (!dl) + continue; + + /* found a usable disk with enough space */ + di = malloc(sizeof(*di)); + if (!di) + continue; + memset(di, 0, sizeof(*di)); + + /* dl->index will be -1 in the case we are activating a + * pristine spare. imsm_process_update() will create a + * new index in this case. Once a disk is found to be + * failed in all member arrays it is kicked from the + * metadata + */ + di->disk.number = dl->index; + + /* (ab)use di->devs to store a pointer to the device + * we chose + */ + di->devs = (struct mdinfo *) dl; + + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = __le32_to_cpu(map->pba_of_lba0); + di->component_size = a->info.component_size; + di->container_member = inst; + super->random = random32(); + di->next = rv; + rv = di; + num_spares++; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, di->data_offset); + + break; + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * disk_ord_tbl for the array + */ + mu = malloc(sizeof(*mu)); + if (mu) { + mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares); + if (mu->buf == NULL) { + free(mu); + mu = NULL; + } + } + if (!mu) { + while (rv) { + struct mdinfo *n = rv->next; + + free(rv); + rv = n; + } + return NULL; + } + + mu->space = NULL; + mu->len = sizeof(struct imsm_update_activate_spare) * num_spares; + mu->next = *updates; + u = (struct imsm_update_activate_spare *) mu->buf; + + for (di = rv ; di ; di = di->next) { + u->type = update_activate_spare; + u->dl = (struct dl *) di->devs; + di->devs = NULL; + u->slot = di->disk.raid_disk; + u->array = inst; + u->next = u + 1; + u++; + } + (u-1)->next = NULL; + *updates = mu; + + return rv; +} + +static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u) +{ + struct imsm_dev *dev = get_imsm_dev(super, idx); + struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *new_map = get_imsm_map(&u->dev, 0); + struct disk_info *inf = get_disk_info(u); + struct imsm_disk *disk; + int i; + int j; + + for (i = 0; i < map->num_members; i++) { + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); + for (j = 0; j < new_map->num_members; j++) + if (serialcmp(disk->serial, inf[j].serial) == 0) + return 1; + } + + return 0; +} + +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index); + +static void imsm_process_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * crack open the metadata_update envelope to find the update record + * update can be one of: + * update_activate_spare - a spare device has replaced a failed + * device in an array, update the disk_ord_tbl. If this disk is + * present in all member arrays then also clear the SPARE_DISK + * flag + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb; + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + + /* update requires a larger buf but the allocation failed */ + if (super->next_len && !super->next_buf) { + super->next_len = 0; + return; + } + + if (super->next_buf) { + memcpy(super->next_buf, super->buf, super->len); + free(super->buf); + super->len = super->next_len; + super->buf = super->next_buf; + + super->next_len = 0; + super->next_buf = NULL; + } + + mpb = super->anchor; + + switch (type) { + case update_activate_spare: { + struct imsm_update_activate_spare *u = (void *) update->buf; + struct imsm_dev *dev = get_imsm_dev(super, u->array); + struct imsm_map *map = get_imsm_map(dev, 0); + struct imsm_map *migr_map; + struct active_array *a; + struct imsm_disk *disk; + __u8 to_state; + struct dl *dl; + unsigned int found; + int failed; + int victim = get_imsm_disk_idx(dev, u->slot); + int i; + + for (dl = super->disks; dl; dl = dl->next) + if (dl == u->dl) + break; + + if (!dl) { + fprintf(stderr, "error: imsm_activate_spare passed " + "an unknown disk (index: %d)\n", + u->dl->index); + return; + } + + super->updates_pending++; + + /* count failures (excluding rebuilds and the victim) + * to determine map[0] state + */ + failed = 0; + for (i = 0; i < map->num_members; i++) { + if (i == u->slot) + continue; + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); + if (!disk || is_failed(disk)) + failed++; + } + + /* adding a pristine spare, assign a new index */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + disk = &dl->disk; + disk->status |= CONFIGURED_DISK; + disk->status &= ~SPARE_DISK; + + /* mark rebuild */ + to_state = imsm_check_degraded(super, dev, failed); + map->map_state = IMSM_T_STATE_DEGRADED; + migrate(dev, to_state, MIGR_REBUILD); + migr_map = get_imsm_map(dev, 1); + set_imsm_ord_tbl_ent(map, u->slot, dl->index); + set_imsm_ord_tbl_ent(migr_map, u->slot, dl->index | IMSM_ORD_REBUILD); + + /* update the family_num to mark a new container + * generation, being careful to record the existing + * family_num in orig_family_num to clean up after + * earlier mdadm versions that neglected to set it. + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + mpb->family_num += super->random; + + /* count arrays using the victim in the metadata */ + found = 0; + for (a = st->arrays; a ; a = a->next) { + dev = get_imsm_dev(super, a->info.container_member); + map = get_imsm_map(dev, 0); + + if (get_imsm_disk_slot(map, victim) >= 0) + found++; + } + + /* delete the victim if it is no longer being + * utilized anywhere + */ + if (!found) { + struct dl **dlp; + + /* We know that 'manager' isn't touching anything, + * so it is safe to delete + */ + for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + + /* victim may be on the missing list */ + if (!*dlp) + for (dlp = &super->missing; *dlp; dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + imsm_delete(super, dlp, victim); + } + break; + } + case update_create_array: { + /* someone wants to create a new array, we need to be aware of + * a few races/collisions: + * 1/ 'Create' called by two separate instances of mdadm + * 2/ 'Create' versus 'activate_spare': mdadm has chosen + * devices that have since been assimilated via + * activate_spare. + * In the event this update can not be carried out mdadm will + * (FIX ME) notice that its update did not take hold. + */ + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_map *map, *new_map; + unsigned long long start, end; + unsigned long long new_start, new_end; + int i; + struct disk_info *inf; + struct dl *dl; + + /* handle racing creates: first come first serve */ + if (u->dev_idx < mpb->num_raid_devs) { + dprintf("%s: subarray %d already defined\n", + __func__, u->dev_idx); + goto create_error; + } + + /* check update is next in sequence */ + if (u->dev_idx != mpb->num_raid_devs) { + dprintf("%s: can not create array %d expected index %d\n", + __func__, u->dev_idx, mpb->num_raid_devs); + goto create_error; + } + + new_map = get_imsm_map(&u->dev, 0); + new_start = __le32_to_cpu(new_map->pba_of_lba0); + new_end = new_start + __le32_to_cpu(new_map->blocks_per_member); + inf = get_disk_info(u); + + /* handle activate_spare versus create race: + * check to make sure that overlapping arrays do not include + * overalpping disks + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); + start = __le32_to_cpu(map->pba_of_lba0); + end = start + __le32_to_cpu(map->blocks_per_member); + if ((new_start >= start && new_start <= end) || + (start >= new_start && start <= new_end)) + /* overlap */; + else + continue; + + if (disks_overlap(super, i, u)) { + dprintf("%s: arrays overlap\n", __func__); + goto create_error; + } + } + + /* check that prepare update was successful */ + if (!update->space) { + dprintf("%s: prepare update failed\n", __func__); + goto create_error; + } + + /* check that all disks are still active before committing + * changes. FIXME: could we instead handle this by creating a + * degraded array? That's probably not what the user expects, + * so better to drop this update on the floor. + */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + dprintf("%s: disk disappeared\n", __func__); + goto create_error; + } + } + + super->updates_pending++; + + /* convert spares to members and fixup ord_tbl */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (dl->index == -1) { + dl->index = mpb->num_disks; + mpb->num_disks++; + dl->disk.status |= CONFIGURED_DISK; + dl->disk.status &= ~SPARE_DISK; + } + set_imsm_ord_tbl_ent(new_map, i, dl->index); + } + + dv = update->space; + dev = dv->dev; + update->space = NULL; + imsm_copy_dev(dev, &u->dev); + dv->index = u->dev_idx; + dv->next = super->devlist; + super->devlist = dv; + mpb->num_raid_devs++; + + imsm_update_version_info(super); + break; + create_error: + /* mdmon knows how to release update->space, but not + * ((struct intel_dev *) update->space)->dev + */ + if (update->space) { + dv = update->space; + free(dv->dev); + } + break; + } + case update_kill_array: { + struct imsm_update_kill_array *u = (void *) update->buf; + int victim = u->dev_idx; + struct active_array *a; + struct intel_dev **dp; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * active arrays, or deleting an active array + * + * FIXME when immutable ids are available, but note that + * we'll also need to fixup the invalidated/active + * subarray indexes in mdstat + */ + for (a = st->arrays; a; a = a->next) + if (a->info.container_member >= victim) + break; + /* by definition if mdmon is running at least one array + * is active in the container, so checking + * mpb->num_raid_devs is just extra paranoia + */ + dev = get_imsm_dev(super, victim); + if (a || !dev || mpb->num_raid_devs == 1) { + dprintf("failed to delete subarray-%d\n", victim); + break; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == (unsigned)super->current_vol) { + *dp = (*dp)->next; + } else { + if ((*dp)->index > (unsigned)victim) + (*dp)->index--; + dp = &(*dp)->next; + } + mpb->num_raid_devs--; + super->updates_pending++; + break; + } + case update_rename_array: { + struct imsm_update_rename_array *u = (void *) update->buf; + char name[MAX_RAID_SERIAL_LEN+1]; + int target = u->dev_idx; + struct active_array *a; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * an active array + */ + snprintf(name, MAX_RAID_SERIAL_LEN, "%s", (char *) u->name); + name[MAX_RAID_SERIAL_LEN] = '\0'; + for (a = st->arrays; a; a = a->next) + if (a->info.container_member == target) + break; + dev = get_imsm_dev(super, u->dev_idx); + if (a || !dev || !check_name(super, name, 1)) { + dprintf("failed to rename subarray-%d\n", target); + break; + } + + snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name); + super->updates_pending++; + break; + } + case update_add_disk: + + /* we may be able to repair some arrays if disks are + * being added */ + if (super->add) { + struct active_array *a; + + super->updates_pending++; + for (a = st->arrays; a; a = a->next) + a->check_degraded = 1; + } + /* add some spares to the metadata */ + while (super->add) { + struct dl *al; + + al = super->add; + super->add = al->next; + al->next = super->disks; + super->disks = al; + dprintf("%s: added %x:%x\n", + __func__, al->major, al->minor); + } + + break; + } +} + +static void imsm_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * Allocate space to hold new disk entries, raid-device entries or a new + * mpb if necessary. The manager synchronously waits for updates to + * complete in the monitor, so new mpb buffers allocated here can be + * integrated by the monitor thread without worrying about live pointers + * in the manager thread. + */ + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + size_t buf_len; + size_t len = 0; + + switch (type) { + case update_create_array: { + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev = &u->dev; + struct imsm_map *map = get_imsm_map(dev, 0); + struct dl *dl; + struct disk_info *inf; + int i; + int activate = 0; + + inf = get_disk_info(u); + len = sizeof_imsm_dev(dev, 1); + /* allocate a new super->devlist entry */ + dv = malloc(sizeof(*dv)); + if (dv) { + dv->dev = malloc(len); + if (dv->dev) + update->space = dv; + else { + free(dv); + update->space = NULL; + } + } + + /* count how many spares will be converted to members */ + for (i = 0; i < map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + /* hmm maybe it failed?, nothing we can do about + * it here + */ + continue; + } + if (count_memberships(dl, super) == 0) + activate++; + } + len += activate * sizeof(struct imsm_disk); + break; + default: + break; + } + } + + /* check if we need a larger metadata buffer */ + if (super->next_buf) + buf_len = super->next_len; + else + buf_len = super->len; + + if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) { + /* ok we need a larger buf than what is currently allocated + * if this allocation fails process_update will notice that + * ->next_len is set and ->next_buf is NULL + */ + buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512); + if (super->next_buf) + free(super->next_buf); + + super->next_len = buf_len; + if (posix_memalign(&super->next_buf, 512, buf_len) == 0) + memset(super->next_buf, 0, buf_len); + else + super->next_buf = NULL; + } +} + +/* must be called while manager is quiesced */ +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index) +{ + struct imsm_super *mpb = super->anchor; + struct dl *iter; + struct imsm_dev *dev; + struct imsm_map *map; + int i, j, num_members; + __u32 ord; + + dprintf("%s: deleting device[%d] from imsm_super\n", + __func__, index); + + /* shift all indexes down one */ + for (iter = super->disks; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + for (iter = super->missing; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, 0); + num_members = map->num_members; + for (j = 0; j < num_members; j++) { + /* update ord entries being careful not to propagate + * ord-flags to the first map + */ + ord = get_imsm_ord_tbl_ent(dev, j); + + if (ord_to_idx(ord) <= index) + continue; + + map = get_imsm_map(dev, 0); + set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1)); + map = get_imsm_map(dev, 1); + if (map) + set_imsm_ord_tbl_ent(map, j, ord - 1); + } + } + + mpb->num_disks--; + super->updates_pending++; + if (*dlp) { + struct dl *dl = *dlp; + + *dlp = (*dlp)->next; + __free_imsm_disk(dl); + } +} +#endif /* MDASSEMBLE */ + +struct superswitch super_imsm = { +#ifndef MDASSEMBLE + .examine_super = examine_super_imsm, + .brief_examine_super = brief_examine_super_imsm, + .brief_examine_subarrays = brief_examine_subarrays_imsm, + .export_examine_super = export_examine_super_imsm, + .detail_super = detail_super_imsm, + .brief_detail_super = brief_detail_super_imsm, + .write_init_super = write_init_super_imsm, + .validate_geometry = validate_geometry_imsm, + .default_chunk = default_chunk_imsm, + .add_to_super = add_to_super_imsm, + .detail_platform = detail_platform_imsm, + .kill_subarray = kill_subarray_imsm, + .update_subarray = update_subarray_imsm, +#endif + .match_home = match_home_imsm, + .uuid_from_super= uuid_from_super_imsm, + .getinfo_super = getinfo_super_imsm, + .update_super = update_super_imsm, + + .avail_size = avail_size_imsm, + + .compare_super = compare_super_imsm, + + .load_super = load_super_imsm, + .init_super = init_super_imsm, + .store_super = store_super_imsm, + .free_super = free_super_imsm, + .match_metadata_desc = match_metadata_desc_imsm, + .container_content = container_content_imsm, + .default_layout = imsm_level_to_layout, + + .external = 1, + .name = "imsm", + +#ifndef MDASSEMBLE +/* for mdmon */ + .open_new = imsm_open_new, + .set_array_state= imsm_set_array_state, + .set_disk = imsm_set_disk, + .sync_metadata = imsm_sync_metadata, + .activate_spare = imsm_activate_spare, + .process_update = imsm_process_update, + .prepare_update = imsm_prepare_update, +#endif /* MDASSEMBLE */ +}; diff -Nru mdadm-2.6.7.1/sysfs.c mdadm-3.1.4/sysfs.c --- mdadm-2.6.7.1/sysfs.c 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/sysfs.c 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ * sysfs - extract md related information from sysfs. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2006 Neil Brown + * Copyright (C) 2006-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -25,6 +25,7 @@ #include "mdadm.h" #include +#include int load_sys(char *path, char *buf) { @@ -34,10 +35,10 @@ return -1; n = read(fd, buf, 1024); close(fd); - if (n <=0 || n >= 1024) + if (n <0 || n >= 1024) return -1; buf[n] = 0; - if (buf[n-1] == '\n') + if (n && buf[n-1] == '\n') buf[n-1] = 0; return 0; } @@ -56,45 +57,68 @@ } } -struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) +int sysfs_open(int devnum, char *devname, char *attr) { - /* Longest possible name in sysfs, mounted at /sys, is - * /sys/block/md_dXXX/md/dev-XXXXX/block/dev - * /sys/block/md_dXXX/md/metadata_version - * which is about 41 characters. 50 should do for now - */ char fname[50]; - char buf[1024]; + int fd; + char *mdname = devnum2devname(devnum); + + if (!mdname) + return -1; + + sprintf(fname, "/sys/block/%s/md/", mdname); + if (devname) { + strcat(fname, devname); + strcat(fname, "/"); + } + strcat(fname, attr); + fd = open(fname, O_RDWR); + if (fd < 0 && errno == EACCES) + fd = open(fname, O_RDONLY); + free(mdname); + return fd; +} + +void sysfs_init(struct mdinfo *mdi, int fd, int devnum) +{ + mdi->sys_name[0] = 0; + if (fd >= 0) { + mdu_version_t vers; + if (ioctl(fd, RAID_VERSION, &vers) != 0) + return; + devnum = fd2devnum(fd); + } + if (devnum == NoMdDev) + return; + if (devnum >= 0) + sprintf(mdi->sys_name, "md%d", devnum); + else + sprintf(mdi->sys_name, "md_d%d", + -1-devnum); +} + + +struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) +{ + char fname[PATH_MAX]; + char buf[PATH_MAX]; char *base; char *dbase; struct mdinfo *sra; struct mdinfo *dev; - DIR *dir; + DIR *dir = NULL; struct dirent *de; sra = malloc(sizeof(*sra)); if (sra == NULL) return sra; - sra->next = NULL; - - if (fd >= 0) { - struct stat stb; - mdu_version_t vers; - if (fstat(fd, &stb)) return NULL; - if (ioctl(fd, RAID_VERSION, &vers) != 0) - return NULL; - if (major(stb.st_rdev)==9) - sprintf(sra->sys_name, "md%d", (int)minor(stb.st_rdev)); - else - sprintf(sra->sys_name, "md_d%d", - (int)minor(stb.st_rdev)>>MdpMinorShift); - } else { - if (devnum >= 0) - sprintf(sra->sys_name, "md%d", devnum); - else - sprintf(sra->sys_name, "md_d%d", - -1-devnum); + memset(sra, 0, sizeof(*sra)); + sysfs_init(sra, fd, devnum); + if (sra->sys_name[0] == 0) { + free(sra); + return NULL; } + sprintf(fname, "/sys/block/%s/md/", sra->sys_name); base = fname + strlen(fname); @@ -111,10 +135,12 @@ sra->array.major_version = -1; sra->array.minor_version = -2; strcpy(sra->text_version, buf+9); - } else + } else { sscanf(buf, "%d.%d", &sra->array.major_version, &sra->array.minor_version); + strcpy(sra->text_version, buf); + } } if (options & GET_LEVEL) { strcpy(base, "level"); @@ -128,6 +154,18 @@ goto abort; sra->array.layout = strtoul(buf, NULL, 0); } + if (options & GET_DISKS) { + strcpy(base, "raid_disks"); + if (load_sys(fname, buf)) + goto abort; + sra->array.raid_disks = strtoul(buf, NULL, 0); + } + if (options & GET_DEGRADED) { + strcpy(base, "degraded"); + if (load_sys(fname, buf)) + goto abort; + sra->array.failed_disks = strtoul(buf, NULL, 0); + } if (options & GET_COMPONENT) { strcpy(base, "component_size"); if (load_sys(fname, buf)) @@ -154,6 +192,35 @@ goto abort; sra->mismatch_cnt = strtoul(buf, NULL, 0); } + if (options & GET_SAFEMODE) { + int scale = 1; + int dot = 0; + unsigned i; + unsigned long msec; + size_t len; + + strcpy(base, "safe_mode_delay"); + if (load_sys(fname, buf)) + goto abort; + + /* remove a period, and count digits after it */ + len = strlen(buf); + for (i = 0; i < len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = strtoul(buf, NULL, 10); + msec = (msec * 1000) / scale; + sra->safe_mode_delay = msec; + } if (! (options & GET_DEVS)) return sra; @@ -177,22 +244,55 @@ dev = malloc(sizeof(*dev)); if (!dev) goto abort; - dev->next = sra->devs; - sra->devs = dev; - strcpy(dev->sys_name, de->d_name); /* Always get slot, major, minor */ strcpy(dbase, "slot"); - if (load_sys(fname, buf)) - goto abort; + if (load_sys(fname, buf)) { + /* hmm... unable to read 'slot' maybe the device + * is going away? + */ + strcpy(dbase, "block"); + if (readlink(fname, buf, sizeof(buf)) < 0 && + errno != ENAMETOOLONG) { + /* ...yup device is gone */ + free(dev); + continue; + } else { + /* slot is unreadable but 'block' link + * still intact... something bad is happening + * so abort + */ + free(dev); + goto abort; + } + + } + strcpy(dev->sys_name, de->d_name); dev->disk.raid_disk = strtoul(buf, &ep, 10); if (*ep) dev->disk.raid_disk = -1; strcpy(dbase, "block/dev"); - if (load_sys(fname, buf)) - goto abort; + if (load_sys(fname, buf)) { + /* assume this is a stale reference to a hot + * removed device + */ + free(dev); + continue; + } sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); + /* special case check for block devices that can go 'offline' */ + strcpy(dbase, "block/device/state"); + if (load_sys(fname, buf) == 0 && + strncmp(buf, "offline", 7) == 0) { + free(dev); + continue; + } + + /* finally add this disk to the array */ + dev->next = sra->devs; + sra->devs = dev; + if (options & GET_OFFSET) { strcpy(dbase, "offset"); if (load_sys(fname, buf)) @@ -203,7 +303,7 @@ strcpy(dbase, "size"); if (load_sys(fname, buf)) goto abort; - dev->component_size = strtoull(buf, NULL, 0); + dev->component_size = strtoull(buf, NULL, 0) * 2; } if (options & GET_STATE) { dev->disk.state = 0; @@ -224,13 +324,41 @@ dev->errors = strtoul(buf, NULL, 0); } } + closedir(dir); return sra; abort: + if (dir) + closedir(dir); sysfs_free(sra); return NULL; } +int sysfs_attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +int sysfs_match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (sysfs_attr_match(word, list[n])) + break; + return n; +} + unsigned long long get_component_size(int fd) { /* Find out the component size of the array. @@ -244,7 +372,7 @@ char fname[50]; int n; if (fstat(fd, &stb)) return 0; - if (major(stb.st_rdev) == 9) + if (major(stb.st_rdev) != (unsigned)get_mdp_major()) sprintf(fname, "/sys/block/md%d/md/component_size", (int)minor(stb.st_rdev)); else @@ -265,8 +393,9 @@ char *name, char *val) { char fname[50]; - int n; + unsigned int n; int fd; + sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); fd = open(fname, O_WRONLY); @@ -274,8 +403,11 @@ return -1; n = write(fd, val, strlen(val)); close(fd); - if (n != strlen(val)) + if (n != strlen(val)) { + dprintf(Name ": failed to write '%s' to '%s' (%s)\n", + val, fname, strerror(errno)); return -1; + } return 0; } @@ -287,21 +419,44 @@ return sysfs_set_str(sra, dev, name, valstr); } -int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, - char *name, unsigned long long *val) +int sysfs_uevent(struct mdinfo *sra, char *event) { char fname[50]; - char buf[50]; int n; int fd; - char *ep; + + sprintf(fname, "/sys/block/%s/uevent", + sra->sys_name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, event, strlen(event)); + close(fd); + return 0; +} + +int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name) +{ + char fname[50]; + int fd; + sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); - fd = open(fname, O_RDONLY); + fd = open(fname, O_RDWR); if (fd < 0) - return -1; + fd = open(fname, O_RDONLY); + return fd; +} + +int sysfs_fd_get_ll(int fd, unsigned long long *val) +{ + char buf[50]; + int n; + char *ep; + + lseek(fd, 0, 0); n = read(fd, buf, sizeof(buf)); - close(fd); if (n <= 0) return -1; buf[n] = 0; @@ -310,3 +465,429 @@ return -1; return 0; } + +int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +int sysfs_fd_get_str(int fd, char *val, int size) +{ + int n; + + lseek(fd, 0, 0); + n = read(fd, val, size); + if (n <= 0) + return -1; + val[n] = 0; + return n; +} + +int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_str(fd, val, size); + close(fd); + return n; +} + +int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms) +{ + unsigned long sec; + unsigned long msec; + char delay[30]; + + sec = ms / 1000; + msec = ms % 1000; + + sprintf(delay, "%ld.%03ld\n", sec, msec); + /* this '\n' ^ needed for kernels older than 2.6.28 */ + return sysfs_set_str(sra, NULL, "safe_mode_delay", delay); +} + +int sysfs_set_array(struct mdinfo *info, int vers) +{ + int rv = 0; + char ver[100]; + + ver[0] = 0; + if (info->array.major_version == -1 && + info->array.minor_version == -2) { + strcat(strcpy(ver, "external:"), info->text_version); + + if ((vers % 100) < 2 || + sysfs_set_str(info, NULL, "metadata_version", + ver) < 0) { + fprintf(stderr, Name ": This kernel does not " + "support external metadata.\n"); + return 1; + } + } + if (info->array.level < 0) + return 0; /* FIXME */ + rv |= sysfs_set_str(info, NULL, "level", + map_num(pers, info->array.level)); + rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size); + rv |= sysfs_set_num(info, NULL, "layout", info->array.layout); + rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2); + if (info->custom_array_size) { + int rc; + + rc = sysfs_set_num(info, NULL, "array_size", + info->custom_array_size/2); + if (rc && errno == ENOENT) { + fprintf(stderr, Name ": This kernel does not " + "have the md/array_size attribute, " + "the array may be larger than expected\n"); + rc = 0; + } + rv |= rc; + } + + if (info->array.level > 0) + rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start); + return rv; +} + +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) +{ + char dv[PATH_MAX]; + char nm[PATH_MAX]; + char *dname; + int rv; + + sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor); + rv = sysfs_set_str(sra, NULL, "new_dev", dv); + if (rv) + return rv; + + memset(nm, 0, sizeof(nm)); + sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor); + rv = readlink(dv, nm, sizeof(nm)); + if (rv <= 0) + return -1; + nm[rv] = '\0'; + dname = strrchr(nm, '/'); + if (dname) dname++; + strcpy(sd->sys_name, "dev-"); + strcpy(sd->sys_name+4, dname); + + /* test write to see if 'recovery_start' is available */ + if (resume && sd->recovery_start < MaxSector && + sysfs_set_num(sra, sd, "recovery_start", 0)) { + sysfs_set_str(sra, sd, "state", "remove"); + return -1; + } + + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); + rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); + if (sra->array.level != LEVEL_CONTAINER) { + if (sd->recovery_start == MaxSector) + /* This can correctly fail if array isn't started, + * yet, so just ignore status for now. + */ + sysfs_set_str(sra, sd, "state", "insync"); + rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); + if (resume) + sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start); + } + return rv; +} + +#if 0 +int sysfs_disk_to_sg(int fd) +{ + /* from an open block device, try find and open its corresponding + * scsi_generic interface + */ + struct stat st; + char path[256]; + char sg_path[256]; + char sg_major_minor[8]; + char *c; + DIR *dir; + struct dirent *de; + int major, minor, rv; + + if (fstat(fd, &st)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return -1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_generic:", de->d_name, + strlen("scsi_generic:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return -1; + + snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); + fd = open(sg_path, O_RDONLY); + if (fd < 0) + return fd; + + rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); + close(fd); + if (rv < 0) + return -1; + else + sg_major_minor[rv - 1] = '\0'; + + c = strchr(sg_major_minor, ':'); + *c = '\0'; + c++; + major = strtol(sg_major_minor, NULL, 10); + minor = strtol(c, NULL, 10); + snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", + (int) getpid(), major, minor); + if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { + fd = open(path, O_RDONLY); + unlink(path); + return fd; + } + + return -1; +} +#endif + +int sysfs_disk_to_scsi_id(int fd, __u32 *id) +{ + /* from an open block device, try to retrieve it scsi_id */ + struct stat st; + char path[256]; + char *c1, *c2; + DIR *dir; + struct dirent *de; + + if (fstat(fd, &st)) + return 1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return 1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_disk:", de->d_name, + strlen("scsi_disk:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return 1; + + c1 = strchr(de->d_name, ':'); + c1++; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id = strtol(c1, NULL, 10) << 24; /* host */ + c1 = c2 + 1; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id |= strtol(c1, NULL, 10) << 16; /* channel */ + c1 = c2 + 1; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id |= strtol(c1, NULL, 10) << 8; /* lun */ + c1 = c2 + 1; + *id |= strtol(c1, NULL, 10); /* id */ + + return 0; +} + + +int sysfs_unique_holder(int devnum, long rdev) +{ + /* Check that devnum is a holder of rdev, + * and is the only holder. + * we should be locked against races by + * an O_EXCL on devnum + */ + DIR *dir; + struct dirent *de; + char dirname[100]; + char l; + int found = 0; + sprintf(dirname, "/sys/dev/block/%d:%d/holders", + major(rdev), minor(rdev)); + dir = opendir(dirname); + errno = ENOENT; + if (!dir) + return 0; + l = strlen(dirname); + while ((de = readdir(dir)) != NULL) { + char buf[10]; + int n; + int mj, mn; + char c; + int fd; + + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + strcpy(dirname+l, "/"); + strcat(dirname+l, de->d_name); + strcat(dirname+l, "/dev"); + fd = open(dirname, O_RDONLY); + if (fd < 0) { + errno = ENOENT; + break; + } + n = read(fd, buf, sizeof(buf)-1); + close(fd); + buf[n] = 0; + if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 || + c != '\n') { + errno = ENOENT; + break; + } + if (mj != MD_MAJOR) + mn = -1-(mn>>6); + + if (devnum != mn) { + errno = EEXIST; + break; + } + found = 1; + } + closedir(dir); + if (de) + return 0; + else + return found; +} + +#ifndef MDASSEMBLE + +static char *clean_states[] = { + "clear", "inactive", "readonly", "read-auto", "clean", NULL }; + +int WaitClean(char *dev, int sock, int verbose) +{ + int fd; + struct mdinfo *mdi; + int rv = 1; + int devnum; + + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + + devnum = fd2devnum(fd); + mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE); + if (!mdi) { + if (verbose) + fprintf(stderr, Name ": Failed to read sysfs attributes for " + "%s\n", dev); + close(fd); + return 0; + } + + switch(mdi->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + /* safemode delay is irrelevant for these levels */ + rv = 0; + + } + + /* for internal metadata the kernel handles the final clean + * transition, containers can never be dirty + */ + if (!is_subarray(mdi->text_version)) + rv = 0; + + /* safemode disabled ? */ + if (mdi->safe_mode_delay == 0) + rv = 0; + + if (rv) { + int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state"); + char buf[20]; + fd_set fds; + struct timeval tm; + + /* minimize the safe_mode_delay and prepare to wait up to 5s + * for writes to quiesce + */ + sysfs_set_safemode(mdi, 1); + tm.tv_sec = 5; + tm.tv_usec = 0; + + FD_ZERO(&fds); + + /* wait for array_state to be clean */ + while (1) { + rv = read(state_fd, buf, sizeof(buf)); + if (rv < 0) + break; + if (sysfs_match_word(buf, clean_states) <= 4) + break; + FD_SET(state_fd, &fds); + rv = select(state_fd + 1, NULL, NULL, &fds, &tm); + if (rv < 0 && errno != EINTR) + break; + lseek(state_fd, 0, SEEK_SET); + } + if (rv < 0) + rv = 1; + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { + /* we need to ping to close the window between array + * state transitioning to clean and the metadata being + * marked clean + */ + rv = 0; + } else + rv = 1; + if (rv && verbose) + fprintf(stderr, Name ": Error waiting for %s to be clean\n", + dev); + + /* restore the original safe_mode_delay */ + sysfs_set_safemode(mdi, mdi->safe_mode_delay); + close(state_fd); + } + + sysfs_free(mdi); + close(fd); + + return rv; +} +#endif /* MDASSEMBLE */ diff -Nru mdadm-2.6.7.1/test mdadm-3.1.4/test --- mdadm-2.6.7.1/test 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/test 2010-08-26 05:24:16.000000000 +0300 @@ -34,14 +34,24 @@ mdsize00=19840 # super1.0 round down to multiple of 2, subtract 8 mdsize1=19992 +mdsize1a=19988 +mdsize12=19988 +# super1.2 for linear: round to multiple of 2, subtract 4 +mdsize1_l=18976 +mdsize2_l=18976 # subtract another 4 for bitmaps mdsize1b=19988 mdsize11=19992 +mdsize11a=18944 mdsize12=19988 +# ddf needs bigger devices as 32Meg is reserved! +ddfsize=65536 + cleanup() { - $mdadm -Ss - for d in 0 1 2 3 4 5 6 7 + udevadm settle + $mdadm -Ssq + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 do losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d done @@ -50,9 +60,11 @@ trap cleanup 0 1 2 3 15 devlist= -for d in 0 1 2 3 4 5 6 7 +for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 do - [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$size bs=1K > /dev/null 2>&1 + sz=$size + if [ $d -gt 7 ]; then sz=$ddfsize ; fi + [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1 [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d if [ $d -eq 7 ] then @@ -63,10 +75,16 @@ eval dev$d=/dev/loop$d eval file$d=$targetdir/mdtest$d eval devlist=\"\$devlist \$dev$d\" + #" <-- add this quote to un-confuse vim syntax highlighting done path0=$dev6 path1=$dev7 +ulimit -c unlimited +[ -f /proc/mdstat ] || modprobe md_mod +echo 2000 > /proc/sys/dev/raid/speed_limit_max +echo 0 > /sys/module/md_mod/parameters/start_ro + if [ " $1" = " setup" ] then trap 0 ; exit 0 fi @@ -75,10 +93,15 @@ mdadm() { rm -f $targetdir/stderr case $* in + *-S* ) udevadm settle;; + esac + case $* in *-C* ) $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes;; * ) $mdadm 2> $targetdir/stderr --quiet "$@" esac + rv=$? cat >&2 $targetdir/stderr + return $rv } # check various things @@ -95,7 +118,7 @@ grep -s "active $1 " /proc/mdstat > /dev/null || { echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;} ;; - resync | recovery ) + resync | recovery | reshape) sleep 0.5 grep -s $1 /proc/mdstat > /dev/null || { echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1; } @@ -103,14 +126,14 @@ nosync ) sleep 0.5 - if grep -s 're[synccovery]* =' > /dev/null /proc/mdstat ; then + if grep -s -E '(resync|recovery|reshape) =' > /dev/null /proc/mdstat ; then echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1; fi ;; wait ) sleep 0.1 - while grep 're[synccovery]* =' > /dev/null /proc/mdstat + while grep -E '(resync|recovery|reshape|check|repair) =' > /dev/null /proc/mdstat do sleep 2; done ;; @@ -144,14 +167,23 @@ # basic device test testdev() { + udevadm settle dev=$1 cnt=$2 dvsize=$3 chunk=$4 - mkfs -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 + if [ -z "$5" ]; then + mkfs -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 + fi dsize=$[dvsize/chunk] dsize=$[dsize*chunk] rasize=$[dsize*2*cnt] + # rasize is in sectors + if [ -n "$DEV_ROUND_K" ]; then + rasize=$[rasize/DEV_ROUND_K/2] + rasize=$[rasize*DEV_ROUND_K*2] + fi + if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi if [ $rasize -ne `/sbin/blockdev --getsize $dev` ] then echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`" @@ -164,21 +196,42 @@ fsck -fn $dev >&2 } +setup_environment() { + if [ -f $1 ]; then + . $environment + setup_env + fi +} +reset_environment() { + if [ -f $1 ]; then + reset_env + unset setup_env + unset reset_env + fi +} for script in tests/$prefix tests/$prefix*[^~] do if [ -f "$script" ] then rm -f $targetdir/stderr + # stop all arrays, just incase some script left an array active. + mdadm -Ssq + mdadm --zero $devlist 2> /dev/null + mdadm --zero $devlist 2> /dev/null + environment="tests/env-`basename $script`" + setup_environment $environment # source script in a subshell, so it has access to our # namespace, but cannot change it. if ( set -ex ; . $script ) 2> $targetdir/log then echo "$script succeeded" else cat $targetdir/log ; cat $targetdir/stderr echo "$script failed" + reset_environment $environment exit 1 fi + reset_environment $environment fi done exit 0 diff -Nru mdadm-2.6.7.1/tests/00linear mdadm-3.1.4/tests/00linear --- mdadm-2.6.7.1/tests/00linear 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/00linear 2010-08-26 05:24:16.000000000 +0300 @@ -3,13 +3,19 @@ mdadm -CR $md0 -l linear -n3 $dev0 $dev1 $dev2 check linear -testdev $md0 3 $mdsize0 64 +testdev $md0 3 $mdsize2_l 1 mdadm -S $md0 -# now with version-1 superblock -mdadm -CR $md0 -e1 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 check linear -testdev $md0 4 $mdsize1 64 +testdev $md0 4 $mdsize0 1 +mdadm -S $md0 + +# now with version-1.0 superblock +mdadm -CR $md0 -e1.0 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize1 1 mdadm -S $md0 # now with no superblock diff -Nru mdadm-2.6.7.1/tests/00multipath mdadm-3.1.4/tests/00multipath --- mdadm-2.6.7.1/tests/00multipath 2007-02-22 05:49:34.000000000 +0200 +++ mdadm-3.1.4/tests/00multipath 2010-08-26 05:24:16.000000000 +0300 @@ -4,21 +4,21 @@ mdadm -CR $md1 -l multipath -n2 $path0 $path1 -testdev $md1 1 $mdsize0 1 +testdev $md1 1 $mdsize12 1 mdadm $md1 -f $path0 rotest $md1 -testdev $md1 1 $mdsize0 1 +testdev $md1 1 $mdsize12 1 mdadm $md1 -r $path0 mdadm $md1 -a $path0 rotest $md1 -testdev $md1 1 $mdsize0 1 +testdev $md1 1 $mdsize12 1 mdadm $md1 -f $path1 mdadm $md1 -r $path1 rotest $md1 -testdev $md1 1 $mdsize0 1 +testdev $md1 1 $mdsize12 1 mdadm -S $md1 diff -Nru mdadm-2.6.7.1/tests/00raid0 mdadm-3.1.4/tests/00raid0 --- mdadm-2.6.7.1/tests/00raid0 2008-07-11 10:32:13.000000000 +0300 +++ mdadm-3.1.4/tests/00raid0 2010-08-26 05:24:16.000000000 +0300 @@ -3,32 +3,32 @@ mdadm -CR $md0 -l raid0 -n3 $dev0 $dev1 $dev2 check raid0 -testdev $md0 3 $mdsize0 64 +testdev $md0 3 $mdsize2_l 512 mdadm -S $md0 -# now with version-1 superblock -mdadm -CR $md0 -e1 -l0 -n4 $dev0 $dev1 $dev2 $dev3 +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 -l0 -n4 $dev0 $dev1 $dev2 $dev3 check raid0 -testdev $md0 4 $mdsize1 64 +testdev $md0 4 $mdsize0 512 mdadm -S $md0 # now with no superblock mdadm -B $md0 -l0 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 check raid0 -testdev $md0 5 $size 64 +testdev $md0 5 $size 512 mdadm -S $md0 # now same again with different chunk size for chunk in 4 32 256 do - mdadm -CR $md0 -l raid0 --chunk $chunk -n3 $dev0 $dev1 $dev2 + mdadm -CR $md0 -e0.90 -l raid0 --chunk $chunk -n3 $dev0 $dev1 $dev2 check raid0 testdev $md0 3 $mdsize0 $chunk mdadm -S $md0 # now with version-1 superblock - mdadm -CR $md0 -e1 -l0 -c $chunk -n4 $dev0 $dev1 $dev2 $dev3 + mdadm -CR $md0 -e1.0 -l0 -c $chunk -n4 $dev0 $dev1 $dev2 $dev3 check raid0 testdev $md0 4 $mdsize1 $chunk mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/00raid1 mdadm-3.1.4/tests/00raid1 --- mdadm-2.6.7.1/tests/00raid1 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/00raid1 2010-08-26 05:24:16.000000000 +0300 @@ -6,14 +6,14 @@ mdadm -CR $md0 -l 1 -n2 $dev0 $dev1 check resync check raid1 -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 mdadm -S $md0 -# now with version-1 superblock, spare -mdadm -CR $md0 -e1 --level=raid1 -n3 -x2 $dev0 missing missing $dev1 $dev2 +# now with version-0.90 superblock, spare +mdadm -CR $md0 -e0.90 --level=raid1 -n3 -x2 $dev0 missing missing $dev1 $dev2 check recovery check raid1 -testdev $md0 1 $mdsize1b 1 +testdev $md0 1 $mdsize0 1 mdadm -S $md0 # now with no superblock diff -Nru mdadm-2.6.7.1/tests/00raid10 mdadm-3.1.4/tests/00raid10 --- mdadm-2.6.7.1/tests/00raid10 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/00raid10 2010-08-26 05:24:16.000000000 +0300 @@ -13,6 +13,6 @@ esac mdadm --create --run --level=raid10 --layout $lo --raid-disks 6 -x 1 $md0 $devs check resync ; check raid10 - testdev $md0 $m $mdsize0 $[64*cm] + testdev $md0 $m $mdsize1 $[512*cm] mdadm -S $md0 done diff -Nru mdadm-2.6.7.1/tests/00raid4 mdadm-3.1.4/tests/00raid4 --- mdadm-2.6.7.1/tests/00raid4 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/00raid4 2010-08-26 05:24:16.000000000 +0300 @@ -3,13 +3,13 @@ mdadm -CfR $md0 -l 4 -n3 $dev0 $dev1 $dev2 check resync ; check raid[45] -testdev $md0 2 $mdsize0 64 +testdev $md0 2 $mdsize1 512 mdadm -S $md0 # now with version-1 superblock mdadm -CR $md0 -e1 --level=raid4 -n4 $dev0 $dev1 $dev2 $dev3 check recovery; check raid[45] -testdev $md0 3 $mdsize1 64 +testdev $md0 3 $mdsize1 512 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/00raid5 mdadm-3.1.4/tests/00raid5 --- mdadm-2.6.7.1/tests/00raid5 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/00raid5 2010-08-26 05:24:16.000000000 +0300 @@ -1,15 +1,15 @@ # create a simple raid5 set -mdadm -CfR $md0 -l 5 -n3 $dev0 $dev1 $dev2 +mdadm -CfR $md0 -e 0.90 -l 5 -n3 $dev0 $dev1 $dev2 check resync -testdev $md0 2 $mdsize0 64 +testdev $md0 2 $mdsize0 512 mdadm -S $md0 # now with version-1 superblock mdadm -CR $md0 -e1 --level=raid5 -n4 $dev0 $dev1 $dev2 $dev3 check recovery -testdev $md0 3 $mdsize1 64 +testdev $md0 3 $mdsize1 512 mdadm -S $md0 # now same again with explicit layout @@ -19,13 +19,13 @@ mdadm -CfR $md0 -l 5 -p $lo -n3 $dev0 $dev1 $dev2 check resync ; check raid5 - testdev $md0 2 $mdsize0 64 + testdev $md0 2 $mdsize1 512 mdadm -S $md0 # now with version-1 superblock mdadm -CR $md0 -e1 --level=raid5 --layout $lo -n4 $dev0 $dev1 $dev2 $dev3 check recovery ; check raid5 - testdev $md0 3 $mdsize1 64 + testdev $md0 3 $mdsize1 512 mdadm -S $md0 done diff -Nru mdadm-2.6.7.1/tests/00raid6 mdadm-3.1.4/tests/00raid6 --- mdadm-2.6.7.1/tests/00raid6 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/00raid6 2010-08-26 05:24:16.000000000 +0300 @@ -1,15 +1,15 @@ # create a simple raid6 set -mdadm -CfR $md0 -l 6 -n4 $dev0 $dev1 $dev2 $dev3 +mdadm -CfR $md0 -e0.90 -l 6 -n4 $dev0 $dev1 $dev2 $dev3 check resync ; check raid6 -testdev $md0 2 $mdsize0 64 +testdev $md0 2 $mdsize0 512 mdadm -S $md0 # now with version-1 superblock mdadm -CR $md0 -e1 --level=raid6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 check resync ; check raid6 -testdev $md0 3 $mdsize1 64 +testdev $md0 3 $mdsize1 512 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/01r5integ mdadm-3.1.4/tests/01r5integ --- mdadm-2.6.7.1/tests/01r5integ 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/01r5integ 2010-03-22 08:08:43.000000000 +0200 @@ -0,0 +1,29 @@ + +# Check integrity of raid5 in degraded mode +# Create a 4 disk raid5, create a filesystem and +# sh1sum it with each device failed + +for layout in ls rs la ra +do + mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + for i in $dev0 $dev1 $dev2 $dev3 + do + mdadm $md0 -f $i + mdadm $md0 -r $i + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ $sum != $sum1 ] + then + echo $sum does not matc $sum1 with $i missing + exit 1 + fi + mdadm $md0 -a $i + check wait + done + mdadm -S $md0 +done + diff -Nru mdadm-2.6.7.1/tests/01raid6integ mdadm-3.1.4/tests/01raid6integ --- mdadm-2.6.7.1/tests/01raid6integ 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/01raid6integ 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,53 @@ + +# Check integrity of raid6 in degraded modes +# Create a 5 disk raid6, dump some data to it, then +# sh1sum it with different pairs of devices failed + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + totest= + for second in $dev0 $dev1 $dev2 $dev3 $dev4 + do + mdadm $md0 -f $second + mdadm $md0 -r $second + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ $sum != $sum1 ] + then + echo $sum does not matc $sum1 with $second missing + exit 1 + fi + for first in $totest + do + mdadm $md0 -f $first + mdadm $md0 -r $first + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ $sum != $sum1 ] + then + echo $sum does not matc $sum1 with $first and $second missing + exit 1 + fi + mdadm $md0 -a $first + check wait + done + mdadm $md0 -a $second + check wait + totest="$totest $second" + done + mdadm -S $md0 +done + diff -Nru mdadm-2.6.7.1/tests/02lineargrow mdadm-3.1.4/tests/02lineargrow --- mdadm-2.6.7.1/tests/02lineargrow 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/02lineargrow 2010-08-26 05:24:16.000000000 +0300 @@ -5,18 +5,19 @@ do case $e in 0.90 ) sz=$mdsize0 ;; - 1 ) sz=$mdsize1 ;; - 1.1 ) sz=$mdsize11 ;; - 1.2 ) sz=$mdsize12 ;; + 1 ) sz=$mdsize2_l ;; + 1.0 ) sz=$mdsize1 ;; + 1.1 ) sz=$mdsize1_l ;; + 1.2 ) sz=$mdsize2_l ;; esac mdadm -CRf $md0 --level linear -e $e --raid-disks=1 $dev1 - testdev $md0 1 $sz 64 + testdev $md0 1 $sz 1 mdadm --grow $md0 --add $dev2 - testdev $md0 2 $sz 64 + testdev $md0 2 $sz 1 mdadm --grow $md0 --add $dev3 - testdev $md0 3 $sz 64 + testdev $md0 3 $sz 1 mdadm -S $md0 done diff -Nru mdadm-2.6.7.1/tests/02r1grow mdadm-3.1.4/tests/02r1grow --- mdadm-2.6.7.1/tests/02r1grow 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/02r1grow 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # create a small raid1 array, make it larger. Then make it smaller -mdadm -CR $md0 --level raid1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +mdadm -CR $md0 -e 0.90 --level raid1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 check wait check state UUU testdev $md0 1 $[size/2] 1 @@ -27,7 +27,7 @@ mdadm --grow $md0 --size max check resync check wait -testdev $md0 1 $mdsize11 1 +testdev $md0 1 $mdsize1_l 1 mdadm --grow $md0 --size $[size/2] check nosync diff -Nru mdadm-2.6.7.1/tests/02r5grow mdadm-3.1.4/tests/02r5grow --- mdadm-2.6.7.1/tests/02r5grow 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/02r5grow 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # create a small raid5 array, make it larger. Then make it smaller -mdadm -CR $md0 --level raid5 --chunk=32 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +mdadm -CR $md0 -e0.90 --level raid5 --chunk=32 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 check wait check state UUU testdev $md0 2 $[size/2] 32 @@ -10,11 +10,11 @@ mdadm --grow $md0 --size max check resync check wait -testdev $md0 2 $mdsize0 64 +testdev $md0 2 $mdsize0 32 mdadm --grow $md0 --size $[size/2] check nosync -testdev $md0 2 $[size/2] 64 +testdev $md0 2 $[size/2] 32 mdadm -S $md0 @@ -27,7 +27,7 @@ mdadm --grow $md0 --size max check resync check wait -testdev $md0 3 $[size-4] 128 +testdev $md0 3 $[mdsize1_l] 128 mdadm --grow $md0 --size $[size/2] check nosync diff -Nru mdadm-2.6.7.1/tests/02r6grow mdadm-3.1.4/tests/02r6grow --- mdadm-2.6.7.1/tests/02r6grow 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/02r6grow 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # create a small raid6 array, make it larger. Then make it smaller -mdadm -CR $md0 --level raid6 --chunk=32 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +mdadm -CR $md0 -e 0.90 --level raid6 --chunk=32 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 check wait check state UUUU testdev $md0 2 $[size/2] 32 @@ -10,11 +10,11 @@ mdadm --grow $md0 --size max check resync check wait -testdev $md0 2 $mdsize0 64 +testdev $md0 2 $mdsize0 32 mdadm --grow $md0 --size $[size/2] check nosync -testdev $md0 2 $[size/2] 64 +testdev $md0 2 $[size/2] 32 mdadm -S $md0 @@ -27,7 +27,7 @@ mdadm --grow $md0 --size max check resync check wait -testdev $md0 2 $[size-4] 128 +testdev $md0 2 $[mdsize1_l] 128 mdadm --grow $md0 --size $[size/2] check nosync diff -Nru mdadm-2.6.7.1/tests/03r0assem mdadm-3.1.4/tests/03r0assem --- mdadm-2.6.7.1/tests/03r0assem 2008-10-13 04:22:02.000000000 +0300 +++ mdadm-3.1.4/tests/03r0assem 2010-08-26 05:24:16.000000000 +0300 @@ -6,7 +6,7 @@ mdadm -CR $md2 -l0 -n3 $dev0 $dev1 $dev2 check raid0 -tst="testdev $md2 3 $mdsize0 64" +tst="testdev $md2 3 $mdsize1_l 512" $tst uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` mdadm -S $md2 @@ -19,7 +19,7 @@ $tst mdadm -S $md2 -mdadm --assemble $md2 --super-minor=2 $devlist +mdadm --assemble $md2 --name=2 $devlist $tst mdadm -S $md2 @@ -35,7 +35,7 @@ { echo DEVICE $devlist - echo array $md2 super-minor=2 + echo array $md2 name=2 } > $conf mdadm -As -c $conf $md2 @@ -65,12 +65,12 @@ mdadm -S $md2 -### Now for version 1... +### Now for version 0... mdadm --zero-superblock $dev0 $dev1 $dev2 -mdadm -CR $md2 -l0 --metadata=1.0 -n3 $dev0 $dev1 $dev2 +mdadm -CR $md2 -l0 --metadata=0.90 -n3 $dev0 $dev1 $dev2 check raid0 -tst="testdev $md2 3 $mdsize1 64" +tst="testdev $md2 3 $mdsize0 512" $tst uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` @@ -84,10 +84,9 @@ $tst mdadm -S $md2 -# version 1 has no super-minor -# mdadm --assemble $md2 --super-minor=2 $devlist # -# $tst -# mdadm -S $md2 +mdadm --assemble $md2 --super-minor=2 $devlist # +$tst +mdadm -S $md2 conf=$targetdir/mdadm.conf { @@ -99,14 +98,14 @@ $tst mdadm -S $md2 -#{ -# echo DEVICE $devlist -# echo array $md2 super-minor=2 -#} > $conf -# -#mdadm -As -c $conf $md2 -#$tst -#mdadm -S $md2 +{ + echo DEVICE $devlist + echo array $md2 super-minor=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 { @@ -129,3 +128,10 @@ mdadm --assemble --scan --config=$conf $md2 $tst mdadm -S $md2 + +# Now use incremental assembly. +mdadm -I --config=$conf $dev0 +mdadm -I --config=$conf $dev1 +mdadm -I --config=$conf $dev2 +$tst +mdadm -S $md2 diff -Nru mdadm-2.6.7.1/tests/03r5assem mdadm-3.1.4/tests/03r5assem --- mdadm-2.6.7.1/tests/03r5assem 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/03r5assem 2010-08-26 05:24:16.000000000 +0300 @@ -2,8 +2,8 @@ # create a raid5 array and assemble it in various ways, # including with missing devices. -mdadm -CR $md1 -l5 -n3 $dev0 $dev1 $dev2 -tst="check raid5 ;testdev $md1 2 $mdsize0 64 ; mdadm -S $md1" +mdadm -CR -e 0.90 $md1 -l5 -n3 $dev0 $dev1 $dev2 +tst="check raid5 ;testdev $md1 2 $mdsize0 512 ; mdadm -S $md1" uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` check wait eval $tst diff -Nru mdadm-2.6.7.1/tests/03r5assemV1 mdadm-3.1.4/tests/03r5assemV1 --- mdadm-2.6.7.1/tests/03r5assemV1 2008-10-13 04:22:02.000000000 +0300 +++ mdadm-3.1.4/tests/03r5assemV1 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # create a v-1 raid5 array and assemble in various ways mdadm -CR -e1 --name one $md1 -l5 -n3 -x2 $dev0 $dev1 $dev2 $dev3 $dev4 -tst="check raid5 ;testdev $md1 2 $mdsize1 64 ; mdadm -S $md1" +tst="check raid5 ;testdev $md1 2 $mdsize1 512 ; mdadm -S $md1" uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` check wait @@ -113,3 +113,10 @@ mdadm --assemble --scan --config=$conf $md1 check state U_U eval $tst + +# And now assemble with -I +mdadm -Ss +mdadm -I -c $conf $dev0 +mdadm -I -c $conf $dev1 +mdadm -I -c $conf $dev2 +eval $tst diff -Nru mdadm-2.6.7.1/tests/04r0update mdadm-3.1.4/tests/04r0update --- mdadm-2.6.7.1/tests/04r0update 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/04r0update 2010-08-26 05:24:16.000000000 +0300 @@ -1,7 +1,7 @@ # create a raid0, re-assemble with a different super-minor -mdadm -CR $md0 -l0 -n3 $dev0 $dev1 $dev2 -testdev $md0 3 $mdsize0 64 +mdadm -CR -e 0.90 $md0 -l0 -n3 $dev0 $dev1 $dev2 +testdev $md0 3 $mdsize0 512 minor1=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` mdadm -S /dev/md0 diff -Nru mdadm-2.6.7.1/tests/05r1-bitmapfile mdadm-3.1.4/tests/05r1-bitmapfile --- mdadm-2.6.7.1/tests/05r1-bitmapfile 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-bitmapfile 2010-08-26 05:24:16.000000000 +0300 @@ -6,11 +6,11 @@ rm -f $bmf mdadm --create --run $md0 --level=1 -n2 --delay=1 --bitmap $bmf $dev1 $dev2 check wait -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 mdadm -S $md0 mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` @@ -20,7 +20,7 @@ exit 1 fi mdadm $md0 -f $dev1 -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 sleep 4 dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` if [ $dirty3 -lt 400 ] diff -Nru mdadm-2.6.7.1/tests/05r1-grow-external mdadm-3.1.4/tests/05r1-grow-external --- mdadm-2.6.7.1/tests/05r1-grow-external 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-grow-external 2010-08-26 05:24:16.000000000 +0300 @@ -4,7 +4,7 @@ # mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 check wait -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 bmf=$targetdir/bm rm -f $bmf @@ -14,7 +14,7 @@ sleep 4 dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` diff -Nru mdadm-2.6.7.1/tests/05r1-grow-internal mdadm-3.1.4/tests/05r1-grow-internal --- mdadm-2.6.7.1/tests/05r1-grow-internal 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-grow-internal 2010-08-26 05:24:16.000000000 +0300 @@ -4,15 +4,15 @@ # mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 check wait -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 #mdadm -E $dev1 -mdadm --grow $md0 --bitmap=internal --delay=1 || { mdadm -X $dev2 ; exit 1; } +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 || { mdadm -X $dev2 ; exit 1; } dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` diff -Nru mdadm-2.6.7.1/tests/05r1-grow-internal-1 mdadm-3.1.4/tests/05r1-grow-internal-1 --- mdadm-2.6.7.1/tests/05r1-grow-internal-1 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-grow-internal-1 2010-08-26 05:24:16.000000000 +0300 @@ -7,7 +7,7 @@ testdev $md0 1 $mdsize1b 1 #mdadm -E $dev1 -mdadm --grow $md0 --bitmap=internal --delay=1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` diff -Nru mdadm-2.6.7.1/tests/05r1-internalbitmap mdadm-3.1.4/tests/05r1-internalbitmap --- mdadm-2.6.7.1/tests/05r1-internalbitmap 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-internalbitmap 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # # create a raid1 with an internal bitmap # -mdadm --create --run $md0 --level=1 -n2 --delay=1 --bitmap internal $dev1 $dev2 +mdadm --create -e0.90 --run $md0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 check wait testdev $md0 1 $mdsize0 1 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/05r1-internalbitmap-v1a mdadm-3.1.4/tests/05r1-internalbitmap-v1a --- mdadm-2.6.7.1/tests/05r1-internalbitmap-v1a 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-internalbitmap-v1a 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # # create a raid1 with an internal bitmap # -mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 --bitmap internal $dev1 $dev2 +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 check wait check bitmap testdev $md0 1 $mdsize1b 1 diff -Nru mdadm-2.6.7.1/tests/05r1-internalbitmap-v1b mdadm-3.1.4/tests/05r1-internalbitmap-v1b --- mdadm-2.6.7.1/tests/05r1-internalbitmap-v1b 2006-05-26 07:44:02.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-internalbitmap-v1b 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # # create a raid1 with an internal bitmap # -mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 --bitmap internal $dev1 $dev2 +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 check wait check bitmap testdev $md0 1 $mdsize11 1 diff -Nru mdadm-2.6.7.1/tests/05r1-internalbitmap-v1c mdadm-3.1.4/tests/05r1-internalbitmap-v1c --- mdadm-2.6.7.1/tests/05r1-internalbitmap-v1c 2006-05-26 07:49:14.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-internalbitmap-v1c 2010-08-26 05:24:16.000000000 +0300 @@ -2,7 +2,7 @@ # # create a raid1 with an internal bitmap # -mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 --bitmap internal $dev1 $dev2 +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk 4 $dev1 $dev2 check wait check bitmap testdev $md0 1 $mdsize12 1 diff -Nru mdadm-2.6.7.1/tests/05r1-n3-bitmapfile mdadm-3.1.4/tests/05r1-n3-bitmapfile --- mdadm-2.6.7.1/tests/05r1-n3-bitmapfile 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-n3-bitmapfile 2010-08-26 05:24:16.000000000 +0300 @@ -6,7 +6,7 @@ # bmf=$targetdir/bitmap rm -f $bmf -mdadm --create --run $md0 --level=1 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +mdadm --create -e0.90 --run $md0 --level=1 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 check wait testdev $md0 1 $mdsize0 1 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/05r1-re-add mdadm-3.1.4/tests/05r1-re-add --- mdadm-2.6.7.1/tests/05r1-re-add 2006-10-23 04:09:23.000000000 +0300 +++ mdadm-3.1.4/tests/05r1-re-add 2010-08-26 05:24:16.000000000 +0300 @@ -5,10 +5,10 @@ # Then do some IO first. Resync should still be very fast # -mdadm -CR $md0 -l1 -n2 -binternal -d1 $dev1 $dev2 +mdadm -CR $md0 -l1 -n2 -binternal --bitmap-chunk=4 -d1 $dev1 $dev2 check resync check wait -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 sleep 4 mdadm $md0 -f $dev2 @@ -21,10 +21,10 @@ mdadm $md0 -f $dev2 sleep 1 mdadm $md0 -r $dev2 -testdev $md0 1 $mdsize0 1 +testdev $md0 1 $mdsize1a 1 mdadm $md0 -a $dev2 check wait -cmp --bytes=$[$mdsize0*1024] $dev1 $dev2 +cmp --ignore-initial=$[16*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 mdadm $md0 -f $dev2; sleep 1 mdadm $md0 -r $dev2 @@ -32,5 +32,5 @@ mdadm $md0 -a $dev2 check recovery check wait -cmp --bytes=$[$mdsize0*1024] $dev1 $dev2 +cmp --ignore-initial=$[16*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/05r5-bitmapfile mdadm-3.1.4/tests/05r5-bitmapfile --- mdadm-2.6.7.1/tests/05r5-bitmapfile 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r5-bitmapfile 2010-08-26 05:24:16.000000000 +0300 @@ -6,11 +6,11 @@ rm -f $bmf mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 check wait -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 mdadm -S $md0 mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` @@ -20,7 +20,7 @@ exit 1 fi mdadm $md0 -f $dev1 -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 sleep 4 dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` if [ $dirty3 -lt 400 ] diff -Nru mdadm-2.6.7.1/tests/05r5-internalbitmap mdadm-3.1.4/tests/05r5-internalbitmap --- mdadm-2.6.7.1/tests/05r5-internalbitmap 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r5-internalbitmap 2010-08-26 05:24:16.000000000 +0300 @@ -2,13 +2,13 @@ # # create a raid1 with an internal bitmap # -mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap internal $dev1 $dev2 $dev3 +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 $dev3 check wait -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 mdadm -S $md0 mdadm --assemble $md0 $dev1 $dev2 $dev3 -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` @@ -18,7 +18,7 @@ exit 1 fi mdadm $md0 -f $dev1 -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 sleep 4 dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` if [ $dirty3 -lt 400 ] diff -Nru mdadm-2.6.7.1/tests/05r6-bitmapfile mdadm-3.1.4/tests/05r6-bitmapfile --- mdadm-2.6.7.1/tests/05r6-bitmapfile 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/05r6-bitmapfile 2010-08-26 05:24:16.000000000 +0300 @@ -6,11 +6,11 @@ rm -f $bmf mdadm --create --run $md0 --level=6 -n4 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 $dev4 check wait -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 mdadm -S $md0 mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 $dev4 -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` sleep 4 dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` @@ -20,7 +20,7 @@ exit 1 fi mdadm $md0 -f $dev3 -testdev $md0 2 $mdsize0 1 +testdev $md0 2 $mdsize1 512 sleep 4 dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` if [ $dirty3 -lt 400 ] diff -Nru mdadm-2.6.7.1/tests/06name mdadm-3.1.4/tests/06name --- mdadm-2.6.7.1/tests/06name 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/06name 2010-03-22 08:08:43.000000000 +0200 @@ -3,8 +3,8 @@ # create an array with a name mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1 -mdadm -E $dev0 | grep 'Name : Fred$' > /dev/null || exit 1 -mdadm -D $md0 | grep 'Name : Fred$' > /dev/null || exit 1 +mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 mdadm -S $md0 mdadm -A $md0 --name="Fred" $devlist diff -Nru mdadm-2.6.7.1/tests/06r5swap mdadm-3.1.4/tests/06r5swap --- mdadm-2.6.7.1/tests/06r5swap 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/06r5swap 2010-08-26 05:24:16.000000000 +0300 @@ -1,7 +1,7 @@ # make a raid5 array, byte swap the superblocks, then assemble... -mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3 sleep 4 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/06update-uuid mdadm-3.1.4/tests/06update-uuid --- mdadm-2.6.7.1/tests/06update-uuid 2006-12-14 08:31:28.000000000 +0200 +++ mdadm-3.1.4/tests/06update-uuid 2010-08-26 05:24:16.000000000 +0300 @@ -57,7 +57,7 @@ mdadm -S /dev/md0 # Internal bitmaps too. -mdadm -CR --assume-clean -b internal $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2 mdadm -S /dev/md0 mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 no_errors @@ -69,7 +69,7 @@ } mdadm -S /dev/md0 -mdadm -CR --assume-clean -e1.2 -b internal $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2 mdadm -S /dev/md0 mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 no_errors diff -Nru mdadm-2.6.7.1/tests/06wrmostly mdadm-3.1.4/tests/06wrmostly --- mdadm-2.6.7.1/tests/06wrmostly 2006-06-20 03:01:17.000000000 +0300 +++ mdadm-3.1.4/tests/06wrmostly 2010-08-26 05:24:16.000000000 +0300 @@ -2,13 +2,13 @@ # create a raid1 array with a wrmostly device mdadm -CR $md0 -l1 -n3 $dev0 $dev1 --write-mostly $dev2 -testdev $md0 1 $mdsize0 64 +testdev $md0 1 $mdsize1a 1 # unfortunately, we cannot measure if any read requests are going to $dev2 mdadm -S $md0 -mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal $dev0 $dev1 --write-mostly $dev2 -testdev $md0 1 $mdsize0 64 +mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal --bitmap-chunk=4 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 1 mdadm -S $md0 diff -Nru mdadm-2.6.7.1/tests/07autoassemble mdadm-3.1.4/tests/07autoassemble --- mdadm-2.6.7.1/tests/07autoassemble 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/tests/07autoassemble 2010-08-26 05:24:16.000000000 +0300 @@ -8,9 +8,9 @@ mdadm -Ss mdadm -As -c /dev/null --homehost=testing -vvv -testdev $md1 1 $mdsize0 64 -testdev $md2 1 $mdsize0 64 -testdev $md0 2 $mdsize00 64 +testdev $md1 1 $mdsize1a 1 +testdev $md2 1 $mdsize1a 1 +testdev $md0 2 $mdsize11a 512 mdadm -Ss mdadm --zero-superblock $dev0 $dev1 $dev2 $dev3 @@ -19,5 +19,6 @@ mdadm -CR $md0 -l0 -n2 $md1 $dev2 --homehost=testing mdadm -Ss mdadm -As -c /dev/null --homehost=testing -vvv -testdev $md1 1 $mdsize0 64 -testdev $md0 1 $[mdsize0+mdsize00] 64 +testdev $md1 1 $mdsize1a 1 +testdev $md0 1 $[mdsize1a+mdsize11a-1024] 512 +mdadm -Ss diff -Nru mdadm-2.6.7.1/tests/07autodetect mdadm-3.1.4/tests/07autodetect --- mdadm-2.6.7.1/tests/07autodetect 2008-10-13 05:46:39.000000000 +0300 +++ mdadm-3.1.4/tests/07autodetect 2010-08-26 05:24:16.000000000 +0300 @@ -12,15 +12,17 @@ fi -mdadm -CR $mdp0 -l0 -f -n1 $dev0 -mdadm -CR $mdp1 -l0 -f -n1 $dev1 +mdadm -CR -e 0 $mdp0 -l0 -f -n1 $dev0 +mdadm -CR -e 0 $mdp1 -l0 -f -n1 $dev1 +udevadm settle sfdisk $mdp0 >&2 << END ,,FD END sfdisk $mdp1 >&2 << END ,,FD END -mdadm -CR $md0 -l1 -n2 ${mdp0}p1 ${mdp1}p1 +udevadm settle +mdadm -CR -e 0 $md0 -l1 -n2 ${mdp0}p1 ${mdp1}p1 check resync check raid1 check wait diff -Nru mdadm-2.6.7.1/tests/07changelevelintr mdadm-3.1.4/tests/07changelevelintr --- mdadm-2.6.7.1/tests/07changelevelintr 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/07changelevelintr 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,60 @@ + +# +# test that we can stop and restart a level change. +# just test a few in-place changes, and a few +# size-reducing changes. + + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + +restart() { + sleep 0.5 + check reshape + mdadm -S $md0 + mdadm -A $md0 $devs --backup-file=$bu + sleep 0.5 + check reshape +} + +bu=/tmp/md-backup +rm -f $bu +devs="$dev0 $dev1 $dev2 $dev3 $dev4" +mdadm -CR $md0 -l5 -n5 -c 256 $devs +checkgeo md0 raid5 5 $[256*1024] 2 + +mdadm -G $md0 -c 128 --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 2 + +mdadm -G $md0 --layout rs --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 3 + +mdadm -G $md0 --array-size 59136 +mdadm -G $md0 --raid-disks 4 -c 64 --backup-file=$bu +restart +checkgeo md0 raid5 4 $[64*1024] 3 + +devs="$dev0 $dev1 $dev2 $dev3" +mdadm -G $md0 --array-size 19712 +mdadm -G $md0 -n 2 -c 256 --backup-file=$bu +restart +checkgeo md0 raid5 2 $[256*1024] 3 diff -Nru mdadm-2.6.7.1/tests/07changelevels mdadm-3.1.4/tests/07changelevels --- mdadm-2.6.7.1/tests/07changelevels 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/07changelevels 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,107 @@ + +# Test changing of level, chunksize etc. +# Create a RAID1, convert to RAID5, add a disk, add another disk +# convert to RAID6, back to RAID5 and ultimately to RAID1 + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERIFY=1 + +dotest() { + sleep 0.5 + check wait + testdev $md0 $1 19968 64 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 -z 19968 +testdev $md0 1 $mdsize1a 64 +dd if=/tmp/RandFile of=$md0 +dotest 1 + +mdadm --grow $md0 -l5 -n3 --chunk 64 +dotest 2 + +mdadm $md0 --add $dev3 $dev4 +mdadm --grow $md0 -n4 --chunk 32 +dotest 3 + +mdadm -G $md0 -l6 --backup-file $bu +dotest 3 + +mdadm -G /dev/md0 --array-size 39936 +mdadm -G $md0 -n4 --backup-file $bu +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +dotest 2 + +mdadm -G /dev/md0 --array-size 19968 +mdadm -G $md0 -n2 --backup-file $bu +dotest 1 + +mdadm -G --level=1 $md0 +dotest 1 + +# now repeat that last few steps only with a degraded array. +mdadm -S $md0 +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +mdadm $md0 --fail $dev0 + +mdadm -G /dev/md0 --array-size 39936 +mdadm -G $md0 -n4 --backup-file $bu +dotest 2 +mdadm $md0 --fail $dev4 + +mdadm $md0 --fail $dev3 +# now double-degraded. +# switch layout to a DDF layout and back to make sure that works. + +mdadm -G /dev/md0 --layout=ddf-N-continue --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 10 +dotest 2 +mdadm -G /dev/md0 --layout=ra --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 1 +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +dotest 2 + +mdadm -G /dev/md0 --array-size 19968 +mdadm -G $md0 -n2 --backup-file $bu +dotest 1 +mdadm $md0 --fail $dev2 + +mdadm -G --level=1 $md0 +dotest 1 diff -Nru mdadm-2.6.7.1/tests/07layouts mdadm-3.1.4/tests/07layouts --- mdadm-2.6.7.1/tests/07layouts 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/07layouts 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,91 @@ + +# check that kernel an restripe interpret all the different layouts +# the same +# This involves changing the layout to each different possibility +# while MDADM_GROW_VERIFY is set. + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERITY=1 + + +dotest() { + sleep 0.5 + check wait + testdev $md0 $1 $mdsize1 512 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `sed 's/ .*//' /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu + +# first a degraded 5 device raid5 +mdadm -CR $md0 -l5 -n5 $dev0 $dev1 missing $dev2 $dev3 +dd if=/tmp/RandFile of=$md0 +dotest 4 + +l5[0]=la +l5[1]=ra +l5[2]=ls +l5[3]=rs +l5[4]=parity-first +l5[5]=parity-last +for layout in 0 1 2 3 4 5 0 +do + mdadm -G $md0 --layout=${l5[$layout]} --backup-file $bu + checkgeo md0 raid5 5 $[512*1024] $layout + dotest 4 +done + +mdadm -S $md0 +# now a doubly degraded raid6 +mdadm -CR $md0 -l6 -n5 $dev0 missing $dev2 missing $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +l6[0]=la +l6[1]=ra +l6[2]=ls +l6[3]=rs +l6[4]=parity-first +l6[5]=parity-last +l6[8]=ddf-zero-restart +l6[9]=ddf-N-restart +l6[10]=ddf-N-continue +l6[16]=left-asymmetric-6 +l6[17]=right-asymmetric-6 +l6[18]=left-symmetric-6 +l6[19]=right-symmetric-6 +l6[20]=parity-first-6 +for layout in 0 1 2 3 4 5 8 9 10 16 17 18 19 20 0 +do + mdadm -G $md0 --layout=${l6[$layout]} --backup-file $bu + checkgeo md0 raid6 5 $[512*1024] $layout + dotest 3 +done diff -Nru mdadm-2.6.7.1/tests/07reshape5intr mdadm-3.1.4/tests/07reshape5intr --- mdadm-2.6.7.1/tests/07reshape5intr 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/07reshape5intr 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,39 @@ + +# +# test interrupting and restarting raid5 reshape. +set -x +devs="$dev1" +st=UU +for disks in 2 3 4 5 +do + eval devs=\"$devs \$dev$disks\" + st=U$st + for d in $devs + do dd if=/dev/urandom of=$d bs=1024 || true + done + + case $disks in + 2 | 3) chunk=1024;; + 4 ) chunk=512;; + 5 ) chunk=256;; + esac + + mdadm -CR $md0 -amd -l5 -c $chunk -n$disks --assume-clean $devs + mdadm $md0 --add $dev6 + echo 20 > /proc/sys/dev/raid/speed_limit_max + mdadm --grow $md0 -n $[disks+1] + check reshape + check state $st + mdadm --stop $md0 + mdadm --assemble $md0 $devs $dev6 + check reshape + echo 2000 > /proc/sys/dev/raid/speed_limit_max + check wait + echo check > /sys/block/md0/md/sync_action + check wait + mm=`cat /sys/block/md0/md/mismatch_cnt` + if [ $mm -gt 0 ] + then echo >&2 "ERROR mismatch_cnt non-zero : $mm" ; exit 1 + fi + mdadm -S $md0 +done diff -Nru mdadm-2.6.7.1/tests/07testreshape5 mdadm-3.1.4/tests/07testreshape5 --- mdadm-2.6.7.1/tests/07testreshape5 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/tests/07testreshape5 2010-08-26 05:24:16.000000000 +0300 @@ -5,34 +5,42 @@ # shaped md arrays. set -x layouts=(la ra ls rs) +for level in 5 6 +do for chunk in 4 8 16 32 64 128 do devs="$dev1" for disks in 2 3 4 5 6 do eval devs=\"$devs \$dev$disks\" + if [ " $level $disks" = " 6 3" -o " $level $disks" = " 6 2" ] + then continue + fi for nlayout in 0 1 2 3 do layout=${layouts[$nlayout]} - size=$[chunk*(disks-1)*disks] + size=$[chunk*(disks-(level-4))*disks] # test restore: make a raid5 from a file, then do a compare dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$size - $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] 5 $nlayout 0 $[size*1024] $devs - $mdadm -CR $md0 -amd -l5 -n$disks --assume-clean -c $chunk -p $layout $devs + $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + mdadm -CR -e 1.0 $md0 -amd -l$level -n$disks --assume-clean -c $chunk -p $layout $devs cmp -s -n $[size*1024] $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } # FIXME check parity # test save dd if=/dev/urandom of=$md0 bs=1024 count=$size + blockdev --flushbufs $md0 $devs; sync > /tmp/NewRand - $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] 5 $nlayout 0 $[size*1024] $devs + $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; } - $mdadm -S $md0 + mdadm -S $md0 + udevadm settle done done done +done exit 0 diff -Nru mdadm-2.6.7.1/tests/08imsm-overlap mdadm-3.1.4/tests/08imsm-overlap --- mdadm-2.6.7.1/tests/08imsm-overlap 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/08imsm-overlap 2010-03-22 08:08:43.000000000 +0200 @@ -0,0 +1,25 @@ +# create raid arrays with varying degress of overlap +mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 +imsm_check container 6 + +size=1910 +level=1 +num_disks=2 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size +mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size +mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size +mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size +mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size + +offset=0 +imsm_check member $member0 $num_disks $level $size $offset +offset=$((offset+size+2048)) +imsm_check member $member1 $num_disks $level $size $offset +offset=$((offset+size+2048)) +imsm_check member $member2 $num_disks $level $size $offset +offset=$((offset+size+2048)) +imsm_check member $member3 $num_disks $level $size $offset +# at this point there should be more freespace at the start of the disk +# than the end +offset=0 +imsm_check member $member4 $num_disks $level $size $offset diff -Nru mdadm-2.6.7.1/tests/09imsm-assemble mdadm-3.1.4/tests/09imsm-assemble --- mdadm-2.6.7.1/tests/09imsm-assemble 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/09imsm-assemble 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,46 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm --wait ${member}_0 +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -IR $container +mdadm --wait ${member}_0 +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -IR $container +testdev ${member}_0 1 $size 1 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -IR ${container}2 + +testdev ${member}_0 1 $size 1 +testdev ${member}_1 1 $size 1 diff -Nru mdadm-2.6.7.1/tests/09imsm-create-fail-rebuild mdadm-3.1.4/tests/09imsm-create-fail-rebuild --- mdadm-2.6.7.1/tests/09imsm-create-fail-rebuild 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/09imsm-create-fail-rebuild 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,59 @@ +# sanity check array creation + +# IMSM rounds to multiples of one mebibyte - 1024K +DEV_ROUND_K=1024 + +num_disks=2 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 +imsm_check container $num_disks + +# RAID0 + RAID1 +size=10000 +level=0 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $offset $chunk +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)) + 2048)) +size=5000 +level=1 +chunk=0 +mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size +imsm_check member $member1 $num_disks $level $size $offset $chunk +testdev $member1 1 $size 1 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +imsm_check container $num_disks + +size=10000 +level=10 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $offset $chunk +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)) + 2048)) +size=5000 +level=5 +mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member1 $num_disks $level $size $offset $chunk +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +imsm_check_hold $container $dev0 +mdadm --fail $member0 $dev0 +mdadm --wait-clean --scan +imsm_check_removal $container $dev0 +mdadm --add $container $dev4 +check wait +imsm_check_hold $container $dev4 + diff -Nru mdadm-2.6.7.1/tests/10ddf-create mdadm-3.1.4/tests/10ddf-create --- mdadm-2.6.7.1/tests/10ddf-create 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/10ddf-create 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,76 @@ +# +# Test basic DDF functionality. +# +# Create a container with 5 drives +# create a small raid0 across them all, then a 2disk raid1 +# and a 3disk raid5 using the remaining space +# +# add some data, tear down the array, reassemble +# and make sure it is still there. + +mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -CR r0 -l0 -n5 /dev/md/ddf0 -z 5000 +mdadm -CR r1 -l1 -n2 /dev/md/ddf0 +mdadm -CR r5 -l5 -n3 /dev/md/ddf0 +testdev /dev/md/r0 5 5000 512 +# r0 will use 4608 due to chunk size, so that leaves 28160 for the rest +testdev /dev/md/r1 1 28160 1 +testdev /dev/md/r5 2 28160 512 +dd if=/dev/sda of=/dev/md/r0 || true +dd if=/dev/sda of=/dev/md/r1 || true +dd if=/dev/sda of=/dev/md/r5 || true + +s0=`sha1sum /dev/md/r0` +s1=`sha1sum /dev/md/r1` +s5=`sha1sum /dev/md/r5` + + +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -I /dev/md/ddf0 + +s0a=`sha1sum /dev/md/r0` +s1a=`sha1sum /dev/md/r1` +s5a=`sha1sum /dev/md/r5` + +if [ "$s0" != "$s0a" ]; then + echo r0 did not match ; exit 1; +fi +if [ "$s1" != "$s1a" ]; then + echo r1 did not match ; exit 1; +fi +if [ "$s5" != "$s5a" ]; then + echo r5 did not match ; exit 1; +fi + +# failure status just means it has completed already, so ignore it. +mdadm --wait /dev/md/r1 || true +mdadm --wait /dev/md/r5 || true + +mdadm -Dbs > /var/tmp/mdadm.conf + +mdadm -Ss + +# Now try to assemble using mdadm.conf +mdadm -Asc /var/tmp/mdadm.conf +check nosync # This failed once. The raid5 was resyncing. + +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss + +# and now assemble fully incrementally. +for i in $dev8 $dev9 $dev10 $dev11 $dev12 +do + #./mdadm -I $i -vv 2>&1 | wc -l > /tmp/cnt + ./mdadm -I $i 2> /tmp/thing + wc -l < /tmp/thing > /tmp/cnt + # should find container and 2 devices, so 3 lines. + [ `cat /tmp/cnt` -eq 3 ] +done +check nosync + +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss +rm /tmp/mdadm.conf /var/tmp/mdadm.conf diff -Nru mdadm-2.6.7.1/tests/env-08imsm-overlap mdadm-3.1.4/tests/env-08imsm-overlap --- mdadm-2.6.7.1/tests/env-08imsm-overlap 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/env-08imsm-overlap 2010-03-22 08:08:43.000000000 +0200 @@ -0,0 +1,68 @@ +imsm_check() { + case $1 in + container ) + grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || { + echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;} + ;; + member ) + member=$2 + num_disks=$3 + level=$4 + size=$5 + offset=$6 + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "member array $member not found" >&2 + cat /proc/mdstat >&2 + exit 1 + fi + for i in `seq 0 $((num_disks-1))` + do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $offset -ne $((_offset/2)) ]; then + echo "offset mismatch expected $offset got $_offset" >&2 + err=$((err+1)) + fi + _size=`cat ${sysfs}/md/rd${i}/size` + if [ $size -ne $_size ]; then + echo "offset mismatch expected $size got $_size" >&2 + err=$((err+1)) + fi + done + + if [ $err -gt 0 ]; then + echo "$member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop0 >&2 + exit 1 + fi + ;; + * ) echo >&2 ERROR unknown check $1 ; exit 1; + esac +} + +setup_env() { + export IMSM_DEVNAME_AS_SERIAL=1 + export IMSM_NO_PLATFORM=1 + container=/dev/md/container + member0=/dev/md/vol0 + member1=/dev/md/vol1 + member2=/dev/md/vol2 + member3=/dev/md/vol3 + member4=/dev/md/vol4 +} + +reset_env() { + unset IMSM_DEVNAME_AS_SERIAL + unset IMSM_NO_PLATFORM + unset imsm_check + unset container + unset member0 + unset member1 + unset member2 + unset member3 + unset member4 +} diff -Nru mdadm-2.6.7.1/tests/env-09imsm-assemble mdadm-3.1.4/tests/env-09imsm-assemble --- mdadm-2.6.7.1/tests/env-09imsm-assemble 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/env-09imsm-assemble 2010-08-05 09:51:58.000000000 +0300 @@ -0,0 +1,32 @@ +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +setup_env() { + export IMSM_DEVNAME_AS_SERIAL=1 + export IMSM_TEST_OROM=1 + container=/dev/md/container + member=/dev/md/vol0 +} + +reset_env() { + unset IMSM_DEVNAME_AS_SERIAL + unset IMSM_TEST_OROM + unset imsm_check + unset container + unset member +} diff -Nru mdadm-2.6.7.1/tests/env-09imsm-create-fail-rebuild mdadm-3.1.4/tests/env-09imsm-create-fail-rebuild --- mdadm-2.6.7.1/tests/env-09imsm-create-fail-rebuild 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/tests/env-09imsm-create-fail-rebuild 2010-03-22 08:08:43.000000000 +0200 @@ -0,0 +1,98 @@ +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check() { + udevadm settle + case $1 in + container ) + grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || { + echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;} + ;; + member ) + member=$2 + num_disks=$3 + level=$4 + size=$5 + offset=$6 + chunk=$7 + err=0 + + if [ $level -ne 1 ]; then + size=$((size & ~(chunk - 1))) + else + chunk=64 + fi + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "member array $member not found" >&2 + cat /proc/mdstat >&2 + exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $chunk -ne $((_chunk/1024)) ]; then + echo "chunk mismatch expected $chunk got $_chunk" >&2 + err=$((err+1)) + fi + for i in `seq 0 $((num_disks-1))` + do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $offset -ne $((_offset/2)) ]; then + echo "offset mismatch expected $offset got $_offset" >&2 + err=$((err+1)) + fi + _size=`cat ${sysfs}/md/rd${i}/size` + if [ $size -ne $_size ]; then + echo "size mismatch expected $size got $_size" >&2 + err=$((err+1)) + fi + done + + if [ $err -gt 0 ]; then + echo "$member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop0 >&2 + exit 1 + fi + ;; + * ) echo >&2 ERROR unknown check $1 ; exit 1; + esac +} + +setup_env() { + export IMSM_DEVNAME_AS_SERIAL=1 + export IMSM_TEST_OROM=1 + container=/dev/md/container + member0=/dev/md/vol0 + member1=/dev/md/vol1 + member2=/dev/md/vol2 + member3=/dev/md/vol3 + member4=/dev/md/vol4 +} + +reset_env() { + unset IMSM_DEVNAME_AS_SERIAL + unset IMSM_TEST_OROM + unset imsm_check + unset container + unset member0 + unset member1 + unset member2 + unset member3 + unset member4 +} diff -Nru mdadm-2.6.7.1/TODO mdadm-3.1.4/TODO --- mdadm-2.6.7.1/TODO 2008-10-13 04:22:02.000000000 +0300 +++ mdadm-3.1.4/TODO 2010-03-22 08:08:42.000000000 +0200 @@ -1,3 +1,38 @@ + - add 'name' field to metadata type and use it. + - use validate_geometry more + - metadata should be able to check/reject bitmap stuff. + +DDF: + Three new metadata types: + ddf - used only to create a container. + ddf-bvd - used to create an array in a container + ddf-svd - used to create a secondary array from bvds. + + Usage: + mdadm -C /dev/ddf1 /dev/sd[abcdef] + mdadm -C /dev/md1 -e ddf /dev/sd[a-f] + mdadm -C /dev/md1 -l container /dev/sd[a-f] + + Each of these create a new ddf container using all those + devices. The name 'ddf*' signals that ddf metadata should be used. + '-e ddf' only supports one level - 'container'. 'container' is only + supported by ddf. + + mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ??? + mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb + If exactly one device is given, and it is a container, we select + devices from that container. + If devices are given that are already in use, they must be in use by + a container, and the array is created in the container. + If devices given are bvds, we slip under the hood to make + the svd arrays. + + mdadm -A /dev/ddf ...... + base drives make a container. Anything in that container is started + auto-read-only. + if /dev/ddf is already assembled, we assemble bvds and svds inside it. + + 2005-dec-20 Want an incremental assembly mode to work nicely with udev. Core usage would be something like diff -Nru mdadm-2.6.7.1/udev-md-raid.rules mdadm-3.1.4/udev-md-raid.rules --- mdadm-2.6.7.1/udev-md-raid.rules 1970-01-01 02:00:00.000000000 +0200 +++ mdadm-3.1.4/udev-md-raid.rules 2010-08-26 05:24:16.000000000 +0300 @@ -0,0 +1,39 @@ +# do not edit this file, it will be overwritten on update + +SUBSYSTEM!="block", GOTO="md_end" + +# handle potential components of arrays +ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name" +ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="add", RUN+="/sbin/mdadm --incremental $env{DEVNAME}" + +# handle md arrays +ACTION!="add|change", GOTO="md_end" +KERNEL!="md*", GOTO="md_end" + +# partitions have no md/{array_state,metadata_version}, but should not +# for that reason be ignored. +ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" + +# container devices have a metadata version of e.g. 'external:ddf' and +# never leave state 'inactive' +ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state" +TEST!="md/array_state", GOTO="md_end" +ATTR{md/array_state}=="|clear|inactive", GOTO="md_end" +LABEL="md_ignore_state" + +IMPORT{program}="/sbin/mdadm --detail --export $tempnode" +ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" +ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" +ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +IMPORT{program}="/sbin/blkid -o udev -p $tempnode" +OPTIONS+="link_priority=100" +OPTIONS+="watch" +ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" +ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + +LABEL="md_end" diff -Nru mdadm-2.6.7.1/util.c mdadm-3.1.4/util.c --- mdadm-2.6.7.1/util.c 2008-10-15 06:34:28.000000000 +0300 +++ mdadm-3.1.4/util.c 2010-08-31 10:18:39.000000000 +0300 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2006 Neil Brown + * Copyright (C) 2001-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -19,18 +19,18 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown - * Email: - * Paper: Neil Brown - * School of Computer Science and Engineering - * The University of New South Wales - * Sydney, 2052 - * Australia + * Email: */ #include "mdadm.h" #include "md_p.h" +#include #include +#include +#include #include +#include +#include /* * following taken from linux/blkpg.h because they aren't @@ -65,6 +65,73 @@ char volname[BLKPG_VOLNAMELTH]; /* volume label */ }; +/* partition table structures so we can check metadata position + * against the end of the last partition. + * Only handle MBR ant GPT partition tables. + */ +struct MBR_part_record { + __u8 bootable; + __u8 first_head; + __u8 first_sector; + __u8 first_cyl; + __u8 part_type; + __u8 last_head; + __u8 last_sector; + __u8 last_cyl; + __u32 first_sect_lba; + __u32 blocks_num; +}; + +struct MBR { + __u8 pad[446]; + struct MBR_part_record parts[4]; + __u16 magic; +} __attribute__((packed)); + +struct GPT_part_entry { + unsigned char type_guid[16]; + unsigned char partition_guid[16]; + __u64 starting_lba; + __u64 ending_lba; + unsigned char attr_bits[8]; + unsigned char name[72]; +} __attribute__((packed)); + +struct GPT { + __u64 magic; + __u32 revision; + __u32 header_size; + __u32 crc; + __u32 pad1; + __u64 current_lba; + __u64 backup_lba; + __u64 first_lba; + __u64 last_lba; + __u8 guid[16]; + __u64 part_start; + __u32 part_cnt; + __u32 part_size; + __u32 part_crc; + __u8 pad2[420]; +} __attribute__((packed)); + +/* Force a compilation error if condition is true */ +#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) + +/* Force a compilation error if condition is true, but also produce a + result (of value 0 and type size_t), so the expression can be used + e.g. in a structure initializer (or where-ever else comma expressions + aren't permitted). */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) + + +/* MBR/GPT magic numbers */ +#define MBR_SIGNATURE_MAGIC __cpu_to_le16(0xAA55) +#define GPT_SIGNATURE_MAGIC __cpu_to_le64(0x5452415020494645ULL) + +#define MBR_PARTITIONS 4 +#define MBR_GPT_PARTITION_TYPE 0xEE + /* * Parse a 128 bit uuid in 4 integers * format is 32 hexx nibbles with options :. separator @@ -149,6 +216,73 @@ return (a*1000000)+(b*1000)+c; } +#ifndef MDASSEMBLE +long long parse_size(char *size) +{ + /* parse 'size' which should be a number optionally + * followed by 'K', 'M', or 'G'. + * Without a suffix, K is assumed. + * Number returned is in sectors (half-K) + */ + char *c; + long long s = strtoll(size, &c, 10); + if (s > 0) { + switch (*c) { + case 'K': + c++; + default: + s *= 2; + break; + case 'M': + c++; + s *= 1024 * 2; + break; + case 'G': + c++; + s *= 1024 * 1024 * 2; + break; + } + } + if (*c) + s = 0; + return s; +} + +int parse_layout_10(char *layout) +{ + int copies, rv; + char *cp; + /* Parse the layout string for raid10 */ + /* 'f', 'o' or 'n' followed by a number <= raid_disks */ + if ((layout[0] != 'n' && layout[0] != 'f' && layout[0] != 'o') || + (copies = strtoul(layout+1, &cp, 10)) < 1 || + copies > 200 || + *cp) + return -1; + if (layout[0] == 'n') + rv = 256 + copies; + else if (layout[0] == 'o') + rv = 0x10000 + (copies<<8) + 1; + else + rv = 1 + (copies<<8); + return rv; +} + +int parse_layout_faulty(char *layout) +{ + /* Parse the layout string for 'faulty' */ + int ln = strcspn(layout, "0123456789"); + char *m = strdup(layout); + int mode; + m[ln] = 0; + mode = map_name(faultylayout, m); + if (mode == UnSet) + return -1; + + return mode | (atoi(layout+ln)<< ModeShift); +} +#endif + void remove_partitions(int fd) { /* remove partitions from this block devices. @@ -168,6 +302,31 @@ #endif } +int test_partition(int fd) +{ + /* Check if fd is a whole-disk or a partition. + * BLKPG will return EINVAL on a partition, and BLKPG_DEL_PARTITION + * will return ENXIO on an invalid partition number. + */ + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + p.pno = 1<<30; + if (ioctl(fd, BLKPG, &a) == 0) + /* Very unlikely, but not a partition */ + return 0; + if (errno == ENXIO) + /* not a partition */ + return 0; + + return 1; +} + + int enough(int level, int raid_disks, int layout, int clean, char *avail, int avail_disks) { @@ -194,9 +353,9 @@ } while (first != 0); return 1; - case -4: + case LEVEL_MULTIPATH: return avail_disks>= 1; - case -1: + case LEVEL_LINEAR: case 0: return avail_disks == raid_disks; case 1: @@ -217,8 +376,13 @@ } } +const int uuid_match_any[4] = { ~0, ~0, ~0, ~0 }; int same_uuid(int a[4], int b[4], int swapuuid) { + if (memcmp(a, uuid_match_any, sizeof(int[4])) == 0 || + memcmp(b, uuid_match_any, sizeof(int[4])) == 0) + return 1; + if (swapuuid) { /* parse uuids are hostendian. * uuid's from some superblocks are big-ending @@ -264,6 +428,36 @@ memcpy(a, b, 16); } +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) +{ + int i, j; + char uuid[16]; + char *c = buf; + strcpy(c, "UUID-"); + c += strlen(c); + copy_uuid(uuid, id, swap); + for (i = 0; i < 4; i++) { + if (i) + *c++ = sep; + for (j = 3; j >= 0; j--) { + sprintf(c,"%02x", (unsigned char) uuid[j+4*i]); + c+= 2; + } + } + return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +{ + // dirty hack to work around an issue with super1 superblocks... + // super1 superblocks need swapuuid set in order for assembly to + // work, but can't have it set if we want this printout to match + // all the other uuid printouts in super1.c, so we force swapuuid + // to 1 to make our printout match the rest of super1 + return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 : st->ss->swapuuid, buf, sep); +} + #ifndef MDASSEMBLE int check_ext2(int fd, char *name) { @@ -389,6 +583,9 @@ /* tests if dev is a "standard" md dev name. * i.e if the last component is "/dNN" or "/mdNN", * where NN is a string of digits + * Returns 1 if a partitionable standard, + * -1 if non-partitonable, + * 0 if not a standard name. */ char *d = strrchr(dev, '/'); int type=0; @@ -398,7 +595,7 @@ if (strncmp(d, "/d",2)==0) d += 2, type=1; /* /dev/md/dN{pM} */ else if (strncmp(d, "/md_d", 5)==0) - d += 5, type=1; /* /dev/md_dNpM */ + d += 5, type=1; /* /dev/md_dN{pM} */ else if (strncmp(d, "/md", 3)==0) d += 3, type=-1; /* /dev/mdN */ else if (d-dev > 3 && strncmp(d-2, "md/", 3)==0) @@ -433,8 +630,10 @@ int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s) { struct stat st; + if (S_ISLNK(stb->st_mode)) { - stat(name, &st); + if (stat(name, &st) != 0) + return 0; stb = &st; } @@ -475,14 +674,13 @@ /* * Find a block device with the right major/minor number. * If we find multiple names, choose the shortest. - * If we find a non-standard name, it is probably there - * deliberately so prefer it over a standard name. + * If we find a name in /dev/md/, we prefer that. * This applies only to names for MD devices. */ char *map_dev(int major, int minor, int create) { struct devmap *p; - char *std = NULL, *nonstd=NULL; + char *regular = NULL, *preferred=NULL; int did_check = 0; if (major == 0 && minor == 0) @@ -509,27 +707,27 @@ for (p=devlist; p; p=p->next) if (p->major == major && p->minor == minor) { - if (is_standard(p->name, NULL)) { - if (std == NULL || - strlen(p->name) < strlen(std)) - std = p->name; + if (strncmp(p->name, "/dev/md/",8) == 0) { + if (preferred == NULL || + strlen(p->name) < strlen(preferred)) + preferred = p->name; } else { - if (nonstd == NULL || - strlen(p->name) < strlen(nonstd)) - nonstd = p->name; + if (regular == NULL || + strlen(p->name) < strlen(regular)) + regular = p->name; } } - if (!std && !nonstd && !did_check) { + if (!regular && !preferred && !did_check) { devlist_ready = 0; goto retry; } - if (create && !std && !nonstd) { + if (create && !regular && !preferred) { static char buf[30]; snprintf(buf, sizeof(buf), "%d:%d", major, minor); - nonstd = buf; + regular = buf; } - return nonstd ? nonstd : std; + return preferred ? preferred : regular; } unsigned long calc_csum(void *super, int bytes) @@ -606,9 +804,42 @@ ); return buf; } + +void print_r10_layout(int layout) +{ + int near = layout & 255; + int far = (layout >> 8) & 255; + int offset = (layout&0x10000); + char *sep = ""; + + if (near != 1) { + printf("%s near=%d", sep, near); + sep = ","; + } + if (far != 1) + printf("%s %s=%d", sep, offset?"offset":"far", far); + if (near*far == 1) + printf("NO REDUNDANCY"); +} #endif -#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) +unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize) +{ + int data_disks = 0; + switch (level) { + case 0: data_disks = raid_disks; break; + case 1: data_disks = 1; break; + case 4: + case 5: data_disks = raid_disks - 1; break; + case 6: data_disks = raid_disks - 2; break; + case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255); + break; + } + devsize &= ~(unsigned long long)((chunksize>>9)-1); + return data_disks * devsize; +} + int get_mdp_major(void) { static int mdp_major = -1; @@ -637,8 +868,7 @@ return mdp_major; } - - +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) char *get_md_name(int dev) { /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */ @@ -693,26 +923,11 @@ unlink(name); } -static int dev2major(int d) -{ - if (d >= 0) - return MD_MAJOR; - else - return get_mdp_major(); -} - -static int dev2minor(int d) -{ - if (d >= 0) - return d; - return (-1-d) << MdpMinorShift; -} - int find_free_devnum(int use_partitions) { int devnum; for (devnum = 127; devnum != 128; - devnum = devnum ? devnum-1 : (1<<22)-1) { + devnum = devnum ? devnum-1 : (1<<20)-1) { char *dn; int _devnum; @@ -744,24 +959,100 @@ int minor; if (!dev) return -1; + flags |= O_DIRECT; major = strtoul(dev, &e, 0); if (e > dev && *e == ':' && e[1] && (minor = strtoul(e+1, &e, 0)) >= 0 && *e == 0) { - snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d", major, minor); - if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) { - fd = open(devname, flags); - unlink(devname); + char *path = map_dev(major, minor, 0); + if (path) + fd = open(path, flags); + if (fd < 0) { + snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) { + fd = open(devname, flags); + unlink(devname); + } + } + if (fd < 0) { + snprintf(devname, sizeof(devname), "/tmp/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) { + fd = open(devname, flags); + unlink(devname); + } } } else fd = open(dev, flags); return fd; } -struct superswitch *superlist[] = { &super0, &super1, NULL }; +int open_dev(int devnum) +{ + char buf[20]; + + sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum)); + return dev_open(buf, O_RDWR); +} + +int open_dev_excl(int devnum) +{ + char buf[20]; + int i; + + sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum)); + for (i=0 ; i<25 ; i++) { + int fd = dev_open(buf, O_RDWR|O_EXCL); + if (fd >= 0) + return fd; + if (errno != EBUSY) + return fd; + usleep(200000); + } + return -1; +} + +int same_dev(char *one, char *two) +{ + struct stat st1, st2; + if (stat(one, &st1) != 0) + return 0; + if (stat(two, &st2) != 0) + return 0; + if ((st1.st_mode & S_IFMT) != S_IFBLK) + return 0; + if ((st2.st_mode & S_IFMT) != S_IFBLK) + return 0; + return st1.st_rdev == st2.st_rdev; +} + +void wait_for(char *dev, int fd) +{ + int i; + struct stat stb_want; + + if (fstat(fd, &stb_want) != 0 || + (stb_want.st_mode & S_IFMT) != S_IFBLK) + return; + + for (i=0 ; i<25 ; i++) { + struct stat stb; + if (stat(dev, &stb) == 0 && + (stb.st_mode & S_IFMT) == S_IFBLK && + (stb.st_rdev == stb_want.st_rdev)) + return; + usleep(200000); + } + if (i == 25) + dprintf("%s: timeout waiting for %s\n", __func__, dev); +} + +struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL }; #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + struct supertype *super_by_fd(int fd) { mdu_array_info_t array; @@ -772,6 +1063,7 @@ char *verstr; char version[20]; int i; + char *subarray = NULL; sra = sysfs_read(fd, 0, GET_VERSION); @@ -791,40 +1083,59 @@ sprintf(version, "%d.%d", vers, minor); verstr = version; } + if (minor == -2 && is_subarray(verstr)) { + char *dev = verstr+1; + subarray = strchr(dev, '/'); + int devnum; + if (subarray) + *subarray++ = '\0'; + devnum = devname2devnum(dev); + subarray = strdup(subarray); + if (sra) + sysfs_free(sra); + sra = sysfs_read(-1, devnum, GET_VERSION); + if (sra && sra->text_version[0]) + verstr = sra->text_version; + else + verstr = "-no-metadata-"; + } + for (i = 0; st == NULL && superlist[i] ; i++) st = superlist[i]->match_metadata_desc(verstr); if (sra) sysfs_free(sra); - if (st) + if (st) { st->sb = NULL; + if (subarray) { + strncpy(st->subarray, subarray, 32); + st->subarray[31] = 0; + free(subarray); + } else + st->subarray[0] = 0; + } return st; } #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ -struct supertype *dup_super(struct supertype *st) +struct supertype *dup_super(struct supertype *orig) { - struct supertype *stnew = NULL; - char *verstr = NULL; - char version[20]; - int i; + struct supertype *st; + if (!orig) + return orig; + st = malloc(sizeof(*st)); if (!st) return st; - - if (st->minor_version == -1) - sprintf(version, "%d", st->ss->major); - else - sprintf(version, "%d.%d", st->ss->major, st->minor_version); - verstr = version; - - for (i = 0; stnew == NULL && superlist[i] ; i++) - stnew = superlist[i]->match_metadata_desc(verstr); - - if (stnew) - stnew->sb = NULL; - return stnew; + memset(st, 0, sizeof(*st)); + st->ss = orig->ss; + st->max_devs = orig->max_devs; + st->minor_version = orig->minor_version; + strcpy(st->subarray, orig->subarray); + st->sb = NULL; + st->info = NULL; + return st; } struct supertype *guess_super(int fd) @@ -834,16 +1145,15 @@ */ struct superswitch *ss; struct supertype *st; - unsigned long besttime = 0; + time_t besttime = 0; int bestsuper = -1; int i; st = malloc(sizeof(*st)); - memset(st, 0, sizeof(*st)); for (i=0 ; superlist[i]; i++) { int rv; ss = superlist[i]; - st->ss = NULL; + memset(st, 0, sizeof(*st)); rv = ss->load_super(st, fd, NULL); if (rv == 0) { struct mdinfo info; @@ -858,7 +1168,7 @@ } if (bestsuper != -1) { int rv; - st->ss = NULL; + memset(st, 0, sizeof(*st)); rv = superlist[bestsuper]->load_super(st, fd, NULL); if (rv == 0) { superlist[bestsuper]->free_super(st); @@ -897,6 +1207,146 @@ return 1; } + +/* Sets endofpart parameter to the last block used by the last GPT partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct GPT gpt; + unsigned char buf[512]; + unsigned char empty_gpt_entry[16]= {0}; + struct GPT_part_entry *part; + unsigned long long curr_part_end; + unsigned all_partitions, entry_size; + unsigned part_nr; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(gpt) != 512); + /* read GPT header */ + lseek(fd, 512, SEEK_SET); + if (read(fd, &gpt, 512) != 512) + return 0; + + /* get the number of partition entries and the entry size */ + all_partitions = __le32_to_cpu(gpt.part_cnt); + entry_size = __le32_to_cpu(gpt.part_size); + + /* Check GPT signature*/ + if (gpt.magic != GPT_SIGNATURE_MAGIC) + return -1; + + /* sanity checks */ + if (all_partitions > 1024 || + entry_size > 512) + return -1; + + /* read first GPT partition entries */ + if (read(fd, buf, 512) != 512) + return 0; + + part = (struct GPT_part_entry*)buf; + + for (part_nr=0; part_nr < all_partitions; part_nr++) { + /* is this valid partition? */ + if (memcmp(part->type_guid, empty_gpt_entry, 16) != 0) { + /* check the last lba for the current partition */ + curr_part_end = __le64_to_cpu(part->ending_lba); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + } + + part = (struct GPT_part_entry*)((unsigned char*)part + entry_size); + + if ((unsigned char *)part >= buf + 512) { + if (read(fd, buf, 512) != 512) + return 0; + part = (struct GPT_part_entry*)buf; + } + } + return 1; +} + +/* Sets endofpart parameter to the last block used by the last partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct MBR boot_sect; + struct MBR_part_record *part; + unsigned long long curr_part_end; + unsigned part_nr; + int retval = 0; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(boot_sect) != 512); + /* read MBR */ + lseek(fd, 0, 0); + if (read(fd, &boot_sect, 512) != 512) + goto abort; + + /* check MBP signature */ + if (boot_sect.magic == MBR_SIGNATURE_MAGIC) { + retval = 1; + /* found the correct signature */ + part = boot_sect.parts; + + for (part_nr=0; part_nr < MBR_PARTITIONS; part_nr++) { + /* check for GPT type */ + if (part->part_type == MBR_GPT_PARTITION_TYPE) { + retval = get_gpt_last_partition_end(fd, endofpart); + break; + } + /* check the last used lba for the current partition */ + curr_part_end = __le32_to_cpu(part->first_sect_lba) + + __le32_to_cpu(part->blocks_num); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + + part++; + } + } else { + /* Unknown partition table */ + retval = -1; + } + abort: + return retval; +} + +int check_partitions(int fd, char *dname, unsigned long long freesize) +{ + /* + * Check where the last partition ends + */ + unsigned long long endofpart; + int ret; + + if ((ret = get_last_partition_end(fd, &endofpart)) > 0) { + /* There appears to be a partition table here */ + if (freesize == 0) { + /* partitions will not be visible in new device */ + fprintf(stderr, + Name ": partition table exists on %s but will be lost or\n" + " meaningless after creating array\n", + dname); + return 1; + } else if (endofpart > freesize) { + /* last partition overlaps metadata */ + fprintf(stderr, + Name ": metadata will over-write last partition on %s.\n", + dname); + return 1; + } + } + return 0; +} + void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk) { int d; @@ -906,6 +1356,478 @@ return; } +int open_container(int fd) +{ + /* 'fd' is a block device. Find out if it is in use + * by a container, and return an open fd on that container. + */ + char path[256]; + char *e; + DIR *dir; + struct dirent *de; + int dfd, n; + char buf[200]; + int major, minor; + struct stat st; + + if (fstat(fd, &st) != 0) + return -1; + sprintf(path, "/sys/dev/block/%d:%d/holders", + (int)major(st.st_rdev), (int)minor(st.st_rdev)); + e = path + strlen(path); + + dir = opendir(path); + if (!dir) + return -1; + while ((de = readdir(dir))) { + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + sprintf(e, "/%s/dev", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (sscanf(buf, "%d:%d", &major, &minor) != 2) + continue; + sprintf(buf, "%d:%d", major, minor); + dfd = dev_open(buf, O_RDONLY); + if (dfd >= 0) { + closedir(dir); + return dfd; + } + } + closedir(dir); + return -1; +} + +struct superswitch *version_to_superswitch(char *vers) +{ + int i; + + for (i = 0; superlist[i]; i++) { + struct superswitch *ss = superlist[i]; + + if (strcmp(vers, ss->name) == 0) + return ss; + } + + return NULL; +} + +int is_container_member(struct mdstat_ent *mdstat, char *container) +{ + if (mdstat->metadata_version == NULL || + strncmp(mdstat->metadata_version, "external:", 9) != 0 || + !is_subarray(mdstat->metadata_version+9) || + strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 || + mdstat->metadata_version[10+strlen(container)] != '/') + return 0; + + return 1; +} + +int is_subarray_active(char *subarray, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + + for (ent = mdstat; ent; ent = ent->next) { + if (is_container_member(ent, container)) { + char *inst = &ent->metadata_version[10+strlen(container)+1]; + + if (!subarray || strcmp(inst, subarray) == 0) + break; + } + } + + free_mdstat(mdstat); + + return ent != NULL; +} + +int is_container_active(char *container) +{ + return is_subarray_active(NULL, container); +} + +/* open_subarray - opens a subarray in a container + * @dev: container device name + * @st: supertype with only ->subarray set + * @quiet: block reporting errors flag + * + * On success returns an fd to a container and fills in *st + */ +int open_subarray(char *dev, struct supertype *st, int quiet) +{ + struct mdinfo *mdi; + int fd, err = 1; + + fd = open(dev, O_RDWR|O_EXCL); + if (fd < 0) { + if (!quiet) + fprintf(stderr, Name ": Couldn't open %s, aborting\n", + dev); + return 2; + } + + st->devnum = fd2devnum(fd); + if (st->devnum == NoMdDev) { + if (!quiet) + fprintf(stderr, + Name ": Failed to determine device number for %s\n", + dev); + goto close_fd; + } + + mdi = sysfs_read(fd, st->devnum, GET_VERSION|GET_LEVEL); + if (!mdi) { + if (!quiet) + fprintf(stderr, Name ": Failed to read sysfs for %s\n", + dev); + goto close_fd; + } + + if (mdi->array.level != UnSet) { + if (!quiet) + fprintf(stderr, Name ": %s is not a container\n", dev); + goto free_sysfs; + } + + st->ss = version_to_superswitch(mdi->text_version); + if (!st->ss) { + if (!quiet) + fprintf(stderr, + Name ": Operation not supported for %s metadata\n", + mdi->text_version); + goto free_sysfs; + } + + st->devname = devnum2devname(st->devnum); + if (!st->devname) { + if (!quiet) + fprintf(stderr, Name ": Failed to allocate device name\n"); + goto free_sysfs; + } + + if (st->ss->load_super(st, fd, NULL)) { + if (!quiet) + fprintf(stderr, Name ": Failed to find subarray-%s in %s\n", + st->subarray, dev); + goto free_name; + } + + if (!st->loaded_container) { + if (!quiet) + fprintf(stderr, Name ": %s is not a container\n", dev); + goto free_super; + } + + err = 0; + + free_super: + if (err) + st->ss->free_super(st); + free_name: + if (err) + free(st->devname); + free_sysfs: + sysfs_free(mdi); + close_fd: + if (err) + close(fd); + + if (err) + return -1; + else + return fd; +} + +int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + /* Add a device to an array, in one of 2 ways. */ + int rv; +#ifndef MDASSEMBLE + if (st->ss->external) { + if (info->disk.state & (1<recovery_start = MaxSector; + else + info->recovery_start = 0; + rv = sysfs_add_disk(sra, info, 0); + if (! rv) { + struct mdinfo *sd2; + for (sd2 = sra->devs; sd2; sd2=sd2->next) + if (sd2 == info) + break; + if (sd2 == NULL) { + sd2 = malloc(sizeof(*sd2)); + *sd2 = *info; + sd2->next = sra->devs; + sra->devs = sd2; + } + } + } else +#endif + rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk); + return rv; +} + +int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) +{ + /* Initialise kernel's knowledge of array. + * This varies between externally managed arrays + * and older kernels + */ + int vers = md_get_version(mdfd); + int rv; + +#ifndef MDASSEMBLE + if (st->ss->external) + rv = sysfs_set_array(info, vers); + else +#endif + if ((vers % 100) >= 1) { /* can use different versions */ + mdu_array_info_t inf; + memset(&inf, 0, sizeof(inf)); + inf.major_version = info->array.major_version; + inf.minor_version = info->array.minor_version; + rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); + } else + rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); + return rv; +} + +unsigned long long min_recovery_start(struct mdinfo *array) +{ + /* find the minimum recovery_start in an array for metadata + * formats that only record per-array recovery progress instead + * of per-device + */ + unsigned long long recovery_start = MaxSector; + struct mdinfo *d; + + for (d = array->devs; d; d = d->next) + recovery_start = min(recovery_start, d->recovery_start); + + return recovery_start; +} + +char *devnum2devname(int num) +{ + char name[100]; + if (num >= 0) + sprintf(name, "md%d", num); + else + sprintf(name, "md_d%d", -1-num); + return strdup(name); +} + +int devname2devnum(char *name) +{ + char *ep; + int num; + if (strncmp(name, "md_d", 4)==0) + num = -1-strtoul(name+4, &ep, 10); + else + num = strtoul(name+2, &ep, 10); + return num; +} + +int stat2devnum(struct stat *st) +{ + char path[30]; + char link[200]; + char *cp; + int n; + + if ((S_IFMT & st->st_mode) == S_IFBLK) { + if (major(st->st_rdev) == MD_MAJOR) + return minor(st->st_rdev); + else if (major(st->st_rdev) == (unsigned)get_mdp_major()) + return -1- (minor(st->st_rdev)>>MdpMinorShift); + + /* must be an extended-minor partition. Look at the + * /sys/dev/block/%d:%d link which must look like + * ../../block/mdXXX/mdXXXpYY + */ + sprintf(path, "/sys/dev/block/%d:%d", major(st->st_rdev), + minor(st->st_rdev)); + n = readlink(path, link, sizeof(link)-1); + if (n <= 0) + return NoMdDev; + link[n] = 0; + cp = strrchr(link, '/'); + if (cp) *cp = 0; + cp = strchr(link, '/'); + if (cp && strncmp(cp, "/md", 3) == 0) + return devname2devnum(cp+1); + } + return NoMdDev; + +} + +int fd2devnum(int fd) +{ + struct stat stb; + if (fstat(fd, &stb) == 0) + return stat2devnum(&stb); + return NoMdDev; +} + +int mdmon_pid(int devnum) +{ + char path[100]; + char pid[10]; + int fd; + int n; + char *devname = devnum2devname(devnum); + + sprintf(path, "%s/%s.pid", MDMON_DIR, devname); + free(devname); + + fd = open(path, O_RDONLY | O_NOATIME, 0); + + if (fd < 0) + return -1; + n = read(fd, pid, 9); + close(fd); + if (n <= 0) + return -1; + return atoi(pid); +} + +int mdmon_running(int devnum) +{ + int pid = mdmon_pid(devnum); + if (pid <= 0) + return 0; + if (kill(pid, 0) == 0) + return 1; + return 0; +} + +int start_mdmon(int devnum) +{ + int i; + int len; + pid_t pid; + int status; + char pathbuf[1024]; + char *paths[4] = { + pathbuf, + "/sbin/mdmon", + "mdmon", + NULL + }; + + if (check_env("MDADM_NO_MDMON")) + return 0; + + len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)); + if (len > 0) { + char *sl; + pathbuf[len] = 0; + sl = strrchr(pathbuf, '/'); + if (sl) + sl++; + else + sl = pathbuf; + strcpy(sl, "mdmon"); + } else + pathbuf[0] = '\0'; + + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + for (i=3; i < 100; i++) + close(i); + for (i=0; paths[i]; i++) + if (paths[i][0]) + execl(paths[i], "mdmon", + devnum2devname(devnum), + NULL); + exit(1); + case -1: fprintf(stderr, Name ": cannot run mdmon. " + "Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid < 0 || status != 0) + return -1; + } + return 0; +} + +int check_env(char *name) +{ + char *val = getenv(name); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + +__u32 random32(void) +{ + __u32 rv; + int rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &rv, 4) != 4) + rv = random(); + if (rfd >= 0) + close(rfd); + return rv; +} + +#ifndef MDASSEMBLE +int flush_metadata_updates(struct supertype *st) +{ + int sfd; + if (!st->updates) { + st->update_tail = NULL; + return -1; + } + + sfd = connect_monitor(devnum2devname(st->container_dev)); + if (sfd < 0) + return -1; + + while (st->updates) { + struct metadata_update *mu = st->updates; + st->updates = mu->next; + + send_message(sfd, mu, 0); + wait_reply(sfd, 0); + free(mu->buf); + free(mu); + } + ack(sfd, 0); + wait_reply(sfd, 0); + close(sfd); + st->update_tail = NULL; + return 0; +} + +void append_metadata_update(struct supertype *st, void *buf, int len) +{ + + struct metadata_update *mu = malloc(sizeof(*mu)); + + mu->buf = buf; + mu->len = len; + mu->space = NULL; + mu->next = NULL; + *st->update_tail = mu; + st->update_tail = &mu->next; +} +#endif /* MDASSEMBLE */ + #ifdef __TINYC__ /* tinyc doesn't optimize this check in ioctl.h out ... */ unsigned int __invalid_size_argument_for_IOC = 0;