#!/bin/sh # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # Filesystem # Description: Manages a Filesystem on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # # usage: ./Filesystem {start|stop|status|monitor|validate-all|meta-data} # # OCF parameters are as below: # OCF_RESKEY_device # OCF_RESKEY_directory # OCF_RESKEY_fstype # OCF_RESKEY_options # #OCF_RESKEY_device : name of block device for the filesystem. e.g. /dev/sda1, /dev/md0 # Or a -U or -L option for mount, or an NFS mount specification #OCF_RESKEY_directory : the mount point for the filesystem #OCF_RESKEY_fstype : optional name of the filesystem type. e.g. ext2 #OCF_RESKEY_options : options to be given to the mount command via -o # # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 Filesystem::/dev/sda1::/data1::ext2 # or # node1 10.0.0.170 Filesystem::-Ldata1::/data1::ext2 # or # node1 10.0.0.170 Filesystem::server:/data1::/data1::nfs::ro # # This assumes you want to manage a filesystem on a shared (scsi) bus. # Do not put this filesystem in /etc/fstab. This script manages all of # that for you. # # If you are interested in High Availability, you will probably also want # some sort of external hardware RAID controller in front of the actual # disks. I don't mean a RAID controller embedded in the host controller - # it has to be an external controller. # # It can also be an internal RAID controller if the controller supports # failover. IBM's ServeRAID controller does this, and it automatically # prohibits concurrent access too, so it's pretty cool in this application. # # There is a script for software RAID-1 included in this directory. Right # now, I wouldn't recommend using software RAID (see notes in the Raid1 script) # # NOTE: There is no locking (such as a SCSI reservation) being done here. # I would if the SCSI driver could properly maintain the reservation, # which it cannot, even with the 'scsi reservation' patch submitted # earlier this year by James Bottomley. The patch minimizes the # bus resets caused by a RESERVATION_CONFLICT return, and helps the # reservation stay when 2 nodes contend for a reservation, # but it does not attempt to recover the reservation in the # case of a bus reset. # # What all this means is that if 2 nodes mount the same file system # read-write, the filesystem is going to become corrupted. # # As a result, you should use this together with the stonith option # and redundant, independent communications paths. # # If you don't do this, don't blame us when you scramble your disk. # # Note: the ServeRAID controller does prohibit concurrent acess # In this case, you don't actually need STONITH, but redundant comm is # still an excellent idea. # ####################################################################### # Initialization: . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs ####################################################################### HOSTOS=`uname` usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|meta-data} EOT } meta_data() { cat < 1.0 Resource script for Filesystem. It manages a Filesystem on a shared storage medium. Filesystem resource agent The name of block device for the filesystem, or -U, -L options for mount, or NFS mount specification. block device The mount point for the filesystem. mount point The optional type of filesystem to be mounted. filesystem type Any extra options to be given as -o options to mount. options The name (UUID) of the OCFS2 cluster this filesystem is part of, iff this is an OCFS2 resource and there's more than one cluster. You should not need to specify this. OCFS2 cluster name/UUID Mountpoint of the cluster hierarchy below configfs. You should not need to specify this. OCFS2 configfs root END } # # Make sure the kernel does the right thing with the FS buffers # This function should be called after unmounting and before mounting # It may not be necessary in 2.4 and later kernels, but it shouldn't hurt # anything either... # # It's really a bug that you have to do this at all... # flushbufs() { if have_binary $BLOCKDEV ; then if [ "$blockdevice" = "yes" ] ; then $BLOCKDEV --flushbufs $1 return $? fi fi return 0 } # Take advantage of /etc/mtab if present, use portable mount command # otherwise. Normalize format to "dev mountpoint fstype". list_mounts() { if [ -f "/etc/mtab" -a -r "/etc/mtab" ]; then cut -d' ' -f1,2,3 /dev/null fi } ocfs2_cleanup() { # We'll never see the post-stop notification. We're gone now, # have unmounted, and thus should remove the membership. # # (Do so regardless of whether we were unmounted already, # because the admin might have manually unmounted but not # cleared up the membership directory. Bad admin, no cookie.) # if [ ! -d "$OCFS2_FS_ROOT" ]; then ocf_log info "$OCFS2_FS_ROOT: Filesystem membership already gone." else ocf_log info "$OCFS2_FS_ROOT: Removing membership directory." rm -rf $OCFS2_FS_ROOT/ fi ocfs2_del_cache } ocfs2_fetch_uuid() { mounted.ocfs2 -d $DEVICE|tail -1|awk '{print $3}'|tr -d -- -|tr '[a-z]' '[A-Z]' } ocfs2_set_uuid() { _OCFS2_uuid_cache="$HA_RSCTMP/Filesystem.ocfs2_uuid.$(echo $DEVICE|tr / .)" if [ "$OP" != "start" -a -e "$_OCFS2_uuid_cache" ]; then # Trust the cache. OCFS2_UUID=$(cat $_OCFS2_uuid_cache 2>/dev/null) return 0 fi OCFS2_UUID=$(ocfs2_fetch_uuid) if [ -n "$OCFS2_UUID" -a "$OCFS2_UUID" != "UUID" ]; then # UUID valid: echo $OCFS2_UUID > $_OCFS2_uuid_cache return 0 fi # Ok, no UUID still, but that's alright for stop, because it # very likely means we never got started - if [ "$OP" = "stop" ]; then ocf_log warn "$DEVICE: No UUID; assuming never started!" OCFS2_UUID="UUID_NOT_SET" return 0 fi # Everything else - wrong: ocf_log err "$DEVICE: Could not determine ocfs2 UUID for device." exit $OCF_ERR_GENERIC } ocfs2_init() { # Check & initialize the OCFS2 specific variables. if [ $OP != "stop" ]; then if [ -z "$OCF_RESKEY_CRM_meta_clone" ]; then ocf_log err "ocfs2 must be run as a clone." exit $OCF_ERR_GENERIC fi fi if [ $blockdevice = "no" ]; then ocf_log err "$DEVICE: ocfs2 needs a block device instead." exit $OCF_ERR_GENERIC fi for f in "$OCF_RESKEY_ocfs2_configfs" /sys/kernel/config/cluster /configfs/cluster ; do if [ -n "$f" -a -d "$f" ]; then OCFS2_CONFIGFS="$f" break fi done if [ ! -d "$OCFS2_CONFIGFS" ]; then ocf_log err "ocfs2 needs configfs mounted." exit $OCF_ERR_GENERIC fi ocfs2_set_uuid if [ -n "$OCF_RESKEY_ocfs2_cluster" ]; then OCFS2_CLUSTER=$(echo $OCF_RESKEY_ocfs2_cluster | tr '[a-z]' '[A-Z]') else OCFS2_CLUSTER=$(find "$OCFS2_CONFIGFS" -maxdepth 1 -mindepth 1 -type d -printf %f 2>/dev/null) set -- $OCFS2_CLUSTER local n="$#" if [ $n -gt 1 ]; then ocf_log err "$OCFS2_CLUSTER: several clusters found." exit $OCF_ERR_GENERIC fi if [ $n -eq 0 ]; then ocf_log err "$OCFS2_CONFIGFS: no clusters found." exit $OCF_ERR_GENERIC fi fi OCFS2_CLUSTER_ROOT="$OCFS2_CONFIGFS/$OCFS2_CLUSTER" if [ ! -d "$OCFS2_CLUSTER_ROOT" ]; then ocf_log err "$OCFS2_CLUSTER: Cluster doesn't exist. Maybe o2cb hasn't been run?" exit $OCF_ERR_GENERIC fi OCFS2_FS_ROOT=$OCFS2_CLUSTER_ROOT/heartbeat/$OCFS2_UUID } # # START: Start up the filesystem # Filesystem_start() { if [ "$FSTYPE" = "ocfs2" ]; then # "start" now has the notification data available; that # we're being started means we didn't get the # pre-notification, because we weren't running, so # process the information now first. ocf_log info "$OCFS2_UUID: Faking pre-notification on start." OCF_RESKEY_CRM_meta_notify_type="pre" OCF_RESKEY_CRM_meta_notify_operation="start" Filesystem_notify fi # See if the device is already mounted. if Filesystem_status >/dev/null 2>&1 ; then ocf_log info "Filesystem $MOUNTPOINT is already mounted." return $OCF_SUCCESS fi if [ "X${HOSTOS}" != "XOpenBSD" ];then # Insert SCSI module # TODO: This probably should go away. Why should the filesystem # RA magically load a kernel module? $MODPROBE scsi_hostadapter >/dev/null 2>&1 if [ -z "$FSTYPE" ]; then : No FSTYPE specified, rely on the system has the right file-system support already else # Insert Filesystem module $MODPROBE $FSTYPE >/dev/null 2>&1 grep -e "$FSTYPE"'$' /proc/filesystems >/dev/null if [ $? -ne 0 ] ; then ocf_log err "Couldn't find filesystem $FSTYPE in /proc/filesystems" return $OCF_ERR_ARGS fi fi fi # Check the filesystem & auto repair. # NOTE: Some filesystem types don't need this step... Please modify # accordingly if [ $blockdevice = "yes" ]; then if [ "$DEVICE" != "/dev/null" -a ! -b "$DEVICE" ] ; then ocf_log err "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" exit $OCF_ERR_ARGS fi if case $FSTYPE in ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs|cifs|smbfs|ocfs2|lustre) false;; *) true;; esac then ocf_log info "Starting filesystem check on $DEVICE" if [ -z "$FSTYPE" ]; then $FSCK -p $DEVICE else $FSCK -t $FSTYPE -p $DEVICE fi # NOTE: if any errors at all are detected, it returns non-zero # if the error is >= 4 then there is a big problem if [ $? -ge 4 ]; then ocf_log err "Couldn't sucessfully fsck filesystem for $DEVICE" return $OCF_ERR_GENERIC fi fi fi if [ ! -d "$MOUNTPOINT" ] ; then ocf_log err "Couldn't find directory [$MOUNTPOINT] to use as a mount point" exit $OCF_ERR_ARGS fi flushbufs $DEVICE # Mount the filesystem. if [ -z "$FSTYPE" ]; then $MOUNT $options $DEVICE $MOUNTPOINT else $MOUNT -t $FSTYPE $options $DEVICE $MOUNTPOINT fi if [ $? -ne 0 ]; then ocf_log err "Couldn't mount filesystem $DEVICE on $MOUNTPOINT" if [ "$FSTYPE" = "ocfs2" ]; then ocfs2_cleanup fi return $OCF_ERR_GENERIC fi return 0 } # end of Filesystem_start Filesystem_notify() { # Process notifications; this is the essential glue level for # giving user-space membership events to a cluster-aware # filesystem. Right now, only OCFS2 is supported. # # When we get a pre-start notification, we set up all the nodes # which will be active in our membership for the filesystem. # (For the resource to be started, this happens at the time of # the actual 'start' operation.) # # At a post-start, actually there's nothing to do for us really, # but no harm done in re-syncing either. # # pre-stop is meaningless; we can't remove any node yet, it # first needs to unmount. # # post-stop: the node is removed from the membership of the # other nodes. # # Note that this expects that the base cluster is already # active; ie o2cb has been started and populated # $OCFS2_CLUSTER_ROOT/node/ already. This can be achieved by # simply having o2cb run on all nodes by the CRM too. This # probably ought to be mentioned somewhere in the to be written # documentation. ;-) # if [ "$FSTYPE" != "ocfs2" ]; then # One of the cases which shouldn't occur; it should have # been caught much earlier. Still, you know ... ocf_log warn "$DEVICE: Notification received for non-ocfs2 mount." return $OCF_ERR_UNIMPLEMENTED fi local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" local n_active="$OCF_RESKEY_CRM_meta_notify_active_uname" local n_stop="$OCF_RESKEY_CRM_meta_notify_stop_uname" local n_start="$OCF_RESKEY_CRM_meta_notify_start_uname" ocf_log info "$OCFS2_UUID: notify: $n_type for $n_op" ocf_log info "$OCFS2_UUID: notify active: $n_active" ocf_log info "$OCFS2_UUID: notify stop: $n_stop" ocf_log info "$OCFS2_UUID: notify start: $n_start" case "$n_type" in pre) case "$n_op" in stop) ocf_log info "$OCFS2_UUID: ignoring pre-notify for stop." return $OCF_SUCCESS ;; start) # These are about to become active; prepare to # communicate with them. # Duplicate removal - start can contain nodes # already on the active list, confusing the # script later on: for UNAME in $n_active; do n_start=`echo ${n_start} | sed s/$UNAME//` done # Merge pruned lists again: n_active="$n_active $n_start" ;; esac ;; post) case "$n_op" in stop) # remove unames from notify_stop_uname; these have been # stopped and can no longer be considered active. for UNAME in $n_stop; do n_active=`echo ${n_active} | sed s/$UNAME//` done ;; start) if [ "$n_op" = "start" ]; then ocf_log info "$OCFS2_UUID: ignoring post-notify for start." return $OCF_SUCCESS fi ;; esac ;; esac ocf_log info "$OCFS2_UUID: post-processed active: $n_active" local n_myself=${HA_CURHOST:-$(uname -n | tr '[A-Z]' '[a-z]')} ocf_log info "$OCFS2_UUID: I am node $n_myself." case " $n_active " in *" $n_myself "*) ;; *) ocf_log err "$OCFS2_UUID: $n_myself (local) not on active list!" return $OCF_ERR_GENERIC ;; esac if [ -d "$OCFS2_FS_ROOT" ]; then entry_prefix=$OCFS2_FS_ROOT/ for entry in $OCFS2_FS_ROOT/* ; do n_fs="${entry##$entry_prefix}" # ocf_log info "$OCFS2_UUID: Found current node $n_fs" case " $n_active " in *" $n_fs "*) # Construct a list of nodes which are present # already in the membership. n_exists="$n_exists $n_fs" ocf_log info "$OCFS2_UUID: Keeping node: $n_fs" ;; *) # Node is in the membership currently, but not on our # active list. Must be removed. if [ "$n_op" = "start" ]; then ocf_log warn "$OCFS2_UUID: Removing nodes on start" fi ocf_log info "$OCFS2_UUID: Removing dead node: $n_fs" if ! rm -f $entry ; then ocf_log err "$OCFS2_UUID: Removal of $n_fs failed!" fi ;; esac done else ocf_log info "$OCFS2_UUID: heartbeat directory doesn't exist yet, creating." mkdir -p $OCFS2_FS_ROOT fi ocf_log info "$OCFS2_UUID: Existing node list: $n_exists" # (2) for entry in $n_active ; do # ocf_log info "$OCFS2_UUID: Expected active node: $entry" case " $n_exists " in *" $entry "*) ocf_log info "$OCFS2_UUID: Already active: $entry" ;; *) if [ "$n_op" = "stop" ]; then ocf_log warn "$OCFS2_UUID: Adding nodes on stop" fi ocf_log info "$OCFS2_UUID: Activating node: $entry" if ! ln -s $OCFS2_CLUSTER_ROOT/node/$entry $OCFS2_FS_ROOT/$entry ; then ocf_log err "$OCFS2_CLUSTER_ROOT/node/$entry: failed to link" fi ;; esac done } # # STOP: Unmount the filesystem # Filesystem_stop() { # See if the device is currently mounted Filesystem_status >/dev/null 2>&1 if [ $? -eq $OCF_NOT_RUNNING ]; then # Already unmounted, wonderful. rc=$OCF_SUCCESS else # Determine the real blockdevice this is mounted on (if # possible) prior to unmounting. determine_blockdevice # For networked filesystems, there's merit in trying -f: case "$FSTYPE" in nfs|cifs|smbfs) umount_force="-f" ;; esac # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. for SUB in `list_submounts $MOUNTPOINT` $MOUNTPOINT; do ocf_log info "Trying to unmount $MOUNTPOINT" for sig in SIGTERM SIGTERM SIGTERM SIGKILL SIGKILL SIGKILL; do if $UMOUNT $umount_force $SUB ; then rc=$OCF_SUCCESS ocf_log info "unmounted $SUB successfully" break else rc=$OCF_ERR_GENERIC ocf_log err "Couldn't unmount $SUB; trying cleanup with $sig" # fuser returns a non-zero return code if none of the # specified files is accessed or in case of a fatal # error. if [ "X${HOSTOS}" = "XOpenBSD" ];then PIDS=`fstat | grep ${SUB} | awk '{print $3}'` for PID in ${PIDS};do kill -9 ${PID} ocf_log info "Sent kill -9 to ${PID}" done else if $FUSER -$sig -m -k $SUB ; then ocf_log info "Some processes on $SUB were signalled" else ocf_log info "No processes on $SUB were signalled" fi fi sleep 1 fi done if [ $rc -ne $OCF_SUCCESS ]; then ocf_log err "Couldn't unmount $SUB, giving up!" fi done fi flushbufs $DEVICE if [ "$FSTYPE" = "ocfs2" ]; then ocfs2_init ocfs2_cleanup fi return $rc } # end of Filesystem_stop # # STATUS: is the filesystem mounted or not? # Filesystem_status() { if list_mounts | grep -q " $MOUNTPOINT " >/dev/null 2>&1; then rc=$OCF_SUCCESS msg="$MOUNTPOINT is mounted (running)" else rc=$OCF_NOT_RUNNING msg="$MOUNTPOINT is unmounted (stopped)" fi # TODO: For ocfs2, or other cluster filesystems, should we be # checking connectivity to other nodes here, or the IO path to # the storage? # Special case "monitor" to check whether the UUID cached and # on-disk still match? case "$OP" in status) ocf_log info "$msg";; esac return $rc } # end of Filesystem_status # # VALIDATE_ALL: Are the instance parameters valid? # FIXME!! The only part that's useful is the return code. # This code always returns $OCF_SUCCESS (!) # Filesystem_validate_all() { if [ -n $MOUNTPOINT -a ! -d $MOUNTPOINT ]; then ocf_log warn "Mountpoint $MOUNTPOINT does not exist" fi # Check if the $FSTYPE is workable # NOTE: Without inserting the $FSTYPE module, this step may be imprecise # TODO: This is Linux specific crap. if [ ! -z "$FSTYPE" ]; then cut -f2 /proc/filesystems |grep -q ^$FSTYPE$ if [ $? -ne 0 ]; then modpath=/lib/modules/`uname -r` moddep=$modpath/modules.dep # Do we have $FSTYPE in modules.dep? cut -d' ' -f1 $moddep |grep -q "^$modpath.*$FSTYPE\.k\?o:$" if [ $? -ne 0 ]; then ocf_log info "It seems we do not have $FSTYPE support" fi fi fi #TODO: How to check the $options ? return $OCF_SUCCESS } # Check the arguments passed to this script if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # Check the OCF_RESKEY_ environment variables... DEVICE=$OCF_RESKEY_device FSTYPE=$OCF_RESKEY_fstype if [ ! -z "$OCF_RESKEY_options" ]; then options="-o $OCF_RESKEY_options" fi OP=$1 # These operations do not require instance parameters case $OP in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac blockdevice=no case $DEVICE in "") ocf_log err "Please set OCF_RESKEY_device to the device to be managed" exit $OCF_ERR_ARGS ;; -*) # Oh... An option to mount instead... Typically -U or -L ;; [^/]*:/*) # An NFS filesystem specification... ;; //[^/]*/*) # An SMB filesystem specification... ;; /dev/null) # Special case for BSC blockdevice=yes ;; *) if [ ! -b "$DEVICE" -a ! -d "$DEVICE" -a "X$OP" != Xstart ] ; then ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" fi if [ ! -d "$DEVICE" ];then blockdevice=yes fi ;; esac # Normalize instance parameters: # It is possible that OCF_RESKEY_directory has one or even multiple trailing "/". # But the output of `mount` and /proc/mounts do not. if [ -z "$OCF_RESKEY_directory" ]; then if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then ocf_log err "Please specify the directory" exit $OCF_ERR_ARGS fi else MOUNTPOINT=$(echo $OCF_RESKEY_directory | sed 's/\/*$//') : ${MOUNTPOINT:=/} # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/" # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll # kill the whole system. Is that a good idea? fi # Check to make sure the utilites are found if [ "X${HOSTOS}" != "XOpenBSD" ];then check_binary $MODPROBE check_binary $FUSER fi check_binary $FSCK check_binary $MOUNT check_binary $UMOUNT if [ "$OP" != "monitor" ]; then ocf_log info "Running $OP for $DEVICE on $MOUNTPOINT" fi # These operations do not require the clone checking + OCFS2 # initialization. case $OP in status|monitor) Filesystem_status exit $? ;; validate-all) Filesystem_validate_all exit $? ;; stop) Filesystem_stop exit $? ;; esac case $FSTYPE in ocfs2) ocfs2_init ;; nfs) : # this is kind of safe too ;; *) if [ -n "$OCF_RESKEY_CRM_meta_clone" ]; then ocf_log err "DANGER! $FSTYPE on $DEVICE is NOT cluster-aware!" ocf_log err "DO NOT RUN IT AS A CLONE!" ocf_log err "Politely refusing to proceed to avoid data corruption." exit $OCF_ERR_GENERIC fi ;; esac case $OP in start) Filesystem_start ;; notify) Filesystem_notify ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $?