Support ephemral storage on EKS

Bug #2046142 reported by DingGGu
6
This bug affects 1 person
Affects Status Importance Assigned to Milestone
cloud-images
Confirmed
Undecided
Unassigned

Bug Description

In EKS, Using instance storage option can help faster than ebs.

Amazon EKS AMI support control ephemeral-storage to easy way.

https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#ephemeral-storage

If Ubuntu EKS AMI or Ubuntu AMI include this shellscript, very helpful for operating ephemeral storage in AWS EC2.

# content of /bin/setup-local-disks

```
#!/usr/bin/env bash

set -o errexit
set -o pipefail
set -o nounset

err_report() {
  echo "Exited with error on line $1"
}
trap 'err_report $LINENO' ERR

print_help() {
  echo "usage: $0 <raid0 | mount>"
  echo "Sets up Amazon EC2 Instance Store NVMe disks"
  echo ""
  echo "-d, --dir directory to mount the filesystem(s) (default: /mnt/k8s-disks/)"
  echo "-h, --help print this help"
}

# Sets up a RAID-0 of NVMe instance storage disks, moves
# the contents of /var/lib/kubelet and /var/lib/containerd
# to the new mounted RAID, and bind mounts the kubelet and
# containerd state directories.
maybe_raid0() {
  local md_name="kubernetes"
  local md_device="/dev/md/${md_name}"
  local md_config="/.aws/mdadm.conf"
  local array_mount_point="${MNT_DIR}/0"
  mkdir -p "$(dirname "${md_config}")"

  if [[ ! -s "${md_config}" ]]; then
    mdadm --create --force --verbose \
      "${md_device}" \
      --level=0 \
      --name="${md_name}" \
      --raid-devices="${#EPHEMERAL_DISKS[@]}" \
      "${EPHEMERAL_DISKS[@]}"
    while [ -n "$(mdadm --detail "${md_device}" | grep -ioE 'State :.*resyncing')" ]; do
      echo "Raid is resyncing..."
      sleep 1
    done
    mdadm --detail --scan > "${md_config}"
  fi

  ## Check if the device symlink has changed on reboot to include a homehost identifier
  local current_md_device=$(find /dev/md/ -type l -regex ".*/${md_name}_?[0-9a-z]*$" | tail -n1)
  if [[ ! -z ${current_md_device} ]]; then
    md_device="${current_md_device}"
  fi

  # Format the array if not already formatted.
  if [[ -z "$(lsblk "${md_device}" -o fstype --noheadings)" ]]; then
    ## By default, mkfs tries to use the stripe unit of the array (512k),
    ## for the log stripe unit, but the max log stripe unit is 256k.
    ## So instead, we use 32k (8 blocks) to avoid a warning of breaching the max.
    ## mkfs.xfs defaults to 32k after logging the warning since the default log buffer size is 32k.
    mkfs.xfs -l su=8b "${md_device}"
  fi

  ## Create the mount directory
  mkdir -p "${array_mount_point}"

  local dev_uuid=$(blkid -s UUID -o value "${md_device}")
  local mount_unit_name="$(systemd-escape --path --suffix=mount "${array_mount_point}")"
  cat > "/etc/systemd/system/${mount_unit_name}" << EOF
  [Unit]
  Description=Mount EC2 Instance Store NVMe disk RAID0
  [Mount]
  What=UUID=${dev_uuid}
  Where=${array_mount_point}
  Type=xfs
  Options=defaults,noatime
  [Install]
  WantedBy=multi-user.target
EOF
  systemd-analyze verify "${mount_unit_name}"
  systemctl enable "${mount_unit_name}" --now

  prev_running=""
  needs_linked=""
  for unit in "kubelet" "containerd"; do
    ## Check if the bind mount from the RAID already exists
    if [[ "$(systemctl is-active var-lib-${unit}.mount)" != "active" ]]; then
      # Check if components that depend on the RAID are running and, if so, stop them
      if systemctl is-active "${unit}" > /dev/null 2>&1; then
        prev_running+=" ${unit}"
      fi
      needs_linked+=" /var/lib/${unit}"
    fi
  done

  ## Check if /var/log/pods has been bind mounted and make sure kubelet is stopped
  if [[ "$(systemctl is-active var-log-pods.mount)" != "active" ]]; then
    if systemctl is-active "kubelet" > /dev/null 2>&1; then
      prev_running+=" ${unit}"
    fi
    needs_linked+=" /var/log/pods"
  fi

  if [[ ! -z "${prev_running}" ]]; then
    systemctl stop ${prev_running}
  fi

  # Transfer state directories to the array, if they exist.
  for mount_point in ${needs_linked}; do
    local unit="$(basename "${mount_point}")"
    local array_mount_point_unit="${array_mount_point}/${unit}"
    mkdir -p "${mount_point}"
    echo "Copying ${mount_point}/ to ${array_mount_point_unit}/"
    cp -a "${mount_point}/" "${array_mount_point_unit}/"
    local mount_unit_name="$(systemd-escape --path --suffix=mount "${mount_point}")"
    cat > "/etc/systemd/system/${mount_unit_name}" << EOF
      [Unit]
      Description=Mount ${unit} on EC2 Instance Store NVMe RAID0
      [Mount]
      What=${array_mount_point_unit}
      Where=${mount_point}
      Type=none
      Options=bind
      [Install]
      WantedBy=multi-user.target
EOF
    systemd-analyze verify "${mount_unit_name}"
    systemctl enable "${mount_unit_name}" --now
  done

  if [[ ! -z "${prev_running}" ]]; then
    systemctl start ${prev_running}
  fi
}

# Mounts and creates xfs file systems on all EC2 instance store NVMe disks
# without existing file systems. Mounts in /mnt/k8s-disks/{1..} by default
maybe_mount() {
  idx=1
  for dev in "${EPHEMERAL_DISKS[@]}"; do
    if [[ -z "$(lsblk "${dev}" -o fstype --noheadings)" ]]; then
      mkfs.xfs -l su=8b "${dev}"
    fi
    if [[ ! -z "$(lsblk "${dev}" -o MOUNTPOINT --noheadings)" ]]; then
      echo "${dev} is already mounted."
      continue
    fi
    local mount_point="${MNT_DIR}/${idx}"
    local mount_unit_name="$(systemd-escape --path --suffix=mount "${mount_point}")"
    mkdir -p "${mount_point}"
    cat > "/etc/systemd/system/${mount_unit_name}" << EOF
    [Unit]
    Description=Mount EC2 Instance Store NVMe disk ${idx}
    [Mount]
    What=${dev}
    Where=${mount_point}
    Type=xfs
    Options=defaults,noatime
    [Install]
    WantedBy=multi-user.target
EOF
    systemd-analyze verify "${mount_unit_name}"
    systemctl enable "${mount_unit_name}" --now
    idx=$((idx + 1))
  done
}

## Main logic
MNT_DIR="/mnt/k8s-disks"

while [[ $# -gt 0 ]]; do
  key="$1"
  case $key in
    -h | --help)
      print_help
      exit 0
      ;;
    -d | --dir)
      MNT_DIR="$2"
      shift
      shift
      ;;
    *) # unknown option
      POSITIONAL+=("$1") # save it in an array for later
      shift # past argument
      ;;
  esac
done

set +u
set -- "${POSITIONAL[@]}" # restore positional parameters
DISK_SETUP="$1"
set -u

if [[ "${DISK_SETUP}" != "raid0" && "${DISK_SETUP}" != "mount" ]]; then
  echo "Valid disk setup options are: raid0 or mount"
  exit 1
fi

disks=($(find -L /dev/disk/by-id/ -xtype l -name '*NVMe_Instance_Storage_*'))
## Bail early if there are no ephemeral disks to setup
if [[ "${#disks[@]}" -eq 0 ]]; then
  echo "no ephemeral disks found, skipping disk setup"
  exit 0
fi

if [ "$(id --user)" -ne 0 ]; then
  echo "Must be run as root"
  exit 1
fi

## Get devices of NVMe instance storage ephemeral disks
EPHEMERAL_DISKS=($(realpath "${disks[@]}" | sort -u))

case "${DISK_SETUP}" in
  "raid0")
    maybe_raid0
    echo "Successfully setup RAID-0 consisting of ${EPHEMERAL_DISKS[@]}"
    ;;
  "mount")
    maybe_mount
    echo "Successfully setup disk mounts consisting of ${EPHEMERAL_DISKS[@]}"
    ;;
esac
```

Tags: cpc-3580
Changed in cloud-images:
status: New → Confirmed
tags: added: cpc-3580
To post a comment you must log in.
This report contains Public information  
Everyone can see this information.

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.