diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/changelog zfs-linux/debian/changelog --- ubuntu/zfs-linux/debian/changelog 2020-05-16 17:46:19.015566898 -0400 +++ zfs-linux/debian/changelog 2020-05-02 16:21:21.000000000 -0400 @@ -1,13 +1,8 @@ -zfs-linux (0.8.3-1ubuntu13) groovy; urgency=medium +zfs-linux (0.8.4-1ubuntu1) UNRELEASED; urgency=medium - * Backport AES-GCM performance accelleration - - backport of upstream zfs commit 31b160f0a6c673c8f926233af2ed6d5354808393 - ("ICP: Improve AES-GCM performance"). - tests on a memory backed pool show performance improvements of ~15-22% - for AES-CCM writes, ~17-20% AES-CCM reads, 34-36% AES-GCM writes and - ~79-80% AES-GCM reads. + * Initial 0.8.4 test - -- Colin Ian King Tue, 5 May 2020 15:53:12 +0100 + -- Satadru Pramanik Sat, 02 May 2020 16:21:01 -0400 zfs-linux (0.8.3-1ubuntu12) focal; urgency=medium diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/2100-zfs-load-module.patch zfs-linux/debian/patches/2100-zfs-load-module.patch --- ubuntu/zfs-linux/debian/patches/2100-zfs-load-module.patch 2020-05-16 17:46:48.143546632 -0400 +++ zfs-linux/debian/patches/2100-zfs-load-module.patch 1969-12-31 19:00:00.000000000 -0500 @@ -1,79 +0,0 @@ -Index: zfs-linux-0.8.2/etc/systemd/system/Makefile.am -=================================================================== ---- zfs-linux-0.8.2.orig/etc/systemd/system/Makefile.am -+++ zfs-linux-0.8.2/etc/systemd/system/Makefile.am -@@ -2,6 +2,7 @@ systemdpreset_DATA = \ - 50-zfs.preset - - systemdunit_DATA = \ -+ zfs-load-module.service \ - zfs-zed.service \ - zfs-import-cache.service \ - zfs-import-scan.service \ -@@ -13,6 +14,7 @@ systemdunit_DATA = \ - zfs.target - - EXTRA_DIST = \ -+ $(top_srcdir)/etc/systemd/system/zfs-load-module.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-zed.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-import-cache.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-import-scan.service.in \ -Index: zfs-linux-0.8.2/etc/systemd/system/zfs-import-cache.service.in -=================================================================== ---- zfs-linux-0.8.2.orig/etc/systemd/system/zfs-import-cache.service.in -+++ zfs-linux-0.8.2/etc/systemd/system/zfs-import-cache.service.in -@@ -3,7 +3,9 @@ Description=Import ZFS pools by cache fi - Documentation=man:zpool(8) - DefaultDependencies=no - Requires=systemd-udev-settle.service -+Requires=zfs-load-module.service - After=systemd-udev-settle.service -+After=zfs-load-module.service - After=cryptsetup.target - After=systemd-remount-fs.service - Before=zfs-import.target -Index: zfs-linux-0.8.2/etc/systemd/system/zfs-import-scan.service.in -=================================================================== ---- zfs-linux-0.8.2.orig/etc/systemd/system/zfs-import-scan.service.in -+++ zfs-linux-0.8.2/etc/systemd/system/zfs-import-scan.service.in -@@ -3,7 +3,9 @@ Description=Import ZFS pools by device s - Documentation=man:zpool(8) - DefaultDependencies=no - Requires=systemd-udev-settle.service -+Requires=zfs-load-module.service - After=systemd-udev-settle.service -+Requires=zfs-load-module.service - After=cryptsetup.target - Before=zfs-import.target - ConditionPathExists=!@sysconfdir@/zfs/zpool.cache -Index: zfs-linux-0.8.2/etc/systemd/system/zfs-load-module.service.in -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.2/etc/systemd/system/zfs-load-module.service.in -@@ -0,0 +1,17 @@ -+[Unit] -+Description=Install ZFS kernel module -+DefaultDependencies=no -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+After=cryptsetup.target -+Before=dracut-mount.service -+After=systemd-remount-fs.service -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=/sbin/modprobe zfs -+ -+[Install] -+WantedBy=zfs-mount.service -+WantedBy=zfs.target -Index: zfs-linux-0.8.2/etc/systemd/system/50-zfs.preset.in -=================================================================== ---- zfs-linux-0.8.2.orig/etc/systemd/system/50-zfs.preset.in -+++ zfs-linux-0.8.2/etc/systemd/system/50-zfs.preset.in -@@ -7,3 +7,4 @@ enable zfs-share.service - enable zfs-zed.service - enable zfs-volume-wait.service - enable zfs.target -+enable zfs-load-module.service diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/2101-zfs-load-module.patch zfs-linux/debian/patches/2101-zfs-load-module.patch --- ubuntu/zfs-linux/debian/patches/2101-zfs-load-module.patch 1969-12-31 19:00:00.000000000 -0500 +++ zfs-linux/debian/patches/2101-zfs-load-module.patch 2020-05-13 13:20:56.000000000 -0400 @@ -0,0 +1,79 @@ +Index: zfs-linux-0.8.2/etc/systemd/system/Makefile.am +=================================================================== +--- zfs-linux-0.8.2.orig/etc/systemd/system/Makefile.am ++++ zfs-linux-0.8.2/etc/systemd/system/Makefile.am +@@ -2,6 +2,7 @@ systemdpreset_DATA = \ + 50-zfs.preset + + systemdunit_DATA = \ ++ zfs-load-module.service \ + zfs-zed.service \ + zfs-import-cache.service \ + zfs-import-scan.service \ +@@ -13,6 +14,7 @@ systemdunit_DATA = \ + zfs.target + + EXTRA_DIST = \ ++ $(top_srcdir)/etc/systemd/system/zfs-load-module.service.in \ + $(top_srcdir)/etc/systemd/system/zfs-zed.service.in \ + $(top_srcdir)/etc/systemd/system/zfs-import-cache.service.in \ + $(top_srcdir)/etc/systemd/system/zfs-import-scan.service.in \ +Index: zfs-linux-0.8.2/etc/systemd/system/zfs-import-cache.service.in +=================================================================== +--- zfs-linux-0.8.2.orig/etc/systemd/system/zfs-import-cache.service.in ++++ zfs-linux-0.8.2/etc/systemd/system/zfs-import-cache.service.in +@@ -3,7 +3,9 @@ Description=Import ZFS pools by cache fi + Documentation=man:zpool(8) + DefaultDependencies=no + Requires=systemd-udev-settle.service ++Requires=zfs-load-module.service + After=systemd-udev-settle.service ++After=zfs-load-module.service + After=cryptsetup.target + After=multipathd.target + After=systemd-remount-fs.service +Index: zfs-linux-0.8.2/etc/systemd/system/zfs-import-scan.service.in +=================================================================== +--- zfs-linux-0.8.2.orig/etc/systemd/system/zfs-import-scan.service.in ++++ zfs-linux-0.8.2/etc/systemd/system/zfs-import-scan.service.in +@@ -3,7 +3,9 @@ Description=Import ZFS pools by device s + Documentation=man:zpool(8) + DefaultDependencies=no + Requires=systemd-udev-settle.service ++Requires=zfs-load-module.service + After=systemd-udev-settle.service ++Requires=zfs-load-module.service + After=cryptsetup.target + After=multipathd.target + Before=zfs-import.target +Index: zfs-linux-0.8.2/etc/systemd/system/zfs-load-module.service.in +=================================================================== +--- /dev/null ++++ zfs-linux-0.8.2/etc/systemd/system/zfs-load-module.service.in +@@ -0,0 +1,17 @@ ++[Unit] ++Description=Install ZFS kernel module ++DefaultDependencies=no ++Requires=systemd-udev-settle.service ++After=systemd-udev-settle.service ++After=cryptsetup.target ++Before=dracut-mount.service ++After=systemd-remount-fs.service ++ ++[Service] ++Type=oneshot ++RemainAfterExit=yes ++ExecStart=/sbin/modprobe zfs ++ ++[Install] ++WantedBy=zfs-mount.service ++WantedBy=zfs.target +Index: zfs-linux-0.8.2/etc/systemd/system/50-zfs.preset.in +=================================================================== +--- zfs-linux-0.8.2.orig/etc/systemd/system/50-zfs.preset.in ++++ zfs-linux-0.8.2/etc/systemd/system/50-zfs.preset.in +@@ -7,3 +7,4 @@ enable zfs-share.service + enable zfs-zed.service + enable zfs-volume-wait.service + enable zfs.target ++enable zfs-load-module.service diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/4000-zsys-support.patch zfs-linux/debian/patches/4000-zsys-support.patch --- ubuntu/zfs-linux/debian/patches/4000-zsys-support.patch 2020-05-16 17:46:48.147546638 -0400 +++ zfs-linux/debian/patches/4000-zsys-support.patch 1969-12-31 19:00:00.000000000 -0500 @@ -1,257 +0,0 @@ -Description: Support zsys systems - Zsys is an enhanced and structured dataset layout for ZFS. - . - It enables advanced use cases by differentiating system, - user data and persistent partitions to allow only partial - permanent or temporary rollback without destroying intermediate - snapshots. -Author: Jean-Baptiste Lallement - Didier Roche -Last-Update: 2019-06-06 -Index: zfs-linux-0.8.3/contrib/initramfs/scripts/zfs.in -=================================================================== ---- zfs-linux-0.8.3.orig/contrib/initramfs/scripts/zfs.in -+++ zfs-linux-0.8.3/contrib/initramfs/scripts/zfs.in -@@ -71,6 +71,20 @@ get_fs_value() - "${ZFS}" get -H -ovalue $value "$fs" 2> /dev/null - } - -+# Get a ZFS filesystem property value with the source stripped from the value -+get_fs_value_without_source() -+{ -+ value="$(get_fs_value $@)" -+ echo "${value%%:*}" -+} -+ -+# Get a ZFS filesystem property source for a given key -+get_fs_source() -+{ -+ value="$(get_fs_value $@)" -+ echo "${value#*:}" -+} -+ - # Find the 'bootfs' property on pool $1. - # If the property does not contain '/', then ignore this - # pool by exporting it again. -@@ -495,16 +509,17 @@ clone_snap() - local snap="$1" - local destfs="$2" - local mountpoint="$3" -+ local additional_parameters="$4" - - [ "$quiet" != "y" ] && zfs_log_begin_msg "Cloning '$snap' to '$destfs'" - -+ if [ -n "${mountpoint}" ]; then -+ additional_parameters="${additional_parameters} -o mountpoint=${mountpoint}" -+ fi -+ - # Clone the snapshot into a dataset we can boot from -- # + We don't want this filesystem to be automatically mounted, we -- # want control over this here and nowhere else. -- # + We don't need any mountpoint set for the same reason. -- # We use the 'org.zol:mountpoint' property to remember the mountpoint. -- ZFS_CMD="${ZFS} clone -o canmount=noauto -o mountpoint=none" -- ZFS_CMD="${ZFS_CMD} -o org.zol:mountpoint=${mountpoint}" -+ ZFS_CMD="${ZFS} clone" -+ ZFS_CMD="${ZFS_CMD} -o canmount=noauto ${additional_parameters}" - ZFS_CMD="${ZFS_CMD} $snap $destfs" - ZFS_STDERR="$(${ZFS_CMD} 2>&1)" - ZFS_ERROR="$?" -@@ -616,6 +631,15 @@ setup_snapshot_booting() - snapname="${snap##*@}" - ZFS_BOOTFS="${rootfs}_${snapname}" - -+ # Detect if we are on a zsys system, which will generates an unique UUID -+ # and override ZFS_BOOTFS -+ use_zsys=$(get_fs_value_without_source "${rootfs}" com.ubuntu.zsys:bootfs) -+ if [ "$use_zsys" = "yes" ]; then -+ zsys_uid=`uid` -+ ZFS_BOOTFS="${rootfs%_*}_${zsys_uid}" # we strip old uid and add new one -+ fi -+ -+ # Rollback won't have effect on zsys system - if ! grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline - then - # If the destination dataset for the clone -@@ -645,10 +669,18 @@ setup_snapshot_booting() - # rpool/ROOT/debian/boot@snap2 => rpool/ROOT/debian_snap2/boot - # rpool/ROOT/debian/usr@snap2 => rpool/ROOT/debian_snap2/usr - # rpool/ROOT/debian/var@snap2 => rpool/ROOT/debian_snap2/var -+ # -+ # For zsys, we have stable root dataset names with uid, so: -+ # rpool/ROOT/debian_uid1@snap2 => rpool/ROOT/debian_uid2 -+ # rpool/ROOT/debian_uid1/boot@snap2 => rpool/ROOT/debian_uid2/boot -+ - subfs="${s##$rootfs}" - subfs="${subfs%%@$snapname}" - - destfs="${rootfs}_${snapname}" # base fs. -+ if [ "${use_zsys}" = "yes" ]; then -+ destfs="${rootfs%_*}_${zsys_uid}" # we strip old uid and add new one -+ fi - [ -n "$subfs" ] && destfs="${destfs}$subfs" # + sub fs. - - # Get the mountpoint of the filesystem, to be used -@@ -665,9 +697,38 @@ setup_snapshot_booting() - fi - fi - -+ # On non zsys: -+ # + We don't want this filesystem to be automatically mounted, we -+ # want control over this here and nowhere else. -+ # + We don't need any mountpoint set for the same reason. -+ # + We use the 'org.zol:mountpoint' property to remember the mountpoint. -+ # On zsys: -+ # + We don't want this filesystem to be automatically mounted, when cloned -+ # so, we set canmount=noauto. Zsys early boot will set the current datasets -+ # to on, alongside other system datasets switch. This enables -+ # zpool import -a -R /altroot to mount the whole system. -+ # The initrd script is doing zpool import -N, so we are not impacted by setting -+ # canmount=on on secondary boot. -+ # + We thus need the real mountpoint set for this reason (as we can't set it -+ # once the system booted, even if the mountpoint didn't change) -+ # + We set additional parameters to zsys to mark datasets we want mount manually -+ # at boot. -+ if [ "${use_zsys}" != "yes" ]; then -+ clone_additional_parameters="-o org.zol:mountpoint=${mountpoint}" -+ mountpoint=none -+ else -+ [ "$(get_fs_value_without_source "$s" com.ubuntu.zsys:bootfs)" != "yes" ] && continue -+ clone_additional_parameters="-o com.ubuntu.zsys:bootfs=yes" -+ # Only set mountpoint explicitely if it was locally set -+ # Keep the possibility to have mountpoint inherited for manual zfs snapshots without zsys involved, which -+ # will have an empty user propertie -+ local mountpoint_source="$(get_fs_source "$s" com.ubuntu.zsys:mountpoint)" -+ [ -n "${mountpoint_source}" -a "${mountpoint_source}" != "local" ] && mountpoint="" -+ fi -+ - # Clone the snapshot into its own - # filesystem -- clone_snap "$s" "${destfs}" "${mountpoint}" || \ -+ clone_snap "$s" "${destfs}" "${mountpoint}" "${clone_additional_parameters}" || \ - retval=$((retval + 1)) - fi - done -@@ -930,6 +991,8 @@ mountroot() - # Booting from a snapshot? - # Will overwrite the ZFS_BOOTFS variable like so: - # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2 -+ # or -+ # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_ if selected system is a zsys one - echo "${ZFS_BOOTFS}" | grep -q '@' && \ - setup_snapshot_booting "${ZFS_BOOTFS}" - fi -@@ -967,8 +1030,16 @@ mountroot() - # Go through the complete list (recursively) of all filesystems below - # the real root dataset - filesystems=$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}") -+ -+ # If the root filesystem is a zsys one, we select the datasets to mount -+ # at boot. -+ # Some datasets under ROOT/ can be mounted on top of persistent datasets -+ # that are hosted elsewhere in the pool. Those are thus only mounted at -+ # early boot. -+ use_zsys=$(get_fs_value_without_source "${ZFS_BOOTFS}" com.ubuntu.zsys:bootfs) - for fs in $filesystems $ZFS_INITRD_ADDITIONAL_DATASETS - do -+ [ "$use_zsys" = "yes" -a "$(get_fs_value_without_source "$fs" com.ubuntu.zsys:bootfs)" != "yes" ] && continue - mount_fs "$fs" - done - -@@ -1007,3 +1078,8 @@ mountroot() - [ "$quiet" != "y" ] && zfs_log_end_msg - fi - } -+ -+uid() -+{ -+ dd if=/dev/urandom of=/dev/stdout bs=1 count=100 2>/dev/null | tr -dc 'a-z0-9' | cut -c-6 -+} -Index: zfs-linux-0.8.3/etc/systemd/system-generators/zfs-mount-generator.in -=================================================================== ---- zfs-linux-0.8.3.orig/etc/systemd/system-generators/zfs-mount-generator.in -+++ zfs-linux-0.8.3/etc/systemd/system-generators/zfs-mount-generator.in -@@ -256,6 +256,83 @@ EOF - ln -s "../${mountfile}" "${req_dir}" - } - -+ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" -+PROPS="name,mountpoint,canmount,atime,relatime,devices,exec,readonly" -+PROPS="${PROPS},setuid,nbmand" -+zsys_revert_failed=0 -+errfile="/tmp/zsys-revert-out.log" -+ -+drop_emergency_on_failure() { -+ if [ ${zsys_revert_failed} -eq 0 ]; then -+ return -+ fi -+ -+ # Drop to emergency target in case of failure after cleanup fstab mountpoints. -+ # This avoids booting and having a mix of old and new datasets, and creating directory in the wrong -+ # datasets, like /boot/grub in / which will prevent zfs to mount /boot dataset later on. -+ rm -f "${dest_norm}"/*.mount -+ ln -s /lib/systemd/system/emergency.target "${dest_norm}"/default.target -+ -+ printf 'ERROR: zfs-mount-generator failed and you requested a revert:\n' > /dev/kmsg -+ cat "${errfile}" > /dev/kmsg -+ printf 'You can reboot on current master dataset to fix the issue\n' > /dev/kmsg -+} -+ -+# Handle revert so that zsys prepares all datasets as expected. -+initzsys() { -+ if [ ! -x @sbindir@/zsysd ]; then -+ return -+ fi -+ -+ # Non ZFS system -+ if ! grep -q "root=ZFS=" /proc/cmdline; then -+ return -+ fi -+ -+ # If we boot on the same dataset than last time, assume we don’t need to do anything as the cache file will only -+ # import desired pools. -+ bootds="$(sed -e 's/.*root=ZFS=\([^ ]\+\).*/\1/' /proc/cmdline)" -+ if grep -Eq "${bootds}\s+/\s+on" "${FSLIST}/"*; then -+ return -+ fi -+ -+ # If we get here: we are reverting. Let zsys handle it -+ trap drop_emergency_on_failure EXIT INT QUIT ABRT PIPE TERM -+ -+ exec 3>&1 1>"${errfile}" -+ exec 4>&2 2>&1 -+ -+ zsys_revert_failed=1 -+ # Import and list previously imported pools for zsys -+ if [ -f "${ZPOOL_CACHE}" ]; then -+ @sbindir@/zpool import -c "${ZPOOL_CACHE}" -aN -+ # As a best effort, import all available pools, hoping there is no conflict. -+ else -+ echo "We had to search for all available pools because ${ZPOOL_CACHE} doesn't exist. To avoid this, create a zpool cache file." -+ @sbindir@/zpool import -aN -+ fi -+ -+ @sbindir@/zsysd boot-prepare >"${errfile}" -+ -+ # If FSLIST is empty, populate with all imported pools -+ if [ -z "$(ls -A ${FSLIST})" ]; then -+ @sbindir@/zpool list -H | cut -f1 | xargs -I{} touch ${FSLIST}/{} -+ fi -+ -+ # Refresh zfs list cache -+ for cachefile in "${FSLIST}/"* ; do -+ pool=`basename ${cachefile}` -+ @sbindir@/zfs list -H -t filesystem -o "${PROPS}" -r "${pool}" >"${cachefile}" -+ done -+ -+ exec 1>&3 3>&- -+ exec 2>&4 4>&- -+ zsys_revert_failed=0 -+ rm "${errfile}" -+} -+ -+initzsys -+ - # Feed each line into process_line - for cachefile in "${FSLIST}/"* ; do - while read -r fs ; do diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/4001-zsys-support.patch zfs-linux/debian/patches/4001-zsys-support.patch --- ubuntu/zfs-linux/debian/patches/4001-zsys-support.patch 1969-12-31 19:00:00.000000000 -0500 +++ zfs-linux/debian/patches/4001-zsys-support.patch 2020-05-13 23:46:33.980086518 -0400 @@ -0,0 +1,260 @@ +Description: Support zsys systems + Zsys is an enhanced and structured dataset layout for ZFS. + . + It enables advanced use cases by differentiating system, + user data and persistent partitions to allow only partial + permanent or temporary rollback without destroying intermediate + snapshots. +Author: Jean-Baptiste Lallement + Didier Roche +Last-Update: 2019-06-06 +Index: zfs-linux-0.8.3/contrib/initramfs/scripts/zfs.in +=================================================================== +--- zfs-linux-0.8.3.orig/contrib/initramfs/scripts/zfs ++++ zfs-linux-0.8.3/contrib/initramfs/scripts/zfs +@@ -62,6 +62,19 @@ get_fs_value() + + "${ZFS}" get -H -ovalue $value "$fs" 2> /dev/null + } ++# Get a ZFS filesystem property value with the source stripped from the value ++get_fs_value_without_source() ++{ ++ value="$(get_fs_value $@)" ++ echo "${value%%:*}" ++} ++ ++# Get a ZFS filesystem property source for a given key ++get_fs_source() ++{ ++ value="$(get_fs_value $@)" ++ echo "${value#*:}" ++} + + # Find the 'bootfs' property on pool $1. + # If the property does not contain '/', then ignore this +@@ -487,17 +500,18 @@ clone_snap() + local snap="$1" + local destfs="$2" + local mountpoint="$3" ++ local additional_parameters="$4" + + [ "$quiet" != "y" ] && zfs_log_begin_msg "Cloning '$snap' to '$destfs'" + ++ if [ -n "${mountpoint}" ]; then ++ additional_parameters="${additional_parameters} -o mountpoint=${mountpoint}" ++ fi ++ + # Clone the snapshot into a dataset we can boot from +- # + We don't want this filesystem to be automatically mounted, we +- # want control over this here and nowhere else. +- # + We don't need any mountpoint set for the same reason. +- # We use the 'org.zol:mountpoint' property to remember the mountpoint. +- ZFS_CMD="${ZFS} clone -o canmount=noauto -o mountpoint=none" +- ZFS_CMD="${ZFS_CMD} -o org.zol:mountpoint=${mountpoint}" +- ZFS_CMD="${ZFS_CMD} $snap $destfs" ++ ZFS_CMD="${ZFS} clone" ++ ZFS_CMD="${ZFS_CMD} -o canmount=noauto ${additional_parameters}" ++ ZFS_CMD="${ZFS_CMD} $snap $destfs" + ZFS_STDERR="$(${ZFS_CMD} 2>&1)" + ZFS_ERROR="$?" + if [ "${ZFS_ERROR}" != 0 ] +@@ -608,6 +622,15 @@ setup_snapshot_booting() + snapname="${snap##*@}" + ZFS_BOOTFS="${rootfs}_${snapname}" + ++ # Detect if we are on a zsys system, which will generates an unique UUID ++ # and override ZFS_BOOTFS ++ use_zsys=$(get_fs_value_without_source "${rootfs}" com.ubuntu.zsys:bootfs) ++ if [ "$use_zsys" = "yes" ]; then ++ zsys_uid=`uid` ++ ZFS_BOOTFS="${rootfs%_*}_${zsys_uid}" # we strip old uid and add new one ++ fi ++ ++ # Rollback won't have effect on zsys system + if ! grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline + then + # If the destination dataset for the clone +@@ -637,10 +660,19 @@ setup_snapshot_booting() + # rpool/ROOT/debian/boot@snap2 => rpool/ROOT/debian_snap2/boot + # rpool/ROOT/debian/usr@snap2 => rpool/ROOT/debian_snap2/usr + # rpool/ROOT/debian/var@snap2 => rpool/ROOT/debian_snap2/var ++ # ++ # For zsys, we have stable root dataset names with uid, so: ++ # rpool/ROOT/debian_uid1@snap2 => rpool/ROOT/debian_uid2 ++ # rpool/ROOT/debian_uid1/boot@snap2 => rpool/ROOT/debian_uid2/boot ++ + subfs="${s##$rootfs}" + subfs="${subfs%%@$snapname}" + + destfs="${rootfs}_${snapname}" # base fs. ++ if [ "${use_zsys}" = "yes" ]; then ++ destfs="${rootfs%_*}_${zsys_uid}" # we strip old uid and add new one ++ fi ++ + [ -n "$subfs" ] && destfs="${destfs}$subfs" # + sub fs. + + # Get the mountpoint of the filesystem, to be used +@@ -656,10 +688,39 @@ setup_snapshot_booting() + mountpoint="/" + fi + fi ++ # On non zsys: ++ # + We don't want this filesystem to be automatically mounted, we ++ # want control over this here and nowhere else. ++ # + We don't need any mountpoint set for the same reason. ++ # + We use the 'org.zol:mountpoint' property to remember the mountpoint. ++ # On zsys: ++ # + We don't want this filesystem to be automatically mounted, when cloned ++ # so, we set canmount=noauto. Zsys early boot will set the current datasets ++ # to on, alongside other system datasets switch. This enables ++ # zpool import -a -R /altroot to mount the whole system. ++ # The initrd script is doing zpool import -N, so we are not impacted by setting ++ # canmount=on on secondary boot. ++ # + We thus need the real mountpoint set for this reason (as we can't set it ++ # once the system booted, even if the mountpoint didn't change) ++ # + We set additional parameters to zsys to mark datasets we want mount manually ++ # at boot. ++ if [ "${use_zsys}" != "yes" ]; then ++ clone_additional_parameters="-o org.zol:mountpoint=${mountpoint}" ++ mountpoint=none ++ else ++ [ "$(get_fs_value_without_source "$s" com.ubuntu.zsys:bootfs)" != "yes" ] && continue ++ clone_additional_parameters="-o com.ubuntu.zsys:bootfs=yes" ++ # Only set mountpoint explicitely if it was locally set ++ # Keep the possibility to have mountpoint inherited for manual zfs snapshots without zsys involved, which ++ # will have an empty user propertie ++ local mountpoint_source="$(get_fs_source "$s" com.ubuntu.zsys:mountpoint)" ++ [ -n "${mountpoint_source}" -a "${mountpoint_source}" != "local" ] && mountpoint="" ++ fi ++ + + # Clone the snapshot into its own + # filesystem +- clone_snap "$s" "${destfs}" "${mountpoint}" || \ ++ clone_snap "$s" "${destfs}" "${mountpoint}" "${clone_additional_parameters}" || \ + retval=$((retval + 1)) + fi + done +@@ -922,6 +983,9 @@ mountroot() + # Booting from a snapshot? + # Will overwrite the ZFS_BOOTFS variable like so: + # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2 ++ # or ++ # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_ if selected system is a zsys one ++ + echo "${ZFS_BOOTFS}" | grep -q '@' && \ + setup_snapshot_booting "${ZFS_BOOTFS}" + fi +@@ -959,8 +1023,16 @@ mountroot() + # Go through the complete list (recursively) of all filesystems below + # the real root dataset + filesystems=$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}") ++ ++ # If the root filesystem is a zsys one, we select the datasets to mount ++ # at boot. ++ # Some datasets under ROOT/ can be mounted on top of persistent datasets ++ # that are hosted elsewhere in the pool. Those are thus only mounted at ++ # early boot. ++ use_zsys=$(get_fs_value_without_source "${ZFS_BOOTFS}" com.ubuntu.zsys:bootfs) + for fs in $filesystems $ZFS_INITRD_ADDITIONAL_DATASETS + do ++ [ "$use_zsys" = "yes" -a "$(get_fs_value_without_source "$fs" com.ubuntu.zsys:bootfs)" != "yes" ] && continue + mount_fs "$fs" + done + +@@ -999,3 +1071,8 @@ mountroot() + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + } ++ ++uid() ++{ ++ dd if=/dev/urandom of=/dev/stdout bs=1 count=100 2>/dev/null | tr -dc 'a-z0-9' | cut -c-6 ++} +Index: zfs-linux-0.8.3/etc/systemd/system-generators/zfs-mount-generator.in +=================================================================== +--- zfs-linux-0.8.3.orig/etc/systemd/system-generators/zfs-mount-generator.in ++++ zfs-linux-0.8.3/etc/systemd/system-generators/zfs-mount-generator.in +@@ -428,6 +428,82 @@ Options=defaults${opts},zfsutil" > "${de + create_dependencies "${mountfile}" "requires" "$requiredby" + + } ++ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" ++PROPS="name,mountpoint,canmount,atime,relatime,devices,exec,readonly" ++PROPS="${PROPS},setuid,nbmand" ++zsys_revert_failed=0 ++errfile="/tmp/zsys-revert-out.log" ++ ++drop_emergency_on_failure() { ++ if [ ${zsys_revert_failed} -eq 0 ]; then ++ return ++ fi ++ ++ # Drop to emergency target in case of failure after cleanup fstab mountpoints. ++ # This avoids booting and having a mix of old and new datasets, and creating directory in the wrong ++ # datasets, like /boot/grub in / which will prevent zfs to mount /boot dataset later on. ++ rm -f "${dest_norm}"/*.mount ++ ln -s /lib/systemd/system/emergency.target "${dest_norm}"/default.target ++ ++ printf 'ERROR: zfs-mount-generator failed and you requested a revert:\n' > /dev/kmsg ++ cat "${errfile}" > /dev/kmsg ++ printf 'You can reboot on current master dataset to fix the issue\n' > /dev/kmsg ++} ++ ++# Handle revert so that zsys prepares all datasets as expected. ++initzsys() { ++ if [ ! -x @sbindir@/zsysd ]; then ++ return ++ fi ++ ++ # Non ZFS system ++ if ! grep -q "root=ZFS=" /proc/cmdline; then ++ return ++ fi ++ ++ # If we boot on the same dataset than last time, assume we don’t need to do anything as the cache file will only ++ # import desired pools. ++ bootds="$(sed -e 's/.*root=ZFS=\([^ ]\+\).*/\1/' /proc/cmdline)" ++ if grep -Eq "${bootds}\s+/\s+on" "${FSLIST}/"*; then ++ return ++ fi ++ ++ # If we get here: we are reverting. Let zsys handle it ++ trap drop_emergency_on_failure EXIT INT QUIT ABRT PIPE TERM ++ ++ exec 3>&1 1>"${errfile}" ++ exec 4>&2 2>&1 ++ ++ zsys_revert_failed=1 ++ # Import and list previously imported pools for zsys ++ if [ -f "${ZPOOL_CACHE}" ]; then ++ @sbindir@/zpool import -c "${ZPOOL_CACHE}" -aN ++ # As a best effort, import all available pools, hoping there is no conflict. ++ else ++ echo "We had to search for all available pools because ${ZPOOL_CACHE} doesn't exist. To avoid this, create a zpool cache file." ++ @sbindir@/zpool import -aN ++ fi ++ ++ @sbindir@/zsysd boot-prepare >"${errfile}" ++ ++ # If FSLIST is empty, populate with all imported pools ++ if [ -z "$(ls -A ${FSLIST})" ]; then ++ @sbindir@/zpool list -H | cut -f1 | xargs -I{} touch ${FSLIST}/{} ++ fi ++ ++ # Refresh zfs list cache ++ for cachefile in "${FSLIST}/"* ; do ++ pool=`basename ${cachefile}` ++ @sbindir@/zfs list -H -t filesystem -o "${PROPS}" -r "${pool}" >"${cachefile}" ++ done ++ ++ exec 1>&3 3>&- ++ exec 2>&4 4>&- ++ zsys_revert_failed=0 ++ rm "${errfile}" ++} ++ ++initzsys + + for cachefile in "${FSLIST}/"* ; do + # Sort cachefile's lines by canmount, "on" before "noauto" diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch zfs-linux/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch --- ubuntu/zfs-linux/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch 2020-05-16 17:46:19.015566898 -0400 +++ zfs-linux/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch 1969-12-31 19:00:00.000000000 -0500 @@ -1,3115 +0,0 @@ -From 31b160f0a6c673c8f926233af2ed6d5354808393 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= -Date: Mon, 10 Feb 2020 21:59:50 +0100 -Subject: [PATCH] ICP: Improve AES-GCM performance -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit -Content-Type: text/plain; charset="utf-8" -Content-Transfer-Encoding: 8bit - -Currently SIMD accelerated AES-GCM performance is limited by two -factors: - -a. The need to disable preemption and interrupts and save the FPU -state before using it and to do the reverse when done. Due to the -way the code is organized (see (b) below) we have to pay this price -twice for each 16 byte GCM block processed. - -b. Most processing is done in C, operating on single GCM blocks. -The use of SIMD instructions is limited to the AES encryption of the -counter block (AES-NI) and the Galois multiplication (PCLMULQDQ). -This leads to the FPU not being fully utilized for crypto -operations. - -To solve (a) we do crypto processing in larger chunks while owning -the FPU. An `icp_gcm_avx_chunk_size` module parameter was introduced -to make this chunk size tweakable. It defaults to 32 KiB. This step -alone roughly doubles performance. (b) is tackled by porting and -using the highly optimized openssl AES-GCM assembler routines, which -do all the processing (CTR, AES, GMULT) in a single routine. Both -steps together result in up to 32x reduction of the time spend in -the en/decryption routines, leading up to approximately 12x -throughput increase for large (128 KiB) blocks. - -Lastly, this commit changes the default encryption algorithm from -AES-CCM to AES-GCM when setting the `encryption=on` property. - -Reviewed-By: Brian Behlendorf -Reviewed-By: Jason King -Reviewed-By: Tom Caputi -Reviewed-By: Richard Laager -Signed-off-by: Attila Fülöp -Closes #9749 -Signed-off-by: Colin Ian King ---- - COPYRIGHT | 4 + - config/toolchain-simd.m4 | 21 + - include/linux/simd_x86.h | 13 + - include/sys/zio.h | 2 +- - lib/libicp/Makefile.am | 2 + - include/linux/simd.h | 15 +- - man/man8/zfsprops.8 | 2 +- - module/icp/Makefile.in | 9 + - module/icp/algs/modes/gcm.c | 746 ++++++++++++++- - .../modes/THIRDPARTYLICENSE.cryptogams | 36 + - .../THIRDPARTYLICENSE.cryptogams.descrip | 1 + - .../modes/THIRDPARTYLICENSE.openssl | 177 ++++ - .../modes/THIRDPARTYLICENSE.openssl.descrip | 1 + - .../icp/asm-x86_64/modes/aesni-gcm-x86_64.S | 892 ++++++++++++++++++ - module/icp/asm-x86_64/modes/ghash-x86_64.S | 714 ++++++++++++++ - module/icp/include/aes/aes_impl.h | 5 + - module/icp/include/modes/modes.h | 29 +- - .../zfs_create/zfs_create_crypt_combos.ksh | 2 +- - .../zpool_create_crypt_combos.ksh | 2 +- - .../functional/rsend/send_encrypted_props.ksh | 12 +- - 20 files changed, 2654 insertions(+), 31 deletions(-) - create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams - create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip - create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl - create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip - create mode 100644 module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S - create mode 100644 module/icp/asm-x86_64/modes/ghash-x86_64.S - -Index: zfs-linux-0.8.3/COPYRIGHT -=================================================================== ---- zfs-linux-0.8.3.orig/COPYRIGHT -+++ zfs-linux-0.8.3/COPYRIGHT -@@ -20,6 +20,10 @@ notable exceptions and their respective - * AES Implementation: module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl - * PBKDF2 Implementation: lib/libzfs/THIRDPARTYLICENSE.openssl - * SPL Implementation: module/spl/THIRDPARTYLICENSE.gplv2 -+ * GCM Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams -+ * GCM Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl -+ * GHASH Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams -+ * GHASH Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl - - This product includes software developed by the OpenSSL Project for use - in the OpenSSL Toolkit (http://www.openssl.org/) -Index: zfs-linux-0.8.3/config/toolchain-simd.m4 -=================================================================== ---- zfs-linux-0.8.3.orig/config/toolchain-simd.m4 -+++ zfs-linux-0.8.3/config/toolchain-simd.m4 -@@ -23,6 +23,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN - ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL - ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES - ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ -+ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE - ;; - esac - ]) -@@ -400,4 +401,24 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BU - ], [ - AC_MSG_RESULT([no]) - ]) -+]) -+ -+dnl # -+dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE -+dnl # -+AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [ -+ AC_MSG_CHECKING([whether host toolchain supports MOVBE]) -+ -+ AC_LINK_IFELSE([AC_LANG_SOURCE([ -+ [ -+ void main() -+ { -+ __asm__ __volatile__("movbe 0(%eax), %eax"); -+ } -+ ]])], [ -+ AC_MSG_RESULT([yes]) -+ AC_DEFINE([HAVE_MOVBE], 1, [Define if host toolchain supports MOVBE]) -+ ], [ -+ AC_MSG_RESULT([no]) -+ ]) - ]) -Index: zfs-linux-0.8.3/include/linux/simd_x86.h -=================================================================== ---- zfs-linux-0.8.3.orig/include/linux/simd_x86.h -+++ zfs-linux-0.8.3/include/linux/simd_x86.h -@@ -382,7 +382,8 @@ typedef enum cpuid_inst_sets { - AVX512ER, - AVX512VL, - AES, -- PCLMULQDQ -+ PCLMULQDQ, -+ MOVBE - } cpuid_inst_sets_t; - - /* -@@ -406,6 +407,7 @@ typedef struct cpuid_feature_desc { - #define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ - #define _AES_BIT (1U << 25) - #define _PCLMULQDQ_BIT (1U << 1) -+#define _MOVBE_BIT (1U << 22) - - /* - * Descriptions of supported instruction sets -@@ -433,6 +435,7 @@ static const cpuid_feature_desc_t cpuid_ - [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX }, - [AES] = {1U, 0U, _AES_BIT, ECX }, - [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, -+ [MOVBE] = {1U, 0U, _MOVBE_BIT, ECX }, - }; - - /* -@@ -505,6 +508,7 @@ CPUID_FEATURE_CHECK(avx512er, AVX512ER); - CPUID_FEATURE_CHECK(avx512vl, AVX512VL); - CPUID_FEATURE_CHECK(aes, AES); - CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); -+CPUID_FEATURE_CHECK(movbe, MOVBE); - - #endif /* !defined(_KERNEL) */ - -@@ -719,6 +723,19 @@ zfs_pclmulqdq_available(void) - #endif - } - -+/* -+ * Check if MOVBE instruction is available -+ */ -+static inline boolean_t -+zfs_movbe_available(void) -+{ -+#if defined(X86_FEATURE_MOVBE) -+ return (!!boot_cpu_has(X86_FEATURE_MOVBE)); -+#else -+ return (B_FALSE); -+#endif -+} -+ - /* - * AVX-512 family of instruction sets: - * -Index: zfs-linux-0.8.3/include/sys/zio.h -=================================================================== ---- zfs-linux-0.8.3.orig/include/sys/zio.h -+++ zfs-linux-0.8.3/include/sys/zio.h -@@ -118,7 +118,7 @@ enum zio_encrypt { - ZIO_CRYPT_FUNCTIONS - }; - --#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_CCM -+#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM - #define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF - - /* macros defining encryption lengths */ -Index: zfs-linux-0.8.3/lib/libicp/Makefile.am -=================================================================== ---- zfs-linux-0.8.3.orig/lib/libicp/Makefile.am -+++ zfs-linux-0.8.3/lib/libicp/Makefile.am -@@ -20,6 +20,8 @@ ASM_SOURCES_AS = \ - asm-x86_64/aes/aes_amd64.S \ - asm-x86_64/aes/aes_aesni.S \ - asm-x86_64/modes/gcm_pclmulqdq.S \ -+ asm-x86_64/modes/aesni-gcm-x86_64.S \ -+ asm-x86_64/modes/ghash-x86_64.S \ - asm-x86_64/sha1/sha1-x86_64.S \ - asm-x86_64/sha2/sha256_impl.S \ - asm-x86_64/sha2/sha512_impl.S -Index: zfs-linux-0.8.3/module/icp/Makefile.in -=================================================================== ---- zfs-linux-0.8.3.orig/module/icp/Makefile.in -+++ zfs-linux-0.8.3/module/icp/Makefile.in -@@ -69,9 +69,18 @@ $(MODULE)-objs += algs/skein/skein_iv.o - $(MODULE)-objs += $(ASM_SOURCES) - - $(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o -+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o -+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o - $(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o - $(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o - -+# Suppress objtool "can't find jump dest instruction at" warnings. They -+# are caused by the constants which are defined in the text section of the -+# assembly file using .byte instructions (e.g. bswap_mask). The objtool -+# utility tries to interpret them as opcodes and obviously fails doing so. -+OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y -+OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y -+ - ICP_DIRS = \ - api \ - core \ -Index: zfs-linux-0.8.3/module/icp/algs/modes/gcm.c -=================================================================== ---- zfs-linux-0.8.3.orig/module/icp/algs/modes/gcm.c -+++ zfs-linux-0.8.3/module/icp/algs/modes/gcm.c -@@ -30,12 +30,46 @@ - #include - #include - #include -+#ifdef CAN_USE_GCM_ASM -+#include -+#endif - - #define GHASH(c, d, t, o) \ - xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ - (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ - (uint64_t *)(void *)(t)); - -+/* Select GCM implementation */ -+#define IMPL_FASTEST (UINT32_MAX) -+#define IMPL_CYCLE (UINT32_MAX-1) -+#ifdef CAN_USE_GCM_ASM -+#define IMPL_AVX (UINT32_MAX-2) -+#endif -+#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) -+static uint32_t icp_gcm_impl = IMPL_FASTEST; -+static uint32_t user_sel_impl = IMPL_FASTEST; -+ -+#ifdef CAN_USE_GCM_ASM -+/* -+ * Whether to use the optimized openssl gcm and ghash implementations. -+ * Set to true if module parameter icp_gcm_impl == "avx". -+ */ -+static boolean_t gcm_use_avx = B_FALSE; -+#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) -+ -+static inline boolean_t gcm_avx_will_work(void); -+static inline void gcm_set_avx(boolean_t); -+static inline boolean_t gcm_toggle_avx(void); -+ -+static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, -+ crypto_data_t *, size_t); -+ -+static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); -+static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); -+static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, -+ size_t, size_t); -+#endif /* ifdef CAN_USE_GCM_ASM */ -+ - /* - * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode - * is done in another function. -@@ -47,6 +81,12 @@ gcm_mode_encrypt_contiguous_blocks(gcm_c - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) - { -+#ifdef CAN_USE_GCM_ASM -+ if (ctx->gcm_use_avx == B_TRUE) -+ return (gcm_mode_encrypt_contiguous_blocks_avx( -+ ctx, data, length, out, block_size)); -+#endif -+ - const gcm_impl_ops_t *gops; - size_t remainder = length; - size_t need = 0; -@@ -109,6 +149,14 @@ gcm_mode_encrypt_contiguous_blocks(gcm_c - - ctx->gcm_processed_data_len += block_size; - -+ /* -+ * The following copies a complete GCM block back to where it -+ * came from if there was a remainder in the last call and out -+ * is NULL. That doesn't seem to make sense. So we assert this -+ * can't happen and leave the code in for reference. -+ * See https://github.com/zfsonlinux/zfs/issues/9661 -+ */ -+ ASSERT(out != NULL); - if (out == NULL) { - if (ctx->gcm_remainder_len > 0) { - bcopy(blockp, ctx->gcm_copy_to, -@@ -169,6 +217,11 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) - { -+#ifdef CAN_USE_GCM_ASM -+ if (ctx->gcm_use_avx == B_TRUE) -+ return (gcm_encrypt_final_avx(ctx, out, block_size)); -+#endif -+ - const gcm_impl_ops_t *gops; - uint64_t counter_mask = ntohll(0x00000000ffffffffULL); - uint8_t *ghash, *macp = NULL; -@@ -321,6 +374,11 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) - { -+#ifdef CAN_USE_GCM_ASM -+ if (ctx->gcm_use_avx == B_TRUE) -+ return (gcm_decrypt_final_avx(ctx, out, block_size)); -+#endif -+ - const gcm_impl_ops_t *gops; - size_t pt_len; - size_t remainder; -@@ -526,6 +584,9 @@ gcm_init(gcm_ctx_t *ctx, unsigned char * - return (CRYPTO_SUCCESS); - } - -+/* -+ * Init the GCM context struct. Handle the cycle and avx implementations here. -+ */ - int - gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), -@@ -556,11 +617,37 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *p - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - -- if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, -- gcm_param->pAAD, gcm_param->ulAADLen, block_size, -- encrypt_block, copy_block, xor_block) != 0) { -- rv = CRYPTO_MECHANISM_PARAM_INVALID; -+#ifdef CAN_USE_GCM_ASM -+ /* -+ * Handle the "cycle" implementation by creating avx and non avx -+ * contexts alternately. -+ */ -+ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { -+ gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; -+ } else { -+ gcm_ctx->gcm_use_avx = gcm_toggle_avx(); - } -+ /* We don't handle byte swapped key schedules in the avx code path. */ -+ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; -+ if (ks->ops->needs_byteswap == B_TRUE) { -+ gcm_ctx->gcm_use_avx = B_FALSE; -+ } -+ /* Avx and non avx context initialization differs from here on. */ -+ if (gcm_ctx->gcm_use_avx == B_FALSE) { -+#endif /* ifdef CAN_USE_GCM_ASM */ -+ if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, -+ gcm_param->pAAD, gcm_param->ulAADLen, block_size, -+ encrypt_block, copy_block, xor_block) != 0) { -+ rv = CRYPTO_MECHANISM_PARAM_INVALID; -+ } -+#ifdef CAN_USE_GCM_ASM -+ } else { -+ if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, -+ gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { -+ rv = CRYPTO_MECHANISM_PARAM_INVALID; -+ } -+ } -+#endif /* ifdef CAN_USE_GCM_ASM */ - - return (rv); - } -@@ -590,11 +677,37 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char * - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - -- if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, -- gmac_param->pAAD, gmac_param->ulAADLen, block_size, -- encrypt_block, copy_block, xor_block) != 0) { -- rv = CRYPTO_MECHANISM_PARAM_INVALID; -+#ifdef CAN_USE_GCM_ASM -+ /* -+ * Handle the "cycle" implementation by creating avx and non avx -+ * contexts alternately. -+ */ -+ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { -+ gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; -+ } else { -+ gcm_ctx->gcm_use_avx = gcm_toggle_avx(); -+ } -+ /* We don't handle byte swapped key schedules in the avx code path. */ -+ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; -+ if (ks->ops->needs_byteswap == B_TRUE) { -+ gcm_ctx->gcm_use_avx = B_FALSE; -+ } -+ /* Avx and non avx context initialization differs from here on. */ -+ if (gcm_ctx->gcm_use_avx == B_FALSE) { -+#endif /* ifdef CAN_USE_GCM_ASM */ -+ if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, -+ gmac_param->pAAD, gmac_param->ulAADLen, block_size, -+ encrypt_block, copy_block, xor_block) != 0) { -+ rv = CRYPTO_MECHANISM_PARAM_INVALID; -+ } -+#ifdef CAN_USE_GCM_ASM -+ } else { -+ if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, -+ gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { -+ rv = CRYPTO_MECHANISM_PARAM_INVALID; -+ } - } -+#endif /* ifdef CAN_USE_GCM_ASM */ - - return (rv); - } -@@ -645,15 +758,6 @@ const gcm_impl_ops_t *gcm_all_impl[] = { - /* Indicate that benchmark has been completed */ - static boolean_t gcm_impl_initialized = B_FALSE; - --/* Select GCM implementation */ --#define IMPL_FASTEST (UINT32_MAX) --#define IMPL_CYCLE (UINT32_MAX-1) -- --#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) -- --static uint32_t icp_gcm_impl = IMPL_FASTEST; --static uint32_t user_sel_impl = IMPL_FASTEST; -- - /* Hold all supported implementations */ - static size_t gcm_supp_impl_cnt = 0; - static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; -@@ -685,6 +789,16 @@ gcm_impl_get_ops() - size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; - ops = gcm_supp_impl[idx]; - break; -+#ifdef CAN_USE_GCM_ASM -+ case IMPL_AVX: -+ /* -+ * Make sure that we return a valid implementation while -+ * switching to the avx implementation since there still -+ * may be unfinished non-avx contexts around. -+ */ -+ ops = &gcm_generic_impl; -+ break; -+#endif - default: - ASSERT3U(impl, <, gcm_supp_impl_cnt); - ASSERT3U(gcm_supp_impl_cnt, >, 0); -@@ -733,6 +847,16 @@ gcm_impl_init(void) - - strcpy(gcm_fastest_impl.name, "fastest"); - -+#ifdef CAN_USE_GCM_ASM -+ /* -+ * Use the avx implementation if it's available and the implementation -+ * hasn't changed from its default value of fastest on module load. -+ */ -+ if (gcm_avx_will_work() && -+ GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { -+ gcm_set_avx(B_TRUE); -+ } -+#endif - /* Finish initialization */ - atomic_swap_32(&icp_gcm_impl, user_sel_impl); - gcm_impl_initialized = B_TRUE; -@@ -744,6 +868,9 @@ static const struct { - } gcm_impl_opts[] = { - { "cycle", IMPL_CYCLE }, - { "fastest", IMPL_FASTEST }, -+#ifdef CAN_USE_GCM_ASM -+ { "avx", IMPL_AVX }, -+#endif - }; - - /* -@@ -777,6 +904,12 @@ gcm_impl_set(const char *val) - - /* Check mandatory options */ - for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { -+#ifdef CAN_USE_GCM_ASM -+ /* Ignore avx implementation if it won't work. */ -+ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { -+ continue; -+ } -+#endif - if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { - impl = gcm_impl_opts[i].sel; - err = 0; -@@ -795,6 +928,18 @@ gcm_impl_set(const char *val) - } - } - } -+#ifdef CAN_USE_GCM_ASM -+ /* -+ * Use the avx implementation if available and the requested one is -+ * avx or fastest. -+ */ -+ if (gcm_avx_will_work() == B_TRUE && -+ (impl == IMPL_AVX || impl == IMPL_FASTEST)) { -+ gcm_set_avx(B_TRUE); -+ } else { -+ gcm_set_avx(B_FALSE); -+ } -+#endif - - if (err == 0) { - if (gcm_impl_initialized) -@@ -826,6 +971,12 @@ icp_gcm_impl_get(char *buffer, zfs_kerne - - /* list mandatory options */ - for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { -+#ifdef CAN_USE_GCM_ASM -+ /* Ignore avx implementation if it won't work. */ -+ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { -+ continue; -+ } -+#endif - fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; - cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name); - } -@@ -842,4 +993,563 @@ icp_gcm_impl_get(char *buffer, zfs_kerne - module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, - NULL, 0644); - MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); --#endif -+#endif /* defined(__KERNEL) */ -+ -+#ifdef CAN_USE_GCM_ASM -+#define GCM_BLOCK_LEN 16 -+/* -+ * The openssl asm routines are 6x aggregated and need that many bytes -+ * at minimum. -+ */ -+#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) -+#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) -+/* -+ * Ensure the chunk size is reasonable since we are allocating a -+ * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. -+ */ -+#define GCM_AVX_MAX_CHUNK_SIZE \ -+ (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) -+ -+/* Get the chunk size module parameter. */ -+#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size -+ -+/* Clear the FPU registers since they hold sensitive internal state. */ -+#define clear_fpu_regs() clear_fpu_regs_avx() -+#define GHASH_AVX(ctx, in, len) \ -+ gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \ -+ in, len) -+ -+#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) -+ -+/* -+ * Module parameter: number of bytes to process at once while owning the FPU. -+ * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is -+ * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. -+ */ -+static uint32_t gcm_avx_chunk_size = -+ ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; -+ -+extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); -+extern void clear_fpu_regs_avx(void); -+extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); -+extern void aes_encrypt_intel(const uint32_t rk[], int nr, -+ const uint32_t pt[4], uint32_t ct[4]); -+ -+extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]); -+extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2], -+ const uint8_t *in, size_t len); -+ -+extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, -+ const void *, uint64_t *, uint64_t *); -+ -+extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, -+ const void *, uint64_t *, uint64_t *); -+ -+static inline boolean_t -+gcm_avx_will_work(void) -+{ -+ /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ -+ return (kfpu_allowed() && -+ zfs_avx_available() && zfs_movbe_available() && -+ zfs_aes_available() && zfs_pclmulqdq_available()); -+} -+ -+static inline void -+gcm_set_avx(boolean_t val) -+{ -+ if (gcm_avx_will_work() == B_TRUE) { -+ atomic_swap_32(&gcm_use_avx, val); -+ } -+} -+ -+static inline boolean_t -+gcm_toggle_avx(void) -+{ -+ if (gcm_avx_will_work() == B_TRUE) { -+ return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); -+ } else { -+ return (B_FALSE); -+ } -+} -+ -+/* -+ * Clear senssitve data in the context. -+ * -+ * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and -+ * ctx->gcm_Htable contain the hash sub key which protects authentication. -+ * -+ * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for -+ * a known plaintext attack, they consists of the IV and the first and last -+ * counter respectively. If they should be cleared is debatable. -+ */ -+static inline void -+gcm_clear_ctx(gcm_ctx_t *ctx) -+{ -+ bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder)); -+ bzero(ctx->gcm_H, sizeof (ctx->gcm_H)); -+ bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable)); -+ bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0)); -+ bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp)); -+} -+ -+/* Increment the GCM counter block by n. */ -+static inline void -+gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) -+{ -+ uint64_t counter_mask = ntohll(0x00000000ffffffffULL); -+ uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); -+ -+ counter = htonll(counter + n); -+ counter &= counter_mask; -+ ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; -+} -+ -+/* -+ * Encrypt multiple blocks of data in GCM mode. -+ * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines -+ * if possible. While processing a chunk the FPU is "locked". -+ */ -+static int -+gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, -+ size_t length, crypto_data_t *out, size_t block_size) -+{ -+ size_t bleft = length; -+ size_t need = 0; -+ size_t done = 0; -+ uint8_t *datap = (uint8_t *)data; -+ size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; -+ const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); -+ uint64_t *ghash = ctx->gcm_ghash; -+ uint64_t *cb = ctx->gcm_cb; -+ uint8_t *ct_buf = NULL; -+ uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; -+ int rv = CRYPTO_SUCCESS; -+ -+ ASSERT(block_size == GCM_BLOCK_LEN); -+ /* -+ * If the last call left an incomplete block, try to fill -+ * it first. -+ */ -+ if (ctx->gcm_remainder_len > 0) { -+ need = block_size - ctx->gcm_remainder_len; -+ if (length < need) { -+ /* Accumulate bytes here and return. */ -+ bcopy(datap, (uint8_t *)ctx->gcm_remainder + -+ ctx->gcm_remainder_len, length); -+ -+ ctx->gcm_remainder_len += length; -+ if (ctx->gcm_copy_to == NULL) { -+ ctx->gcm_copy_to = datap; -+ } -+ return (CRYPTO_SUCCESS); -+ } else { -+ /* Complete incomplete block. */ -+ bcopy(datap, (uint8_t *)ctx->gcm_remainder + -+ ctx->gcm_remainder_len, need); -+ -+ ctx->gcm_copy_to = NULL; -+ } -+ } -+ -+ /* Allocate a buffer to encrypt to if there is enough input. */ -+ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { -+ ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag); -+ if (ct_buf == NULL) { -+ return (CRYPTO_HOST_MEMORY); -+ } -+ } -+ -+ /* If we completed an incomplete block, encrypt and write it out. */ -+ if (ctx->gcm_remainder_len > 0) { -+ kfpu_begin(); -+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, -+ (const uint32_t *)cb, (uint32_t *)tmp); -+ -+ gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); -+ GHASH_AVX(ctx, tmp, block_size); -+ clear_fpu_regs(); -+ kfpu_end(); -+ /* -+ * We don't follow gcm_mode_encrypt_contiguous_blocks() here -+ * but assert that out is not null. -+ * See gcm_mode_encrypt_contiguous_blocks() above and -+ * https://github.com/zfsonlinux/zfs/issues/9661 -+ */ -+ ASSERT(out != NULL); -+ rv = crypto_put_output_data(tmp, out, block_size); -+ out->cd_offset += block_size; -+ gcm_incr_counter_block(ctx); -+ ctx->gcm_processed_data_len += block_size; -+ bleft -= need; -+ datap += need; -+ ctx->gcm_remainder_len = 0; -+ } -+ -+ /* Do the bulk encryption in chunk_size blocks. */ -+ for (; bleft >= chunk_size; bleft -= chunk_size) { -+ kfpu_begin(); -+ done = aesni_gcm_encrypt( -+ datap, ct_buf, chunk_size, key, cb, ghash); -+ -+ clear_fpu_regs(); -+ kfpu_end(); -+ if (done != chunk_size) { -+ rv = CRYPTO_FAILED; -+ goto out_nofpu; -+ } -+ if (out != NULL) { -+ rv = crypto_put_output_data(ct_buf, out, chunk_size); -+ if (rv != CRYPTO_SUCCESS) { -+ goto out_nofpu; -+ } -+ out->cd_offset += chunk_size; -+ } -+ datap += chunk_size; -+ ctx->gcm_processed_data_len += chunk_size; -+ } -+ /* Check if we are already done. */ -+ if (bleft == 0) { -+ goto out_nofpu; -+ } -+ /* Bulk encrypt the remaining data. */ -+ kfpu_begin(); -+ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { -+ done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); -+ if (done == 0) { -+ rv = CRYPTO_FAILED; -+ goto out; -+ } -+ if (out != NULL) { -+ rv = crypto_put_output_data(ct_buf, out, done); -+ if (rv != CRYPTO_SUCCESS) { -+ goto out; -+ } -+ out->cd_offset += done; -+ } -+ ctx->gcm_processed_data_len += done; -+ datap += done; -+ bleft -= done; -+ -+ } -+ /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ -+ while (bleft > 0) { -+ if (bleft < block_size) { -+ bcopy(datap, ctx->gcm_remainder, bleft); -+ ctx->gcm_remainder_len = bleft; -+ ctx->gcm_copy_to = datap; -+ goto out; -+ } -+ /* Encrypt, hash and write out. */ -+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, -+ (const uint32_t *)cb, (uint32_t *)tmp); -+ -+ gcm_xor_avx(datap, tmp); -+ GHASH_AVX(ctx, tmp, block_size); -+ if (out != NULL) { -+ rv = crypto_put_output_data(tmp, out, block_size); -+ if (rv != CRYPTO_SUCCESS) { -+ goto out; -+ } -+ out->cd_offset += block_size; -+ } -+ gcm_incr_counter_block(ctx); -+ ctx->gcm_processed_data_len += block_size; -+ datap += block_size; -+ bleft -= block_size; -+ } -+out: -+ clear_fpu_regs(); -+ kfpu_end(); -+out_nofpu: -+ if (ct_buf != NULL) { -+ vmem_free(ct_buf, chunk_size); -+ } -+ return (rv); -+} -+ -+/* -+ * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual -+ * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. -+ */ -+static int -+gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) -+{ -+ uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; -+ uint32_t *J0 = (uint32_t *)ctx->gcm_J0; -+ uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; -+ size_t rem_len = ctx->gcm_remainder_len; -+ const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; -+ int aes_rounds = ((aes_key_t *)keysched)->nr; -+ int rv; -+ -+ ASSERT(block_size == GCM_BLOCK_LEN); -+ -+ if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { -+ return (CRYPTO_DATA_LEN_RANGE); -+ } -+ -+ kfpu_begin(); -+ /* Pad last incomplete block with zeros, encrypt and hash. */ -+ if (rem_len > 0) { -+ uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; -+ const uint32_t *cb = (uint32_t *)ctx->gcm_cb; -+ -+ aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); -+ bzero(remainder + rem_len, block_size - rem_len); -+ for (int i = 0; i < rem_len; i++) { -+ remainder[i] ^= tmp[i]; -+ } -+ GHASH_AVX(ctx, remainder, block_size); -+ ctx->gcm_processed_data_len += rem_len; -+ /* No need to increment counter_block, it's the last block. */ -+ } -+ /* Finish tag. */ -+ ctx->gcm_len_a_len_c[1] = -+ htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); -+ GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); -+ aes_encrypt_intel(keysched, aes_rounds, J0, J0); -+ -+ gcm_xor_avx((uint8_t *)J0, ghash); -+ clear_fpu_regs(); -+ kfpu_end(); -+ -+ /* Output remainder. */ -+ if (rem_len > 0) { -+ rv = crypto_put_output_data(remainder, out, rem_len); -+ if (rv != CRYPTO_SUCCESS) -+ return (rv); -+ } -+ out->cd_offset += rem_len; -+ ctx->gcm_remainder_len = 0; -+ rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); -+ if (rv != CRYPTO_SUCCESS) -+ return (rv); -+ -+ out->cd_offset += ctx->gcm_tag_len; -+ /* Clear sensitive data in the context before returning. */ -+ gcm_clear_ctx(ctx); -+ return (CRYPTO_SUCCESS); -+} -+ -+/* -+ * Finalize decryption: We just have accumulated crypto text, so now we -+ * decrypt it here inplace. -+ */ -+static int -+gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) -+{ -+ ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); -+ ASSERT3U(block_size, ==, 16); -+ -+ size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; -+ size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; -+ uint8_t *datap = ctx->gcm_pt_buf; -+ const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); -+ uint32_t *cb = (uint32_t *)ctx->gcm_cb; -+ uint64_t *ghash = ctx->gcm_ghash; -+ uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; -+ int rv = CRYPTO_SUCCESS; -+ size_t bleft, done; -+ -+ /* -+ * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be -+ * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of -+ * GCM_AVX_MIN_DECRYPT_BYTES. -+ */ -+ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { -+ kfpu_begin(); -+ done = aesni_gcm_decrypt(datap, datap, chunk_size, -+ (const void *)key, ctx->gcm_cb, ghash); -+ clear_fpu_regs(); -+ kfpu_end(); -+ if (done != chunk_size) { -+ return (CRYPTO_FAILED); -+ } -+ datap += done; -+ } -+ /* Decrypt remainder, which is less then chunk size, in one go. */ -+ kfpu_begin(); -+ if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { -+ done = aesni_gcm_decrypt(datap, datap, bleft, -+ (const void *)key, ctx->gcm_cb, ghash); -+ if (done == 0) { -+ clear_fpu_regs(); -+ kfpu_end(); -+ return (CRYPTO_FAILED); -+ } -+ datap += done; -+ bleft -= done; -+ } -+ ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); -+ -+ /* -+ * Now less then GCM_AVX_MIN_DECRYPT_BYTES bytes remain, -+ * decrypt them block by block. -+ */ -+ while (bleft > 0) { -+ /* Incomplete last block. */ -+ if (bleft < block_size) { -+ uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; -+ -+ bzero(lastb, block_size); -+ bcopy(datap, lastb, bleft); -+ /* The GCM processing. */ -+ GHASH_AVX(ctx, lastb, block_size); -+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); -+ for (size_t i = 0; i < bleft; i++) { -+ datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; -+ } -+ break; -+ } -+ /* The GCM processing. */ -+ GHASH_AVX(ctx, datap, block_size); -+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); -+ gcm_xor_avx((uint8_t *)tmp, datap); -+ gcm_incr_counter_block(ctx); -+ -+ datap += block_size; -+ bleft -= block_size; -+ } -+ if (rv != CRYPTO_SUCCESS) { -+ clear_fpu_regs(); -+ kfpu_end(); -+ return (rv); -+ } -+ /* Decryption done, finish the tag. */ -+ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); -+ GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); -+ aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, -+ (uint32_t *)ctx->gcm_J0); -+ -+ gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); -+ -+ /* We are done with the FPU, restore its state. */ -+ clear_fpu_regs(); -+ kfpu_end(); -+ -+ /* Compare the input authentication tag with what we calculated. */ -+ if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { -+ /* They don't match. */ -+ return (CRYPTO_INVALID_MAC); -+ } -+ rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); -+ if (rv != CRYPTO_SUCCESS) { -+ return (rv); -+ } -+ out->cd_offset += pt_len; -+ gcm_clear_ctx(ctx); -+ return (CRYPTO_SUCCESS); -+} -+ -+/* -+ * Initialize the GCM params H, Htabtle and the counter block. Save the -+ * initial counter block. -+ */ -+static int -+gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, -+ unsigned char *auth_data, size_t auth_data_len, size_t block_size) -+{ -+ uint8_t *cb = (uint8_t *)ctx->gcm_cb; -+ uint64_t *H = ctx->gcm_H; -+ const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; -+ int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; -+ uint8_t *datap = auth_data; -+ size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; -+ size_t bleft; -+ -+ ASSERT(block_size == GCM_BLOCK_LEN); -+ -+ /* Init H (encrypt zero block) and create the initial counter block. */ -+ bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash)); -+ bzero(H, sizeof (ctx->gcm_H)); -+ kfpu_begin(); -+ aes_encrypt_intel(keysched, aes_rounds, -+ (const uint32_t *)H, (uint32_t *)H); -+ -+ gcm_init_htab_avx(ctx->gcm_Htable, H); -+ -+ if (iv_len == 12) { -+ bcopy(iv, cb, 12); -+ cb[12] = 0; -+ cb[13] = 0; -+ cb[14] = 0; -+ cb[15] = 1; -+ /* We need the ICB later. */ -+ bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0)); -+ } else { -+ /* -+ * Most consumers use 12 byte IVs, so it's OK to use the -+ * original routines for other IV sizes, just avoid nesting -+ * kfpu_begin calls. -+ */ -+ clear_fpu_regs(); -+ kfpu_end(); -+ gcm_format_initial_blocks(iv, iv_len, ctx, block_size, -+ aes_copy_block, aes_xor_block); -+ kfpu_begin(); -+ } -+ -+ /* Openssl post increments the counter, adjust for that. */ -+ gcm_incr_counter_block(ctx); -+ -+ /* Ghash AAD in chunk_size blocks. */ -+ for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { -+ GHASH_AVX(ctx, datap, chunk_size); -+ datap += chunk_size; -+ clear_fpu_regs(); -+ kfpu_end(); -+ kfpu_begin(); -+ } -+ /* Ghash the remainder and handle possible incomplete GCM block. */ -+ if (bleft > 0) { -+ size_t incomp = bleft % block_size; -+ -+ bleft -= incomp; -+ if (bleft > 0) { -+ GHASH_AVX(ctx, datap, bleft); -+ datap += bleft; -+ } -+ if (incomp > 0) { -+ /* Zero pad and hash incomplete last block. */ -+ uint8_t *authp = (uint8_t *)ctx->gcm_tmp; -+ -+ bzero(authp, block_size); -+ bcopy(datap, authp, incomp); -+ GHASH_AVX(ctx, authp, block_size); -+ } -+ } -+ clear_fpu_regs(); -+ kfpu_end(); -+ return (CRYPTO_SUCCESS); -+} -+ -+#if defined(_KERNEL) -+static int -+icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) -+{ -+ unsigned long val; -+ char val_rounded[16]; -+ int error = 0; -+ -+ error = kstrtoul(buf, 0, &val); -+ if (error) -+ return (error); -+ -+ val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; -+ -+ if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) -+ return (-EINVAL); -+ -+ snprintf(val_rounded, 16, "%u", (uint32_t)val); -+ error = param_set_uint(val_rounded, kp); -+ return (error); -+} -+ -+module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, -+ param_get_uint, &gcm_avx_chunk_size, 0644); -+ -+MODULE_PARM_DESC(icp_gcm_avx_chunk_size, -+ "How many bytes to process while owning the FPU"); -+ -+#endif /* defined(__KERNEL) */ -+#endif /* ifdef CAN_USE_GCM_ASM */ -Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams -@@ -0,0 +1,36 @@ -+Copyright (c) 2006-2017, CRYPTOGAMS by -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions -+are met: -+ -+ * Redistributions of source code must retain copyright notices, -+ this list of conditions and the following disclaimer. -+ -+ * Redistributions in binary form must reproduce the above -+ copyright notice, this list of conditions and the following -+ disclaimer in the documentation and/or other materials -+ provided with the distribution. -+ -+ * Neither the name of the CRYPTOGAMS nor the names of its -+ copyright holder and contributors may be used to endorse or -+ promote products derived from this software without specific -+ prior written permission. -+ -+ALTERNATIVELY, provided that this notice is retained in full, this -+product may be distributed under the terms of the GNU General Public -+License (GPL), in which case the provisions of the GPL apply INSTEAD OF -+those given above. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS -+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip -@@ -0,0 +1 @@ -+PORTIONS OF GCM and GHASH FUNCTIONALITY -Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl -@@ -0,0 +1,177 @@ -+ -+ Apache License -+ Version 2.0, January 2004 -+ https://www.apache.org/licenses/ -+ -+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -+ -+ 1. Definitions. -+ -+ "License" shall mean the terms and conditions for use, reproduction, -+ and distribution as defined by Sections 1 through 9 of this document. -+ -+ "Licensor" shall mean the copyright owner or entity authorized by -+ the copyright owner that is granting the License. -+ -+ "Legal Entity" shall mean the union of the acting entity and all -+ other entities that control, are controlled by, or are under common -+ control with that entity. For the purposes of this definition, -+ "control" means (i) the power, direct or indirect, to cause the -+ direction or management of such entity, whether by contract or -+ otherwise, or (ii) ownership of fifty percent (50%) or more of the -+ outstanding shares, or (iii) beneficial ownership of such entity. -+ -+ "You" (or "Your") shall mean an individual or Legal Entity -+ exercising permissions granted by this License. -+ -+ "Source" form shall mean the preferred form for making modifications, -+ including but not limited to software source code, documentation -+ source, and configuration files. -+ -+ "Object" form shall mean any form resulting from mechanical -+ transformation or translation of a Source form, including but -+ not limited to compiled object code, generated documentation, -+ and conversions to other media types. -+ -+ "Work" shall mean the work of authorship, whether in Source or -+ Object form, made available under the License, as indicated by a -+ copyright notice that is included in or attached to the work -+ (an example is provided in the Appendix below). -+ -+ "Derivative Works" shall mean any work, whether in Source or Object -+ form, that is based on (or derived from) the Work and for which the -+ editorial revisions, annotations, elaborations, or other modifications -+ represent, as a whole, an original work of authorship. For the purposes -+ of this License, Derivative Works shall not include works that remain -+ separable from, or merely link (or bind by name) to the interfaces of, -+ the Work and Derivative Works thereof. -+ -+ "Contribution" shall mean any work of authorship, including -+ the original version of the Work and any modifications or additions -+ to that Work or Derivative Works thereof, that is intentionally -+ submitted to Licensor for inclusion in the Work by the copyright owner -+ or by an individual or Legal Entity authorized to submit on behalf of -+ the copyright owner. For the purposes of this definition, "submitted" -+ means any form of electronic, verbal, or written communication sent -+ to the Licensor or its representatives, including but not limited to -+ communication on electronic mailing lists, source code control systems, -+ and issue tracking systems that are managed by, or on behalf of, the -+ Licensor for the purpose of discussing and improving the Work, but -+ excluding communication that is conspicuously marked or otherwise -+ designated in writing by the copyright owner as "Not a Contribution." -+ -+ "Contributor" shall mean Licensor and any individual or Legal Entity -+ on behalf of whom a Contribution has been received by Licensor and -+ subsequently incorporated within the Work. -+ -+ 2. Grant of Copyright License. Subject to the terms and conditions of -+ this License, each Contributor hereby grants to You a perpetual, -+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable -+ copyright license to reproduce, prepare Derivative Works of, -+ publicly display, publicly perform, sublicense, and distribute the -+ Work and such Derivative Works in Source or Object form. -+ -+ 3. Grant of Patent License. Subject to the terms and conditions of -+ this License, each Contributor hereby grants to You a perpetual, -+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable -+ (except as stated in this section) patent license to make, have made, -+ use, offer to sell, sell, import, and otherwise transfer the Work, -+ where such license applies only to those patent claims licensable -+ by such Contributor that are necessarily infringed by their -+ Contribution(s) alone or by combination of their Contribution(s) -+ with the Work to which such Contribution(s) was submitted. If You -+ institute patent litigation against any entity (including a -+ cross-claim or counterclaim in a lawsuit) alleging that the Work -+ or a Contribution incorporated within the Work constitutes direct -+ or contributory patent infringement, then any patent licenses -+ granted to You under this License for that Work shall terminate -+ as of the date such litigation is filed. -+ -+ 4. Redistribution. You may reproduce and distribute copies of the -+ Work or Derivative Works thereof in any medium, with or without -+ modifications, and in Source or Object form, provided that You -+ meet the following conditions: -+ -+ (a) You must give any other recipients of the Work or -+ Derivative Works a copy of this License; and -+ -+ (b) You must cause any modified files to carry prominent notices -+ stating that You changed the files; and -+ -+ (c) You must retain, in the Source form of any Derivative Works -+ that You distribute, all copyright, patent, trademark, and -+ attribution notices from the Source form of the Work, -+ excluding those notices that do not pertain to any part of -+ the Derivative Works; and -+ -+ (d) If the Work includes a "NOTICE" text file as part of its -+ distribution, then any Derivative Works that You distribute must -+ include a readable copy of the attribution notices contained -+ within such NOTICE file, excluding those notices that do not -+ pertain to any part of the Derivative Works, in at least one -+ of the following places: within a NOTICE text file distributed -+ as part of the Derivative Works; within the Source form or -+ documentation, if provided along with the Derivative Works; or, -+ within a display generated by the Derivative Works, if and -+ wherever such third-party notices normally appear. The contents -+ of the NOTICE file are for informational purposes only and -+ do not modify the License. You may add Your own attribution -+ notices within Derivative Works that You distribute, alongside -+ or as an addendum to the NOTICE text from the Work, provided -+ that such additional attribution notices cannot be construed -+ as modifying the License. -+ -+ You may add Your own copyright statement to Your modifications and -+ may provide additional or different license terms and conditions -+ for use, reproduction, or distribution of Your modifications, or -+ for any such Derivative Works as a whole, provided Your use, -+ reproduction, and distribution of the Work otherwise complies with -+ the conditions stated in this License. -+ -+ 5. Submission of Contributions. Unless You explicitly state otherwise, -+ any Contribution intentionally submitted for inclusion in the Work -+ by You to the Licensor shall be under the terms and conditions of -+ this License, without any additional terms or conditions. -+ Notwithstanding the above, nothing herein shall supersede or modify -+ the terms of any separate license agreement you may have executed -+ with Licensor regarding such Contributions. -+ -+ 6. Trademarks. This License does not grant permission to use the trade -+ names, trademarks, service marks, or product names of the Licensor, -+ except as required for reasonable and customary use in describing the -+ origin of the Work and reproducing the content of the NOTICE file. -+ -+ 7. Disclaimer of Warranty. Unless required by applicable law or -+ agreed to in writing, Licensor provides the Work (and each -+ Contributor provides its Contributions) on an "AS IS" BASIS, -+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -+ implied, including, without limitation, any warranties or conditions -+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A -+ PARTICULAR PURPOSE. You are solely responsible for determining the -+ appropriateness of using or redistributing the Work and assume any -+ risks associated with Your exercise of permissions under this License. -+ -+ 8. Limitation of Liability. In no event and under no legal theory, -+ whether in tort (including negligence), contract, or otherwise, -+ unless required by applicable law (such as deliberate and grossly -+ negligent acts) or agreed to in writing, shall any Contributor be -+ liable to You for damages, including any direct, indirect, special, -+ incidental, or consequential damages of any character arising as a -+ result of this License or out of the use or inability to use the -+ Work (including but not limited to damages for loss of goodwill, -+ work stoppage, computer failure or malfunction, or any and all -+ other commercial damages or losses), even if such Contributor -+ has been advised of the possibility of such damages. -+ -+ 9. Accepting Warranty or Additional Liability. While redistributing -+ the Work or Derivative Works thereof, You may choose to offer, -+ and charge a fee for, acceptance of support, warranty, indemnity, -+ or other liability obligations and/or rights consistent with this -+ License. However, in accepting such obligations, You may act only -+ on Your own behalf and on Your sole responsibility, not on behalf -+ of any other Contributor, and only if You agree to indemnify, -+ defend, and hold each Contributor harmless for any liability -+ incurred by, or claims asserted against, such Contributor by reason -+ of your accepting any such warranty or additional liability. -+ -+ END OF TERMS AND CONDITIONS -Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip -@@ -0,0 +1 @@ -+PORTIONS OF GCM and GHASH FUNCTIONALITY -Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S -@@ -0,0 +1,892 @@ -+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. -+# -+# Licensed under the Apache License 2.0 (the "License"). You may not use -+# this file except in compliance with the License. You can obtain a copy -+# in the file LICENSE in the source distribution or at -+# https://www.openssl.org/source/license.html -+ -+# -+# ==================================================================== -+# Written by Andy Polyakov for the OpenSSL -+# project. The module is, however, dual licensed under OpenSSL and -+# CRYPTOGAMS licenses depending on where you obtain it. For further -+# details see http://www.openssl.org/~appro/cryptogams/. -+# ==================================================================== -+# -+# -+# AES-NI-CTR+GHASH stitch. -+# -+# February 2013 -+# -+# OpenSSL GCM implementation is organized in such way that its -+# performance is rather close to the sum of its streamed components, -+# in the context parallelized AES-NI CTR and modulo-scheduled -+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation -+# was observed to perform significantly better than the sum of the -+# components on contemporary CPUs, the effort was deemed impossible to -+# justify. This module is based on combination of Intel submissions, -+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max -+# Locktyukhin of Intel Corp. who verified that it reduces shuffles -+# pressure with notable relative improvement, achieving 1.0 cycle per -+# byte processed with 128-bit key on Haswell processor, 0.74 - on -+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled -+# measurements for favourable packet size, one divisible by 96. -+# Applications using the EVP interface will observe a few percent -+# worse performance.] -+# -+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). -+# -+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest -+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf -+ -+# Generated once from -+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl -+# and modified for ICP. Modification are kept at a bare minimum to ease later -+# upstream merges. -+ -+#if defined(__x86_64__) && defined(HAVE_AVX) && \ -+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE) -+ -+.text -+ -+.type _aesni_ctr32_ghash_6x,@function -+.align 32 -+_aesni_ctr32_ghash_6x: -+ vmovdqu 32(%r11),%xmm2 -+ subq $6,%rdx -+ vpxor %xmm4,%xmm4,%xmm4 -+ vmovdqu 0-128(%rcx),%xmm15 -+ vpaddb %xmm2,%xmm1,%xmm10 -+ vpaddb %xmm2,%xmm10,%xmm11 -+ vpaddb %xmm2,%xmm11,%xmm12 -+ vpaddb %xmm2,%xmm12,%xmm13 -+ vpaddb %xmm2,%xmm13,%xmm14 -+ vpxor %xmm15,%xmm1,%xmm9 -+ vmovdqu %xmm4,16+8(%rsp) -+ jmp .Loop6x -+ -+.align 32 -+.Loop6x: -+ addl $100663296,%ebx -+ jc .Lhandle_ctr32 -+ vmovdqu 0-32(%r9),%xmm3 -+ vpaddb %xmm2,%xmm14,%xmm1 -+ vpxor %xmm15,%xmm10,%xmm10 -+ vpxor %xmm15,%xmm11,%xmm11 -+ -+.Lresume_ctr32: -+ vmovdqu %xmm1,(%r8) -+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 -+ vpxor %xmm15,%xmm12,%xmm12 -+ vmovups 16-128(%rcx),%xmm2 -+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 -+ xorq %r12,%r12 -+ cmpq %r14,%r15 -+ -+ vaesenc %xmm2,%xmm9,%xmm9 -+ vmovdqu 48+8(%rsp),%xmm0 -+ vpxor %xmm15,%xmm13,%xmm13 -+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 -+ vaesenc %xmm2,%xmm10,%xmm10 -+ vpxor %xmm15,%xmm14,%xmm14 -+ setnc %r12b -+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 -+ vaesenc %xmm2,%xmm11,%xmm11 -+ vmovdqu 16-32(%r9),%xmm3 -+ negq %r12 -+ vaesenc %xmm2,%xmm12,%xmm12 -+ vpxor %xmm5,%xmm6,%xmm6 -+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 -+ vpxor %xmm4,%xmm8,%xmm8 -+ vaesenc %xmm2,%xmm13,%xmm13 -+ vpxor %xmm5,%xmm1,%xmm4 -+ andq $0x60,%r12 -+ vmovups 32-128(%rcx),%xmm15 -+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 -+ vaesenc %xmm2,%xmm14,%xmm14 -+ -+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 -+ leaq (%r14,%r12,1),%r14 -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor 16+8(%rsp),%xmm8,%xmm8 -+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 -+ vmovdqu 64+8(%rsp),%xmm0 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ movbeq 88(%r14),%r13 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ movbeq 80(%r14),%r12 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ movq %r13,32+8(%rsp) -+ vaesenc %xmm15,%xmm13,%xmm13 -+ movq %r12,40+8(%rsp) -+ vmovdqu 48-32(%r9),%xmm5 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vmovups 48-128(%rcx),%xmm15 -+ vpxor %xmm1,%xmm6,%xmm6 -+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor %xmm2,%xmm6,%xmm6 -+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vpxor %xmm3,%xmm7,%xmm7 -+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 -+ vmovdqu 80+8(%rsp),%xmm0 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vmovdqu 64-32(%r9),%xmm1 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vmovups 64-128(%rcx),%xmm15 -+ vpxor %xmm2,%xmm6,%xmm6 -+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor %xmm3,%xmm6,%xmm6 -+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ movbeq 72(%r14),%r13 -+ vpxor %xmm5,%xmm7,%xmm7 -+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ movbeq 64(%r14),%r12 -+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 -+ vmovdqu 96+8(%rsp),%xmm0 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ movq %r13,48+8(%rsp) -+ vaesenc %xmm15,%xmm13,%xmm13 -+ movq %r12,56+8(%rsp) -+ vpxor %xmm2,%xmm4,%xmm4 -+ vmovdqu 96-32(%r9),%xmm2 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vmovups 80-128(%rcx),%xmm15 -+ vpxor %xmm3,%xmm6,%xmm6 -+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor %xmm5,%xmm6,%xmm6 -+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ movbeq 56(%r14),%r13 -+ vpxor %xmm1,%xmm7,%xmm7 -+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 -+ vpxor 112+8(%rsp),%xmm8,%xmm8 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ movbeq 48(%r14),%r12 -+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ movq %r13,64+8(%rsp) -+ vaesenc %xmm15,%xmm13,%xmm13 -+ movq %r12,72+8(%rsp) -+ vpxor %xmm3,%xmm4,%xmm4 -+ vmovdqu 112-32(%r9),%xmm3 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vmovups 96-128(%rcx),%xmm15 -+ vpxor %xmm5,%xmm6,%xmm6 -+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor %xmm1,%xmm6,%xmm6 -+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ movbeq 40(%r14),%r13 -+ vpxor %xmm2,%xmm7,%xmm7 -+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ movbeq 32(%r14),%r12 -+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ movq %r13,80+8(%rsp) -+ vaesenc %xmm15,%xmm13,%xmm13 -+ movq %r12,88+8(%rsp) -+ vpxor %xmm5,%xmm6,%xmm6 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ vpxor %xmm1,%xmm6,%xmm6 -+ -+ vmovups 112-128(%rcx),%xmm15 -+ vpslldq $8,%xmm6,%xmm5 -+ vpxor %xmm2,%xmm4,%xmm4 -+ vmovdqu 16(%r11),%xmm3 -+ -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor %xmm8,%xmm7,%xmm7 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vpxor %xmm5,%xmm4,%xmm4 -+ movbeq 24(%r14),%r13 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ movbeq 16(%r14),%r12 -+ vpalignr $8,%xmm4,%xmm4,%xmm0 -+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 -+ movq %r13,96+8(%rsp) -+ vaesenc %xmm15,%xmm12,%xmm12 -+ movq %r12,104+8(%rsp) -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vmovups 128-128(%rcx),%xmm1 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vaesenc %xmm1,%xmm9,%xmm9 -+ vmovups 144-128(%rcx),%xmm15 -+ vaesenc %xmm1,%xmm10,%xmm10 -+ vpsrldq $8,%xmm6,%xmm6 -+ vaesenc %xmm1,%xmm11,%xmm11 -+ vpxor %xmm6,%xmm7,%xmm7 -+ vaesenc %xmm1,%xmm12,%xmm12 -+ vpxor %xmm0,%xmm4,%xmm4 -+ movbeq 8(%r14),%r13 -+ vaesenc %xmm1,%xmm13,%xmm13 -+ movbeq 0(%r14),%r12 -+ vaesenc %xmm1,%xmm14,%xmm14 -+ vmovups 160-128(%rcx),%xmm1 -+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. -+ jb .Lenc_tail -+ -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vaesenc %xmm1,%xmm9,%xmm9 -+ vaesenc %xmm1,%xmm10,%xmm10 -+ vaesenc %xmm1,%xmm11,%xmm11 -+ vaesenc %xmm1,%xmm12,%xmm12 -+ vaesenc %xmm1,%xmm13,%xmm13 -+ vmovups 176-128(%rcx),%xmm15 -+ vaesenc %xmm1,%xmm14,%xmm14 -+ vmovups 192-128(%rcx),%xmm1 -+ cmpl $14,%ebp // ICP does not zero key schedule. -+ jb .Lenc_tail -+ -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ -+ vaesenc %xmm1,%xmm9,%xmm9 -+ vaesenc %xmm1,%xmm10,%xmm10 -+ vaesenc %xmm1,%xmm11,%xmm11 -+ vaesenc %xmm1,%xmm12,%xmm12 -+ vaesenc %xmm1,%xmm13,%xmm13 -+ vmovups 208-128(%rcx),%xmm15 -+ vaesenc %xmm1,%xmm14,%xmm14 -+ vmovups 224-128(%rcx),%xmm1 -+ jmp .Lenc_tail -+ -+.align 32 -+.Lhandle_ctr32: -+ vmovdqu (%r11),%xmm0 -+ vpshufb %xmm0,%xmm1,%xmm6 -+ vmovdqu 48(%r11),%xmm5 -+ vpaddd 64(%r11),%xmm6,%xmm10 -+ vpaddd %xmm5,%xmm6,%xmm11 -+ vmovdqu 0-32(%r9),%xmm3 -+ vpaddd %xmm5,%xmm10,%xmm12 -+ vpshufb %xmm0,%xmm10,%xmm10 -+ vpaddd %xmm5,%xmm11,%xmm13 -+ vpshufb %xmm0,%xmm11,%xmm11 -+ vpxor %xmm15,%xmm10,%xmm10 -+ vpaddd %xmm5,%xmm12,%xmm14 -+ vpshufb %xmm0,%xmm12,%xmm12 -+ vpxor %xmm15,%xmm11,%xmm11 -+ vpaddd %xmm5,%xmm13,%xmm1 -+ vpshufb %xmm0,%xmm13,%xmm13 -+ vpshufb %xmm0,%xmm14,%xmm14 -+ vpshufb %xmm0,%xmm1,%xmm1 -+ jmp .Lresume_ctr32 -+ -+.align 32 -+.Lenc_tail: -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vmovdqu %xmm7,16+8(%rsp) -+ vpalignr $8,%xmm4,%xmm4,%xmm8 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 -+ vpxor 0(%rdi),%xmm1,%xmm2 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ vpxor 16(%rdi),%xmm1,%xmm0 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ vpxor 32(%rdi),%xmm1,%xmm5 -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vpxor 48(%rdi),%xmm1,%xmm6 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ vpxor 64(%rdi),%xmm1,%xmm7 -+ vpxor 80(%rdi),%xmm1,%xmm3 -+ vmovdqu (%r8),%xmm1 -+ -+ vaesenclast %xmm2,%xmm9,%xmm9 -+ vmovdqu 32(%r11),%xmm2 -+ vaesenclast %xmm0,%xmm10,%xmm10 -+ vpaddb %xmm2,%xmm1,%xmm0 -+ movq %r13,112+8(%rsp) -+ leaq 96(%rdi),%rdi -+ vaesenclast %xmm5,%xmm11,%xmm11 -+ vpaddb %xmm2,%xmm0,%xmm5 -+ movq %r12,120+8(%rsp) -+ leaq 96(%rsi),%rsi -+ vmovdqu 0-128(%rcx),%xmm15 -+ vaesenclast %xmm6,%xmm12,%xmm12 -+ vpaddb %xmm2,%xmm5,%xmm6 -+ vaesenclast %xmm7,%xmm13,%xmm13 -+ vpaddb %xmm2,%xmm6,%xmm7 -+ vaesenclast %xmm3,%xmm14,%xmm14 -+ vpaddb %xmm2,%xmm7,%xmm3 -+ -+ addq $0x60,%r10 -+ subq $0x6,%rdx -+ jc .L6x_done -+ -+ vmovups %xmm9,-96(%rsi) -+ vpxor %xmm15,%xmm1,%xmm9 -+ vmovups %xmm10,-80(%rsi) -+ vmovdqa %xmm0,%xmm10 -+ vmovups %xmm11,-64(%rsi) -+ vmovdqa %xmm5,%xmm11 -+ vmovups %xmm12,-48(%rsi) -+ vmovdqa %xmm6,%xmm12 -+ vmovups %xmm13,-32(%rsi) -+ vmovdqa %xmm7,%xmm13 -+ vmovups %xmm14,-16(%rsi) -+ vmovdqa %xmm3,%xmm14 -+ vmovdqu 32+8(%rsp),%xmm7 -+ jmp .Loop6x -+ -+.L6x_done: -+ vpxor 16+8(%rsp),%xmm8,%xmm8 -+ vpxor %xmm4,%xmm8,%xmm8 -+ -+ .byte 0xf3,0xc3 -+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x -+.globl aesni_gcm_decrypt -+.type aesni_gcm_decrypt,@function -+.align 32 -+aesni_gcm_decrypt: -+.cfi_startproc -+ xorq %r10,%r10 -+ cmpq $0x60,%rdx -+ jb .Lgcm_dec_abort -+ -+ leaq (%rsp),%rax -+.cfi_def_cfa_register %rax -+ pushq %rbx -+.cfi_offset %rbx,-16 -+ pushq %rbp -+.cfi_offset %rbp,-24 -+ pushq %r12 -+.cfi_offset %r12,-32 -+ pushq %r13 -+.cfi_offset %r13,-40 -+ pushq %r14 -+.cfi_offset %r14,-48 -+ pushq %r15 -+.cfi_offset %r15,-56 -+ vzeroupper -+ -+ vmovdqu (%r8),%xmm1 -+ addq $-128,%rsp -+ movl 12(%r8),%ebx -+ leaq .Lbswap_mask(%rip),%r11 -+ leaq -128(%rcx),%r14 -+ movq $0xf80,%r15 -+ vmovdqu (%r9),%xmm8 -+ andq $-128,%rsp -+ vmovdqu (%r11),%xmm0 -+ leaq 128(%rcx),%rcx -+ leaq 32+32(%r9),%r9 -+ movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. -+ vpshufb %xmm0,%xmm8,%xmm8 -+ -+ andq %r15,%r14 -+ andq %rsp,%r15 -+ subq %r14,%r15 -+ jc .Ldec_no_key_aliasing -+ cmpq $768,%r15 -+ jnc .Ldec_no_key_aliasing -+ subq %r15,%rsp -+.Ldec_no_key_aliasing: -+ -+ vmovdqu 80(%rdi),%xmm7 -+ leaq (%rdi),%r14 -+ vmovdqu 64(%rdi),%xmm4 -+ leaq -192(%rdi,%rdx,1),%r15 -+ vmovdqu 48(%rdi),%xmm5 -+ shrq $4,%rdx -+ xorq %r10,%r10 -+ vmovdqu 32(%rdi),%xmm6 -+ vpshufb %xmm0,%xmm7,%xmm7 -+ vmovdqu 16(%rdi),%xmm2 -+ vpshufb %xmm0,%xmm4,%xmm4 -+ vmovdqu (%rdi),%xmm3 -+ vpshufb %xmm0,%xmm5,%xmm5 -+ vmovdqu %xmm4,48(%rsp) -+ vpshufb %xmm0,%xmm6,%xmm6 -+ vmovdqu %xmm5,64(%rsp) -+ vpshufb %xmm0,%xmm2,%xmm2 -+ vmovdqu %xmm6,80(%rsp) -+ vpshufb %xmm0,%xmm3,%xmm3 -+ vmovdqu %xmm2,96(%rsp) -+ vmovdqu %xmm3,112(%rsp) -+ -+ call _aesni_ctr32_ghash_6x -+ -+ vmovups %xmm9,-96(%rsi) -+ vmovups %xmm10,-80(%rsi) -+ vmovups %xmm11,-64(%rsi) -+ vmovups %xmm12,-48(%rsi) -+ vmovups %xmm13,-32(%rsi) -+ vmovups %xmm14,-16(%rsi) -+ -+ vpshufb (%r11),%xmm8,%xmm8 -+ vmovdqu %xmm8,-64(%r9) -+ -+ vzeroupper -+ movq -48(%rax),%r15 -+.cfi_restore %r15 -+ movq -40(%rax),%r14 -+.cfi_restore %r14 -+ movq -32(%rax),%r13 -+.cfi_restore %r13 -+ movq -24(%rax),%r12 -+.cfi_restore %r12 -+ movq -16(%rax),%rbp -+.cfi_restore %rbp -+ movq -8(%rax),%rbx -+.cfi_restore %rbx -+ leaq (%rax),%rsp -+.cfi_def_cfa_register %rsp -+.Lgcm_dec_abort: -+ movq %r10,%rax -+ .byte 0xf3,0xc3 -+.cfi_endproc -+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt -+.type _aesni_ctr32_6x,@function -+.align 32 -+_aesni_ctr32_6x: -+ vmovdqu 0-128(%rcx),%xmm4 -+ vmovdqu 32(%r11),%xmm2 -+ leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. -+ vmovups 16-128(%rcx),%xmm15 -+ leaq 32-128(%rcx),%r12 -+ vpxor %xmm4,%xmm1,%xmm9 -+ addl $100663296,%ebx -+ jc .Lhandle_ctr32_2 -+ vpaddb %xmm2,%xmm1,%xmm10 -+ vpaddb %xmm2,%xmm10,%xmm11 -+ vpxor %xmm4,%xmm10,%xmm10 -+ vpaddb %xmm2,%xmm11,%xmm12 -+ vpxor %xmm4,%xmm11,%xmm11 -+ vpaddb %xmm2,%xmm12,%xmm13 -+ vpxor %xmm4,%xmm12,%xmm12 -+ vpaddb %xmm2,%xmm13,%xmm14 -+ vpxor %xmm4,%xmm13,%xmm13 -+ vpaddb %xmm2,%xmm14,%xmm1 -+ vpxor %xmm4,%xmm14,%xmm14 -+ jmp .Loop_ctr32 -+ -+.align 16 -+.Loop_ctr32: -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ vmovups (%r12),%xmm15 -+ leaq 16(%r12),%r12 -+ decl %r13d -+ jnz .Loop_ctr32 -+ -+ vmovdqu (%r12),%xmm3 -+ vaesenc %xmm15,%xmm9,%xmm9 -+ vpxor 0(%rdi),%xmm3,%xmm4 -+ vaesenc %xmm15,%xmm10,%xmm10 -+ vpxor 16(%rdi),%xmm3,%xmm5 -+ vaesenc %xmm15,%xmm11,%xmm11 -+ vpxor 32(%rdi),%xmm3,%xmm6 -+ vaesenc %xmm15,%xmm12,%xmm12 -+ vpxor 48(%rdi),%xmm3,%xmm8 -+ vaesenc %xmm15,%xmm13,%xmm13 -+ vpxor 64(%rdi),%xmm3,%xmm2 -+ vaesenc %xmm15,%xmm14,%xmm14 -+ vpxor 80(%rdi),%xmm3,%xmm3 -+ leaq 96(%rdi),%rdi -+ -+ vaesenclast %xmm4,%xmm9,%xmm9 -+ vaesenclast %xmm5,%xmm10,%xmm10 -+ vaesenclast %xmm6,%xmm11,%xmm11 -+ vaesenclast %xmm8,%xmm12,%xmm12 -+ vaesenclast %xmm2,%xmm13,%xmm13 -+ vaesenclast %xmm3,%xmm14,%xmm14 -+ vmovups %xmm9,0(%rsi) -+ vmovups %xmm10,16(%rsi) -+ vmovups %xmm11,32(%rsi) -+ vmovups %xmm12,48(%rsi) -+ vmovups %xmm13,64(%rsi) -+ vmovups %xmm14,80(%rsi) -+ leaq 96(%rsi),%rsi -+ -+ .byte 0xf3,0xc3 -+.align 32 -+.Lhandle_ctr32_2: -+ vpshufb %xmm0,%xmm1,%xmm6 -+ vmovdqu 48(%r11),%xmm5 -+ vpaddd 64(%r11),%xmm6,%xmm10 -+ vpaddd %xmm5,%xmm6,%xmm11 -+ vpaddd %xmm5,%xmm10,%xmm12 -+ vpshufb %xmm0,%xmm10,%xmm10 -+ vpaddd %xmm5,%xmm11,%xmm13 -+ vpshufb %xmm0,%xmm11,%xmm11 -+ vpxor %xmm4,%xmm10,%xmm10 -+ vpaddd %xmm5,%xmm12,%xmm14 -+ vpshufb %xmm0,%xmm12,%xmm12 -+ vpxor %xmm4,%xmm11,%xmm11 -+ vpaddd %xmm5,%xmm13,%xmm1 -+ vpshufb %xmm0,%xmm13,%xmm13 -+ vpxor %xmm4,%xmm12,%xmm12 -+ vpshufb %xmm0,%xmm14,%xmm14 -+ vpxor %xmm4,%xmm13,%xmm13 -+ vpshufb %xmm0,%xmm1,%xmm1 -+ vpxor %xmm4,%xmm14,%xmm14 -+ jmp .Loop_ctr32 -+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x -+ -+.globl aesni_gcm_encrypt -+.type aesni_gcm_encrypt,@function -+.align 32 -+aesni_gcm_encrypt: -+.cfi_startproc -+ xorq %r10,%r10 -+ cmpq $288,%rdx -+ jb .Lgcm_enc_abort -+ -+ leaq (%rsp),%rax -+.cfi_def_cfa_register %rax -+ pushq %rbx -+.cfi_offset %rbx,-16 -+ pushq %rbp -+.cfi_offset %rbp,-24 -+ pushq %r12 -+.cfi_offset %r12,-32 -+ pushq %r13 -+.cfi_offset %r13,-40 -+ pushq %r14 -+.cfi_offset %r14,-48 -+ pushq %r15 -+.cfi_offset %r15,-56 -+ vzeroupper -+ -+ vmovdqu (%r8),%xmm1 -+ addq $-128,%rsp -+ movl 12(%r8),%ebx -+ leaq .Lbswap_mask(%rip),%r11 -+ leaq -128(%rcx),%r14 -+ movq $0xf80,%r15 -+ leaq 128(%rcx),%rcx -+ vmovdqu (%r11),%xmm0 -+ andq $-128,%rsp -+ movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. -+ -+ andq %r15,%r14 -+ andq %rsp,%r15 -+ subq %r14,%r15 -+ jc .Lenc_no_key_aliasing -+ cmpq $768,%r15 -+ jnc .Lenc_no_key_aliasing -+ subq %r15,%rsp -+.Lenc_no_key_aliasing: -+ -+ leaq (%rsi),%r14 -+ leaq -192(%rsi,%rdx,1),%r15 -+ shrq $4,%rdx -+ -+ call _aesni_ctr32_6x -+ vpshufb %xmm0,%xmm9,%xmm8 -+ vpshufb %xmm0,%xmm10,%xmm2 -+ vmovdqu %xmm8,112(%rsp) -+ vpshufb %xmm0,%xmm11,%xmm4 -+ vmovdqu %xmm2,96(%rsp) -+ vpshufb %xmm0,%xmm12,%xmm5 -+ vmovdqu %xmm4,80(%rsp) -+ vpshufb %xmm0,%xmm13,%xmm6 -+ vmovdqu %xmm5,64(%rsp) -+ vpshufb %xmm0,%xmm14,%xmm7 -+ vmovdqu %xmm6,48(%rsp) -+ -+ call _aesni_ctr32_6x -+ -+ vmovdqu (%r9),%xmm8 -+ leaq 32+32(%r9),%r9 -+ subq $12,%rdx -+ movq $192,%r10 -+ vpshufb %xmm0,%xmm8,%xmm8 -+ -+ call _aesni_ctr32_ghash_6x -+ vmovdqu 32(%rsp),%xmm7 -+ vmovdqu (%r11),%xmm0 -+ vmovdqu 0-32(%r9),%xmm3 -+ vpunpckhqdq %xmm7,%xmm7,%xmm1 -+ vmovdqu 32-32(%r9),%xmm15 -+ vmovups %xmm9,-96(%rsi) -+ vpshufb %xmm0,%xmm9,%xmm9 -+ vpxor %xmm7,%xmm1,%xmm1 -+ vmovups %xmm10,-80(%rsi) -+ vpshufb %xmm0,%xmm10,%xmm10 -+ vmovups %xmm11,-64(%rsi) -+ vpshufb %xmm0,%xmm11,%xmm11 -+ vmovups %xmm12,-48(%rsi) -+ vpshufb %xmm0,%xmm12,%xmm12 -+ vmovups %xmm13,-32(%rsi) -+ vpshufb %xmm0,%xmm13,%xmm13 -+ vmovups %xmm14,-16(%rsi) -+ vpshufb %xmm0,%xmm14,%xmm14 -+ vmovdqu %xmm9,16(%rsp) -+ vmovdqu 48(%rsp),%xmm6 -+ vmovdqu 16-32(%r9),%xmm0 -+ vpunpckhqdq %xmm6,%xmm6,%xmm2 -+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 -+ vpxor %xmm6,%xmm2,%xmm2 -+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 -+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 -+ -+ vmovdqu 64(%rsp),%xmm9 -+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 -+ vmovdqu 48-32(%r9),%xmm3 -+ vpxor %xmm5,%xmm4,%xmm4 -+ vpunpckhqdq %xmm9,%xmm9,%xmm5 -+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 -+ vpxor %xmm9,%xmm5,%xmm5 -+ vpxor %xmm7,%xmm6,%xmm6 -+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 -+ vmovdqu 80-32(%r9),%xmm15 -+ vpxor %xmm1,%xmm2,%xmm2 -+ -+ vmovdqu 80(%rsp),%xmm1 -+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 -+ vmovdqu 64-32(%r9),%xmm0 -+ vpxor %xmm4,%xmm7,%xmm7 -+ vpunpckhqdq %xmm1,%xmm1,%xmm4 -+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpxor %xmm6,%xmm9,%xmm9 -+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 -+ vpxor %xmm2,%xmm5,%xmm5 -+ -+ vmovdqu 96(%rsp),%xmm2 -+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 -+ vmovdqu 96-32(%r9),%xmm3 -+ vpxor %xmm7,%xmm6,%xmm6 -+ vpunpckhqdq %xmm2,%xmm2,%xmm7 -+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 -+ vpxor %xmm2,%xmm7,%xmm7 -+ vpxor %xmm9,%xmm1,%xmm1 -+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 -+ vmovdqu 128-32(%r9),%xmm15 -+ vpxor %xmm5,%xmm4,%xmm4 -+ -+ vpxor 112(%rsp),%xmm8,%xmm8 -+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 -+ vmovdqu 112-32(%r9),%xmm0 -+ vpunpckhqdq %xmm8,%xmm8,%xmm9 -+ vpxor %xmm6,%xmm5,%xmm5 -+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 -+ vpxor %xmm8,%xmm9,%xmm9 -+ vpxor %xmm1,%xmm2,%xmm2 -+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 -+ vpxor %xmm4,%xmm7,%xmm4 -+ -+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 -+ vmovdqu 0-32(%r9),%xmm3 -+ vpunpckhqdq %xmm14,%xmm14,%xmm1 -+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 -+ vpxor %xmm14,%xmm1,%xmm1 -+ vpxor %xmm5,%xmm6,%xmm5 -+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 -+ vmovdqu 32-32(%r9),%xmm15 -+ vpxor %xmm2,%xmm8,%xmm7 -+ vpxor %xmm4,%xmm9,%xmm6 -+ -+ vmovdqu 16-32(%r9),%xmm0 -+ vpxor %xmm5,%xmm7,%xmm9 -+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 -+ vpxor %xmm9,%xmm6,%xmm6 -+ vpunpckhqdq %xmm13,%xmm13,%xmm2 -+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 -+ vpxor %xmm13,%xmm2,%xmm2 -+ vpslldq $8,%xmm6,%xmm9 -+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 -+ vpxor %xmm9,%xmm5,%xmm8 -+ vpsrldq $8,%xmm6,%xmm6 -+ vpxor %xmm6,%xmm7,%xmm7 -+ -+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 -+ vmovdqu 48-32(%r9),%xmm3 -+ vpxor %xmm4,%xmm5,%xmm5 -+ vpunpckhqdq %xmm12,%xmm12,%xmm9 -+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 -+ vpxor %xmm12,%xmm9,%xmm9 -+ vpxor %xmm14,%xmm13,%xmm13 -+ vpalignr $8,%xmm8,%xmm8,%xmm14 -+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 -+ vmovdqu 80-32(%r9),%xmm15 -+ vpxor %xmm1,%xmm2,%xmm2 -+ -+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 -+ vmovdqu 64-32(%r9),%xmm0 -+ vpxor %xmm5,%xmm4,%xmm4 -+ vpunpckhqdq %xmm11,%xmm11,%xmm1 -+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 -+ vpxor %xmm11,%xmm1,%xmm1 -+ vpxor %xmm13,%xmm12,%xmm12 -+ vxorps 16(%rsp),%xmm7,%xmm7 -+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 -+ vpxor %xmm2,%xmm9,%xmm9 -+ -+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 -+ vxorps %xmm14,%xmm8,%xmm8 -+ -+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 -+ vmovdqu 96-32(%r9),%xmm3 -+ vpxor %xmm4,%xmm5,%xmm5 -+ vpunpckhqdq %xmm10,%xmm10,%xmm2 -+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 -+ vpxor %xmm10,%xmm2,%xmm2 -+ vpalignr $8,%xmm8,%xmm8,%xmm14 -+ vpxor %xmm12,%xmm11,%xmm11 -+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 -+ vmovdqu 128-32(%r9),%xmm15 -+ vpxor %xmm9,%xmm1,%xmm1 -+ -+ vxorps %xmm7,%xmm14,%xmm14 -+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 -+ vxorps %xmm14,%xmm8,%xmm8 -+ -+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 -+ vmovdqu 112-32(%r9),%xmm0 -+ vpxor %xmm5,%xmm4,%xmm4 -+ vpunpckhqdq %xmm8,%xmm8,%xmm9 -+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 -+ vpxor %xmm8,%xmm9,%xmm9 -+ vpxor %xmm11,%xmm10,%xmm10 -+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 -+ vpxor %xmm1,%xmm2,%xmm2 -+ -+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 -+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 -+ vpxor %xmm4,%xmm5,%xmm5 -+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 -+ vpxor %xmm10,%xmm7,%xmm7 -+ vpxor %xmm2,%xmm6,%xmm6 -+ -+ vpxor %xmm5,%xmm7,%xmm4 -+ vpxor %xmm4,%xmm6,%xmm6 -+ vpslldq $8,%xmm6,%xmm1 -+ vmovdqu 16(%r11),%xmm3 -+ vpsrldq $8,%xmm6,%xmm6 -+ vpxor %xmm1,%xmm5,%xmm8 -+ vpxor %xmm6,%xmm7,%xmm7 -+ -+ vpalignr $8,%xmm8,%xmm8,%xmm2 -+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 -+ vpxor %xmm2,%xmm8,%xmm8 -+ -+ vpalignr $8,%xmm8,%xmm8,%xmm2 -+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 -+ vpxor %xmm7,%xmm2,%xmm2 -+ vpxor %xmm2,%xmm8,%xmm8 -+ vpshufb (%r11),%xmm8,%xmm8 -+ vmovdqu %xmm8,-64(%r9) -+ -+ vzeroupper -+ movq -48(%rax),%r15 -+.cfi_restore %r15 -+ movq -40(%rax),%r14 -+.cfi_restore %r14 -+ movq -32(%rax),%r13 -+.cfi_restore %r13 -+ movq -24(%rax),%r12 -+.cfi_restore %r12 -+ movq -16(%rax),%rbp -+.cfi_restore %rbp -+ movq -8(%rax),%rbx -+.cfi_restore %rbx -+ leaq (%rax),%rsp -+.cfi_def_cfa_register %rsp -+.Lgcm_enc_abort: -+ movq %r10,%rax -+ .byte 0xf3,0xc3 -+.cfi_endproc -+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt -+ -+/* Some utility routines */ -+ -+/* -+ * clear all fpu registers -+ * void clear_fpu_regs_avx(void); -+ */ -+.globl clear_fpu_regs_avx -+.type clear_fpu_regs_avx,@function -+.align 32 -+clear_fpu_regs_avx: -+ vzeroall -+ ret -+.size clear_fpu_regs_avx,.-clear_fpu_regs_avx -+ -+/* -+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); -+ * -+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and -+ * stores the result at `dst'. The XOR is performed using FPU registers, -+ * so make sure FPU state is saved when running this in the kernel. -+ */ -+.globl gcm_xor_avx -+.type gcm_xor_avx,@function -+.align 32 -+gcm_xor_avx: -+ movdqu (%rdi), %xmm0 -+ movdqu (%rsi), %xmm1 -+ pxor %xmm1, %xmm0 -+ movdqu %xmm0, (%rsi) -+ ret -+.size gcm_xor_avx,.-gcm_xor_avx -+ -+/* -+ * Toggle a boolean_t value atomically and return the new value. -+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); -+ */ -+.globl atomic_toggle_boolean_nv -+.type atomic_toggle_boolean_nv,@function -+.align 32 -+atomic_toggle_boolean_nv: -+ xorl %eax, %eax -+ lock -+ xorl $1, (%rdi) -+ jz 1f -+ movl $1, %eax -+1: -+ ret -+.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv -+ -+.align 64 -+.Lbswap_mask: -+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -+.Lpoly: -+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -+.Lone_msb: -+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -+.Ltwo_lsb: -+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -+.Lone_lsb: -+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -+.align 64 -+ -+/* Mark the stack non-executable. */ -+#if defined(__linux__) && defined(__ELF__) -+.section .note.GNU-stack,"",%progbits -+#endif -+ -+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ -Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/ghash-x86_64.S -=================================================================== ---- /dev/null -+++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/ghash-x86_64.S -@@ -0,0 +1,714 @@ -+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. -+# -+# Licensed under the Apache License 2.0 (the "License"). You may not use -+# this file except in compliance with the License. You can obtain a copy -+# in the file LICENSE in the source distribution or at -+# https://www.openssl.org/source/license.html -+ -+# -+# ==================================================================== -+# Written by Andy Polyakov for the OpenSSL -+# project. The module is, however, dual licensed under OpenSSL and -+# CRYPTOGAMS licenses depending on where you obtain it. For further -+# details see http://www.openssl.org/~appro/cryptogams/. -+# ==================================================================== -+# -+# March, June 2010 -+# -+# The module implements "4-bit" GCM GHASH function and underlying -+# single multiplication operation in GF(2^128). "4-bit" means that -+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH -+# function features so called "528B" variant utilizing additional -+# 256+16 bytes of per-key storage [+512 bytes shared table]. -+# Performance results are for this streamed GHASH subroutine and are -+# expressed in cycles per processed byte, less is better: -+# -+# gcc 3.4.x(*) assembler -+# -+# P4 28.6 14.0 +100% -+# Opteron 19.3 7.7 +150% -+# Core2 17.8 8.1(**) +120% -+# Atom 31.6 16.8 +88% -+# VIA Nano 21.8 10.1 +115% -+# -+# (*) comparison is not completely fair, because C results are -+# for vanilla "256B" implementation, while assembler results -+# are for "528B";-) -+# (**) it's mystery [to me] why Core2 result is not same as for -+# Opteron; -+ -+# May 2010 -+# -+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. -+# See ghash-x86.pl for background information and details about coding -+# techniques. -+# -+# Special thanks to David Woodhouse for providing access to a -+# Westmere-based system on behalf of Intel Open Source Technology Centre. -+ -+# December 2012 -+# -+# Overhaul: aggregate Karatsuba post-processing, improve ILP in -+# reduction_alg9, increase reduction aggregate factor to 4x. As for -+# the latter. ghash-x86.pl discusses that it makes lesser sense to -+# increase aggregate factor. Then why increase here? Critical path -+# consists of 3 independent pclmulqdq instructions, Karatsuba post- -+# processing and reduction. "On top" of this we lay down aggregated -+# multiplication operations, triplets of independent pclmulqdq's. As -+# issue rate for pclmulqdq is limited, it makes lesser sense to -+# aggregate more multiplications than it takes to perform remaining -+# non-multiplication operations. 2x is near-optimal coefficient for -+# contemporary Intel CPUs (therefore modest improvement coefficient), -+# but not for Bulldozer. Latter is because logical SIMD operations -+# are twice as slow in comparison to Intel, so that critical path is -+# longer. A CPU with higher pclmulqdq issue rate would also benefit -+# from higher aggregate factor... -+# -+# Westmere 1.78(+13%) -+# Sandy Bridge 1.80(+8%) -+# Ivy Bridge 1.80(+7%) -+# Haswell 0.55(+93%) (if system doesn't support AVX) -+# Broadwell 0.45(+110%)(if system doesn't support AVX) -+# Skylake 0.44(+110%)(if system doesn't support AVX) -+# Bulldozer 1.49(+27%) -+# Silvermont 2.88(+13%) -+# Knights L 2.12(-) (if system doesn't support AVX) -+# Goldmont 1.08(+24%) -+ -+# March 2013 -+# -+# ... 8x aggregate factor AVX code path is using reduction algorithm -+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable -+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs -+# sub-optimally in comparison to above mentioned version. But thanks -+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that -+# it performs in 0.41 cycles per byte on Haswell processor, in -+# 0.29 on Broadwell, and in 0.36 on Skylake. -+# -+# Knights Landing achieves 1.09 cpb. -+# -+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest -+ -+# Generated once from -+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl -+# and modified for ICP. Modification are kept at a bare minimum to ease later -+# upstream merges. -+ -+#if defined(__x86_64__) && defined(HAVE_AVX) && \ -+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) -+ -+.text -+ -+.globl gcm_gmult_clmul -+.type gcm_gmult_clmul,@function -+.align 16 -+gcm_gmult_clmul: -+.cfi_startproc -+.L_gmult_clmul: -+ movdqu (%rdi),%xmm0 -+ movdqa .Lbswap_mask(%rip),%xmm5 -+ movdqu (%rsi),%xmm2 -+ movdqu 32(%rsi),%xmm4 -+.byte 102,15,56,0,197 -+ movdqa %xmm0,%xmm1 -+ pshufd $78,%xmm0,%xmm3 -+ pxor %xmm0,%xmm3 -+.byte 102,15,58,68,194,0 -+.byte 102,15,58,68,202,17 -+.byte 102,15,58,68,220,0 -+ pxor %xmm0,%xmm3 -+ pxor %xmm1,%xmm3 -+ -+ movdqa %xmm3,%xmm4 -+ psrldq $8,%xmm3 -+ pslldq $8,%xmm4 -+ pxor %xmm3,%xmm1 -+ pxor %xmm4,%xmm0 -+ -+ movdqa %xmm0,%xmm4 -+ movdqa %xmm0,%xmm3 -+ psllq $5,%xmm0 -+ pxor %xmm0,%xmm3 -+ psllq $1,%xmm0 -+ pxor %xmm3,%xmm0 -+ psllq $57,%xmm0 -+ movdqa %xmm0,%xmm3 -+ pslldq $8,%xmm0 -+ psrldq $8,%xmm3 -+ pxor %xmm4,%xmm0 -+ pxor %xmm3,%xmm1 -+ -+ -+ movdqa %xmm0,%xmm4 -+ psrlq $1,%xmm0 -+ pxor %xmm4,%xmm1 -+ pxor %xmm0,%xmm4 -+ psrlq $5,%xmm0 -+ pxor %xmm4,%xmm0 -+ psrlq $1,%xmm0 -+ pxor %xmm1,%xmm0 -+.byte 102,15,56,0,197 -+ movdqu %xmm0,(%rdi) -+ .byte 0xf3,0xc3 -+.cfi_endproc -+.size gcm_gmult_clmul,.-gcm_gmult_clmul -+ -+.globl gcm_init_htab_avx -+.type gcm_init_htab_avx,@function -+.align 32 -+gcm_init_htab_avx: -+.cfi_startproc -+ vzeroupper -+ -+ vmovdqu (%rsi),%xmm2 -+ // KCF/ICP stores H in network byte order with the hi qword first -+ // so we need to swap all bytes, not the 2 qwords. -+ vmovdqu .Lbswap_mask(%rip),%xmm4 -+ vpshufb %xmm4,%xmm2,%xmm2 -+ -+ -+ vpshufd $255,%xmm2,%xmm4 -+ vpsrlq $63,%xmm2,%xmm3 -+ vpsllq $1,%xmm2,%xmm2 -+ vpxor %xmm5,%xmm5,%xmm5 -+ vpcmpgtd %xmm4,%xmm5,%xmm5 -+ vpslldq $8,%xmm3,%xmm3 -+ vpor %xmm3,%xmm2,%xmm2 -+ -+ -+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 -+ vpxor %xmm5,%xmm2,%xmm2 -+ -+ vpunpckhqdq %xmm2,%xmm2,%xmm6 -+ vmovdqa %xmm2,%xmm0 -+ vpxor %xmm2,%xmm6,%xmm6 -+ movq $4,%r10 -+ jmp .Linit_start_avx -+.align 32 -+.Linit_loop_avx: -+ vpalignr $8,%xmm3,%xmm4,%xmm5 -+ vmovdqu %xmm5,-16(%rdi) -+ vpunpckhqdq %xmm0,%xmm0,%xmm3 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 -+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 -+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 -+ vpxor %xmm0,%xmm1,%xmm4 -+ vpxor %xmm4,%xmm3,%xmm3 -+ -+ vpslldq $8,%xmm3,%xmm4 -+ vpsrldq $8,%xmm3,%xmm3 -+ vpxor %xmm4,%xmm0,%xmm0 -+ vpxor %xmm3,%xmm1,%xmm1 -+ vpsllq $57,%xmm0,%xmm3 -+ vpsllq $62,%xmm0,%xmm4 -+ vpxor %xmm3,%xmm4,%xmm4 -+ vpsllq $63,%xmm0,%xmm3 -+ vpxor %xmm3,%xmm4,%xmm4 -+ vpslldq $8,%xmm4,%xmm3 -+ vpsrldq $8,%xmm4,%xmm4 -+ vpxor %xmm3,%xmm0,%xmm0 -+ vpxor %xmm4,%xmm1,%xmm1 -+ -+ vpsrlq $1,%xmm0,%xmm4 -+ vpxor %xmm0,%xmm1,%xmm1 -+ vpxor %xmm4,%xmm0,%xmm0 -+ vpsrlq $5,%xmm4,%xmm4 -+ vpxor %xmm4,%xmm0,%xmm0 -+ vpsrlq $1,%xmm0,%xmm0 -+ vpxor %xmm1,%xmm0,%xmm0 -+.Linit_start_avx: -+ vmovdqa %xmm0,%xmm5 -+ vpunpckhqdq %xmm0,%xmm0,%xmm3 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 -+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 -+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 -+ vpxor %xmm0,%xmm1,%xmm4 -+ vpxor %xmm4,%xmm3,%xmm3 -+ -+ vpslldq $8,%xmm3,%xmm4 -+ vpsrldq $8,%xmm3,%xmm3 -+ vpxor %xmm4,%xmm0,%xmm0 -+ vpxor %xmm3,%xmm1,%xmm1 -+ vpsllq $57,%xmm0,%xmm3 -+ vpsllq $62,%xmm0,%xmm4 -+ vpxor %xmm3,%xmm4,%xmm4 -+ vpsllq $63,%xmm0,%xmm3 -+ vpxor %xmm3,%xmm4,%xmm4 -+ vpslldq $8,%xmm4,%xmm3 -+ vpsrldq $8,%xmm4,%xmm4 -+ vpxor %xmm3,%xmm0,%xmm0 -+ vpxor %xmm4,%xmm1,%xmm1 -+ -+ vpsrlq $1,%xmm0,%xmm4 -+ vpxor %xmm0,%xmm1,%xmm1 -+ vpxor %xmm4,%xmm0,%xmm0 -+ vpsrlq $5,%xmm4,%xmm4 -+ vpxor %xmm4,%xmm0,%xmm0 -+ vpsrlq $1,%xmm0,%xmm0 -+ vpxor %xmm1,%xmm0,%xmm0 -+ vpshufd $78,%xmm5,%xmm3 -+ vpshufd $78,%xmm0,%xmm4 -+ vpxor %xmm5,%xmm3,%xmm3 -+ vmovdqu %xmm5,0(%rdi) -+ vpxor %xmm0,%xmm4,%xmm4 -+ vmovdqu %xmm0,16(%rdi) -+ leaq 48(%rdi),%rdi -+ subq $1,%r10 -+ jnz .Linit_loop_avx -+ -+ vpalignr $8,%xmm4,%xmm3,%xmm5 -+ vmovdqu %xmm5,-16(%rdi) -+ -+ vzeroupper -+ .byte 0xf3,0xc3 -+.cfi_endproc -+.size gcm_init_htab_avx,.-gcm_init_htab_avx -+ -+.globl gcm_gmult_avx -+.type gcm_gmult_avx,@function -+.align 32 -+gcm_gmult_avx: -+.cfi_startproc -+ jmp .L_gmult_clmul -+.cfi_endproc -+.size gcm_gmult_avx,.-gcm_gmult_avx -+.globl gcm_ghash_avx -+.type gcm_ghash_avx,@function -+.align 32 -+gcm_ghash_avx: -+.cfi_startproc -+ vzeroupper -+ -+ vmovdqu (%rdi),%xmm10 -+ leaq .L0x1c2_polynomial(%rip),%r10 -+ leaq 64(%rsi),%rsi -+ vmovdqu .Lbswap_mask(%rip),%xmm13 -+ vpshufb %xmm13,%xmm10,%xmm10 -+ cmpq $0x80,%rcx -+ jb .Lshort_avx -+ subq $0x80,%rcx -+ -+ vmovdqu 112(%rdx),%xmm14 -+ vmovdqu 0-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vmovdqu 32-64(%rsi),%xmm7 -+ -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vmovdqu 96(%rdx),%xmm15 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpxor %xmm14,%xmm9,%xmm9 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 16-64(%rsi),%xmm6 -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vmovdqu 80(%rdx),%xmm14 -+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 -+ vpxor %xmm15,%xmm8,%xmm8 -+ -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 -+ vmovdqu 48-64(%rsi),%xmm6 -+ vpxor %xmm14,%xmm9,%xmm9 -+ vmovdqu 64(%rdx),%xmm15 -+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 -+ vmovdqu 80-64(%rsi),%xmm7 -+ -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 64-64(%rsi),%xmm6 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 -+ vpxor %xmm15,%xmm8,%xmm8 -+ -+ vmovdqu 48(%rdx),%xmm14 -+ vpxor %xmm3,%xmm0,%xmm0 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 -+ vpxor %xmm4,%xmm1,%xmm1 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 -+ vmovdqu 96-64(%rsi),%xmm6 -+ vpxor %xmm5,%xmm2,%xmm2 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 -+ vmovdqu 128-64(%rsi),%xmm7 -+ vpxor %xmm14,%xmm9,%xmm9 -+ -+ vmovdqu 32(%rdx),%xmm15 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 112-64(%rsi),%xmm6 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 -+ vpxor %xmm15,%xmm8,%xmm8 -+ -+ vmovdqu 16(%rdx),%xmm14 -+ vpxor %xmm3,%xmm0,%xmm0 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 -+ vpxor %xmm4,%xmm1,%xmm1 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 -+ vmovdqu 144-64(%rsi),%xmm6 -+ vpxor %xmm5,%xmm2,%xmm2 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 -+ vmovdqu 176-64(%rsi),%xmm7 -+ vpxor %xmm14,%xmm9,%xmm9 -+ -+ vmovdqu (%rdx),%xmm15 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 160-64(%rsi),%xmm6 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 -+ -+ leaq 128(%rdx),%rdx -+ cmpq $0x80,%rcx -+ jb .Ltail_avx -+ -+ vpxor %xmm10,%xmm15,%xmm15 -+ subq $0x80,%rcx -+ jmp .Loop8x_avx -+ -+.align 32 -+.Loop8x_avx: -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vmovdqu 112(%rdx),%xmm14 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 -+ vmovdqu 0-64(%rsi),%xmm6 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 -+ vmovdqu 32-64(%rsi),%xmm7 -+ vpxor %xmm14,%xmm9,%xmm9 -+ -+ vmovdqu 96(%rdx),%xmm15 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpxor %xmm3,%xmm10,%xmm10 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vxorps %xmm4,%xmm11,%xmm11 -+ vmovdqu 16-64(%rsi),%xmm6 -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 -+ vpxor %xmm5,%xmm12,%xmm12 -+ vxorps %xmm15,%xmm8,%xmm8 -+ -+ vmovdqu 80(%rdx),%xmm14 -+ vpxor %xmm10,%xmm12,%xmm12 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 -+ vpxor %xmm11,%xmm12,%xmm12 -+ vpslldq $8,%xmm12,%xmm9 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 -+ vpsrldq $8,%xmm12,%xmm12 -+ vpxor %xmm9,%xmm10,%xmm10 -+ vmovdqu 48-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vxorps %xmm12,%xmm11,%xmm11 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 -+ vmovdqu 80-64(%rsi),%xmm7 -+ vpxor %xmm14,%xmm9,%xmm9 -+ vpxor %xmm2,%xmm5,%xmm5 -+ -+ vmovdqu 64(%rdx),%xmm15 -+ vpalignr $8,%xmm10,%xmm10,%xmm12 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpxor %xmm3,%xmm0,%xmm0 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 64-64(%rsi),%xmm6 -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm4,%xmm1,%xmm1 -+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 -+ vxorps %xmm15,%xmm8,%xmm8 -+ vpxor %xmm5,%xmm2,%xmm2 -+ -+ vmovdqu 48(%rdx),%xmm14 -+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 -+ vmovdqu 96-64(%rsi),%xmm6 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 -+ vmovdqu 128-64(%rsi),%xmm7 -+ vpxor %xmm14,%xmm9,%xmm9 -+ vpxor %xmm2,%xmm5,%xmm5 -+ -+ vmovdqu 32(%rdx),%xmm15 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpxor %xmm3,%xmm0,%xmm0 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 112-64(%rsi),%xmm6 -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm4,%xmm1,%xmm1 -+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vpxor %xmm5,%xmm2,%xmm2 -+ vxorps %xmm12,%xmm10,%xmm10 -+ -+ vmovdqu 16(%rdx),%xmm14 -+ vpalignr $8,%xmm10,%xmm10,%xmm12 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 -+ vpshufb %xmm13,%xmm14,%xmm14 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 -+ vmovdqu 144-64(%rsi),%xmm6 -+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 -+ vxorps %xmm11,%xmm12,%xmm12 -+ vpunpckhqdq %xmm14,%xmm14,%xmm9 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 -+ vmovdqu 176-64(%rsi),%xmm7 -+ vpxor %xmm14,%xmm9,%xmm9 -+ vpxor %xmm2,%xmm5,%xmm5 -+ -+ vmovdqu (%rdx),%xmm15 -+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 -+ vpshufb %xmm13,%xmm15,%xmm15 -+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 -+ vmovdqu 160-64(%rsi),%xmm6 -+ vpxor %xmm12,%xmm15,%xmm15 -+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 -+ vpxor %xmm10,%xmm15,%xmm15 -+ -+ leaq 128(%rdx),%rdx -+ subq $0x80,%rcx -+ jnc .Loop8x_avx -+ -+ addq $0x80,%rcx -+ jmp .Ltail_no_xor_avx -+ -+.align 32 -+.Lshort_avx: -+ vmovdqu -16(%rdx,%rcx,1),%xmm14 -+ leaq (%rdx,%rcx,1),%rdx -+ vmovdqu 0-64(%rsi),%xmm6 -+ vmovdqu 32-64(%rsi),%xmm7 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ -+ vmovdqa %xmm0,%xmm3 -+ vmovdqa %xmm1,%xmm4 -+ vmovdqa %xmm2,%xmm5 -+ subq $0x10,%rcx -+ jz .Ltail_avx -+ -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vmovdqu -32(%rdx),%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vmovdqu 16-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ vpsrldq $8,%xmm7,%xmm7 -+ subq $0x10,%rcx -+ jz .Ltail_avx -+ -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vmovdqu -48(%rdx),%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vmovdqu 48-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ vmovdqu 80-64(%rsi),%xmm7 -+ subq $0x10,%rcx -+ jz .Ltail_avx -+ -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vmovdqu -64(%rdx),%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vmovdqu 64-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ vpsrldq $8,%xmm7,%xmm7 -+ subq $0x10,%rcx -+ jz .Ltail_avx -+ -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vmovdqu -80(%rdx),%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vmovdqu 96-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ vmovdqu 128-64(%rsi),%xmm7 -+ subq $0x10,%rcx -+ jz .Ltail_avx -+ -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vmovdqu -96(%rdx),%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vmovdqu 112-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ vpsrldq $8,%xmm7,%xmm7 -+ subq $0x10,%rcx -+ jz .Ltail_avx -+ -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vmovdqu -112(%rdx),%xmm14 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vmovdqu 144-64(%rsi),%xmm6 -+ vpshufb %xmm13,%xmm14,%xmm15 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ vmovq 184-64(%rsi),%xmm7 -+ subq $0x10,%rcx -+ jmp .Ltail_avx -+ -+.align 32 -+.Ltail_avx: -+ vpxor %xmm10,%xmm15,%xmm15 -+.Ltail_no_xor_avx: -+ vpunpckhqdq %xmm15,%xmm15,%xmm8 -+ vpxor %xmm0,%xmm3,%xmm3 -+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 -+ vpxor %xmm15,%xmm8,%xmm8 -+ vpxor %xmm1,%xmm4,%xmm4 -+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 -+ vpxor %xmm2,%xmm5,%xmm5 -+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 -+ -+ vmovdqu (%r10),%xmm12 -+ -+ vpxor %xmm0,%xmm3,%xmm10 -+ vpxor %xmm1,%xmm4,%xmm11 -+ vpxor %xmm2,%xmm5,%xmm5 -+ -+ vpxor %xmm10,%xmm5,%xmm5 -+ vpxor %xmm11,%xmm5,%xmm5 -+ vpslldq $8,%xmm5,%xmm9 -+ vpsrldq $8,%xmm5,%xmm5 -+ vpxor %xmm9,%xmm10,%xmm10 -+ vpxor %xmm5,%xmm11,%xmm11 -+ -+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 -+ vpalignr $8,%xmm10,%xmm10,%xmm10 -+ vpxor %xmm9,%xmm10,%xmm10 -+ -+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 -+ vpalignr $8,%xmm10,%xmm10,%xmm10 -+ vpxor %xmm11,%xmm10,%xmm10 -+ vpxor %xmm9,%xmm10,%xmm10 -+ -+ cmpq $0,%rcx -+ jne .Lshort_avx -+ -+ vpshufb %xmm13,%xmm10,%xmm10 -+ vmovdqu %xmm10,(%rdi) -+ vzeroupper -+ .byte 0xf3,0xc3 -+.cfi_endproc -+.size gcm_ghash_avx,.-gcm_ghash_avx -+.align 64 -+.Lbswap_mask: -+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -+.L0x1c2_polynomial: -+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -+.L7_mask: -+.long 7,0,7,0 -+.L7_mask_poly: -+.long 7,0,450,0 -+.align 64 -+.type .Lrem_4bit,@object -+.Lrem_4bit: -+.long 0,0,0,471859200,0,943718400,0,610271232 -+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 -+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 -+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 -+.type .Lrem_8bit,@object -+.Lrem_8bit: -+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E -+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E -+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E -+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E -+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E -+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E -+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E -+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E -+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE -+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE -+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE -+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE -+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E -+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E -+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE -+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE -+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E -+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E -+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E -+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E -+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E -+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E -+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E -+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E -+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE -+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE -+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE -+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE -+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E -+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E -+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE -+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE -+ -+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -+.align 64 -+ -+/* Mark the stack non-executable. */ -+#if defined(__linux__) && defined(__ELF__) -+.section .note.GNU-stack,"",%progbits -+#endif -+ -+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ -Index: zfs-linux-0.8.3/module/icp/include/aes/aes_impl.h -=================================================================== ---- zfs-linux-0.8.3.orig/module/icp/include/aes/aes_impl.h -+++ zfs-linux-0.8.3/module/icp/include/aes/aes_impl.h -@@ -107,6 +107,11 @@ typedef union { - } aes_ks_t; - - typedef struct aes_impl_ops aes_impl_ops_t; -+ -+/* -+ * The absolute offset of the encr_ks (0) and the nr (504) fields are hard -+ * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly). -+ */ - typedef struct aes_key aes_key_t; - struct aes_key { - aes_ks_t encr_ks; /* encryption key schedule */ -Index: zfs-linux-0.8.3/module/icp/include/modes/modes.h -=================================================================== ---- zfs-linux-0.8.3.orig/module/icp/include/modes/modes.h -+++ zfs-linux-0.8.3/module/icp/include/modes/modes.h -@@ -34,6 +34,16 @@ extern "C" { - #include - #include - -+/* -+ * Does the build chain support all instructions needed for the GCM assembler -+ * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure -+ * anyhow. -+ */ -+#if defined(__x86_64__) && defined(HAVE_AVX) && \ -+ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE) -+#define CAN_USE_GCM_ASM -+#endif -+ - #define ECB_MODE 0x00000002 - #define CBC_MODE 0x00000004 - #define CTR_MODE 0x00000008 -@@ -189,13 +199,17 @@ typedef struct ccm_ctx { - * - * gcm_H: Subkey. - * -+ * gcm_Htable: Pre-computed and pre-shifted H, H^2, ... H^6 for the -+ * Karatsuba Algorithm in host byte order. -+ * - * gcm_J0: Pre-counter block generated from the IV. - * - * gcm_len_a_len_c: 64-bit representations of the bit lengths of - * AAD and ciphertext. - * -- * gcm_kmflag: Current value of kmflag. Used only for allocating -- * the plaintext buffer during decryption. -+ * gcm_kmflag: Current value of kmflag. Used for allocating -+ * the plaintext buffer during decryption and a -+ * gcm_avx_chunk_size'd buffer for avx enabled encryption. - */ - typedef struct gcm_ctx { - struct common_ctx gcm_common; -@@ -203,12 +217,23 @@ typedef struct gcm_ctx { - size_t gcm_processed_data_len; - size_t gcm_pt_buf_len; - uint32_t gcm_tmp[4]; -+ /* -+ * The relative positions of gcm_ghash, gcm_H and pre-computed -+ * gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S, -+ * so please don't change (or adjust accordingly). -+ */ - uint64_t gcm_ghash[2]; - uint64_t gcm_H[2]; -+#ifdef CAN_USE_GCM_ASM -+ uint64_t gcm_Htable[12][2]; -+#endif - uint64_t gcm_J0[2]; - uint64_t gcm_len_a_len_c[2]; - uint8_t *gcm_pt_buf; - int gcm_kmflag; -+#ifdef CAN_USE_GCM_ASM -+ boolean_t gcm_use_avx; -+#endif - } gcm_ctx_t; - - #define gcm_keysched gcm_common.cc_keysched -Index: zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh -=================================================================== ---- zfs-linux-0.8.3.orig/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh -+++ zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh -@@ -53,7 +53,7 @@ set -A ENCRYPTION_ALGS \ - "encryption=aes-256-gcm" - - set -A ENCRYPTION_PROPS \ -- "encryption=aes-256-ccm" \ -+ "encryption=aes-256-gcm" \ - "encryption=aes-128-ccm" \ - "encryption=aes-192-ccm" \ - "encryption=aes-256-ccm" \ -Index: zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh -=================================================================== ---- zfs-linux-0.8.3.orig/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh -+++ zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh -@@ -48,7 +48,7 @@ set -A ENCRYPTION_ALGS "encryption=on" \ - "encryption=aes-192-gcm" \ - "encryption=aes-256-gcm" - --set -A ENCRYPTION_PROPS "encryption=aes-256-ccm" \ -+set -A ENCRYPTION_PROPS "encryption=aes-256-gcm" \ - "encryption=aes-128-ccm" \ - "encryption=aes-192-ccm" \ - "encryption=aes-256-ccm" \ -Index: zfs-linux-0.8.3/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh -=================================================================== ---- zfs-linux-0.8.3.orig/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh -+++ zfs-linux-0.8.3/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh -@@ -124,7 +124,7 @@ ds=$TESTPOOL/recv - log_must eval "zfs send $snap > $sendfile" - log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \ - "-o keylocation=file://$keyfile $ds < $sendfile" --log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" -+log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" - log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds" - log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" - log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile" -@@ -140,7 +140,7 @@ ds=$TESTPOOL/recv - log_must eval "zfs send -p $snap > $sendfile" - log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \ - "-o keylocation=file://$keyfile $ds < $sendfile" --log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" -+log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" - log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds" - log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" - log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile" -@@ -158,7 +158,7 @@ ds=$TESTPOOL/recv - log_must eval "zfs send -R $snap > $sendfile" - log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \ - "-o keylocation=file://$keyfile $ds < $sendfile" --log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" -+log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" - log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds" - log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" - log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile" -@@ -174,7 +174,7 @@ ds=$TESTPOOL/crypt/recv - log_must eval "zfs send -p $snap > $sendfile" - log_must eval "zfs recv -x encryption $ds < $sendfile" - log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt" --log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" -+log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" - log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" - log_must test "$(get_prop 'mounted' $ds)" == "yes" - recv_cksum=$(md5digest /$ds/$TESTFILE0) -@@ -188,7 +188,7 @@ ds=$TESTPOOL/crypt/recv - log_must eval "zfs send -R $snap > $sendfile" - log_must eval "zfs recv -x encryption $ds < $sendfile" - log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt" --log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" -+log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" - log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" - log_must test "$(get_prop 'mounted' $ds)" == "yes" - recv_cksum=$(md5digest /$ds/$TESTFILE0) -@@ -202,7 +202,7 @@ ds=$TESTPOOL/crypt/recv - log_must eval "zfs send -R $snap2 > $sendfile" - log_must eval "zfs recv -x encryption $ds < $sendfile" - log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt" --log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" -+log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" - log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" - log_must test "$(get_prop 'mounted' $ds)" == "yes" - recv_cksum=$(md5digest /$ds/$TESTFILE0) diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/overlay.patch zfs-linux/debian/patches/overlay.patch --- ubuntu/zfs-linux/debian/patches/overlay.patch 1969-12-31 19:00:00.000000000 -0500 +++ zfs-linux/debian/patches/overlay.patch 2020-05-13 22:44:28.617584274 -0400 @@ -0,0 +1,14 @@ + +diff --git a/etc/default/zfs.in b/etc/default/zfs.in +index ce719734c0c..9439954b8ac 100644 +--- a/etc/default/zfs.in ++++ b/etc/default/zfs.in +@@ -66,7 +66,7 @@ VERBOSE_MOUNT='no' + # Should we allow overlay mounts? + # This is standard in Linux, but not ZFS which comes from Solaris where this + # is not allowed). +-DO_OVERLAY_MOUNTS='no' ++DO_OVERLAY_MOUNTS='yes' + + # Any additional option to the 'zfs import' commandline? + # Include '-o' for each option wanted. diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/pr10163.patch zfs-linux/debian/patches/pr10163.patch --- ubuntu/zfs-linux/debian/patches/pr10163.patch 1969-12-31 19:00:00.000000000 -0500 +++ zfs-linux/debian/patches/pr10163.patch 2020-05-13 14:42:37.000000000 -0400 @@ -0,0 +1,324 @@ +From 1335fc698150e42a21df5681a8b8bbbf6102b8fc Mon Sep 17 00:00:00 2001 +From: Matthew Ahrens +Date: Fri, 27 Mar 2020 10:45:25 -0700 +Subject: [PATCH] Improve ZVOL sync write performance by using a taskq + +== Summary == + +Prior to this change, sync writes to a zvol are processed serially. +This commit makes zvols process concurrently outstanding sync writes in +parallel, similar to how reads and async writes are already handled. +The result is that the throughput of sync writes is tripled. + +== Background == + +When a write comes in for a zvol (e.g. over iscsi), it is processed by +calling `zvol_request()` to initiate the operation. ZFS is expected to +later call `BIO_END_IO()` when the operation completes (possibly from a +different thread). There are a limited number of threads that are +available to call `zvol_request()` - one one per iscsi client (unless +using MC/S). Therefore, to ensure good performance, the latency of +`zvol_request()` is important, so that many i/o operations to the zvol +can be processed concurrently. In other words, if the client has +multiple outstanding requests to the zvol, the zvol should have multiple +outstanding requests to the storage hardware (i.e. issue multiple +concurrent `zio_t`'s). + +For reads, and async writes (i.e. writes which can be acknowledged +before the data reaches stable storage), `zvol_request()` achieves low +latency by dispatching the bulk of the work (including waiting for i/o +to disk) to a taskq. The taskq callback (`zvol_read()` or +`zvol_write()`) blocks while waiting for the i/o to disk to complete. +The `zvol_taskq` has 32 threads (by default), so we can have up to 32 +concurrent i/os to disk in service of requests to zvols. + +However, for sync writes (i.e. writes which must be persisted to stable +storage before they can be acknowledged, by calling `zil_commit()`), +`zvol_request()` does not use `zvol_taskq`. Instead it blocks while +waiting for the ZIL write to disk to complete. This has the effect of +serializing sync writes to each zvol. In other words, each zvol will +only process one sync write at a time, waiting for it to be written to +the ZIL before accepting the next request. + +The same issue applies to FLUSH operations, for which `zvol_request()` +calls `zil_commit()` directly. + +== Description of change == + +This commit changes `zvol_request()` to use +`taskq_dispatch_ent(zvol_taskq)` for sync writes, and FLUSh operations. +Therefore we can have up to 32 threads (the taskq threads) +simultaneously calling `zil_commit()`, for a theoretical performance +improvement of up to 32x. + +To avoid the locking issue described in the comment (which this commit +removes), we acquire the rangelock from the taskq callback (e.g. +`zvol_write()`) rather than from `zvol_request()`. This applies to all +writes (sync and async), reads, and discard operations. This means that +multiple simultaneously-outstanding i/o's which access the same block +can complete in any order. This was previously thought to be incorrect, +but a review of the block device interface requirements revealed that +this is fine - the order is inherently not defined. The shorter hold +time of the rangelock should also have a slight performance improvement. + +For an additional slight performance improvement, we use +`taskq_dispatch_ent()` instead of `taskq_dispatch()`, which avoids a +`kmem_alloc()` and eliminates a failure mode. This applies to all +writes (sync and async), reads, and discard operations. + +== Performance results == + +We used a zvol as an iscsi target (server) for a Windows initiator +(client), with a single connection (the default - i.e. not MC/S). + +We used `diskspd` to generate a workload with 4 threads, doing 1MB +writes to random offsets in the zvol. Without this change we get +231MB/s, and with the change we get 728MB/s, which is 3.15x the original +performance. + +We ran a real-world workload, restoring a MSSQL database, and saw +throughput 2.5x the original. + +We saw more modest performance wins (typically 1.5x-2x) when using MC/S +with 4 connections, and with different number of client threads (1, 8, +32). + +Signed-off-by: Matthew Ahrens +--- + module/zfs/zvol.c | 122 ++++++++++++++++++++++------------ + 1 file changed, 78 insertions(+), 44 deletions(-) + +diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c +index ce719734c0c..9439954b8ac 100644 +--- a/module/zfs/zvol.c ++++ b/module/zfs/zvol.c +@@ -38,9 +38,6 @@ + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ +-/* +- * Copyright (c) 2012, 2020 by Delphix. All rights reserved. +- */ + + /* + * Note on locking of zvol state structures. +@@ -723,7 +720,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_ + typedef struct zv_request { + zvol_state_t *zv; + struct bio *bio; +- taskq_ent_t ent; ++ zfs_locked_range_t *lr; + } zv_request_t; + + static void +@@ -752,18 +749,6 @@ zvol_write(void *arg) + ASSERT(zv && zv->zv_open_count > 0); + ASSERT(zv->zv_zilog != NULL); + +- /* bio marked as FLUSH need to flush before write */ +- if (bio_is_flush(bio)) +- zil_commit(zv->zv_zilog, ZVOL_OBJ); +- +- /* Some requests are just for flush and nothing else. */ +- if (uio.uio_resid == 0) { +- rw_exit(&zv->zv_suspend_lock); +- BIO_END_IO(bio, 0); +- kmem_free(zvr, sizeof (zv_request_t)); +- return; +- } +- + ssize_t start_resid = uio.uio_resid; + unsigned long start_jif = jiffies; + blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio), +@@ -772,9 +757,6 @@ zvol_write(void *arg) + boolean_t sync = + bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + +- zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, +- uio.uio_loffset, uio.uio_resid, RL_WRITER); +- + uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { + uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); +@@ -801,7 +783,7 @@ zvol_write(void *arg) + if (error) + break; + } +- zfs_rangelock_exit(lr); ++ zfs_rangelock_exit(zvr->lr); + + int64_t nwritten = start_resid - uio.uio_resid; + dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); +@@ -884,9 +866,6 @@ zvol_discard(void *arg) + if (start >= end) + goto unlock; + +- zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, +- start, size, RL_WRITER); +- + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); +@@ -898,12 +877,12 @@ zvol_discard(void *arg) + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, start, size); + } +- zfs_rangelock_exit(lr); ++unlock: ++ zfs_rangelock_exit(zvr->lr); + + if (error == 0 && sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + +-unlock: + rw_exit(&zv->zv_suspend_lock); + blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0, + start_jif); +@@ -929,9 +908,6 @@ zvol_read(void *arg) + blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio), + &zv->zv_disk->part0); + +- zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, +- uio.uio_loffset, uio.uio_resid, RL_READER); +- + uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { + uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); +@@ -948,7 +924,7 @@ zvol_read(void *arg) + break; + } + } +- zfs_rangelock_exit(lr); ++ zfs_rangelock_exit(zvr->lr); + + int64_t nread = start_resid - uio.uio_resid; + dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); +@@ -1063,15 +1039,16 @@ zvol_request(struct request_queue *q, st + } + + if (rw == WRITE) { ++ boolean_t need_sync = B_FALSE; ++ + if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { + BIO_END_IO(bio, -SET_ERROR(EROFS)); + goto out; + } + + /* +- * Prevents the zvol from being suspended, or the ZIL being +- * concurrently opened. Will be released after the i/o +- * completes. ++ * To be released in the I/O function. See the comment on ++ * rangelock_enter() below. + */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + +@@ -1092,55 +1069,47 @@ zvol_request(struct request_queue *q, st + rw_downgrade(&zv->zv_suspend_lock); + } + ++ /* bio marked as FLUSH need to flush before write */ ++ if (bio_is_flush(bio)) ++ zil_commit(zv->zv_zilog, ZVOL_OBJ); ++ ++ /* Some requests are just for flush and nothing else. */ ++ if (size == 0) { ++ rw_exit(&zv->zv_suspend_lock); ++ BIO_END_IO(bio, 0); ++ goto out; ++ } ++ + zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); + zvr->zv = zv; + zvr->bio = bio; +- taskq_init_ent(&zvr->ent); + + /* +- * We don't want this thread to be blocked waiting for i/o to +- * complete, so we instead wait from a taskq callback. The +- * i/o may be a ZIL write (via zil_commit()), or a read of an +- * indirect block, or a read of a data block (if this is a +- * partial-block write). We will indicate that the i/o is +- * complete by calling BIO_END_IO() from the taskq callback. +- * +- * This design allows the calling thread to continue and +- * initiate more concurrent operations by calling +- * zvol_request() again. There are typically only a small +- * number of threads available to call zvol_request() (e.g. +- * one per iSCSI target), so keeping the latency of +- * zvol_request() low is important for performance. +- * +- * The zvol_request_sync module parameter allows this +- * behavior to be altered, for performance evaluation +- * purposes. If the callback blocks, setting +- * zvol_request_sync=1 will result in much worse performance. +- * +- * We can have up to zvol_threads concurrent i/o's being +- * processed for all zvols on the system. This is typically +- * a vast improvement over the zvol_request_sync=1 behavior +- * of one i/o at a time per zvol. However, an even better +- * design would be for zvol_request() to initiate the zio +- * directly, and then be notified by the zio_done callback, +- * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL +- * interfaces lack this functionality (they block waiting for +- * the i/o to complete). ++ * To be released in the I/O function. Since the I/O functions ++ * are asynchronous, we take it here synchronously to make ++ * sure overlapped I/Os are properly ordered. + */ ++ zvr->lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, ++ RL_WRITER); ++ /* ++ * Sync writes and discards execute zil_commit() which may need ++ * to take a RL_READER lock on the whole block being modified ++ * via its zillog->zl_get_data(): to avoid circular dependency ++ * issues with taskq threads execute these requests ++ * synchronously here in zvol_request(). ++ */ ++ need_sync = bio_is_fua(bio) || ++ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { +- if (zvol_request_sync) { ++ if (zvol_request_sync || need_sync || ++ taskq_dispatch(zvol_taskq, zvol_discard, zvr, ++ TQ_SLEEP) == TASKQID_INVALID) + zvol_discard(zvr); +- } else { +- taskq_dispatch_ent(zvol_taskq, +- zvol_discard, zvr, 0, &zvr->ent); +- } + } else { +- if (zvol_request_sync) { ++ if (zvol_request_sync || need_sync || ++ taskq_dispatch(zvol_taskq, zvol_write, zvr, ++ TQ_SLEEP) == TASKQID_INVALID) + zvol_write(zvr); +- } else { +- taskq_dispatch_ent(zvol_taskq, +- zvol_write, zvr, 0, &zvr->ent); +- } + } + } else { + /* +@@ -1156,17 +1125,14 @@ zvol_request(struct request_queue *q, st + zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); + zvr->zv = zv; + zvr->bio = bio; +- taskq_init_ent(&zvr->ent); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + +- /* See comment in WRITE case above. */ +- if (zvol_request_sync) { ++ zvr->lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, ++ RL_READER); ++ if (zvol_request_sync || taskq_dispatch(zvol_taskq, ++ zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID) + zvol_read(zvr); +- } else { +- taskq_dispatch_ent(zvol_taskq, +- zvol_read, zvr, 0, &zvr->ent); +- } + } + + out: diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/pr10184.patch zfs-linux/debian/patches/pr10184.patch --- ubuntu/zfs-linux/debian/patches/pr10184.patch 1969-12-31 19:00:00.000000000 -0500 +++ zfs-linux/debian/patches/pr10184.patch 2020-05-13 19:22:23.600999520 -0400 @@ -0,0 +1,29 @@ +From 1eb1f33c6914e6419d4f4699bf45dfe8e288eeb7 Mon Sep 17 00:00:00 2001 +From: Matthew Ahrens +Date: Mon, 6 Apr 2020 08:01:45 -0700 +Subject: [PATCH] zvol_write() can use dmu_tx_hold_write_by_dnode() + +We can improve the performance of writes to zvols by using +dmu_tx_hold_write_by_dnode() instead of dmu_tx_hold_write(). This +reduces lock contention on the first block of the dnode object, and also +reduces the amount of CPU needed. The benefit will be highest with +multi-threaded async writes (i.e. writes that don't call zil_commit()). + +Signed-off-by: Matthew Ahrens +--- + module/zfs/zvol.c | 122 ++++++++++++++++++++++------------ + 1 file changed, 78 insertions(+), 44 deletions(-) + +diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c +index ce719734c0c..9439954b8ac 100644 +--- a/module/zfs/zvol.c ++++ b/module/zfs/zvol.c +@@ -125,7 +125,7 @@ zvol_write(void *arg) + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + +- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); ++ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + + /* This will only fail for ENOSPC */ + error = dmu_tx_assign(tx, TXG_WAIT); diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/patches/series zfs-linux/debian/patches/series --- ubuntu/zfs-linux/debian/patches/series 2020-05-16 17:46:19.019566887 -0400 +++ zfs-linux/debian/patches/series 2020-05-16 17:52:12.944742162 -0400 @@ -1,21 +1,23 @@ -0001-Prevent-manual-builds-in-the-DKMS-source.patch -0002-Check-for-META-and-DCH-consistency-in-autoconf.patch +#0001-Prevent-manual-builds-in-the-DKMS-source.patch +#0002-Check-for-META-and-DCH-consistency-in-autoconf.patch 0003-relocate-zvol_wait.patch enable-zed.patch -1004-zed-service-bindir.patch -2100-zfs-load-module.patch -2200-add-zfs-0.6.x-ioctl-compat-shim.patch +#1004-zed-service-bindir.patch +2101-zfs-load-module.patch +#2200-add-zfs-0.6.x-ioctl-compat-shim.patch 3100-remove-libzfs-module-timeout.patch 3302-Use-obj-m-instead-of-subdir-m.patch -4000-zsys-support.patch +4001-zsys-support.patch 4100-disable-bpool-upgrade.patch force-verbose-rules.patch #unapplied/init-debian-openrc-workaround.patch # OpenRC users can apply this locally -4550-Linux-5.5-compat-blkg_tryget.patch -4600-Linux-5.6-compat-struct-proc_ops.patch -4601-Linux-5.6-compat-timestamp_truncate.patch -4602-Linux-5.6-compat-ktime_get_raw_ts64.patch -4603-Linux-5.6-compat-time_t.patch +#4550-Linux-5.5-compat-blkg_tryget.patch +#4600-Linux-5.6-compat-struct-proc_ops.patch +#4601-Linux-5.6-compat-timestamp_truncate.patch +#4602-Linux-5.6-compat-ktime_get_raw_ts64.patch +#4603-Linux-5.6-compat-time_t.patch zfs-mount-container-start.patch -4610-ICP-Improve-AES-GCM-performance.patch +pr10163.patch +pr10184.patch +overlay.patch diff '--exclude=.git' -Npaur ubuntu/zfs-linux/debian/rules zfs-linux/debian/rules --- ubuntu/zfs-linux/debian/rules 2020-05-16 17:46:48.147546638 -0400 +++ zfs-linux/debian/rules 2020-05-13 16:27:11.358747557 -0400 @@ -56,6 +56,9 @@ endif --with-systemdunitdir=/lib/systemd/system \ --with-systemdpresetdir=/lib/systemd/system-preset \ --with-systemdgeneratordir=/lib/systemd/system-generators \ + --with-dracutdir=/usr/lib/dracut \ + --sysconfdir=/etc \ + --exec-prefix=/usr \ --with-config=user override_dh_gencontrol: @@ -165,7 +168,7 @@ endif override_dh_install: find . -name lib*.la -delete - dh_install --fail-missing + dh_install --list-missing override_dh_installinit: dh_installinit -r --no-restart-after-upgrade --name zfs-import