diff -Nru xen-4.1.3/Config.mk xen-4.1.5/Config.mk --- xen-4.1.3/Config.mk 2012-08-09 22:08:04.000000000 +0200 +++ xen-4.1.5/Config.mk 2013-05-02 22:08:10.000000000 +0200 @@ -144,6 +144,7 @@ $(call cc-option-add,HOSTCFLAGS,HOSTCC,-Wdeclaration-after-statement) $(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement) $(call cc-option-add,CFLAGS,CC,-Wno-unused-but-set-variable) +$(call cc-option-add,CFLAGS,CC,-Wno-unused-local-typedefs) LDFLAGS += $(foreach i, $(EXTRA_LIB), -L$(i)) CFLAGS += $(foreach i, $(EXTRA_INCLUDES), -I$(i)) @@ -178,10 +179,11 @@ # Mercurial in-tree version, or a local directory, or a git URL. # CONFIG_QEMU ?= `pwd`/$(XEN_ROOT)/../qemu-xen.git CONFIG_QEMU ?= $(QEMU_REMOTE) +# CONFIG_QEMU ?= $(XEN_ROOT)/tools/ioemu-qemu-xen -QEMU_TAG ?= xen-4.1.3 -# Mon Apr 2 17:55:05 2012 +0100 -# qemu-xen-traditional: QDISK fixes +QEMU_TAG ?= xen-4.1.5 +# Thu Jan 17 15:52:16 2013 +0000 +# e1000: fix compile warning introduced by security fix, and debugging # Optional components XENSTAT_XENTOP ?= y diff -Nru xen-4.1.3/debian/changelog xen-4.1.5/debian/changelog --- xen-4.1.3/debian/changelog 2013-04-11 16:06:46.000000000 +0200 +++ xen-4.1.5/debian/changelog 2013-05-15 16:01:55.000000000 +0200 @@ -1,3 +1,185 @@ +xen (4.1.5-0ubuntu0.12.10.1) quantal-proposed; urgency=low + + * Updating to lastest upstream stable release (LP: #1180396). + Replacing: + - CVE-2012-3494, CVE-2012-3495, CVE-2012-3496, CVE-2012-3498, + CVE-2012-3515, CVE-2012-4411 + - xsa20, xsa21, xsa22, xsa23-4.0-4.1, xsa24, xsa25-4.1, xsa26-4.1, + xsa27-4.1, xsa28-4.1, xsa29-4.1, xsa30-4.1, xsa31-4.1, xsa33-4.1, + xsa36-4.1, xsa38, xsa44-4.1, xsa46-4.1, xsa47-4.1 + - qemu-cve-2012-6075-1, qemu-cve-2012-6075-2 + - 0008-vmx-Simplify-cr0-update-handling-by-deferring-cr4-ch.patch + - 0009-VMX-disable-SMEP-feature-when-guest-is-in-non-paging.patch + - 0010-VMX-Always-disable-SMEP-when-guest-is-in-non-paging-.patch + * Upstream changes: + - x86-64/MMCFG: correct base address computation for regions not + starting at bus 0 + - x86-64: Fix off-by-one error in __addr_ok() macro + - X86: Disable PCID/INVPCID for dom0 + - X86: Disable PCID/INVPCID for pv + - Fix save/restore of guest PAT table in HAP paging mode. + - x86: don't hold off NMI delivery when MCE is masked + - amd iommu: Add workaround for erratum 732 & 733 + - x86/EDD: check MBR for BIOS magic before considering signature valid + - xen: fix page_list_splice() + - x86-64: don't allow non-canonical addresses to be set for any callback + - make all (native) hypercalls consistently have "long" return type + - xen: prevent a 64 bit guest setting reserved bits in DR7 + - xen: handle out-of-pirq condition correctly in PHYSDEVOP_get_free_pirq + - xen: Don't BUG_ON() PoD operations on a non-translated guest. + - x86/pvhvm: properly range-check PHYSDEVOP_map_pirq/MAP_PIRQ_TYPE_GSI + - x86/passthrough: Fix corruption caused by race conditions between + device allocation and deallocation to a domain. + - acpi: Make sure valid CPU is passed to do_pm_op() + - x86/PoD: prevent guest from being destroyed upon early access to + its memory + - x86/PoD: clean up types + - x86/mm: update max_mapped_pfn on MMIO mappings too. + - EPT/PoD: fix interaction with 1Gb pages + - x86: don't expose SYSENTER on unknown CPUs + - x86-64: refine the XSA-9 fix + - Fix shared entry status for grant copy operation on paged-out gfn + - x86,cmdline: Fix setting skip_realmode boolean on no-real-mode and + tboot options ...effect should be cumulative. + - x86: Prefer multiboot-provided e820 over bios-provided e801 memory info. + - tmem: only allow tmem control operations from privileged domains + - tmem: consistently make pool_id a uint32_t + - tmem: check the pool_id is valid when destroying a tmem pool + - tmem: check for a valid client ("domain") in the save subops + - tmem: don't access guest memory without using the accessors intended + for this + - tmem: detect arithmetic overflow in tmh_copy_{from,to}_client() + - tmem: properly drop lock on error path in do_tmem_get() + - tmem: properly drop lock on error path in do_tmem_op() + - tmem: reduce severity of log messages + - tmem: fixup 2010 cleanup patch that breaks tmem save/restore + - tmem: cleanup + - tmem: bump pool version to 1 to fix restore issue when tmem enabled + - make domain_create() return a proper error code + - x86/MSI: fix 2nd S3 resume with interrupt remapping enabled + - adjust a few RCU domain locking calls + - VT-d: split .ack and .disable DMA-MSI actors + - x86: properly check XEN_DOMCTL_ioport_mapping arguments for invalid range + - x86: tighten checks in XEN_DOMCTL_memory_mapping handler + - x86: check remote MMIO remap permissions + - libxc: builder: limit maximum size of kernel/ramdisk. + - hvmloader: Do not zero the wallclock fields in shared-info. + - x86/amd: Fix xen_apic_write warnings in Dom0 + - x86/xenoprof: fix kernel/user mode detection for HVM + - More efficient TLB-flush filtering in alloc_heap_pages(). + - x86/oprof: adjust off-by-one counter range checks + - x86/HPET: obtain proper lock for changing IRQ affinity + - hvm: handle PoD and grant pages in HVMOP_get_mem_type + - x86: don't special case first IO-APIC + - docs: correct formatting errors in xmdomain.cfg + - pygrub: always append --args + - hotplug/Linux: Remove tracing (bash -x) from network-nat script + - xenballoond.init: remove 4 from default runlevel + - pygrub: correct typo in --args assignment + - tools: xend: fix wrong condition check for xml file + - xend/pvscsi: fix passing of SCSI control LUNs + - xend/pvscsi: fix usage of persistant device names for SCSI devices + - xend/pvscsi: update sysfs parser for Linux 3.0 + - tmem: Prevent NULL dereference on error case + - x86/mm x86 shadow: Fix typo in sh_invlpg sl3 page presence check + - VCPU/timers: Prevent overflow in calculations, leading to DoS + vulnerability + - x86/physdev: Range check pirq parameter from guests + - x86/physmap: Prevent incorrect updates of m2p mappings + - xen/mm/shadow: check toplevel pagetables are present before + unhooking them. + - compat/gnttab: Prevent infinite loop in compat code + - fix backport oversight in 23383:addf106cc90f + - x86/time: fix scale_delta() inline assembly + - gnttab: fix releasing of memory upon switches between versions + - hvm: Limit the size of large HVM op batches + - x86/HVM: range check xen_hvm_set_mem_access.hvmmem_access before use + - xen: add missing guest address range checks to XENMEM_exchange handlers + - xen: fix error handling of guest_physmap_mark_populate_on_demand() + - memop: limit guest specified extent order + - MAINTAINERS: Reference stable maintenance policy + - x86/hap: Fix memory leak of domain->arch.hvm_domain.dirty_vram + - libxl: avoid blktap2 deadlock on cleanup + - libxl: revert 23428:93e17b0cd035 "avoid blktap2 deadlock" + - Add Dom0 xz kernel decompression + - XZ decompressor: Fix decoding of empty LZMA2 streams + - XZ: Fix incorrect XZ_BUF_ERROR + - passthrough/PCI: replace improper uses of pci_find_next_cap() + - x86/HPET: fix FSB interrupt masking + - IOMMU/ATS: fix maximum queue depth calculation + - x86, amd: Disable way access filter on Piledriver CPUs + - VT-d: fix interrupt remapping source validation for devices + behind legacy bridges + - Config.mk: delete accidentally introduced drivel + - x86/mm: Fix loop increment in paging_log_dirty_range() + - x86: compat_show_guest_stack() should not truncate MFN + - ACPI: acpi_table_parse() should return handler's error code + - AMD,IOMMU: Clean up old entries in remapping tables when creating new one + - AMD,IOMMU: Disable IOMMU if SATA Combined mode is on + - AMD,IOMMU: Make per-device interrupt remapping table default + - tools/ocaml: oxenstored: Be more paranoid about ring reading + - oxenstored: Enforce a maximum message size of 4096 bytes + - x86/AMD: Enable WC+ memory type on family 10 processors + - x86: restore (optional) forwarding of PCI SERR induced NMI to Dom0 + - unmodified_drivers: __devinit was removed in linux-3.8 + - tools/ocaml: oxenstored: correctly handle a full ring. + - AMD IOMMU: also spot missing IO-APIC entries in IVRS table + - xen: sched_credit: improve picking up the idle CPU for a VCPU + - gcc4.8 build fix: Add -Wno-unused-local-typedefs to CFLAGS. + - xenoprof: avoid division by 0 + - AMD IOMMU: don't BUG() when we don't have to + - libxl: Fix uninitialized variable in libxl_create_stubdom + - tools: Fix memset(&p,0,sizeof(p)) idiom in several places. + - x86/setup: don't relocate the VGA hole. + - x86: fix null pointer dereference in intel_get_extended_msrs() + - IOMMU, AMD Family15h Model10-1Fh erratum 746 Workaround + - x86: fix CMCI injection + - vmx: fix handling of NMI VMEXIT. + - Avoid stale pointer when moving domain to another cpupool + - fix compat memory exchange op splitting + - x86: make certain memory sub-ops return valid values + - SEDF: avoid gathering vCPU-s on pCPU0 + - x86: defer processing events on the NMI exit path + - credit1: Use atomic bit operations for the flags structure + - x86/MSI: add mechanism to fully protect MSI-X table from PV + guest accesses + - Add DomU xz kernel decompression + - powernow: add fixups for AMD P-state figures + - x86/MCA: suppress bank clearing for certain injected events + - AMD/IOMMU: Process softirqs while building dom0 iommu mappings + - VT-d: Enumerate IOMMUs when listing capabilities + - IOMMU: properly check whether interrupt remapping is enabled + - VT-d: deal with 5500/5520/X58 errata + - AMD IOMMU: allow disabling only interrupt remapping when certain + IVRS consistency checks fail + - x86: reserve pages when SandyBridge integrated graphics + - hvm: Clean up vlapic_reg_write() error propagation. + - ACPI/APEI: fix ERST MOVE_DATA instruction implementation + - ACPI/ERST: Name table in otherwise opaque error messages + - ACPI/APEI: Unlock apei_iomaps_lock on error path + - ACPI, APEI: Add apei_exec_run_optional + - ACPI: fix APEI related table size checking + - defer event channel bucket pointer store until after XSM checks + - x86: irq_move_cleanup_interrupt() must ignore legacy vectors + - x86/S3: Restore broken vcpu affinity on resume + - x86/mm/shadow: spurious warning when unmapping xenheap pages. + - vmx: Simplify cr0 update handling by deferring cr4 changes to the + cr4 handler. + - VMX: disable SMEP feature when guest is in non-paging mode + - VMX: Always disable SMEP when guest is in non-paging mode + - x86: don't pass negative time to gtime_to_gtsc() + - iommu/crash: Interrupt remapping is also disabled on crash + - x86: clear EFLAGS.NT in SYSENTER entry path + - x86: fix various issues with handling guest IRQs + - Fix rcu domain locking for transitive grants + - libxl: Fix SEGV in network-attach + - libxl: fix build error after 21c31a81 + * Fix translation import problem caused by duplicate message ID + (LP: #1176209). + - tools-xm-fix-duplicate-msgid.patch + + -- Stefan Bader Thu, 02 May 2013 13:14:43 -0700 + xen (4.1.3-3ubuntu1.5) quantal-security; urgency=low * Applying Xen Security Advisories: diff -Nru xen-4.1.3/debian/patches/0008-vmx-Simplify-cr0-update-handling-by-deferring-cr4-ch.patch xen-4.1.5/debian/patches/0008-vmx-Simplify-cr0-update-handling-by-deferring-cr4-ch.patch --- xen-4.1.3/debian/patches/0008-vmx-Simplify-cr0-update-handling-by-deferring-cr4-ch.patch 2013-04-08 14:37:06.000000000 +0200 +++ xen-4.1.5/debian/patches/0008-vmx-Simplify-cr0-update-handling-by-deferring-cr4-ch.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,69 +0,0 @@ -From 1e6275a95d3e35a72939b588f422bb761ba82f6b Mon Sep 17 00:00:00 2001 -From: Keir Fraser -Date: Tue, 12 Feb 2013 13:43:16 +0100 -Subject: [PATCH] vmx: Simplify cr0 update handling by deferring cr4 changes to the cr4 handler. - -Signed-off-by: Keir Fraser -xen-unstable changeset: 26501:8201b6ec3564 -xen-unstable date: Wed Jan 30 17:15:39 UTC 2013 ---- - xen/arch/x86/hvm/vmx/vmx.c | 15 +++++---------- - 1 files changed, 5 insertions(+), 10 deletions(-) - -Index: xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/hvm/vmx/vmx.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c 2013-04-08 14:37:00.977790278 +0200 -@@ -1064,20 +1064,18 @@ static void vmx_update_guest_cr(struct v - - if ( paging_mode_hap(v->domain) ) - { -- /* We manage GUEST_CR3 when guest CR0.PE is zero or when cr3 memevents are on */ -+ /* Manage GUEST_CR3 when CR0.PE=0. */ - uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - v->arch.hvm_vmx.exec_control &= ~cr3_ctls; - if ( !hvm_paging_enabled(v) ) - v->arch.hvm_vmx.exec_control |= cr3_ctls; - -+ /* Trap CR3 updates if CR3 memory events are enabled. */ - if ( v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_CR3] ) - v->arch.hvm_vmx.exec_control |= CPU_BASED_CR3_LOAD_EXITING; - - vmx_update_cpu_exec_control(v); -- -- /* Changing CR0.PE can change some bits in real CR4. */ -- vmx_update_guest_cr(v, 4); - } - - if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) -@@ -1107,8 +1105,6 @@ static void vmx_update_guest_cr(struct v - { - for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) - vmx_set_segment_register(v, s, ®[s]); -- v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME; -- __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); - v->arch.hvm_vmx.exception_bitmap = 0xffffffff; - vmx_update_exception_bitmap(v); - } -@@ -1118,10 +1114,6 @@ static void vmx_update_guest_cr(struct v - if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<arch.hvm_vmx.vm86_saved_seg[s]); -- v->arch.hvm_vcpu.hw_cr[4] = -- ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME) -- |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME)); -- __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); - v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK - | (paging_mode_hap(v->domain) ? - 0 : (1U << TRAP_page_fault)) -@@ -1135,6 +1127,9 @@ static void vmx_update_guest_cr(struct v - v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask; - __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]); - __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]); -+ -+ /* Changing CR0 can change some bits in real CR4. */ -+ vmx_update_guest_cr(v, 4); - break; - } - case 2: diff -Nru xen-4.1.3/debian/patches/0009-VMX-disable-SMEP-feature-when-guest-is-in-non-paging.patch xen-4.1.5/debian/patches/0009-VMX-disable-SMEP-feature-when-guest-is-in-non-paging.patch --- xen-4.1.3/debian/patches/0009-VMX-disable-SMEP-feature-when-guest-is-in-non-paging.patch 2013-04-08 14:37:13.000000000 +0200 +++ xen-4.1.5/debian/patches/0009-VMX-disable-SMEP-feature-when-guest-is-in-non-paging.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,43 +0,0 @@ -From 485f374230d39e153d7b9786e3d0336bd52ee661 Mon Sep 17 00:00:00 2001 -From: Dongxiao Xu -Date: Tue, 12 Feb 2013 13:44:02 +0100 -Subject: [PATCH] VMX: disable SMEP feature when guest is in non-paging mode - -SMEP is disabled if CPU is in non-paging mode in hardware. -However Xen always uses paging mode to emulate guest non-paging -mode with HAP. To emulate this behavior, SMEP needs to be manually -disabled when guest switches to non-paging mode. - -We met an issue that, SMP Linux guest with recent kernel (enable -SMEP support, for example, 3.5.3) would crash with triple fault if -setting unrestricted_guest=0 in grub. This is because Xen uses an -identity mapping page table to emulate the non-paging mode, where -the page table is set with USER flag. If SMEP is still enabled in -this case, guest will meet unhandlable page fault and then crash. - -Signed-off-by: Dongxiao Xu -Signed-off-by: Xiantao Zhang -xen-unstable changeset: 26502:d1bf3b21f783 -xen-unstable date: Wed Jan 30 17:17:30 UTC 2013 ---- - xen/arch/x86/hvm/vmx/vmx.c | 7 +++++++ - 1 files changed, 7 insertions(+), 0 deletions(-) - -Index: xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/hvm/vmx/vmx.c 2013-04-08 14:37:00.977790278 +0200 -+++ xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c 2013-04-08 14:37:09.189830426 +0200 -@@ -1158,6 +1158,13 @@ static void vmx_update_guest_cr(struct v - { - v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE; - v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; -+ /* -+ * SMEP is disabled if CPU is in non-paging mode in hardware. -+ * However Xen always uses paging mode to emulate guest non-paging -+ * mode with HAP. To emulate this behavior, SMEP needs to be -+ * manually disabled when guest switches to non-paging mode. -+ */ -+ v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_SMEP; - } - __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); - __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]); diff -Nru xen-4.1.3/debian/patches/0010-VMX-Always-disable-SMEP-when-guest-is-in-non-paging-.patch xen-4.1.5/debian/patches/0010-VMX-Always-disable-SMEP-when-guest-is-in-non-paging-.patch --- xen-4.1.3/debian/patches/0010-VMX-Always-disable-SMEP-when-guest-is-in-non-paging-.patch 2013-04-08 14:37:21.000000000 +0200 +++ xen-4.1.5/debian/patches/0010-VMX-Always-disable-SMEP-when-guest-is-in-non-paging-.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,52 +0,0 @@ -From 0d2e673a763bc7c2ddf97fed074eb691d325ecc5 Mon Sep 17 00:00:00 2001 -From: Stefan Bader -Date: Thu, 4 Apr 2013 10:37:19 +0200 -Subject: [PATCH] VMX: Always disable SMEP when guest is in non-paging mode - -commit e7dda8ec9fc9020e4f53345cdbb18a2e82e54a65 - VMX: disable SMEP feature when guest is in non-paging mode - -disabled the SMEP bit if a guest VCPU was using HAP and was not -in paging mode. However I could observe VCPUs getting stuck in -the trampoline after the following patch in the Linux kernel -changed the way CR4 gets set up: - x86, realmode: read cr4 and EFER from kernel for 64-bit trampoline - -The change will set CR4 from already set flags which includes the -SMEP bit. On bare metal this does not matter as the CPU is in non- -paging mode at that time. But Xen seems to use the emulated non- -paging mode regardless of HAP (I verified that on the guests I was -seeing the issue, HAP was not used). - -Therefor it seems right to unset the SMEP bit for a VCPU that is -not in paging-mode, regardless of its HAP usage. - -Signed-off-by: Stefan Bader -Acked-by: Keir Fraser -Acked-by: Dongxiao Xu -Backported-by: Stefan Bader ---- - xen/arch/x86/hvm/vmx/vmx.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -Index: xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/hvm/vmx/vmx.c 2013-04-08 14:37:09.189830426 +0200 -+++ xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c 2013-04-08 14:37:16.997868606 +0200 -@@ -1158,11 +1158,14 @@ static void vmx_update_guest_cr(struct v - { - v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE; - v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; -+ } -+ if ( !hvm_paging_enabled(v) ) -+ { - /* - * SMEP is disabled if CPU is in non-paging mode in hardware. - * However Xen always uses paging mode to emulate guest non-paging -- * mode with HAP. To emulate this behavior, SMEP needs to be -- * manually disabled when guest switches to non-paging mode. -+ * mode. To emulate this behavior, SMEP needs to be manually -+ * disabled when guest VCPU is in non-paging mode. - */ - v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_SMEP; - } diff -Nru xen-4.1.3/debian/patches/CVE-2012-3494 xen-4.1.5/debian/patches/CVE-2012-3494 --- xen-4.1.3/debian/patches/CVE-2012-3494 2012-09-07 18:05:34.000000000 +0200 +++ xen-4.1.5/debian/patches/CVE-2012-3494 1970-01-01 01:00:00.000000000 +0100 @@ -1,27 +0,0 @@ -# HG changeset patch -# User Ian Jackson -# Date 1346844474 -3600 -# Node ID bcc3402927311c64cc04e59d3680680b09459da6 -# Parent d28a9ba889c02f835df05bc007c2b4828d86cff2 -xen: prevent a 64 bit guest setting reserved bits in DR7 - -The upper 32 bits of this register are reserved and should be written as -zero. - -This is XSA-12 / CVE-2012-3494 - -Signed-off-by: Jan Beulich -Reviewed-by: Ian Campbell - -diff -r d28a9ba889c0 -r bcc340292731 xen/include/asm-x86/debugreg.h ---- a/xen/include/asm-x86/debugreg.h Tue Sep 04 14:56:48 2012 +0200 -+++ b/xen/include/asm-x86/debugreg.h Wed Sep 05 12:27:54 2012 +0100 -@@ -58,7 +58,7 @@ - We can slow the instruction pipeline for instructions coming via the - gdt or the ldt if we want to. I am not sure why this is an advantage */ - --#define DR_CONTROL_RESERVED_ZERO (0x0000d800ul) /* Reserved, read as zero */ -+#define DR_CONTROL_RESERVED_ZERO (~0xffff27fful) /* Reserved, read as zero */ - #define DR_CONTROL_RESERVED_ONE (0x00000400ul) /* Reserved, read as one */ - #define DR_LOCAL_EXACT_ENABLE (0x00000100ul) /* Local exact enable */ - #define DR_GLOBAL_EXACT_ENABLE (0x00000200ul) /* Global exact enable */ diff -Nru xen-4.1.3/debian/patches/CVE-2012-3495 xen-4.1.5/debian/patches/CVE-2012-3495 --- xen-4.1.3/debian/patches/CVE-2012-3495 2012-09-07 18:05:34.000000000 +0200 +++ xen-4.1.5/debian/patches/CVE-2012-3495 1970-01-01 01:00:00.000000000 +0100 @@ -1,35 +0,0 @@ -# HG changeset patch -# User Ian Jackson -# Date 1346844497 -3600 -# Node ID 6779ddca8593b766ccabcfec294ba10f17e68484 -# Parent bcc3402927311c64cc04e59d3680680b09459da6 -xen: handle out-of-pirq condition correctly in PHYSDEVOP_get_free_pirq - -This is XSA-13 / CVE-2012-3495 - -Signed-off-by: Ian Campbell -Signed-off-by: Jan Beulich - -diff -r bcc340292731 -r 6779ddca8593 xen/arch/x86/physdev.c ---- a/xen/arch/x86/physdev.c Wed Sep 05 12:27:54 2012 +0100 -+++ b/xen/arch/x86/physdev.c Wed Sep 05 12:28:17 2012 +0100 -@@ -587,11 +587,16 @@ - break; - - spin_lock(&d->event_lock); -- out.pirq = get_free_pirq(d, out.type, 0); -- d->arch.pirq_irq[out.pirq] = PIRQ_ALLOCATED; -+ ret = get_free_pirq(d, out.type, 0); -+ if ( ret >= 0 ) -+ d->arch.pirq_irq[ret] = PIRQ_ALLOCATED; - spin_unlock(&d->event_lock); - -- ret = copy_to_guest(arg, &out, 1) ? -EFAULT : 0; -+ if ( ret >= 0 ) -+ { -+ out.pirq = ret; -+ ret = copy_to_guest(arg, &out, 1) ? -EFAULT : 0; -+ } - - rcu_unlock_domain(d); - break; diff -Nru xen-4.1.3/debian/patches/CVE-2012-3496 xen-4.1.5/debian/patches/CVE-2012-3496 --- xen-4.1.3/debian/patches/CVE-2012-3496 2012-09-07 18:05:34.000000000 +0200 +++ xen-4.1.5/debian/patches/CVE-2012-3496 1970-01-01 01:00:00.000000000 +0100 @@ -1,26 +0,0 @@ -# HG changeset patch -# User Ian Jackson -# Date 1346844545 -3600 -# Node ID 8ebda5388e4e83a69c73bdd7621e76e1de4fc995 -# Parent 6779ddca8593b766ccabcfec294ba10f17e68484 -xen: Don't BUG_ON() PoD operations on a non-translated guest. - -This is XSA-14 / CVE-2012-3496 - -Signed-off-by: Tim Deegan -Reviewed-by: Ian Campbell -Tested-by: Ian Campbell - -diff -r 6779ddca8593 -r 8ebda5388e4e xen/arch/x86/mm/p2m.c ---- a/xen/arch/x86/mm/p2m.c Wed Sep 05 12:28:17 2012 +0100 -+++ b/xen/arch/x86/mm/p2m.c Wed Sep 05 12:29:05 2012 +0100 -@@ -2414,7 +2414,8 @@ - int pod_count = 0; - int rc = 0; - -- BUG_ON(!paging_mode_translate(d)); -+ if ( !paging_mode_translate(d) ) -+ return -EINVAL; - - rc = gfn_check_limit(d, gfn, order); - if ( rc != 0 ) diff -Nru xen-4.1.3/debian/patches/CVE-2012-3498 xen-4.1.5/debian/patches/CVE-2012-3498 --- xen-4.1.3/debian/patches/CVE-2012-3498 2012-09-07 18:05:34.000000000 +0200 +++ xen-4.1.5/debian/patches/CVE-2012-3498 1970-01-01 01:00:00.000000000 +0100 @@ -1,36 +0,0 @@ -# HG changeset patch -# User Ian Jackson -# Date 1346844596 -3600 -# Node ID 936f63ee4dadb832222c029e958ae7c7564ec0e8 -# Parent 8ebda5388e4e83a69c73bdd7621e76e1de4fc995 -x86/pvhvm: properly range-check PHYSDEVOP_map_pirq/MAP_PIRQ_TYPE_GSI - -This is being used as a array index, and hence must be validated before -use. - -This is XSA-16 / CVE-2012-3498. - -Signed-off-by: Jan Beulich - -diff -r 8ebda5388e4e -r 936f63ee4dad xen/arch/x86/physdev.c ---- a/xen/arch/x86/physdev.c Wed Sep 05 12:29:05 2012 +0100 -+++ b/xen/arch/x86/physdev.c Wed Sep 05 12:29:56 2012 +0100 -@@ -40,11 +40,18 @@ - struct hvm_girq_dpci_mapping *girq; - uint32_t machine_gsi = 0; - -+ if ( map->index < 0 || map->index >= NR_HVM_IRQS ) -+ { -+ ret = -EINVAL; -+ break; -+ } -+ - /* find the machine gsi corresponding to the - * emulated gsi */ - hvm_irq_dpci = domain_get_irq_dpci(d); - if ( hvm_irq_dpci ) - { -+ BUILD_BUG_ON(ARRAY_SIZE(hvm_irq_dpci->girq) < NR_HVM_IRQS); - list_for_each_entry ( girq, - &hvm_irq_dpci->girq[map->index], - list ) diff -Nru xen-4.1.3/debian/patches/CVE-2012-3515 xen-4.1.5/debian/patches/CVE-2012-3515 --- xen-4.1.3/debian/patches/CVE-2012-3515 2012-09-07 18:05:34.000000000 +0200 +++ xen-4.1.5/debian/patches/CVE-2012-3515 1970-01-01 01:00:00.000000000 +0100 @@ -1,118 +0,0 @@ -commit 3220480734832a148d26f7a81f90af61c2ecfdd9 -Author: Ian Campbell -Date: Wed Sep 5 12:31:40 2012 +0100 - - console: bounds check whenever changing the cursor due to an escape code - - This is XSA-17 / CVE-2012-3515 - - Signed-off-by: Ian Campbell - (cherry picked from commit a56ae4b5069c7b23ee657b15f08443a9b14a8e7b) - -diff --git a/console.c b/console.c -index 5e6e3d0..9984d6f 100644 ---- a/qemu/console.c -+++ b/qemu/console.c -@@ -794,6 +794,26 @@ static void console_clear_xy(TextConsole *s, int x, int y) - update_xy(s, x, y); - } - -+/* set cursor, checking bounds */ -+static void set_cursor(TextConsole *s, int x, int y) -+{ -+ if (x < 0) { -+ x = 0; -+ } -+ if (y < 0) { -+ y = 0; -+ } -+ if (y >= s->height) { -+ y = s->height - 1; -+ } -+ if (x >= s->width) { -+ x = s->width - 1; -+ } -+ -+ s->x = x; -+ s->y = y; -+} -+ - static void console_putchar(TextConsole *s, int ch) - { - TextCell *c; -@@ -869,7 +889,8 @@ static void console_putchar(TextConsole *s, int ch) - s->esc_params[s->nb_esc_params] * 10 + ch - '0'; - } - } else { -- s->nb_esc_params++; -+ if (s->nb_esc_params < MAX_ESC_PARAMS) -+ s->nb_esc_params++; - if (ch == ';') - break; - #ifdef DEBUG_CONSOLE -@@ -883,59 +904,37 @@ static void console_putchar(TextConsole *s, int ch) - if (s->esc_params[0] == 0) { - s->esc_params[0] = 1; - } -- s->y -= s->esc_params[0]; -- if (s->y < 0) { -- s->y = 0; -- } -+ set_cursor(s, s->x, s->y - s->esc_params[0]); - break; - case 'B': - /* move cursor down */ - if (s->esc_params[0] == 0) { - s->esc_params[0] = 1; - } -- s->y += s->esc_params[0]; -- if (s->y >= s->height) { -- s->y = s->height - 1; -- } -+ set_cursor(s, s->x, s->y + s->esc_params[0]); - break; - case 'C': - /* move cursor right */ - if (s->esc_params[0] == 0) { - s->esc_params[0] = 1; - } -- s->x += s->esc_params[0]; -- if (s->x >= s->width) { -- s->x = s->width - 1; -- } -+ set_cursor(s, s->x + s->esc_params[0], s->y); - break; - case 'D': - /* move cursor left */ - if (s->esc_params[0] == 0) { - s->esc_params[0] = 1; - } -- s->x -= s->esc_params[0]; -- if (s->x < 0) { -- s->x = 0; -- } -+ set_cursor(s, s->x - s->esc_params[0], s->y); - break; - case 'G': - /* move cursor to column */ -- s->x = s->esc_params[0] - 1; -- if (s->x < 0) { -- s->x = 0; -- } -+ set_cursor(s, s->esc_params[0] - 1, s->y); - break; - case 'f': - case 'H': - /* move cursor to row, column */ -- s->x = s->esc_params[1] - 1; -- if (s->x < 0) { -- s->x = 0; -- } -- s->y = s->esc_params[0] - 1; -- if (s->y < 0) { -- s->y = 0; -- } -+ set_cursor(s, s->esc_params[1] - 1, s->esc_params[0] - 1); - break; - case 'J': - switch (s->esc_params[0]) { diff -Nru xen-4.1.3/debian/patches/CVE-2012-4411 xen-4.1.5/debian/patches/CVE-2012-4411 --- xen-4.1.3/debian/patches/CVE-2012-4411 2012-09-07 19:38:58.000000000 +0200 +++ xen-4.1.5/debian/patches/CVE-2012-4411 1970-01-01 01:00:00.000000000 +0100 @@ -1,31 +0,0 @@ -commit d7d453f51459b591faa96d1c123b5bfff7c5b6b6 -Author: Ian Jackson -Date: Thu Sep 6 17:05:30 2012 +0100 - - Disable qemu monitor by default. The qemu monitor is an overly - powerful feature which must be protected from untrusted (guest) - administrators. - - Neither xl nor xend expect qemu to produce this monitor unless it is - explicitly requested. - - This is a security problem, XSA-19. Previously it was CVE-2007-0998 - in Red Hat but we haven't dealt with it in upstream. We hope to have - a new CVE for it here but we don't have one yet. - - Signed-off-by: Ian Jackson - (cherry picked from commit bacc0d302445c75f18f4c826750fb5853b60e7ca) - -diff --git a/vl.c b/vl.c -index f07a659..686a9bd 100644 ---- a/qemu/vl.c -+++ b/qemu/vl.c -@@ -4910,7 +4910,7 @@ int main(int argc, char **argv, char **envp) - kernel_cmdline = ""; - cyls = heads = secs = 0; - translation = BIOS_ATA_TRANSLATION_AUTO; -- monitor_device = "vc:80Cx24C"; -+ monitor_device = "null"; - - serial_devices[0] = "vc:80Cx24C"; - for(i = 1; i < MAX_SERIAL_PORTS; i++) diff -Nru xen-4.1.3/debian/patches/qemu-cve-2012-6075-1.patch xen-4.1.5/debian/patches/qemu-cve-2012-6075-1.patch --- xen-4.1.3/debian/patches/qemu-cve-2012-6075-1.patch 2013-01-08 14:47:33.000000000 +0100 +++ xen-4.1.5/debian/patches/qemu-cve-2012-6075-1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,44 +0,0 @@ -From: Michael Contreras -Date: Mon, 3 Dec 2012 04:11:22 +0000 (-0800) -Subject: e1000: Discard packets that are too long if !SBP and !LPE -X-Git-Tag: v1.3.0~1 -X-Git-Url: http://git.qemu.org/?p=qemu.git;a=commitdiff_plain;h=b0d9ffcd0251161c7c92f94804dcf599dfa3edeb - -e1000: Discard packets that are too long if !SBP and !LPE - -The e1000_receive function for the e1000 needs to discard packets longer than -1522 bytes if the SBP and LPE flags are disabled. The linux driver assumes -this behavior and allocates memory based on this assumption. - -Signed-off-by: Michael Contreras -Signed-off-by: Anthony Liguori ---- - -Index: xen-4.2.0/qemu/hw/e1000.c -=================================================================== ---- xen-4.2.0.orig/qemu/hw/e1000.c 2012-11-20 10:54:05.000000000 +0100 -+++ xen-4.2.0/qemu/hw/e1000.c 2013-01-07 15:43:38.765802328 +0100 -@@ -55,6 +55,9 @@ static int debugflags = DBGBIT(TXERR) | - #define REG_IOADDR 0x0 - #define REG_IODATA 0x4 - -+/* this is the size past which hardware will drop packets when setting LPE=0 */ -+#define MAXIMUM_ETHERNET_VLAN_SIZE 1522 -+ - /* - * HW models: - * E1000_DEV_ID_82540EM works with Windows and Linux -@@ -628,6 +631,13 @@ e1000_receive(void *opaque, const uint8_ - return; - } - -+ /* Discard oversized packets if !LPE and !SBP. */ -+ if (size > MAXIMUM_ETHERNET_VLAN_SIZE -+ && !(s->mac_reg[RCTL] & E1000_RCTL_LPE) -+ && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) { -+ return; -+ } -+ - if (!receive_filter(s, buf, size)) - return; - diff -Nru xen-4.1.3/debian/patches/qemu-cve-2012-6075-2.patch xen-4.1.5/debian/patches/qemu-cve-2012-6075-2.patch --- xen-4.1.3/debian/patches/qemu-cve-2012-6075-2.patch 2013-01-08 14:47:33.000000000 +0100 +++ xen-4.1.5/debian/patches/qemu-cve-2012-6075-2.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,33 +0,0 @@ -From: Michael Contreras -Date: Wed, 5 Dec 2012 13:31:30 -0500 -Subject: Discard packets longer than 16384 when !SBP to match the hardware - behavior. -Signed-off-by: Michael Contreras - -Origin: https://lists.nongnu.org/archive/html/qemu-devel/2012-12/msg00533.html -Bug: CVE-2012-6075 (pt2) -Index: xen-4.2.0/qemu/hw/e1000.c -=================================================================== ---- xen-4.2.0.orig/qemu/hw/e1000.c 2013-01-07 15:43:38.765802328 +0100 -+++ xen-4.2.0/qemu/hw/e1000.c 2013-01-07 15:53:39.668739989 +0100 -@@ -57,6 +57,8 @@ static int debugflags = DBGBIT(TXERR) | - - /* this is the size past which hardware will drop packets when setting LPE=0 */ - #define MAXIMUM_ETHERNET_VLAN_SIZE 1522 -+/* this is the size past which hardware will drop packets when setting LPE=1 */ -+#define MAXIMUM_ETHERNET_LPE_SIZE 16384 - - /* - * HW models: -@@ -632,8 +634,9 @@ e1000_receive(void *opaque, const uint8_ - } - - /* Discard oversized packets if !LPE and !SBP. */ -- if (size > MAXIMUM_ETHERNET_VLAN_SIZE -- && !(s->mac_reg[RCTL] & E1000_RCTL_LPE) -+ if ((size > MAXIMUM_ETHERNET_LPE_SIZE || -+ (size > MAXIMUM_ETHERNET_VLAN_SIZE -+ && !(s->mac_reg[RCTL] & E1000_RCTL_LPE))) - && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) { - return; - } diff -Nru xen-4.1.3/debian/patches/series xen-4.1.5/debian/patches/series --- xen-4.1.3/debian/patches/series 2013-04-11 16:05:08.000000000 +0200 +++ xen-4.1.5/debian/patches/series 2013-05-03 23:21:58.000000000 +0200 @@ -4,13 +4,6 @@ upstream-23939:51288f69523f-rework upstream-25290:7a6dcecb1781-rework -CVE-2012-3494 -CVE-2012-3495 -CVE-2012-3496 -CVE-2012-3498 -CVE-2012-3515 -CVE-2012-4411 - xen-x86-interrupt-pointer-missmatch.diff version.patch @@ -79,26 +72,5 @@ xen-amd03-50a70b652b43.patch xen-amd04-eae25241d571.patch xen-amd10-23e33ea79cac.patch -xsa20.patch -xsa21.patch -xsa22-4.1.patch -xsa23-4.0-4.1.patch -xsa24.patch -xsa25-4.1.patch -xsa26-4.1.patch -xsa27-4.1.patch -xsa28-4.1.patch -xsa29-4.1.patch -xsa30-4.1.patch -xsa31-4.1.patch -xsa33-4.1.patch -qemu-cve-2012-6075-1.patch -qemu-cve-2012-6075-2.patch -xsa36-4.1.patch -xsa38.patch -0008-vmx-Simplify-cr0-update-handling-by-deferring-cr4-ch.patch -0009-VMX-disable-SMEP-feature-when-guest-is-in-non-paging.patch -0010-VMX-Always-disable-SMEP-when-guest-is-in-non-paging-.patch -xsa44-4.1.patch -xsa46-4.1.patch -xsa47-4.1.patch + +tools-xm-fix-duplicate-msgid.patch diff -Nru xen-4.1.3/debian/patches/tools-xm-fix-duplicate-msgid.patch xen-4.1.5/debian/patches/tools-xm-fix-duplicate-msgid.patch --- xen-4.1.3/debian/patches/tools-xm-fix-duplicate-msgid.patch 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/debian/patches/tools-xm-fix-duplicate-msgid.patch 2013-05-03 23:38:45.000000000 +0200 @@ -0,0 +1,23 @@ +Description: Fix duplicate message ID reported by Launchpad + This hopefully fixes the error mail produced on each upload of the + package. Seems with the 4.2.2 version of Xen this has been fixed + already. +Forwarded: not-needed +Origin: vendor, http://bugs.launchpad.net/bugs/1176209 +Author: Stefan Bader + +Index: xen-4.1.5/tools/python/xen/xm/messages/en/xen-xm.po +=================================================================== +--- xen-4.1.5.orig/tools/python/xen/xm/messages/en/xen-xm.po 2013-04-23 09:44:20.000000000 -0700 ++++ xen-4.1.5/tools/python/xen/xm/messages/en/xen-xm.po 2013-05-03 14:22:29.394836897 -0700 +@@ -86,8 +86,5 @@ msgstr "Device already exists %s" + msgid "IMPLEMENTATION_ERROR" + msgstr "Class %s does not implement %s" + +-msgid "VLAN_TAG_INVALID" +-msgstr "VLAN tag invalid %s" +- + msgid "NETWORK_ERROR" +-msgstr "Network Error: %s - %s" +\ No newline at end of file ++msgstr "Network Error: %s - %s" diff -Nru xen-4.1.3/debian/patches/xsa20.patch xen-4.1.5/debian/patches/xsa20.patch --- xen-4.1.3/debian/patches/xsa20.patch 2012-12-05 16:40:23.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa20.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,42 +0,0 @@ -VCPU/timers: Prevent overflow in calculations, leading to DoS vulnerability - -The timer action for a vcpu periodic timer is to calculate the next -expiry time, and to reinsert itself into the timer queue. If the -deadline ends up in the past, Xen never leaves __do_softirq(). The -affected PCPU will stay in an infinite loop until Xen is killed by the -watchdog (if enabled). - -This is a security problem, XSA-20 / CVE-2012-4535. - -Signed-off-by: Andrew Cooper -Acked-by: Ian Campbell -Origin: http://lists.xen.org/archives/html/xen-announce/2012-11/msg00001.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/common/domain.c -=================================================================== ---- xen-4.1.3.orig/xen/common/domain.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/common/domain.c 2012-12-05 16:40:19.504428539 +0100 -@@ -871,6 +871,9 @@ long do_vcpu_op(int cmd, int vcpuid, XEN - if ( set.period_ns < MILLISECS(1) ) - return -EINVAL; - -+ if ( set.period_ns > STIME_DELTA_MAX ) -+ return -EINVAL; -+ - v->periodic_period = set.period_ns; - vcpu_force_reschedule(v); - -Index: xen-4.1.3/xen/include/xen/time.h -=================================================================== ---- xen-4.1.3.orig/xen/include/xen/time.h 2012-08-09 22:08:10.000000000 +0200 -+++ xen-4.1.3/xen/include/xen/time.h 2012-12-05 16:40:19.504428539 +0100 -@@ -53,6 +53,8 @@ struct tm gmtime(unsigned long t); - #define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL)) - #define MICROSECS(_us) ((s_time_t)((_us) * 1000ULL)) - #define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1)) -+/* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */ -+#define STIME_DELTA_MAX ((s_time_t)((uint64_t)~0ull>>2)) - - extern void update_vcpu_system_time(struct vcpu *v); - extern void update_domain_wallclock_time(struct domain *d); diff -Nru xen-4.1.3/debian/patches/xsa21.patch xen-4.1.5/debian/patches/xsa21.patch --- xen-4.1.3/debian/patches/xsa21.patch 2012-12-05 16:40:26.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa21.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,34 +0,0 @@ -x86/physdev: Range check pirq parameter from guests - -Otherwise Xen will read beyond either end of the struct -domain.arch.pirq_emuirq array, usually resulting in a fatal page fault. - -This vulnerability was introduced by c/s 23241:d21100f1d00e, which adds -a call to domain_pirq_to_emuirq() which uses the guest provided pirq -value before range checking it, and was fixed by c/s 23573:584c2e5e03d9 -which changed the behaviour of the domain_pirq_to_emuirq() macro to use -radix trees instead of a flat array. - -This is XSA-21 / CVE-2012-4536. - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich -Acked-by: Ian Campbell -Origin: http://lists.xen.org/archives/html/xen-announce/2012-11/msg00003.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/arch/x86/physdev.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/physdev.c 2012-12-05 16:39:37.000000000 +0100 -+++ xen-4.1.3/xen/arch/x86/physdev.c 2012-12-05 16:40:24.668453786 +0100 -@@ -237,6 +237,10 @@ static int physdev_unmap_pirq(struct phy - if ( ret ) - return ret; - -+ ret = -EINVAL; -+ if ( unmap->pirq < 0 || unmap->pirq >= d->nr_pirqs ) -+ goto free_domain; -+ - if ( is_hvm_domain(d) ) - { - spin_lock(&d->event_lock); diff -Nru xen-4.1.3/debian/patches/xsa22-4.1.patch xen-4.1.5/debian/patches/xsa22-4.1.patch --- xen-4.1.3/debian/patches/xsa22-4.1.patch 2012-12-05 16:40:29.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa22-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,43 +0,0 @@ -x86/physmap: Prevent incorrect updates of m2p mappings - -In certain conditions, such as low memory, set_p2m_entry() can fail. -Currently, the p2m and m2p tables will get out of sync because we still -update the m2p table after the p2m update has failed. - -If that happens, subsequent guest-invoked memory operations can cause -BUG()s and ASSERT()s to kill Xen. - -This is fixed by only updating the m2p table iff the p2m was -successfully updated. - -This is a security problem, XSA-22 / CVE-2012-4537. - -Signed-off-by: Andrew Cooper -Acked-by: Ian Campbell -Acked-by: Ian Jackson -Origin: http://lists.xen.org/archives/html/xen-announce/2012-11/msg00005.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/arch/x86/mm/p2m.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/mm/p2m.c 2012-12-05 16:39:37.000000000 +0100 -+++ xen-4.1.3/xen/arch/x86/mm/p2m.c 2012-12-05 16:40:28.324471658 +0100 -@@ -2560,7 +2560,10 @@ guest_physmap_add_entry(struct p2m_domai - if ( mfn_valid(_mfn(mfn)) ) - { - if ( !set_p2m_entry(p2m, gfn, _mfn(mfn), page_order, t, p2m->default_access) ) -+ { - rc = -EINVAL; -+ goto out; /* Failed to update p2m, bail without updating m2p. */ -+ } - if ( !p2m_is_grant(t) ) - { - for ( i = 0; i < (1UL << page_order); i++ ) -@@ -2581,6 +2584,7 @@ guest_physmap_add_entry(struct p2m_domai - } - } - -+out: - audit_p2m(p2m, 1); - p2m_unlock(p2m); - diff -Nru xen-4.1.3/debian/patches/xsa23-4.0-4.1.patch xen-4.1.5/debian/patches/xsa23-4.0-4.1.patch --- xen-4.1.3/debian/patches/xsa23-4.0-4.1.patch 2012-12-05 16:40:32.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa23-4.0-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,35 +0,0 @@ -xen/mm/shadow: check toplevel pagetables are present before unhooking them. - -If the guest has not fully populated its top-level PAE entries when it calls -HVMOP_pagetable_dying, the shadow code could try to unhook entries from -MFN 0. Add a check to avoid that case. - -This issue was introduced by c/s 21239:b9d2db109cf5. - -This is a security problem, XSA-23 / CVE-2012-4538. - -Signed-off-by: Tim Deegan -Tested-by: Andrew Cooper -Acked-by: Ian Campbell -Origin: http://lists.xen.org/archives/html/xen-announce/2012-11/msg00004.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/arch/x86/mm/shadow/multi.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/mm/shadow/multi.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/mm/shadow/multi.c 2012-12-05 16:40:31.056485017 +0100 -@@ -4737,8 +4737,12 @@ static void sh_pagetable_dying(struct vc - } - for ( i = 0; i < 4; i++ ) - { -- if ( fast_path ) -- smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i])); -+ if ( fast_path ) { -+ if ( pagetable_is_null(v->arch.shadow_table[i]) ) -+ smfn = _mfn(INVALID_MFN); -+ else -+ smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i])); -+ } - else - { - /* retrieving the l2s */ diff -Nru xen-4.1.3/debian/patches/xsa24.patch xen-4.1.5/debian/patches/xsa24.patch --- xen-4.1.3/debian/patches/xsa24.patch 2012-12-05 16:40:35.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa24.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,29 +0,0 @@ -compat/gnttab: Prevent infinite loop in compat code - -c/s 20281:95ea2052b41b, which introduces Grant Table version 2 -hypercalls introduces a vulnerability whereby the compat hypercall -handler can fall into an infinite loop. - -If the watchdog is enabled, Xen will die after the timeout. - -This is a security problem, XSA-24 / CVE-2012-4539. - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich -Acked-by: Ian Jackson -Origin: http://lists.xen.org/archives/html/xen-announce/2012-11/msg00002.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/common/compat/grant_table.c -=================================================================== ---- xen-4.1.3.orig/xen/common/compat/grant_table.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/common/compat/grant_table.c 2012-12-05 16:40:33.988499342 +0100 -@@ -310,6 +310,8 @@ int compat_grant_table_op(unsigned int c - #undef XLAT_gnttab_get_status_frames_HNDL_frame_list - if ( unlikely(__copy_to_guest(cmp_uop, &cmp.get_status, 1)) ) - rc = -EFAULT; -+ else -+ i = 1; - } - break; - } diff -Nru xen-4.1.3/debian/patches/xsa25-4.1.patch xen-4.1.5/debian/patches/xsa25-4.1.patch --- xen-4.1.3/debian/patches/xsa25-4.1.patch 2012-12-05 16:40:38.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa25-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,463 +0,0 @@ -libxc: builder: limit maximum size of kernel/ramdisk. - -Allowing user supplied kernels of arbitrary sizes, especially during -decompression, can swallow up dom0 memory leading to either virtual -address space exhaustion in the builder process or allocation -failures/OOM killing of both toolstack and unrelated processes. - -We disable these checks when building in a stub domain for pvgrub -since this uses the guest's own memory and is isolated. - -Decompression of gzip compressed kernels and ramdisks has been safe -since 14954:58205257517d (Xen 3.1.0 onwards). - -This is XSA-25 / CVE-2012-4544. - -Also make explicit checks for buffer overflows in various -decompression routines. These were already ruled out due to other -properties of the code but check them as a belt-and-braces measure. - -Signed-off-by: Ian Campbell -Acked-by: Ian Jackson -[ Includes 25589:60f09d1ab1fe for CVE-2012-2625 ] -Origin: http://lists.xen.org/archives/html/xen-announce/2012-11/msg00006.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/stubdom/grub/kexec.c -=================================================================== ---- xen-4.1.3.orig/stubdom/grub/kexec.c 2012-08-09 22:08:05.000000000 +0200 -+++ xen-4.1.3/stubdom/grub/kexec.c 2012-12-05 16:40:37.076514449 +0100 -@@ -137,6 +137,10 @@ void kexec(void *kernel, long kernel_siz - dom = xc_dom_allocate(xc_handle, cmdline, features); - dom->allocate = kexec_allocate; - -+ /* We are using guest owned memory, therefore no limits. */ -+ xc_dom_kernel_max_size(dom, 0); -+ xc_dom_ramdisk_max_size(dom, 0); -+ - dom->kernel_blob = kernel; - dom->kernel_size = kernel_size; - -Index: xen-4.1.3/tools/libxc/xc_dom.h -=================================================================== ---- xen-4.1.3.orig/tools/libxc/xc_dom.h 2012-08-09 22:08:06.000000000 +0200 -+++ xen-4.1.3/tools/libxc/xc_dom.h 2012-12-05 16:40:37.076514449 +0100 -@@ -52,6 +52,9 @@ struct xc_dom_image { - void *ramdisk_blob; - size_t ramdisk_size; - -+ size_t max_kernel_size; -+ size_t max_ramdisk_size; -+ - /* arguments and parameters */ - char *cmdline; - uint32_t f_requested[XENFEAT_NR_SUBMAPS]; -@@ -175,6 +178,23 @@ void xc_dom_release_phys(struct xc_dom_i - void xc_dom_release(struct xc_dom_image *dom); - int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb); - -+/* Set this larger if you have enormous ramdisks/kernels. Note that -+ * you should trust all kernels not to be maliciously large (e.g. to -+ * exhaust all dom0 memory) if you do this (see CVE-2012-4544 / -+ * XSA-25). You can also set the default independently for -+ * ramdisks/kernels in xc_dom_allocate() or call -+ * xc_dom_{kernel,ramdisk}_max_size. -+ */ -+#ifndef XC_DOM_DECOMPRESS_MAX -+#define XC_DOM_DECOMPRESS_MAX (1024*1024*1024) /* 1GB */ -+#endif -+ -+int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz); -+int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz); -+ -+int xc_dom_ramdisk_check_size(struct xc_dom_image *dom, size_t sz); -+int xc_dom_ramdisk_max_size(struct xc_dom_image *dom, size_t sz); -+ - size_t xc_dom_check_gzip(xc_interface *xch, - void *blob, size_t ziplen); - int xc_dom_do_gunzip(xc_interface *xch, -@@ -224,7 +244,8 @@ void xc_dom_log_memory_footprint(struct - void *xc_dom_malloc(struct xc_dom_image *dom, size_t size); - void *xc_dom_malloc_page_aligned(struct xc_dom_image *dom, size_t size); - void *xc_dom_malloc_filemap(struct xc_dom_image *dom, -- const char *filename, size_t * size); -+ const char *filename, size_t * size, -+ const size_t max_size); - char *xc_dom_strdup(struct xc_dom_image *dom, const char *str); - - /* --- alloc memory pool ------------------------------------------- */ -Index: xen-4.1.3/tools/libxc/xc_dom_bzimageloader.c -=================================================================== ---- xen-4.1.3.orig/tools/libxc/xc_dom_bzimageloader.c 2012-08-09 22:08:06.000000000 +0200 -+++ xen-4.1.3/tools/libxc/xc_dom_bzimageloader.c 2012-12-05 16:40:37.076514449 +0100 -@@ -47,13 +47,19 @@ static int xc_try_bzip2_decode( - char *out_buf; - char *tmp_buf; - int retval = -1; -- int outsize; -+ unsigned int outsize; - uint64_t total; - - stream.bzalloc = NULL; - stream.bzfree = NULL; - stream.opaque = NULL; - -+ if ( dom->kernel_size == 0) -+ { -+ DOMPRINTF("BZIP2: Input is 0 size"); -+ return -1; -+ } -+ - ret = BZ2_bzDecompressInit(&stream, 0, 0); - if ( ret != BZ_OK ) - { -@@ -66,6 +72,17 @@ static int xc_try_bzip2_decode( - * the input buffer to start, and we'll realloc as needed. - */ - outsize = dom->kernel_size; -+ -+ /* -+ * stream.avail_in and outsize are unsigned int, while kernel_size -+ * is a size_t. Check we aren't overflowing. -+ */ -+ if ( outsize != dom->kernel_size ) -+ { -+ DOMPRINTF("BZIP2: Input too large"); -+ goto bzip2_cleanup; -+ } -+ - out_buf = malloc(outsize); - if ( out_buf == NULL ) - { -@@ -98,13 +115,20 @@ static int xc_try_bzip2_decode( - if ( stream.avail_out == 0 ) - { - /* Protect against output buffer overflow */ -- if ( outsize > INT_MAX / 2 ) -+ if ( outsize > UINT_MAX / 2 ) - { - DOMPRINTF("BZIP2: output buffer overflow"); - free(out_buf); - goto bzip2_cleanup; - } - -+ if ( xc_dom_kernel_check_size(dom, outsize * 2) ) -+ { -+ DOMPRINTF("BZIP2: output too large"); -+ free(out_buf); -+ goto bzip2_cleanup; -+ } -+ - tmp_buf = realloc(out_buf, outsize * 2); - if ( tmp_buf == NULL ) - { -@@ -172,9 +196,15 @@ static int xc_try_lzma_decode( - unsigned char *out_buf; - unsigned char *tmp_buf; - int retval = -1; -- int outsize; -+ size_t outsize; - const char *msg; - -+ if ( dom->kernel_size == 0) -+ { -+ DOMPRINTF("LZMA: Input is 0 size"); -+ return -1; -+ } -+ - ret = lzma_alone_decoder(&stream, 128*1024*1024); - if ( ret != LZMA_OK ) - { -@@ -251,13 +281,20 @@ static int xc_try_lzma_decode( - if ( stream.avail_out == 0 ) - { - /* Protect against output buffer overflow */ -- if ( outsize > INT_MAX / 2 ) -+ if ( outsize > SIZE_MAX / 2 ) - { - DOMPRINTF("LZMA: output buffer overflow"); - free(out_buf); - goto lzma_cleanup; - } - -+ if ( xc_dom_kernel_check_size(dom, outsize * 2) ) -+ { -+ DOMPRINTF("LZMA: output too large"); -+ free(out_buf); -+ goto lzma_cleanup; -+ } -+ - tmp_buf = realloc(out_buf, outsize * 2); - if ( tmp_buf == NULL ) - { -@@ -327,6 +364,12 @@ static int xc_try_lzo1x_decode( - 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a - }; - -+ /* -+ * lzo_uint should match size_t. Check that this is the case to be -+ * sure we won't overflow various lzo_uint fields. -+ */ -+ XC_BUILD_BUG_ON(sizeof(lzo_uint) != sizeof(size_t)); -+ - ret = lzo_init(); - if ( ret != LZO_E_OK ) - { -@@ -406,6 +449,14 @@ static int xc_try_lzo1x_decode( - if ( src_len <= 0 || src_len > dst_len || src_len > left ) - break; - -+ msg = "Output buffer overflow"; -+ if ( *size > SIZE_MAX - dst_len ) -+ break; -+ -+ msg = "Decompressed image too large"; -+ if ( xc_dom_kernel_check_size(dom, *size + dst_len) ) -+ break; -+ - msg = "Failed to (re)alloc memory"; - tmp_buf = realloc(out_buf, *size + dst_len); - if ( tmp_buf == NULL ) -Index: xen-4.1.3/tools/libxc/xc_dom_core.c -=================================================================== ---- xen-4.1.3.orig/tools/libxc/xc_dom_core.c 2012-08-09 22:08:06.000000000 +0200 -+++ xen-4.1.3/tools/libxc/xc_dom_core.c 2012-12-05 16:40:37.076514449 +0100 -@@ -159,7 +159,8 @@ void *xc_dom_malloc_page_aligned(struct - } - - void *xc_dom_malloc_filemap(struct xc_dom_image *dom, -- const char *filename, size_t * size) -+ const char *filename, size_t * size, -+ const size_t max_size) - { - struct xc_dom_mem *block = NULL; - int fd = -1; -@@ -171,6 +172,13 @@ void *xc_dom_malloc_filemap(struct xc_do - lseek(fd, 0, SEEK_SET); - *size = lseek(fd, 0, SEEK_END); - -+ if ( max_size && *size > max_size ) -+ { -+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, -+ "tried to map file which is too large"); -+ goto err; -+ } -+ - block = malloc(sizeof(*block)); - if ( block == NULL ) - goto err; -@@ -222,6 +230,40 @@ char *xc_dom_strdup(struct xc_dom_image - } - - /* ------------------------------------------------------------------------ */ -+/* decompression buffer sizing */ -+int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz) -+{ -+ /* No limit */ -+ if ( !dom->max_kernel_size ) -+ return 0; -+ -+ if ( sz > dom->max_kernel_size ) -+ { -+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, -+ "kernel image too large"); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+int xc_dom_ramdisk_check_size(struct xc_dom_image *dom, size_t sz) -+{ -+ /* No limit */ -+ if ( !dom->max_ramdisk_size ) -+ return 0; -+ -+ if ( sz > dom->max_ramdisk_size ) -+ { -+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, -+ "ramdisk image too large"); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+/* ------------------------------------------------------------------------ */ - /* read files, copy memory blocks, with transparent gunzip */ - - size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen) -@@ -235,7 +277,7 @@ size_t xc_dom_check_gzip(xc_interface *x - - gzlen = blob + ziplen - 4; - unziplen = gzlen[3] << 24 | gzlen[2] << 16 | gzlen[1] << 8 | gzlen[0]; -- if ( (unziplen < 0) || (unziplen > (1024*1024*1024)) ) /* 1GB limit */ -+ if ( (unziplen < 0) || (unziplen > XC_DOM_DECOMPRESS_MAX) ) - { - xc_dom_printf - (xch, -@@ -288,6 +330,9 @@ int xc_dom_try_gunzip(struct xc_dom_imag - if ( unziplen == 0 ) - return 0; - -+ if ( xc_dom_kernel_check_size(dom, unziplen) ) -+ return 0; -+ - unzip = xc_dom_malloc(dom, unziplen); - if ( unzip == NULL ) - return -1; -@@ -588,6 +633,9 @@ struct xc_dom_image *xc_dom_allocate(xc_ - memset(dom, 0, sizeof(*dom)); - dom->xch = xch; - -+ dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX; -+ dom->max_ramdisk_size = XC_DOM_DECOMPRESS_MAX; -+ - if ( cmdline ) - dom->cmdline = xc_dom_strdup(dom, cmdline); - if ( features ) -@@ -608,10 +656,25 @@ struct xc_dom_image *xc_dom_allocate(xc_ - return NULL; - } - -+int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz) -+{ -+ DOMPRINTF("%s: kernel_max_size=%zx", __FUNCTION__, sz); -+ dom->max_kernel_size = sz; -+ return 0; -+} -+ -+int xc_dom_ramdisk_max_size(struct xc_dom_image *dom, size_t sz) -+{ -+ DOMPRINTF("%s: ramdisk_max_size=%zx", __FUNCTION__, sz); -+ dom->max_ramdisk_size = sz; -+ return 0; -+} -+ - int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename) - { - DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); -- dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size); -+ dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size, -+ dom->max_kernel_size); - if ( dom->kernel_blob == NULL ) - return -1; - return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); -@@ -621,7 +684,9 @@ int xc_dom_ramdisk_file(struct xc_dom_im - { - DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); - dom->ramdisk_blob = -- xc_dom_malloc_filemap(dom, filename, &dom->ramdisk_size); -+ xc_dom_malloc_filemap(dom, filename, &dom->ramdisk_size, -+ dom->max_ramdisk_size); -+ - if ( dom->ramdisk_blob == NULL ) - return -1; - // return xc_dom_try_gunzip(dom, &dom->ramdisk_blob, &dom->ramdisk_size); -@@ -781,7 +846,11 @@ int xc_dom_build_image(struct xc_dom_ima - void *ramdiskmap; - - unziplen = xc_dom_check_gzip(dom->xch, dom->ramdisk_blob, dom->ramdisk_size); -+ if ( xc_dom_ramdisk_check_size(dom, unziplen) != 0 ) -+ unziplen = 0; -+ - ramdisklen = unziplen ? unziplen : dom->ramdisk_size; -+ - if ( xc_dom_alloc_segment(dom, &dom->ramdisk_seg, "ramdisk", 0, - ramdisklen) != 0 ) - goto err; -Index: xen-4.1.3/tools/pygrub/src/pygrub -=================================================================== ---- xen-4.1.3.orig/tools/pygrub/src/pygrub 2012-12-05 16:39:37.000000000 +0100 -+++ xen-4.1.3/tools/pygrub/src/pygrub 2012-12-05 16:40:37.080514476 +0100 -@@ -29,6 +29,7 @@ import grub.LiloConf - import grub.ExtLinuxConf - - PYGRUB_VER = 0.6 -+FS_READ_MAX = 1024 * 1024 - - def enable_cursor(ison): - if ison: -@@ -422,7 +423,8 @@ class Grub: - if self.__dict__.get('cf', None) is None: - raise RuntimeError, "couldn't find bootloader config file in the image provided." - f = fs.open_file(self.cf.filename) -- buf = f.read() -+ # limit read size to avoid pathological cases -+ buf = f.read(FS_READ_MAX) - del f - self.cf.parse(buf) - -@@ -626,6 +628,37 @@ if __name__ == "__main__": - def usage(): - print >> sys.stderr, "Usage: %s [-q|--quiet] [-i|--interactive] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] " %(sys.argv[0],) - -+ def copy_from_image(fs, file_to_read, file_type, output_directory, -+ not_really): -+ if not_really: -+ if fs.file_exists(file_to_read): -+ return "<%s:%s>" % (file_type, file_to_read) -+ else: -+ sys.exit("The requested %s file does not exist" % file_type) -+ try: -+ datafile = fs.open_file(file_to_read) -+ except Exception, e: -+ print >>sys.stderr, e -+ sys.exit("Error opening %s in guest" % file_to_read) -+ (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".", -+ dir=output_directory) -+ dataoff = 0 -+ while True: -+ data = datafile.read(FS_READ_MAX, dataoff) -+ if len(data) == 0: -+ os.close(tfd) -+ del datafile -+ return ret -+ try: -+ os.write(tfd, data) -+ except Exception, e: -+ print >>sys.stderr, e -+ os.close(tfd) -+ os.unlink(ret) -+ del datafile -+ sys.exit("Error writing temporary copy of "+file_type) -+ dataoff += len(data) -+ - try: - opts, args = getopt.gnu_getopt(sys.argv[1:], 'qinh::', - ["quiet", "interactive", "not-really", "help", -@@ -739,24 +772,18 @@ if __name__ == "__main__": - if not fs: - raise RuntimeError, "Unable to find partition containing kernel" - -- if not_really: -- bootcfg["kernel"] = "" % chosencfg["kernel"] -- else: -- data = fs.open_file(chosencfg["kernel"]).read() -- (tfd, bootcfg["kernel"]) = tempfile.mkstemp(prefix="boot_kernel.", -- dir=output_directory) -- os.write(tfd, data) -- os.close(tfd) -+ bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel", -+ output_directory, not_really) - - if chosencfg["ramdisk"]: -- if not_really: -- bootcfg["ramdisk"] = "" % chosencfg["ramdisk"] -- else: -- data = fs.open_file(chosencfg["ramdisk"],).read() -- (tfd, bootcfg["ramdisk"]) = tempfile.mkstemp( -- prefix="boot_ramdisk.", dir=output_directory) -- os.write(tfd, data) -- os.close(tfd) -+ try: -+ bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"], -+ "ramdisk", output_directory, -+ not_really) -+ except: -+ if not not_really: -+ os.unlink(bootcfg["kernel"]) -+ raise - else: - initrd = None - diff -Nru xen-4.1.3/debian/patches/xsa26-4.1.patch xen-4.1.5/debian/patches/xsa26-4.1.patch --- xen-4.1.3/debian/patches/xsa26-4.1.patch 2012-12-05 17:47:38.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa26-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,109 +0,0 @@ -gnttab: fix releasing of memory upon switches between versions - -gnttab_unpopulate_status_frames() incompletely freed the pages -previously used as status frame in that they did not get removed from -the domain's xenpage_list, thus causing subsequent list corruption -when those pages did get allocated again for the same or another purpose. - -Similarly, grant_table_create() and gnttab_grow_table() both improperly -clean up in the event of an error - pages already shared with the guest -can't be freed by just passing them to free_xenheap_page(). Fix this by -sharing the pages only after all allocations succeeded. - -This is CVE-2012-5510 / XSA-26. - -Signed-off-by: Jan Beulich -Acked-by: Ian Campbell -Origin: http://lists.xen.org/archives/html/xen-devel/2012-12/msg00077.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/common/grant_table.c -=================================================================== ---- xen-4.1.3.orig/xen/common/grant_table.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/common/grant_table.c 2012-12-05 17:47:33.996152169 +0100 -@@ -1102,12 +1102,13 @@ fault: - } - - static int --gnttab_populate_status_frames(struct domain *d, struct grant_table *gt) -+gnttab_populate_status_frames(struct domain *d, struct grant_table *gt, -+ unsigned int req_nr_frames) - { - unsigned i; - unsigned req_status_frames; - -- req_status_frames = grant_to_status_frames(gt->nr_grant_frames); -+ req_status_frames = grant_to_status_frames(req_nr_frames); - for ( i = nr_status_frames(gt); i < req_status_frames; i++ ) - { - if ( (gt->status[i] = alloc_xenheap_page()) == NULL ) -@@ -1138,7 +1139,12 @@ gnttab_unpopulate_status_frames(struct d - - for ( i = 0; i < nr_status_frames(gt); i++ ) - { -- page_set_owner(virt_to_page(gt->status[i]), dom_xen); -+ struct page_info *pg = virt_to_page(gt->status[i]); -+ -+ BUG_ON(page_get_owner(pg) != d); -+ if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) ) -+ put_page(pg); -+ BUG_ON(pg->count_info & ~PGC_xen_heap); - free_xenheap_page(gt->status[i]); - gt->status[i] = NULL; - } -@@ -1176,19 +1182,18 @@ gnttab_grow_table(struct domain *d, unsi - clear_page(gt->shared_raw[i]); - } - -- /* Share the new shared frames with the recipient domain */ -- for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) -- gnttab_create_shared_page(d, gt, i); -- -- gt->nr_grant_frames = req_nr_frames; -- - /* Status pages - version 2 */ - if (gt->gt_version > 1) - { -- if ( gnttab_populate_status_frames(d, gt) ) -+ if ( gnttab_populate_status_frames(d, gt, req_nr_frames) ) - goto shared_alloc_failed; - } - -+ /* Share the new shared frames with the recipient domain */ -+ for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) -+ gnttab_create_shared_page(d, gt, i); -+ gt->nr_grant_frames = req_nr_frames; -+ - return 1; - - shared_alloc_failed: -@@ -2129,7 +2134,7 @@ gnttab_set_version(XEN_GUEST_HANDLE(gntt - - if ( op.version == 2 && gt->gt_version < 2 ) - { -- res = gnttab_populate_status_frames(d, gt); -+ res = gnttab_populate_status_frames(d, gt, nr_grant_frames(gt)); - if ( res < 0) - goto out_unlock; - } -@@ -2450,9 +2455,6 @@ grant_table_create( - clear_page(t->shared_raw[i]); - } - -- for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) -- gnttab_create_shared_page(d, t, i); -- - /* Status pages for grant table - for version 2 */ - t->status = xmalloc_array(grant_status_t *, - grant_to_status_frames(max_nr_grant_frames)); -@@ -2460,6 +2462,10 @@ grant_table_create( - goto no_mem_4; - memset(t->status, 0, - grant_to_status_frames(max_nr_grant_frames) * sizeof(t->status[0])); -+ -+ for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) -+ gnttab_create_shared_page(d, t, i); -+ - t->nr_status_frames = 0; - - /* Okay, install the structure. */ diff -Nru xen-4.1.3/debian/patches/xsa27-4.1.patch xen-4.1.5/debian/patches/xsa27-4.1.patch --- xen-4.1.3/debian/patches/xsa27-4.1.patch 2012-12-05 17:47:43.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa27-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,173 +0,0 @@ -hvm: Limit the size of large HVM op batches - -Doing large p2m updates for HVMOP_track_dirty_vram without preemption -ties up the physical processor. Integrating preemption into the p2m -updates is hard so simply limit to 1GB which is sufficient for a 15000 -* 15000 * 32bpp framebuffer. - -For HVMOP_modified_memory and HVMOP_set_mem_type preemptible add the -necessary machinery to handle preemption. - -This is CVE-2012-5511 / XSA-27. - -Signed-off-by: Tim Deegan -Signed-off-by: Ian Campbell -Acked-by: Ian Jackson - -x86/paging: Don't allocate user-controlled amounts of stack memory. - -This is XSA-27 / CVE-2012-5511. - -Signed-off-by: Tim Deegan -Acked-by: Jan Beulich -v2: Provide definition of GB to fix x86-32 compile. - -Signed-off-by: Jan Beulich -Acked-by: Ian Jackson -Origin: http://lists.xen.org/archives/html/xen-devel/2012-12/msg00093.html -Signed-off-by: Stefan Bader - - -Index: xen-4.1.3/xen/arch/x86/hvm/hvm.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/hvm/hvm.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/hvm/hvm.c 2012-12-05 17:47:40.776185311 +0100 -@@ -3446,6 +3446,9 @@ long do_hvm_op(unsigned long op, XEN_GUE - if ( !is_hvm_domain(d) ) - goto param_fail2; - -+ if ( a.nr > GB(1) >> PAGE_SHIFT ) -+ goto param_fail2; -+ - rc = xsm_hvm_param(d, op); - if ( rc ) - goto param_fail2; -@@ -3473,7 +3476,6 @@ long do_hvm_op(unsigned long op, XEN_GUE - struct xen_hvm_modified_memory a; - struct domain *d; - struct p2m_domain *p2m; -- unsigned long pfn; - - if ( copy_from_guest(&a, arg, 1) ) - return -EFAULT; -@@ -3501,8 +3503,9 @@ long do_hvm_op(unsigned long op, XEN_GUE - goto param_fail3; - - p2m = p2m_get_hostp2m(d); -- for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) -+ while ( a.nr > 0 ) - { -+ unsigned long pfn = a.first_pfn; - p2m_type_t t; - mfn_t mfn = gfn_to_mfn(p2m, pfn, &t); - if ( p2m_is_paging(t) ) -@@ -3523,6 +3526,19 @@ long do_hvm_op(unsigned long op, XEN_GUE - /* don't take a long time and don't die either */ - sh_remove_shadows(d->vcpu[0], mfn, 1, 0); - } -+ -+ a.first_pfn++; -+ a.nr--; -+ -+ /* Check for continuation if it's not the last interation */ -+ if ( a.nr > 0 && hypercall_preempt_check() ) -+ { -+ if ( copy_to_guest(arg, &a, 1) ) -+ rc = -EFAULT; -+ else -+ rc = -EAGAIN; -+ break; -+ } - } - - param_fail3: -@@ -3566,7 +3582,6 @@ long do_hvm_op(unsigned long op, XEN_GUE - struct xen_hvm_set_mem_type a; - struct domain *d; - struct p2m_domain *p2m; -- unsigned long pfn; - - /* Interface types to internal p2m types */ - p2m_type_t memtype[] = { -@@ -3596,8 +3611,9 @@ long do_hvm_op(unsigned long op, XEN_GUE - goto param_fail4; - - p2m = p2m_get_hostp2m(d); -- for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) -+ while ( a.nr > 0 ) - { -+ unsigned long pfn = a.first_pfn; - p2m_type_t t; - p2m_type_t nt; - mfn_t mfn; -@@ -3633,6 +3649,19 @@ long do_hvm_op(unsigned long op, XEN_GUE - goto param_fail4; - } - } -+ -+ a.first_pfn++; -+ a.nr--; -+ -+ /* Check for continuation if it's not the last interation */ -+ if ( a.nr > 0 && hypercall_preempt_check() ) -+ { -+ if ( copy_to_guest(arg, &a, 1) ) -+ rc = -EFAULT; -+ else -+ rc = -EAGAIN; -+ goto param_fail4; -+ } - } - - rc = 0; -Index: xen-4.1.3/xen/arch/x86/mm/paging.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/mm/paging.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/mm/paging.c 2012-12-05 17:47:40.776185311 +0100 -@@ -529,13 +529,18 @@ int paging_log_dirty_range(struct domain - - if ( !d->arch.paging.log_dirty.fault_count && - !d->arch.paging.log_dirty.dirty_count ) { -- int size = (nr + BITS_PER_LONG - 1) / BITS_PER_LONG; -- unsigned long zeroes[size]; -- memset(zeroes, 0x00, size * BYTES_PER_LONG); -+ static uint8_t zeroes[PAGE_SIZE]; -+ int off, size; -+ -+ size = ((nr + BITS_PER_LONG - 1) / BITS_PER_LONG) * sizeof (long); - rv = 0; -- if ( copy_to_guest_offset(dirty_bitmap, 0, (uint8_t *) zeroes, -- size * BYTES_PER_LONG) != 0 ) -- rv = -EFAULT; -+ for ( off = 0; !rv && off < size; off += sizeof zeroes ) -+ { -+ int todo = min(size - off, (int) PAGE_SIZE); -+ if ( copy_to_guest_offset(dirty_bitmap, off, zeroes, todo) ) -+ rv = -EFAULT; -+ off += todo; -+ } - goto out; - } - d->arch.paging.log_dirty.fault_count = 0; -Index: xen-4.1.3/xen/include/asm-x86/config.h -=================================================================== ---- xen-4.1.3.orig/xen/include/asm-x86/config.h 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/include/asm-x86/config.h 2012-12-05 17:47:40.780185332 +0100 -@@ -108,6 +108,9 @@ extern unsigned int trampoline_xen_phys_ - extern unsigned char trampoline_cpu_started; - extern char wakeup_start[]; - extern unsigned int video_mode, video_flags; -+ -+#define GB(_gb) (_gb ## UL << 30) -+ - #endif - - #define asmlinkage -@@ -123,7 +126,6 @@ extern unsigned int video_mode, video_fl - #define PML4_ADDR(_slot) \ - ((((_slot ## UL) >> 8) * 0xffff000000000000UL) | \ - (_slot ## UL << PML4_ENTRY_BITS)) --#define GB(_gb) (_gb ## UL << 30) - #else - #define PML4_ENTRY_BYTES (1 << PML4_ENTRY_BITS) - #define PML4_ADDR(_slot) \ diff -Nru xen-4.1.3/debian/patches/xsa28-4.1.patch xen-4.1.5/debian/patches/xsa28-4.1.patch --- xen-4.1.3/debian/patches/xsa28-4.1.patch 2012-12-05 17:47:47.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa28-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,38 +0,0 @@ -x86/HVM: range check xen_hvm_set_mem_access.hvmmem_access before use - -Otherwise an out of bounds array access can happen if changing the -default access is being requested, which - if it doesn't crash Xen - -would subsequently allow reading arbitrary memory through -HVMOP_get_mem_access (again, unless that operation crashes Xen). - -This is XSA-28 / CVE-2012-5512. - -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan -Acked-by: Ian Campbell -Origin: http://lists.xen.org/archives/html/xen-devel/2012-12/msg00080.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/arch/x86/hvm/hvm.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/hvm/hvm.c 2012-12-05 17:47:40.776185311 +0100 -+++ xen-4.1.3/xen/arch/x86/hvm/hvm.c 2012-12-05 17:47:45.492208368 +0100 -@@ -3699,7 +3699,7 @@ long do_hvm_op(unsigned long op, XEN_GUE - return rc; - - rc = -EINVAL; -- if ( !is_hvm_domain(d) ) -+ if ( !is_hvm_domain(d) || a.hvmmem_access >= ARRAY_SIZE(memaccess) ) - goto param_fail5; - - p2m = p2m_get_hostp2m(d); -@@ -3719,9 +3719,6 @@ long do_hvm_op(unsigned long op, XEN_GUE - ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) ) - goto param_fail5; - -- if ( a.hvmmem_access >= ARRAY_SIZE(memaccess) ) -- goto param_fail5; -- - for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) - { - p2m_type_t t; diff -Nru xen-4.1.3/debian/patches/xsa29-4.1.patch xen-4.1.5/debian/patches/xsa29-4.1.patch --- xen-4.1.3/debian/patches/xsa29-4.1.patch 2012-12-05 17:47:50.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa29-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,51 +0,0 @@ -xen: add missing guest address range checks to XENMEM_exchange handlers - -Ever since its existence (3.0.3 iirc) the handler for this has been -using non address range checking guest memory accessors (i.e. -the ones prefixed with two underscores) without first range -checking the accessed space (via guest_handle_okay()), allowing -a guest to access and overwrite hypervisor memory. - -This is XSA-29 / CVE-2012-5513. - -Signed-off-by: Jan Beulich -Acked-by: Ian Campbell -Acked-by: Ian Jackson -Origin: http://lists.xen.org/archives/html/xen-devel/2012-12/msg00082.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/common/compat/memory.c -=================================================================== ---- xen-4.1.3.orig/xen/common/compat/memory.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/common/compat/memory.c 2012-12-05 17:47:48.908225074 +0100 -@@ -114,6 +114,12 @@ int compat_memory_op(unsigned int cmd, X - (cmp.xchg.out.nr_extents << cmp.xchg.out.extent_order)) ) - return -EINVAL; - -+ if ( !compat_handle_okay(cmp.xchg.in.extent_start, -+ cmp.xchg.in.nr_extents) || -+ !compat_handle_okay(cmp.xchg.out.extent_start, -+ cmp.xchg.out.nr_extents) ) -+ return -EFAULT; -+ - start_extent = cmp.xchg.nr_exchanged; - end_extent = (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.xchg)) / - (((1U << ABS(order_delta)) + 1) * -Index: xen-4.1.3/xen/common/memory.c -=================================================================== ---- xen-4.1.3.orig/xen/common/memory.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/common/memory.c 2012-12-05 17:47:48.908225074 +0100 -@@ -289,6 +289,13 @@ static long memory_exchange(XEN_GUEST_HA - goto fail_early; - } - -+ if ( !guest_handle_okay(exch.in.extent_start, exch.in.nr_extents) || -+ !guest_handle_okay(exch.out.extent_start, exch.out.nr_extents) ) -+ { -+ rc = -EFAULT; -+ goto fail_early; -+ } -+ - /* Only privileged guests can allocate multi-page contiguous extents. */ - if ( !multipage_allocation_permitted(current->domain, - exch.in.extent_order) || diff -Nru xen-4.1.3/debian/patches/xsa30-4.1.patch xen-4.1.5/debian/patches/xsa30-4.1.patch --- xen-4.1.3/debian/patches/xsa30-4.1.patch 2012-12-05 17:47:53.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa30-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,60 +0,0 @@ -xen: fix error handling of guest_physmap_mark_populate_on_demand() - -The only user of the "out" label bypasses a necessary unlock, thus -enabling the caller to lock up Xen. - -Also, the function was never meant to be called by a guest for itself, -so rather than inspecting the code paths in depth for potential other -problems this might cause, and adjusting e.g. the non-guest printk() -in the above error path, just disallow the guest access to it. - -Finally, the printk() (considering its potential of spamming the log, -the more that it's not using XENLOG_GUEST), is being converted to -P2M_DEBUG(), as debugging is what it apparently was added for in the -first place. - -This is XSA-30 / CVE-2012-5514. - -Signed-off-by: Jan Beulich -Acked-by: Ian Campbell -Acked-by: George Dunlap -Acked-by: Ian Jackson -Origin: http://lists.xen.org/archives/html/xen-devel/2012-12/msg00078.html -Signed-off-by: Stefan Bader - -Index: xen-4.1.3/xen/arch/x86/mm/p2m.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/mm/p2m.c 2012-12-05 16:40:28.324471658 +0100 -+++ xen-4.1.3/xen/arch/x86/mm/p2m.c 2012-12-05 17:47:52.052240442 +0100 -@@ -2414,6 +2414,9 @@ guest_physmap_mark_populate_on_demand(st - int pod_count = 0; - int rc = 0; - -+ if ( !IS_PRIV_FOR(current->domain, d) ) -+ return -EPERM; -+ - if ( !paging_mode_translate(d) ) - return -EINVAL; - -@@ -2432,8 +2435,7 @@ guest_physmap_mark_populate_on_demand(st - omfn = gfn_to_mfn_query(p2m, gfn + i, &ot); - if ( p2m_is_ram(ot) ) - { -- printk("%s: gfn_to_mfn returned type %d!\n", -- __func__, ot); -+ P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); - rc = -EBUSY; - goto out; - } -@@ -2455,10 +2457,10 @@ guest_physmap_mark_populate_on_demand(st - BUG_ON(p2m->pod.entry_count < 0); - } - -+out: - audit_p2m(p2m, 1); - p2m_unlock(p2m); - --out: - return rc; - } - diff -Nru xen-4.1.3/debian/patches/xsa31-4.1.patch xen-4.1.5/debian/patches/xsa31-4.1.patch --- xen-4.1.3/debian/patches/xsa31-4.1.patch 2012-12-05 17:47:28.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa31-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,52 +0,0 @@ -memop: limit guest specified extent order - -Allowing unbounded order values here causes almost unbounded loops -and/or partially incomplete requests, particularly in PoD code. - -The added range checks in populate_physmap(), decrease_reservation(), -and the "in" one in memory_exchange() architecturally all could use -PADDR_BITS - PAGE_SHIFT, and are being artificially constrained to -MAX_ORDER. - -This is XSA-31 / CVE-2012-5515. - -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan -Acked-by: Ian Jackson -Origin: http://lists.xen.org/archives/html/xen-devel/2012-12/msg00079.html -Signed-off-by: Stefan Bader - -diff --git a/xen/common/memory.c b/xen/common/memory.c -index 4e7c234..9b9fb18 100644 ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -117,7 +117,8 @@ static void populate_physmap(struct memop_args *a) - - if ( a->memflags & MEMF_populate_on_demand ) - { -- if ( guest_physmap_mark_populate_on_demand(d, gpfn, -+ if ( a->extent_order > MAX_ORDER || -+ guest_physmap_mark_populate_on_demand(d, gpfn, - a->extent_order) < 0 ) - goto out; - } -@@ -216,7 +217,8 @@ static void decrease_reservation(struct memop_args *a) - xen_pfn_t gmfn; - - if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done, -- a->nr_extents-1) ) -+ a->nr_extents-1) || -+ a->extent_order > MAX_ORDER ) - return; - - for ( i = a->nr_done; i < a->nr_extents; i++ ) -@@ -278,6 +280,9 @@ static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg) - if ( (exch.nr_exchanged > exch.in.nr_extents) || - /* Input and output domain identifiers match? */ - (exch.in.domid != exch.out.domid) || -+ /* Extent orders are sensible? */ -+ (exch.in.extent_order > MAX_ORDER) || -+ (exch.out.extent_order > MAX_ORDER) || - /* Sizes of input and output lists do not overflow a long? */ - ((~0UL >> exch.in.extent_order) < exch.in.nr_extents) || - ((~0UL >> exch.out.extent_order) < exch.out.nr_extents) || diff -Nru xen-4.1.3/debian/patches/xsa33-4.1.patch xen-4.1.5/debian/patches/xsa33-4.1.patch --- xen-4.1.3/debian/patches/xsa33-4.1.patch 2013-01-08 14:47:33.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa33-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,24 +0,0 @@ -VT-d: fix interrupt remapping source validation for devices behind -legacy bridges - -Using SVT_VERIFY_BUS here doesn't make sense; native Linux also -uses SVT_VERIFY_SID_SQ here instead. - -Signed-off-by: Jan Beulich - -Conflicts: - xen/drivers/passthrough/vtd/intremap.c - -Index: xen-4.1.3/xen/drivers/passthrough/vtd/intremap.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/passthrough/vtd/intremap.c 2013-01-07 18:39:03.329254309 +0100 -+++ xen-4.1.3/xen/drivers/passthrough/vtd/intremap.c 2013-01-07 18:40:08.581573309 +0100 -@@ -499,7 +499,7 @@ static void set_msi_source_id(struct pci - set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16, - (bus << 8) | pdev->bus); - else if ( pdev_type(bus, devfn) == DEV_TYPE_LEGACY_PCI_BRIDGE ) -- set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16, -+ set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_ALL_16, - PCI_BDF2(bus, devfn)); - } - break; diff -Nru xen-4.1.3/debian/patches/xsa36-4.1.patch xen-4.1.5/debian/patches/xsa36-4.1.patch --- xen-4.1.3/debian/patches/xsa36-4.1.patch 2013-01-30 12:02:33.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa36-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,385 +0,0 @@ -ACPI: acpi_table_parse() should return handler's error code - -Currently, the error code returned by acpi_table_parse()'s handler -is ignored. This patch will propagate handler's return value to -acpi_table_parse()'s caller. - -AMD,IOMMU: Clean up old entries in remapping tables when creating new -interrupt mapping. - -When changing the affinity of an IRQ associated with a passed -through PCI device, clear previous mapping. - -In addition, because some BIOSes may incorrectly program IVRS -entries for IOAPIC try to check for entry's consistency. Specifically, -if conflicting entries are found disable IOMMU if per-device -remapping table is used. If entries refer to bogus IOAPIC IDs -disable IOMMU unconditionally - -AMD,IOMMU: Disable IOMMU if SATA Combined mode is on - -AMD's SP5100 chipset can be placed into SATA Combined mode -that may cause prevent dom0 from booting when IOMMU is -enabled and per-device interrupt remapping table is used. -While SP5100 erratum 28 requires BIOSes to disable this mode, -some may still use it. - -This patch checks whether this mode is on and, if per-device -table is in use, disables IOMMU. - -AMD,IOMMU: Make per-device interrupt remapping table default - -Using global interrupt remapping table may be insecure, as -described by XSA-36. This patch makes per-device mode default. - -This is XSA-36 / CVE-2013-0153. - -Signed-off-by: Jan Beulich -Signed-off-by: Boris Ostrovsky - -Index: xen-4.1.3/xen/arch/x86/irq.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/irq.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/irq.c 2013-01-30 12:02:28.482349033 +0100 -@@ -1677,9 +1677,6 @@ int map_domain_pirq( - d->arch.pirq_irq[pirq] = irq; - d->arch.irq_pirq[irq] = pirq; - spin_unlock_irqrestore(&desc->lock, flags); -- -- if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV ) -- printk(XENLOG_INFO "Per-device vector maps for GSIs not implemented yet.\n"); - } - - done: -Index: xen-4.1.3/xen/drivers/acpi/tables.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/acpi/tables.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/drivers/acpi/tables.c 2013-01-30 12:02:28.482349033 +0100 -@@ -267,7 +267,7 @@ acpi_table_parse_madt(enum acpi_madt_typ - * @handler: handler to run - * - * Scan the ACPI System Descriptor Table (STD) for a table matching @id, -- * run @handler on it. Return 0 if table found, return on if not. -+ * run @handler on it. - */ - int acpi_table_parse(char *id, acpi_table_handler handler) - { -@@ -282,8 +282,7 @@ int acpi_table_parse(char *id, acpi_tabl - acpi_get_table(id, 0, &table); - - if (table) { -- handler(table); -- return 0; -+ return handler(table); - } else - return 1; - } -Index: xen-4.1.3/xen/drivers/passthrough/amd/iommu_acpi.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/passthrough/amd/iommu_acpi.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/drivers/passthrough/amd/iommu_acpi.c 2013-01-30 12:02:28.486349054 +0100 -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -29,7 +30,6 @@ extern unsigned long amd_iommu_page_entr - extern unsigned short ivrs_bdf_entries; - extern struct ivrs_mappings *ivrs_mappings; - extern unsigned short last_bdf; --extern int ioapic_bdf[MAX_IO_APICS]; - extern void *shared_intremap_table; - - static void add_ivrs_mapping_entry( -@@ -636,6 +636,7 @@ static u16 __init parse_ivhd_device_spec - u16 header_length, u16 block_length, struct amd_iommu *iommu) - { - u16 dev_length, bdf; -+ int apic; - - dev_length = sizeof(struct acpi_ivhd_device_special); - if ( header_length < (block_length + dev_length) ) -@@ -652,9 +653,58 @@ static u16 __init parse_ivhd_device_spec - } - - add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); -- /* set device id of ioapic */ -- ioapic_bdf[ivhd_device->special.handle] = bdf; -- return dev_length; -+ -+ if ( ivhd_device->special.variety != 1 /* ACPI_IVHD_IOAPIC */ ) -+ { -+ if ( ivhd_device->special.variety != 2 /* ACPI_IVHD_HPET */ ) -+ printk(XENLOG_ERR "Unrecognized IVHD special variety %#x\n", -+ ivhd_device->special.variety); -+ return dev_length; -+ } -+ -+ /* -+ * Some BIOSes have IOAPIC broken entries so we check for IVRS -+ * consistency here --- whether entry's IOAPIC ID is valid and -+ * whether there are conflicting/duplicated entries. -+ */ -+ for ( apic = 0; apic < nr_ioapics; apic++ ) -+ { -+ if ( IO_APIC_ID(apic) != ivhd_device->special.handle ) -+ continue; -+ -+ if ( ioapic_bdf[ivhd_device->special.handle].pin_setup ) -+ { -+ if ( ioapic_bdf[ivhd_device->special.handle].bdf == bdf ) -+ AMD_IOMMU_DEBUG("IVHD Warning: Duplicate IO-APIC %#x entries\n", -+ ivhd_device->special.handle); -+ else -+ { -+ printk(XENLOG_ERR "IVHD Error: Conflicting IO-APIC %#x entries\n", -+ ivhd_device->special.handle); -+ if ( amd_iommu_perdev_intremap ) -+ return 0; -+ } -+ } -+ else -+ { -+ /* set device id of ioapic */ -+ ioapic_bdf[ivhd_device->special.handle].bdf = bdf; -+ -+ ioapic_bdf[ivhd_device->special.handle].pin_setup = xzalloc_array( -+ unsigned long, BITS_TO_LONGS(nr_ioapic_registers[apic])); -+ if ( nr_ioapic_registers[apic] && -+ !ioapic_bdf[IO_APIC_ID(apic)].pin_setup ) -+ { -+ printk(XENLOG_ERR "IVHD Error: Out of memory\n"); -+ return 0; -+ } -+ } -+ return dev_length; -+ } -+ -+ printk(XENLOG_ERR "IVHD Error: Invalid IO-APIC %#x\n", -+ ivhd_device->special.handle); -+ return 0; - } - - static int __init parse_ivhd_block(struct acpi_ivhd_block_header *ivhd_block) -Index: xen-4.1.3/xen/drivers/passthrough/amd/iommu_init.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/passthrough/amd/iommu_init.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/drivers/passthrough/amd/iommu_init.c 2013-01-30 12:02:28.486349054 +0100 -@@ -874,12 +874,45 @@ static int __init amd_iommu_setup_device - return 0; - } - -+/* Check whether SP5100 SATA Combined mode is on */ -+static bool_t __init amd_sp5100_erratum28(void) -+{ -+ u32 bus, id; -+ u16 vendor_id, dev_id; -+ u8 byte; -+ -+ for (bus = 0; bus < 256; bus++) -+ { -+ id = pci_conf_read32(bus, 0x14, 0, PCI_VENDOR_ID); -+ -+ vendor_id = id & 0xffff; -+ dev_id = (id >> 16) & 0xffff; -+ -+ /* SP5100 SMBus module sets Combined mode on */ -+ if (vendor_id != 0x1002 || dev_id != 0x4385) -+ continue; -+ -+ byte = pci_conf_read8(bus, 0x14, 0, 0xad); -+ if ( (byte >> 3) & 1 ) -+ { -+ printk(XENLOG_WARNING "AMD-Vi: SP5100 erratum 28 detected, disabling IOMMU.\n" -+ "If possible, disable SATA Combined mode in BIOS or contact your vendor for BIOS update.\n"); -+ return 1; -+ } -+ } -+ -+ return 0; -+} -+ - int __init amd_iommu_init(void) - { - struct amd_iommu *iommu; - - BUG_ON( !iommu_found() ); - -+ if ( amd_iommu_perdev_intremap && amd_sp5100_erratum28() ) -+ goto error_out; -+ - irq_to_iommu = xmalloc_array(struct amd_iommu *, nr_irqs); - if ( irq_to_iommu == NULL ) - goto error_out; -Index: xen-4.1.3/xen/drivers/passthrough/amd/iommu_intr.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/passthrough/amd/iommu_intr.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/drivers/passthrough/amd/iommu_intr.c 2013-01-30 12:02:28.490349078 +0100 -@@ -27,7 +27,7 @@ - #define INTREMAP_LENGTH 0xB - #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH) - --int ioapic_bdf[MAX_IO_APICS]; -+struct ioapic_bdf ioapic_bdf[MAX_IO_APICS]; - extern struct ivrs_mappings *ivrs_mappings; - extern unsigned short ivrs_bdf_entries; - void *shared_intremap_table; -@@ -117,12 +117,12 @@ void invalidate_interrupt_table(struct a - static void update_intremap_entry_from_ioapic( - int bdf, - struct amd_iommu *iommu, -- struct IO_APIC_route_entry *ioapic_rte) -+ const struct IO_APIC_route_entry *rte, -+ const struct IO_APIC_route_entry *old_rte) - { - unsigned long flags; - u32* entry; - u8 delivery_mode, dest, vector, dest_mode; -- struct IO_APIC_route_entry *rte = ioapic_rte; - int req_id; - spinlock_t *lock; - int offset; -@@ -138,6 +138,14 @@ static void update_intremap_entry_from_i - spin_lock_irqsave(lock, flags); - - offset = get_intremap_offset(vector, delivery_mode); -+ if ( old_rte ) -+ { -+ int old_offset = get_intremap_offset(old_rte->vector, -+ old_rte->delivery_mode); -+ -+ if ( offset != old_offset ) -+ free_intremap_entry(bdf, old_offset); -+ } - entry = (u32*)get_intremap_entry(req_id, offset); - update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); - -@@ -176,7 +184,7 @@ int __init amd_iommu_setup_ioapic_remapp - continue; - - /* get device id of ioapic devices */ -- bdf = ioapic_bdf[IO_APIC_ID(apic)]; -+ bdf = ioapic_bdf[IO_APIC_ID(apic)].bdf; - iommu = find_iommu_for_device(bdf); - if ( !iommu ) - { -@@ -207,6 +215,7 @@ int __init amd_iommu_setup_ioapic_remapp - flush_command_buffer(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); - } -+ set_bit(pin, ioapic_bdf[IO_APIC_ID(apic)].pin_setup); - } - } - return 0; -@@ -218,6 +227,7 @@ void amd_iommu_ioapic_update_ire( - struct IO_APIC_route_entry old_rte = { 0 }; - struct IO_APIC_route_entry new_rte = { 0 }; - unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; -+ unsigned int pin = (reg - 0x10) / 2; - int saved_mask, bdf; - struct amd_iommu *iommu; - -@@ -228,7 +238,7 @@ void amd_iommu_ioapic_update_ire( - } - - /* get device id of ioapic devices */ -- bdf = ioapic_bdf[IO_APIC_ID(apic)]; -+ bdf = ioapic_bdf[IO_APIC_ID(apic)].bdf; - iommu = find_iommu_for_device(bdf); - if ( !iommu ) - { -@@ -254,6 +264,14 @@ void amd_iommu_ioapic_update_ire( - *(((u32 *)&new_rte) + 1) = value; - } - -+ if ( new_rte.mask && -+ !test_bit(pin, ioapic_bdf[IO_APIC_ID(apic)].pin_setup) ) -+ { -+ ASSERT(saved_mask); -+ __io_apic_write(apic, reg, value); -+ return; -+ } -+ - /* mask the interrupt while we change the intremap table */ - if ( !saved_mask ) - { -@@ -262,7 +280,11 @@ void amd_iommu_ioapic_update_ire( - } - - /* Update interrupt remapping entry */ -- update_intremap_entry_from_ioapic(bdf, iommu, &new_rte); -+ update_intremap_entry_from_ioapic( -+ bdf, iommu, &new_rte, -+ test_and_set_bit(pin, -+ ioapic_bdf[IO_APIC_ID(apic)].pin_setup) ? &old_rte -+ : NULL); - - /* Forward write access to IO-APIC RTE */ - __io_apic_write(apic, reg, value); -@@ -373,6 +395,12 @@ void amd_iommu_msi_msg_update_ire( - return; - } - -+ if ( msi_desc->remap_index >= 0 ) -+ update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, NULL); -+ -+ if ( !msg ) -+ return; -+ - update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, msg); - } - -Index: xen-4.1.3/xen/drivers/passthrough/amd/pci_amd_iommu.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/passthrough/amd/pci_amd_iommu.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/drivers/passthrough/amd/pci_amd_iommu.c 2013-01-30 12:02:28.490349078 +0100 -@@ -195,6 +195,8 @@ int __init amd_iov_detect(void) - { - printk("AMD-Vi: Not overriding irq_vector_map setting\n"); - } -+ if ( !amd_iommu_perdev_intremap ) -+ printk(XENLOG_WARNING "AMD-Vi: Using global interrupt remap table is not recommended (see XSA-36)!\n"); - return scan_pci_devices(); - } - -Index: xen-4.1.3/xen/drivers/passthrough/iommu.c -=================================================================== ---- xen-4.1.3.orig/xen/drivers/passthrough/iommu.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/drivers/passthrough/iommu.c 2013-01-30 12:02:28.490349078 +0100 -@@ -49,7 +49,7 @@ bool_t __read_mostly iommu_qinval = 1; - bool_t __read_mostly iommu_intremap = 1; - bool_t __read_mostly iommu_hap_pt_share; - bool_t __read_mostly amd_iommu_debug; --bool_t __read_mostly amd_iommu_perdev_intremap; -+bool_t __read_mostly amd_iommu_perdev_intremap = 1; - - static void __init parse_iommu_param(char *s) - { -@@ -78,6 +78,8 @@ static void __init parse_iommu_param(cha - amd_iommu_debug = 1; - else if ( !strcmp(s, "amd-iommu-perdev-intremap") ) - amd_iommu_perdev_intremap = 1; -+ else if ( !strcmp(s, "amd-iommu-global-intremap") ) -+ amd_iommu_perdev_intremap = 0; - else if ( !strcmp(s, "dom0-passthrough") ) - iommu_passthrough = 1; - else if ( !strcmp(s, "dom0-strict") ) -Index: xen-4.1.3/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h -=================================================================== ---- xen-4.1.3.orig/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 2012-08-09 22:08:10.000000000 +0200 -+++ xen-4.1.3/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 2013-01-30 12:02:28.490349078 +0100 -@@ -88,6 +88,11 @@ void amd_iommu_read_msi_from_ire( - unsigned int amd_iommu_read_ioapic_from_ire( - unsigned int apic, unsigned int reg); - -+extern struct ioapic_bdf { -+ u16 bdf; -+ unsigned long *pin_setup; -+} ioapic_bdf[]; -+ - /* power management support */ - void amd_iommu_resume(void); - void amd_iommu_suspend(void); diff -Nru xen-4.1.3/debian/patches/xsa38.patch xen-4.1.5/debian/patches/xsa38.patch --- xen-4.1.3/debian/patches/xsa38.patch 2013-01-30 12:02:52.000000000 +0100 +++ xen-4.1.5/debian/patches/xsa38.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,73 +0,0 @@ -Index: xen-4.1.3/tools/ocaml/libs/xb/partial.ml -=================================================================== ---- xen-4.1.3.orig/tools/ocaml/libs/xb/partial.ml 2012-08-09 22:08:06.000000000 +0200 -+++ xen-4.1.3/tools/ocaml/libs/xb/partial.ml 2013-01-30 12:02:47.970444307 +0100 -@@ -27,8 +27,15 @@ external header_size: unit -> int = "stu - external header_of_string_internal: string -> int * int * int * int - = "stub_header_of_string" - -+let xenstore_payload_max = 4096 (* xen/include/public/io/xs_wire.h *) -+ - let of_string s = - let tid, rid, opint, dlen = header_of_string_internal s in -+ (* A packet which is bigger than xenstore_payload_max is illegal. -+ This will leave the guest connection is a bad state and will -+ be hard to recover from without restarting the connection -+ (ie rebooting the guest) *) -+ let dlen = min xenstore_payload_max dlen in - { - tid = tid; - rid = rid; -@@ -38,6 +45,7 @@ let of_string s = - } - - let append pkt s sz = -+ if pkt.len > 4096 then failwith "Buffer.add: cannot grow buffer"; - Buffer.add_string pkt.buf (String.sub s 0 sz) - - let to_complete pkt = -Index: xen-4.1.3/tools/ocaml/libs/xb/xs_ring_stubs.c -=================================================================== ---- xen-4.1.3.orig/tools/ocaml/libs/xb/xs_ring_stubs.c 2012-08-09 22:08:06.000000000 +0200 -+++ xen-4.1.3/tools/ocaml/libs/xb/xs_ring_stubs.c 2013-01-30 12:02:47.974444330 +0100 -@@ -43,21 +43,23 @@ static int xs_ring_read(struct mmap_inte - char *buffer, int len) - { - struct xenstore_domain_interface *intf = interface->addr; -- XENSTORE_RING_IDX cons, prod; -+ XENSTORE_RING_IDX cons, prod; /* offsets only */ - int to_read; - -- cons = intf->req_cons; -- prod = intf->req_prod; -+ cons = *(volatile uint32*)&intf->req_cons; -+ prod = *(volatile uint32*)&intf->req_prod; - xen_mb(); -+ cons = MASK_XENSTORE_IDX(cons); -+ prod = MASK_XENSTORE_IDX(prod); - if (prod == cons) - return 0; -- if (MASK_XENSTORE_IDX(prod) > MASK_XENSTORE_IDX(cons)) -+ if (prod > cons) - to_read = prod - cons; - else -- to_read = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); -+ to_read = XENSTORE_RING_SIZE - cons; - if (to_read < len) - len = to_read; -- memcpy(buffer, intf->req + MASK_XENSTORE_IDX(cons), len); -+ memcpy(buffer, intf->req + cons, len); - xen_mb(); - intf->req_cons += len; - return len; -@@ -70,8 +72,8 @@ static int xs_ring_write(struct mmap_int - XENSTORE_RING_IDX cons, prod; - int can_write; - -- cons = intf->rsp_cons; -- prod = intf->rsp_prod; -+ cons = *(volatile uint32*)&intf->rsp_cons; -+ prod = *(volatile uint32*)&intf->rsp_prod; - xen_mb(); - if ( (prod - cons) >= XENSTORE_RING_SIZE ) - return 0; diff -Nru xen-4.1.3/debian/patches/xsa44-4.1.patch xen-4.1.5/debian/patches/xsa44-4.1.patch --- xen-4.1.3/debian/patches/xsa44-4.1.patch 2013-04-11 16:05:18.000000000 +0200 +++ xen-4.1.5/debian/patches/xsa44-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,83 +0,0 @@ -x86: clear EFLAGS.NT in SYSENTER entry path - -... as it causes problems if we happen to exit back via IRET: In the -course of trying to handle the fault, the hypervisor creates a stack -frame by hand, and uses PUSHFQ to set the respective EFLAGS field, but -expects to be able to IRET through that stack frame to the second -portion of the fixup code (which causes a #GP due to the stored EFLAGS -having NT set). - -And even if this worked (e.g if we cleared NT in that path), it would -then (through the fail safe callback) cause a #GP in the guest with the -SYSENTER handler's first instruction as the source, which in turn would -allow guest user mode code to crash the guest kernel. - -Inject a #GP on the fake (NULL) address of the SYSENTER instruction -instead, just like in the case where the guest kernel didn't register -a corresponding entry point. - -On 32-bit we also need to make sure we clear SYSENTER_CS for all CPUs -(neither #RESET nor #INIT guarantee this). - -This is CVE-2013-1917 / XSA-44. - -Reported-by: Andrew Cooper -Signed-off-by: Jan Beulich -Tested-by: Andrew Cooper -Acked-by: Andrew Cooper - -Index: xen-4.1.3/xen/arch/x86/acpi/suspend.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/acpi/suspend.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/acpi/suspend.c 2013-04-11 16:05:12.390581755 +0200 -@@ -81,8 +81,12 @@ void restore_rest_processor_state(void) - } - - #else /* !defined(CONFIG_X86_64) */ -- if ( supervisor_mode_kernel && cpu_has_sep ) -- wrmsr(MSR_IA32_SYSENTER_ESP, &this_cpu(init_tss).esp1, 0); -+ if ( cpu_has_sep ) -+ { -+ wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); -+ if ( supervisor_mode_kernel ) -+ wrmsr(MSR_IA32_SYSENTER_ESP, &this_cpu(init_tss).esp1, 0); -+ } - #endif - - /* Maybe load the debug registers. */ -Index: xen-4.1.3/xen/arch/x86/cpu/common.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/cpu/common.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/cpu/common.c 2013-04-11 16:05:12.402581806 +0200 -@@ -710,8 +710,11 @@ void __cpuinit cpu_init(void) - #if defined(CONFIG_X86_32) - t->ss0 = __HYPERVISOR_DS; - t->esp0 = get_stack_bottom(); -- if ( supervisor_mode_kernel && cpu_has_sep ) -+ if ( cpu_has_sep ) { -+ wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); -+ if ( supervisor_mode_kernel ) - wrmsr(MSR_IA32_SYSENTER_ESP, &t->esp1, 0); -+ } - #elif defined(CONFIG_X86_64) - /* Bottom-of-stack must be 16-byte aligned! */ - BUG_ON((get_stack_bottom() & 15) != 0); -Index: xen-4.1.3/xen/arch/x86/x86_64/entry.S -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/x86_64/entry.S 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/x86_64/entry.S 2013-04-11 16:05:12.410581852 +0200 -@@ -286,7 +286,14 @@ sysenter_eflags_saved: - movl $3,UREGS_cs(%rsp) /* ring 3 null cs */ - movq VCPU_sysenter_addr(%rbx),%rax - setne %cl -+ testl $X86_EFLAGS_NT,UREGS_eflags(%rsp) - leaq VCPU_trap_bounce(%rbx),%rdx -+UNLIKELY_START(nz, sysenter_nt_set) -+ pushfq -+ andl $~X86_EFLAGS_NT,(%rsp) -+ popfq -+ xorl %eax,%eax -+UNLIKELY_END(sysenter_nt_set) - testq %rax,%rax - leal (,%rcx,TBF_INTERRUPT),%ecx - UNLIKELY_START(z, sysenter_gpf) diff -Nru xen-4.1.3/debian/patches/xsa46-4.1.patch xen-4.1.5/debian/patches/xsa46-4.1.patch --- xen-4.1.3/debian/patches/xsa46-4.1.patch 2013-04-11 16:05:21.000000000 +0200 +++ xen-4.1.5/debian/patches/xsa46-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,267 +0,0 @@ -x86: fix various issues with handling guest IRQs - -- properly revoke IRQ access in map_domain_pirq() error path -- don't permit replacing an in use IRQ -- don't accept inputs in the GSI range for MAP_PIRQ_TYPE_MSI -- track IRQ access permission in host IRQ terms, not guest IRQ ones - (and with that, also disallow Dom0 access to IRQ0) - -This is CVE-2013-1919 / XSA-46. - -Signed-off-by: Jan Beulich -Acked-by: Stefano Stabellini - -Index: xen-4.1.3/tools/python/xen/xend/server/irqif.py -=================================================================== ---- xen-4.1.3.orig/tools/python/xen/xend/server/irqif.py 2012-08-09 22:08:07.000000000 +0200 -+++ xen-4.1.3/tools/python/xen/xend/server/irqif.py 2013-04-11 16:05:19.578616889 +0200 -@@ -73,6 +73,12 @@ class IRQController(DevController): - - pirq = get_param('irq') - -+ rc = xc.physdev_map_pirq(domid = self.getDomid(), -+ index = pirq, -+ pirq = pirq) -+ if rc < 0: -+ raise VmError('irq: Failed to map irq %x' % (pirq)) -+ - rc = xc.domain_irq_permission(domid = self.getDomid(), - pirq = pirq, - allow_access = True) -@@ -81,12 +87,6 @@ class IRQController(DevController): - #todo non-fatal - raise VmError( - 'irq: Failed to configure irq: %d' % (pirq)) -- rc = xc.physdev_map_pirq(domid = self.getDomid(), -- index = pirq, -- pirq = pirq) -- if rc < 0: -- raise VmError( -- 'irq: Failed to map irq %x' % (pirq)) - back = dict([(k, config[k]) for k in self.valid_cfg if k in config]) - return (self.allocateDeviceID(), back, {}) - -Index: xen-4.1.3/xen/arch/x86/domain_build.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/domain_build.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/domain_build.c 2013-04-11 16:05:19.590616952 +0200 -@@ -1201,7 +1201,7 @@ int __init construct_dom0( - /* DOM0 is permitted full I/O capabilities. */ - rc |= ioports_permit_access(dom0, 0, 0xFFFF); - rc |= iomem_permit_access(dom0, 0UL, ~0UL); -- rc |= irqs_permit_access(dom0, 0, d->nr_pirqs - 1); -+ rc |= irqs_permit_access(dom0, 1, nr_irqs_gsi - 1); - - /* - * Modify I/O port access permissions. -Index: xen-4.1.3/xen/arch/x86/domctl.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/domctl.c 2012-08-09 22:08:08.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/domctl.c 2013-04-11 16:05:19.598616991 +0200 -@@ -908,9 +908,13 @@ long arch_do_domctl( - goto bind_out; - - ret = -EPERM; -- if ( !IS_PRIV(current->domain) && -- !irq_access_permitted(current->domain, bind->machine_irq) ) -- goto bind_out; -+ if ( !IS_PRIV(current->domain) ) -+ { -+ int irq = domain_pirq_to_irq(d, bind->machine_irq); -+ -+ if ( irq <= 0 || !irq_access_permitted(current->domain, irq) ) -+ goto bind_out; -+ } - - ret = -ESRCH; - if ( iommu_enabled ) -@@ -938,9 +942,13 @@ long arch_do_domctl( - bind = &(domctl->u.bind_pt_irq); - - ret = -EPERM; -- if ( !IS_PRIV(current->domain) && -- !irq_access_permitted(current->domain, bind->machine_irq) ) -- goto unbind_out; -+ if ( !IS_PRIV(current->domain) ) -+ { -+ int irq = domain_pirq_to_irq(d, bind->machine_irq); -+ -+ if ( irq <= 0 || !irq_access_permitted(current->domain, irq) ) -+ goto unbind_out; -+ } - - if ( iommu_enabled ) - { -Index: xen-4.1.3/xen/arch/x86/irq.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/irq.c 2013-04-08 11:45:26.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/irq.c 2013-04-11 16:05:19.606617030 +0200 -@@ -174,6 +174,15 @@ int create_irq(void) - out: - spin_unlock_irqrestore(&vector_lock, flags); - -+ if ( irq > 0 && dom0 ) -+ { -+ ret = irq_permit_access(dom0, irq); -+ if ( ret ) -+ printk(XENLOG_G_ERR -+ "Could not grant Dom0 access to IRQ%d (error %d)\n", -+ irq, ret); -+ } -+ - return irq; - } - -@@ -258,6 +267,17 @@ void clear_irq_vector(int irq) - void destroy_irq(unsigned int irq) - { - BUG_ON(!MSI_IRQ(irq)); -+ -+ if ( dom0 ) -+ { -+ int err = irq_deny_access(dom0, irq); -+ -+ if ( err ) -+ printk(XENLOG_G_ERR -+ "Could not revoke Dom0 access to IRQ%u (error %d)\n", -+ irq, err); -+ } -+ - dynamic_irq_cleanup(irq); - clear_irq_vector(irq); - } -@@ -1604,7 +1624,7 @@ int map_domain_pirq( - - if ( !IS_PRIV(current->domain) && - !(IS_PRIV_FOR(current->domain, d) && -- irq_access_permitted(current->domain, pirq))) -+ irq_access_permitted(current->domain, irq))) - return -EPERM; - - if ( pirq < 0 || pirq >= d->nr_pirqs || irq < 0 || irq >= nr_irqs ) -@@ -1625,11 +1645,12 @@ int map_domain_pirq( - return 0; - } - -- ret = irq_permit_access(d, pirq); -+ ret = irq_permit_access(d, irq); - if ( ret ) - { -- dprintk(XENLOG_G_ERR, "dom%d: could not permit access to irq %d\n", -- d->domain_id, pirq); -+ printk(XENLOG_G_ERR -+ "dom%d: could not permit access to IRQ%d (pirq %d)\n", -+ d->domain_id, irq, pirq); - return ret; - } - -@@ -1651,8 +1672,14 @@ int map_domain_pirq( - spin_lock_irqsave(&desc->lock, flags); - - if ( desc->handler != &no_irq_type ) -+ { -+ spin_unlock_irqrestore(&desc->lock, flags); - dprintk(XENLOG_G_ERR, "dom%d: irq %d in use\n", - d->domain_id, irq); -+ pci_disable_msi(msi_desc); -+ ret = -EBUSY; -+ goto done; -+ } - desc->handler = &pci_msi_type; - if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV - && !desc->chip_data->used_vectors ) -@@ -1680,6 +1707,10 @@ int map_domain_pirq( - } - - done: -+ if ( ret && irq_deny_access(d, irq) ) -+ printk(XENLOG_G_ERR -+ "dom%d: could not revoke access to IRQ%d (pirq %d)\n", -+ d->domain_id, irq, pirq); - return ret; - } - -@@ -1736,10 +1767,11 @@ int unmap_domain_pirq(struct domain *d, - if (msi_desc) - msi_free_irq(msi_desc); - -- ret = irq_deny_access(d, pirq); -+ ret = irq_deny_access(d, irq); - if ( ret ) -- dprintk(XENLOG_G_ERR, "dom%d: could not deny access to irq %d\n", -- d->domain_id, pirq); -+ printk(XENLOG_G_ERR -+ "dom%d: could not deny access to IRQ%d (pirq %d)\n", -+ d->domain_id, irq, pirq); - - if ( desc->handler == &pci_msi_type ) - desc->handler = &no_irq_type; -Index: xen-4.1.3/xen/arch/x86/physdev.c -=================================================================== ---- xen-4.1.3.orig/xen/arch/x86/physdev.c 2013-04-08 11:45:26.000000000 +0200 -+++ xen-4.1.3/xen/arch/x86/physdev.c 2013-04-11 16:05:19.610617045 +0200 -@@ -150,7 +150,7 @@ static int physdev_map_pirq(struct physd - if ( irq == -1 ) - irq = create_irq(); - -- if ( irq < 0 || irq >= nr_irqs ) -+ if ( irq < nr_irqs_gsi || irq >= nr_irqs ) - { - dprintk(XENLOG_G_ERR, "dom%d: can't create irq for msi!\n", - d->domain_id); -Index: xen-4.1.3/xen/common/domctl.c -=================================================================== ---- xen-4.1.3.orig/xen/common/domctl.c 2013-04-08 11:45:26.000000000 +0200 -+++ xen-4.1.3/xen/common/domctl.c 2013-04-11 16:05:19.618617084 +0200 -@@ -869,9 +869,9 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc - if ( pirq >= d->nr_pirqs ) - ret = -EINVAL; - else if ( op->u.irq_permission.allow_access ) -- ret = irq_permit_access(d, pirq); -+ ret = pirq_permit_access(d, pirq); - else -- ret = irq_deny_access(d, pirq); -+ ret = pirq_deny_access(d, pirq); - - rcu_unlock_domain(d); - } -Index: xen-4.1.3/xen/common/event_channel.c -=================================================================== ---- xen-4.1.3.orig/xen/common/event_channel.c 2012-08-09 22:08:09.000000000 +0200 -+++ xen-4.1.3/xen/common/event_channel.c 2013-04-11 16:05:19.626617123 +0200 -@@ -331,7 +331,7 @@ static long evtchn_bind_pirq(evtchn_bind - if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) - return -EINVAL; - -- if ( !is_hvm_domain(d) && !irq_access_permitted(d, pirq) ) -+ if ( !is_hvm_domain(d) && !pirq_access_permitted(d, pirq) ) - return -EPERM; - - spin_lock(&d->event_lock); -Index: xen-4.1.3/xen/include/xen/iocap.h -=================================================================== ---- xen-4.1.3.orig/xen/include/xen/iocap.h 2012-08-09 22:08:10.000000000 +0200 -+++ xen-4.1.3/xen/include/xen/iocap.h 2013-04-11 16:05:19.634617163 +0200 -@@ -28,4 +28,22 @@ - #define irq_access_permitted(d, i) \ - rangeset_contains_singleton((d)->irq_caps, i) - -+#define pirq_permit_access(d, i) ({ \ -+ struct domain *d__ = (d); \ -+ int i__ = domain_pirq_to_irq(d__, i); \ -+ i__ > 0 ? rangeset_add_singleton(d__->irq_caps, i__)\ -+ : -EINVAL; \ -+}) -+#define pirq_deny_access(d, i) ({ \ -+ struct domain *d__ = (d); \ -+ int i__ = domain_pirq_to_irq(d__, i); \ -+ i__ > 0 ? rangeset_remove_singleton(d__->irq_caps, i__)\ -+ : -EINVAL; \ -+}) -+#define pirq_access_permitted(d, i) ({ \ -+ struct domain *d__ = (d); \ -+ rangeset_contains_singleton(d__->irq_caps, \ -+ domain_pirq_to_irq(d__, i));\ -+}) -+ - #endif /* __XEN_IOCAP_H__ */ diff -Nru xen-4.1.3/debian/patches/xsa47-4.1.patch xen-4.1.5/debian/patches/xsa47-4.1.patch --- xen-4.1.3/debian/patches/xsa47-4.1.patch 2013-04-11 16:05:24.000000000 +0200 +++ xen-4.1.5/debian/patches/xsa47-4.1.patch 1970-01-01 01:00:00.000000000 +0100 @@ -1,33 +0,0 @@ -defer event channel bucket pointer store until after XSM checks - -Otherwise a dangling pointer can be left, which would cause subsequent -memory corruption as soon as the space got re-allocated for some other -purpose. - -This is CVE-2013-1920 / XSA-47. - -Reported-by: Wei Liu -Signed-off-by: Jan Beulich -Reviewed-by: Tim Deegan - -Index: xen-4.1.3/xen/common/event_channel.c -=================================================================== ---- xen-4.1.3.orig/xen/common/event_channel.c 2013-04-11 16:05:19.626617123 +0200 -+++ xen-4.1.3/xen/common/event_channel.c 2013-04-11 16:05:23.106634141 +0200 -@@ -104,7 +104,6 @@ static int get_free_port(struct domain * - if ( unlikely(chn == NULL) ) - return -ENOMEM; - memset(chn, 0, EVTCHNS_PER_BUCKET * sizeof(*chn)); -- bucket_from_port(d, port) = chn; - - for ( i = 0; i < EVTCHNS_PER_BUCKET; i++ ) - { -@@ -117,6 +116,8 @@ static int get_free_port(struct domain * - } - } - -+ bucket_from_port(d, port) = chn; -+ - return port; - } - diff -Nru xen-4.1.3/docs/man/xmdomain.cfg.pod.5 xen-4.1.5/docs/man/xmdomain.cfg.pod.5 --- xen-4.1.3/docs/man/xmdomain.cfg.pod.5 2012-08-09 22:08:04.000000000 +0200 +++ xen-4.1.5/docs/man/xmdomain.cfg.pod.5 2013-04-23 18:44:20.000000000 +0200 @@ -298,16 +298,14 @@ =back -=over 4 - Additionally, the "on_crash" event can also take: +=over 4 + =item B Dump the crashed domain's core and then destroy it. -=back - =item B Dump the crashed domain's core and then restart it. diff -Nru xen-4.1.3/.gitignore xen-4.1.5/.gitignore --- xen-4.1.3/.gitignore 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/.gitignore 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,394 @@ +.hg +*.orig +*.rej +*~ +*.o +*.d +*.opic +*.a +*.so +*.so.[0-9]* +*.bin +*.bak +*.tmp +*.spot +*.spit +TAGS +cscope.files +cscope.in.out +cscope.out +cscope.po.out +.config + +dist +stubdom/*.tar.gz + +build-* +dist/* +docs/*.aux +docs/*.dvi +docs/*.log +docs/*.pdf +docs/*.ps +docs/*.toc +docs/api/* +docs/figs/xenserver.eps +docs/html/* +docs/interface/WARNINGS +docs/interface/images.pl +docs/interface/images.tex +docs/interface/img1.png +docs/interface/index.html +docs/interface/interface.css +docs/interface/interface.html +docs/interface/labels.pl +docs/man1/ +docs/man5/ +docs/pdf/* +docs/ps/* +docs/user/WARNINGS +docs/user/images.pl +docs/user/images.tex +docs/user/img1.png +docs/user/img2.png +docs/user/img3.png +docs/user/index.html +docs/user/internals.pl +docs/user/labels.pl +docs/user/user.css +docs/user/user.html +docs/xen-api/vm_lifecycle.eps +docs/xen-api/xenapi-datamodel-graph.eps +docs/xen-api/xenapi.out +docs/xen-api/xenapi.dvi +docs/xen-api/xenapi.pdf +docs/xen-api/xenapi.ps +docs/xen-api/xenapi.toc +extras/mini-os/arch/ia64/gen_off.s +extras/mini-os/include/mini-os +extras/mini-os/include/ia64/mini-os +extras/mini-os/include/ia64/offsets.h +extras/mini-os/include/x86/mini-os +extras/mini-os/include/xen +extras/mini-os/include/list.h +extras/mini-os/mini-os* +install/* +linux-[^/]*-paravirt/* +linux-2.6[^/]*/* +linux-[^/]*-rc/* +linux-[^/]*-tip/* +linux-[^/]*-git/* +linux-[^/]*.patch +mkddbxen +netbsd-[^/]*-tools/* +netbsd-[^/]*-xen0/* +netbsd-[^/]*-xenU/* +netbsd-[^/]*.patch +patches/*/.makedep +patches/ebtables-brnf-5_vs_2.4.25.diff +patches/ebtables.diff +patches/tmp/* +pristine-* +ref-* +tmp-* +stubdom/binutils-* +stubdom/cross-root-* +stubdom/gcc-* +stubdom/include +stubdom/ioemu +stubdom/xenstore +stubdom/libxc-* +stubdom/lwip-* +stubdom/mini-os-* +stubdom/mk-headers-* +stubdom/newlib-1.* +stubdom/newlib-x86* +stubdom/pciutils-* +stubdom/zlib-* +stubdom/grub-* +stubdom/ocaml-* +stubdom/lwip/ +stubdom/ioemu/ +stubdom/stubdompath.sh +tools/*/build/lib*/*.py +tools/autom4te.cache +tools/config.h +tools/config.log +tools/config.status +tools/config.cache +config/Tools.mk +tools/blktap2/daemon/blktapctrl +tools/blktap2/drivers/img2qcow +tools/blktap2/drivers/lock-util +tools/blktap2/drivers/qcow-create +tools/blktap2/drivers/qcow2raw +tools/blktap2/drivers/tapdisk +tools/blktap2/drivers/tapdisk-client +tools/blktap2/drivers/tapdisk-diff +tools/blktap2/drivers/tapdisk-stream +tools/blktap2/drivers/tapdisk2 +tools/blktap2/drivers/td-util +tools/blktap2/vhd/vhd-update +tools/blktap2/vhd/vhd-util +tools/blktap/drivers/blktapctrl +tools/blktap/drivers/img2qcow +tools/blktap/drivers/qcow-create +tools/blktap/drivers/qcow2raw +tools/blktap/drivers/tapdisk +tools/check/.* +tools/console/xenconsole +tools/console/xenconsoled +tools/debugger/gdb/gdb-6.2.1-linux-i386-xen/* +tools/debugger/gdb/gdb-6.2.1/* +tools/debugger/gdb/gdb-6.2.1.tar.bz2 +tools/debugger/gdbsx/gdbsx +tools/debugger/xenitp/xenitp +tools/firmware/*/biossums +tools/firmware/*.bin +tools/firmware/*.sym +tools/firmware/*bios/*bios*.txt +tools/firmware/etherboot/gpxe/* +tools/firmware/extboot/extboot.img +tools/firmware/extboot/signrom +tools/firmware/hvmloader/acpi/mk_dsdt +tools/firmware/hvmloader/acpi/dsdt*.c +tools/firmware/hvmloader/acpi/dsdt*.asl +tools/firmware/hvmloader/acpi/ssdt_*.h +tools/firmware/hvmloader/hvmloader +tools/firmware/hvmloader/roms.h +tools/firmware/hvmloader/roms.inc +tools/firmware/rombios/BIOS-bochs-[^/]* +tools/firmware/rombios/_rombios[^/]*_.c +tools/firmware/rombios/rombios[^/]*.s +tools/firmware/rombios/32bit/32bitbios_flat.h +tools/firmware/vgabios/vbetables-gen +tools/firmware/vgabios/vbetables.h +tools/flask/utils/flask-getenforce +tools/flask/utils/flask-get-bool +tools/flask/utils/flask-loadpolicy +tools/flask/utils/flask-setenforce +tools/flask/utils/flask-set-bool +tools/flask/utils/flask-label-pci +tools/fs-back/fs-backend +tools/hotplug/common/hotplugpath.sh +tools/include/xen/* +tools/include/xen-foreign/*.(c|h|size) +tools/include/xen-foreign/checker +tools/libxc/ia64/asm/*.h +tools/libxc/ia64/acpi/*.h +tools/libxc/ia64/acpi/platform/*.h +tools/libxc/ia64/dom_fw_asm.S +tools/libxc/ia64/dom_fw_common.c +tools/libxc/ia64/dom_fw_domu.c +tools/libxc/ia64/xen/*.h +tools/libxen/libxenapi- +tools/libxen/test/test_bindings +tools/libxen/test/test_event_handling +tools/libxl/libxlu_cfg_y.output +tools/libxl/xl +tools/libxl/testenum +tools/libxl/testenum.c +tools/libxl/tmp.* +tools/libxl/_libxl.api-for-check +tools/libxl/*.api-ok +tools/libaio/src/*.ol +tools/libaio/src/*.os +tools/misc/cpuperf/cpuperf-perfcntr +tools/misc/cpuperf/cpuperf-xen +tools/misc/lomount/lomount +tools/misc/mbootpack/bin2c +tools/misc/mbootpack/bootsect +tools/misc/mbootpack/bzimage_header.c +tools/misc/mbootpack/mbootpack +tools/misc/mbootpack/setup +tools/misc/miniterm/miniterm +tools/misc/xc_shadow +tools/misc/xen_cpuperf +tools/misc/xen-detect +tools/misc/xen-tmem-list-parse +tools/misc/xenperf +tools/misc/xenpm +tools/misc/xen-hvmctx +tools/misc/gtraceview +tools/misc/gtracestat +tools/misc/xenlockprof +tools/misc/lowmemd +tools/pygrub/build/* +tools/python/build/* +tools/python/xen/util/path.py +tools/remus/imqebt/imqebt +tools/remus/kmod/*(.cmd|.mod|.ko|.mod.c|.symvers|.xen) +tools/security/secpol_tool +tools/security/xen/* +tools/security/xensec_tool +tools/tests/blowfish.bin +tools/tests/blowfish.h +tools/tests/test_x86_emulator +tools/tests/x86_emulate +tools/tests/regression/installed/* +tools/tests/regression/build/* +tools/tests/regression/downloads/* +tools/tests/mem-sharing/memshrtool +tools/tests/mce-test/tools/xen-mceinj +tools/vnet/Make.local +tools/vnet/build/* +tools/vnet/gc +tools/vnet/gc*/* +tools/vnet/vnet-module/*.ko +tools/vnet/vnet-module/.*.cmd +tools/vnet/vnet-module/.tmp_versions/* +tools/vnet/vnet-module/vnet_module.mod.* +tools/vnet/vnetd/vnetd +tools/vtpm/tpm_emulator-*.tar.gz +tools/vtpm/tpm_emulator/* +tools/vtpm/vtpm/* +tools/vtpm_manager/manager/vtpm_managerd +tools/xcutils/lsevtchn +tools/xcutils/xc_restore +tools/xcutils/xc_save +tools/xcutils/readnotes +tools/xenfb/sdlfb +tools/xenfb/vncfb +tools/xenmon/xentrace_setmask +tools/xenmon/xenbaked +tools/xenpaging/xenpaging +tools/xenpmd/xenpmd +tools/xenstat/xentop/xentop +tools/xenstore/testsuite/tmp/* +tools/xenstore/init-xenstore-domain +tools/xenstore/xen +tools/xenstore/xenstore +tools/xenstore/xenstore-chmod +tools/xenstore/xenstore-exists +tools/xenstore/xenstore-list +tools/xenstore/xenstore-read +tools/xenstore/xenstore-rm +tools/xenstore/xenstore-write +tools/xenstore/xenstore-control +tools/xenstore/xenstore-ls +tools/xenstore/xenstored +tools/xenstore/xenstored_test +tools/xenstore/xs_crashme +tools/xenstore/xs_random +tools/xenstore/xs_stress +tools/xenstore/xs_tdb_dump +tools/xenstore/xs_test +tools/xenstore/xs_watch_stress +tools/xentrace/xentrace_setsize +tools/xentrace/tbctl +tools/xentrace/xenctx +tools/xentrace/xentrace +tools/xm-test/ramdisk/buildroot +tools/xm-test/aclocal.m4 +tools/xm-test/autom4te +tools/xm-test/install-sh +tools/xm-test/mkinstalldirs +tools/xm-test/missing +tools/xm-test/config(ure|.log|.status|.guess|.sub) +tools/xm-test/Makefile(.in)* +tools/xm-test/*/Makefile(.in)* +tools/xm-test/lib/XmTestLib/config.py +tools/xm-test/lib/XmTestReport/xmtest.py +tools/xm-test/tests/*.test +tools/ocaml-xenstored* +xen/.banner* +xen/BLOG +xen/System.map +xen/arch/arm/asm-offsets.s +xen/arch/arm/xen.lds +xen/arch/x86/asm-offsets.s +xen/arch/x86/boot/mkelf32 +xen/arch/x86/xen.lds +xen/arch/x86/boot/reloc.S +xen/arch/x86/boot/reloc.bin +xen/arch/x86/boot/reloc.lnk +xen/arch/x86/efi.lds +xen/arch/x86/efi/disabled +xen/arch/x86/efi/mkreloc +xen/ddb/* +xen/include/headers.chk +xen/include/asm +xen/include/asm-*/asm-offsets.h +xen/include/asm-ia64/asm-xsi-offsets.h +xen/include/asm-ia64/.offsets.h.stamp +xen/include/asm-ia64/xen +xen/include/compat/* +xen/include/hypervisor-ifs/arch +xen/include/linux +xen/include/public/public +xen/include/xen/*.new +xen/include/xen/acm_policy.h +xen/include/xen/banner.h +xen/include/xen/compile.h +xen/tools/figlet/figlet +xen/tools/symbols +xen/xen +xen/xen-syms +xen/xen.* +xen/arch/ia64/asm-offsets.s +xen/arch/ia64/asm-xsi-offsets.s +xen/arch/ia64/map.out +xen/arch/ia64/xen.lds.s +unmodified_drivers/linux-2.6/.tmp_versions +unmodified_drivers/linux-2.6/*.cmd +unmodified_drivers/linux-2.6/*.ko +unmodified_drivers/linux-2.6/*.mod.c +LibVNCServer* + +tools/qemu-xen-dir-remote +tools/qemu-xen-dir + +tools/qemu-xen-traditional-dir-remote +tools/qemu-xen-traditional-dir + +tools/firmware/seabios-dir-remote +tools/firmware/seabios-dir + +tools/firmware/rombios/_rombios_.c +tools/firmware/rombios/rombios.s +tools/firmware/rombios/rombios.sym +tools/include/xen-foreign/checker.c +tools/include/xen-foreign/ia64.h +tools/include/xen-foreign/structs.pyc +tools/include/xen-foreign/x86_32.h +tools/include/xen-foreign/x86_64.h + +.git +tools/misc/xen-hptool +tools/libxl/_*.[ch] +tools/libxl/testidl +tools/libxl/testidl.c +tools/libxl/*.pyc +tools/libxl/libxl-save-helper +tools/blktap2/control/tap-ctl +tools/firmware/etherboot/eb-roms.h +tools/firmware/etherboot/gpxe-git-snapshot.tar.gz +tools/misc/xenwatchdogd +tools/misc/xen-hvmcrash +tools/misc/xen-lowmemd +tools/libvchan/vchan-node[12] +tools/ocaml/*/.ocamldep.make +tools/ocaml/*/*.cm[ixao] +tools/ocaml/*/*.cmxa +tools/ocaml/*/*.annot +tools/ocaml/*/*/.ocamldep.make +tools/ocaml/*/*/*.cm[ixao] +tools/ocaml/*/*/*.cmxa +tools/ocaml/*/*/*.annot +tools/ocaml/*/*/META +tools/ocaml/libs/xl/_libxl_types.inc +tools/ocaml/libs/xl/_libxl_types.ml.in +tools/ocaml/libs/xl/_libxl_types.mli.in +tools/ocaml/libs/xl/xenlight.ml +tools/ocaml/libs/xl/xenlight.mli +tools/ocaml/xenstored/oxenstored + +tools/debugger/kdd/kdd +tools/firmware/etherboot/ipxe.tar.gz +tools/firmware/etherboot/ipxe/ +tools/python/xen/lowlevel/xl/_pyxl_types.c +tools/python/xen/lowlevel/xl/_pyxl_types.h +tools/xenstore/xenstore-watch + +docs/txt/misc/*.txt +docs/txt/man/*.txt diff -Nru xen-4.1.3/.hg_archival.txt xen-4.1.5/.hg_archival.txt --- xen-4.1.3/.hg_archival.txt 2012-08-09 22:08:04.000000000 +0200 +++ xen-4.1.5/.hg_archival.txt 1970-01-01 01:00:00.000000000 +0100 @@ -1,4 +0,0 @@ -repo: ab039beb22dc9d53f224a5ef2ef88d534b561898 -node: ce7195d2b80e4df9857e434fa29689fd678a2341 -branch: default -tag: RELEASE-4.1.3 diff -Nru xen-4.1.3/.hgsigs xen-4.1.5/.hgsigs --- xen-4.1.3/.hgsigs 2012-08-09 22:08:04.000000000 +0200 +++ xen-4.1.5/.hgsigs 2013-04-23 18:44:20.000000000 +0200 @@ -18,3 +18,7 @@ da64f68730cf1c42c06919578e70d8bc01041051 0 iQEcBAABAgAGBQJPp8OUAAoJEIP+FMlX6CvZRkEIAKp5iVEADZyijVw0Jwj1vUWKqHJYVONzNjzRcnavWAEzsuwbAxQ6QfMJIai2ThjF79M2w7fPXY03S/vCV4/bXVE9R9s2/IUmS9B6pK+DAhw3ExuNUfsxq9UZd3Iul6hWifjjouYnBmgUtpF7O5z4pfQ+r1+z58FpIYPrv39NARt5YW7tcPeUJh4gOJ0ugORc5CclZqLLiljjIbVY6DN+jJDzjqCAwbWLGbkVw4kEGAeWI6aP3/5ZDpnk9Yytp9GpZ8d3BpmlHaR/kY6xepmZUqBPFGKUGY437+1jKWGgUYPLt2RC0S88W4iLRW6b9HXd7u3bhrn36ERz8XZ10KqjH7A= acbd3617691397911f34e4574d03385c08aec900 0 iQEcBAABAgAGBQJP3zbaAAoJEIP+FMlX6CvZoMUH/1TQcdw+e/7BmxtXBnMIrpiTJ7/tffSBYurcoQFq1cTaJJgz5in8iq1JWHgru/ToYQ9PaWY0wVQcb1Yj40rCGNnASlSzQqgRQbYMmZpKd0+TESDtMkl6q1FXECrs8ag/HMHwkVYsgdAEmQ/7IouRK4kBOXXzSWhMRU24YkHdJAnQCcXD9L99Yjmrr5oxF/fgVG7WnhfTGlhpu7FaUeWlDjBRlIuw6HeNnXMwubAn569dGXyPdwJnbU0nCLRrQGjQn7DsmeN25gL4R5Pz+uhp4eeGB7ORYT/mj5+xeS2Cjb3XfptV3qAW2FJVYRLit7lp5cmsKvtBnr8mAO8GS0R+8Pg= 5cdcfed7b5b129843e1602b5d43c7651de337092 0 iQEcBAABAgAGBQJQDB6TAAoJEIP+FMlX6CvZ+H8IAJbWR4PrKOt3gMpgEYdADts96vtduD3oet5C+l8FSlo0pDPtF32wPQ5tQz+Ll8OtCFckSIzobsw+9IMrZ38nRwP1UM2LgLUuo6WVVwYZ4DKVIntDrC1DV6Us1CmGiHiTHqPNDypBB2NponJ21rlD8zRY4Q661BgdKXVwqq5H6SDtxNRSn7RPDYnsIvavabr0fvcR38YOHVG4TvfXP+uge0UfEvIurGEBnTn25E0vadLG9la9SGKeEm8HuTDnzuxQmSic7tPdodQ0oQYQ5AAj+/mdW2B9uaCDsmOeP4udDNcV4yXxdLxNA2GkeSSJ/+U0hj2HBaHZvd+hvAeHBZGdMAU= +ce7195d2b80e4df9857e434fa29689fd678a2341 0 iQEcBAABAgAGBQJQI9u1AAoJEIP+FMlX6CvZU88IAKKz8mw53zvvdEFaV39669d+SFATRyatb9OF6L1O6RQczsdxI7koyGyLL64V6NvpMB3RLFrh3dD+1ZIy4W13AWuDSnby45oIijHpcQqZ+Zt1ijxDj/wuViSC+4S+sdkgCDfmMyGb5a8XwnoKIWb6EOJVT8FFjuVJ3UsTXfujCyDdDeKoFH7ZGfq+r7k0DoANqkNi2AIhxUrw/h/9ydNffd4Gh/BUsHHcx5DEfXUplbT3pPUvq6o38rsqM80xZNR7lgL7chxFHjs94OOqX71gEZw0sR1hi+gbL8eJTIsep4kM4Z0aOYJJCqUFYv5C31Nn54GWECKJSNSWtBo8hIqLpI0= +500194a883bdcf77b2a0fc87de291b0c27435207 0 iQEcBAABAgAGBQJQpLjeAAoJEIP+FMlX6CvZaUUIAKXHEVEfhnGEUAnm0gA0Mao1W8yo7BrsIhgZ/TyTYmhaOBzF3ez9+oUDQ6XWAJ9zbyq4ZWBsDbKFJCSwF8nvPL2OLG2DGyqoi7UckPZshQi71h91cT8v/snsp4A2161NEh9Tyti7YrygCzZldgjYy0vVAyFGmML+swvcBI9Zufb2Vk+Lm4R6EgmmTSO+EHEPg0QDtjLcUICSPcQgzRBe2QB/xzn/h8yWJ8FoOCyNDu3zQU1RKoP3lVMT0vK6Pr/l6NC7sy2Npfhe44ysvKHCpzQD3kZMhvzLujVnm1xJs0iKifUgHLsAO7RP55i7AfUgXqEMHaMxwlRSNWBmOsLSeho= +0125069bc1b29b617f1e4f4f2c2fcb8547ecd45d 0 iQEcBAABAgAGBQJQyFD/AAoJEIP+FMlX6CvZdAAH/39wnElEv6/KMSdVNXvX2Fz5Q4jzw23muZsXF7e+vAJ2LOrg2vzzAyq4Clx6h2Rw0cvSHtZC8f+RbFA4cnMcm9yxAu9jO1+GViRMLi2yNaNIMiQ6+w4DCBtscRPa+t5fqVnUcSkC3xm/V1Us4whICL40AFtpi3u78d4DhbnbkH4NAbRutZD4RrI8frrTyu8X6aLVzVK+Am1yb69Y0BDQldI9L3a89o9zH3yASyo98Suvo9p+tipYv2YJMaTZaVFbPhcakg3ZnB5OVOdel1K16Rf3TB1qE+Yj654U0fFx4Tvx28q94n+HzQQiIDSanv6v1GL/lcQtcZwQtgrx4e+3jqc= +12c4c4c0a715c8daf08222f0b4cebcf2ec3bd3a0 0 iQEcBAABAgAGBQJQ0Gc4AAoJEIP+FMlX6CvZmJcH/1QBx8vOWMUt05lO9qFtU7AT/+g0Zxgr8dNL1cI0ntn+EXoRjta4mBuKuLq35zhwzBpKOguOBVwrlwt6YTx/w/f4RtBkwqvsgZT7BJQF/iXnD8XisJIdMB9g97W3b5qLag873MzEvnE1uJJ8y82qpo4IkIEs0SkNhKFI1dPDKyn+uQFOHOQzG5lgV3IKON+R9aCGnhmMfJukufETMtaOG7aBs7a6iftDmywGnAq6zEVNDsV4KTIC21vME6mewg8WQlGf83zTBV4pFUIzyOekSPqYddNUuRu+vAT3VAInAbmb2cHZ3i/xb0tGstbrGnjNkTTHCsy9JwxtTfiqJRhBPT8= diff -Nru xen-4.1.3/.hgtags xen-4.1.5/.hgtags --- xen-4.1.3/.hgtags 2012-08-09 22:08:04.000000000 +0200 +++ xen-4.1.5/.hgtags 2013-04-23 18:44:20.000000000 +0200 @@ -65,3 +65,7 @@ da64f68730cf1c42c06919578e70d8bc01041051 4.1.3-rc1 acbd3617691397911f34e4574d03385c08aec900 4.1.3-rc2 5cdcfed7b5b129843e1602b5d43c7651de337092 4.1.3-rc3 +ce7195d2b80e4df9857e434fa29689fd678a2341 RELEASE-4.1.3 +500194a883bdcf77b2a0fc87de291b0c27435207 4.1.4-rc1 +0125069bc1b29b617f1e4f4f2c2fcb8547ecd45d 4.1.4-rc2 +12c4c4c0a715c8daf08222f0b4cebcf2ec3bd3a0 RELEASE-4.1.4 diff -Nru xen-4.1.3/MAINTAINERS xen-4.1.5/MAINTAINERS --- xen-4.1.3/MAINTAINERS 2012-08-09 22:08:04.000000000 +0200 +++ xen-4.1.5/MAINTAINERS 2013-04-23 18:44:20.000000000 +0200 @@ -1,5 +1,6 @@ List of maintainers and how to submit changes + ============================================= Please try to follow the guidelines below. This will make things easier on the maintainers. Not all of these guidelines matter for every @@ -15,7 +16,11 @@ 'diff -u' to make the patch easy to merge. Be prepared to get your changes sent back with seemingly silly requests about formatting and variable names. These aren't as silly as they seem. One - job the maintainersdo is to keep things looking the same. + job the maintainers do is to keep things looking the same. + + PLEASE see http://wiki.xen.org/wiki/Submitting_Xen_Patches for + hints on how to submit a patch to xen-unstable in a suitable + form. PLEASE try to include any credit lines you want added with the patch. It avoids people being missed off by mistake and makes @@ -34,6 +39,28 @@ 5. Happy hacking. + + Stable Release Maintenance + ========================== + +The policy for inclusion in a Xen stable release is different to that +for inclusion in xen-unstable. + +Please see http://wiki.xen.org/wiki/Xen_Maintenance_Releases for more +information. + +Remember to copy the stable branch maintainer. The maintainer for this +branch is: + + Jan Beulich + +Tools backport requests should also be copied to: + + Ian Jackson + + Unstable Subsystem Maintainers + ============================== + Descriptions of section entries: M: Mail patches to: FullName diff -Nru xen-4.1.3/qemu/console.c xen-4.1.5/qemu/console.c --- xen-4.1.3/qemu/console.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/console.c 2013-01-17 17:01:00.000000000 +0100 @@ -794,6 +794,26 @@ update_xy(s, x, y); } +/* set cursor, checking bounds */ +static void set_cursor(TextConsole *s, int x, int y) +{ + if (x < 0) { + x = 0; + } + if (y < 0) { + y = 0; + } + if (y >= s->height) { + y = s->height - 1; + } + if (x >= s->width) { + x = s->width - 1; + } + + s->x = x; + s->y = y; +} + static void console_putchar(TextConsole *s, int ch) { TextCell *c; @@ -869,7 +889,8 @@ s->esc_params[s->nb_esc_params] * 10 + ch - '0'; } } else { - s->nb_esc_params++; + if (s->nb_esc_params < MAX_ESC_PARAMS) + s->nb_esc_params++; if (ch == ';') break; #ifdef DEBUG_CONSOLE @@ -883,59 +904,37 @@ if (s->esc_params[0] == 0) { s->esc_params[0] = 1; } - s->y -= s->esc_params[0]; - if (s->y < 0) { - s->y = 0; - } + set_cursor(s, s->x, s->y - s->esc_params[0]); break; case 'B': /* move cursor down */ if (s->esc_params[0] == 0) { s->esc_params[0] = 1; } - s->y += s->esc_params[0]; - if (s->y >= s->height) { - s->y = s->height - 1; - } + set_cursor(s, s->x, s->y + s->esc_params[0]); break; case 'C': /* move cursor right */ if (s->esc_params[0] == 0) { s->esc_params[0] = 1; } - s->x += s->esc_params[0]; - if (s->x >= s->width) { - s->x = s->width - 1; - } + set_cursor(s, s->x + s->esc_params[0], s->y); break; case 'D': /* move cursor left */ if (s->esc_params[0] == 0) { s->esc_params[0] = 1; } - s->x -= s->esc_params[0]; - if (s->x < 0) { - s->x = 0; - } + set_cursor(s, s->x - s->esc_params[0], s->y); break; case 'G': /* move cursor to column */ - s->x = s->esc_params[0] - 1; - if (s->x < 0) { - s->x = 0; - } + set_cursor(s, s->esc_params[0] - 1, s->y); break; case 'f': case 'H': /* move cursor to row, column */ - s->x = s->esc_params[1] - 1; - if (s->x < 0) { - s->x = 0; - } - s->y = s->esc_params[0] - 1; - if (s->y < 0) { - s->y = 0; - } + set_cursor(s, s->esc_params[1] - 1, s->esc_params[0] - 1); break; case 'J': switch (s->esc_params[0]) { diff -Nru xen-4.1.3/qemu/hw/e1000.c xen-4.1.5/qemu/hw/e1000.c --- xen-4.1.3/qemu/hw/e1000.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/hw/e1000.c 2013-01-17 17:01:00.000000000 +0100 @@ -55,6 +55,11 @@ #define REG_IOADDR 0x0 #define REG_IODATA 0x4 +/* this is the size past which hardware will drop packets when setting LPE=0 */ +#define MAXIMUM_ETHERNET_VLAN_SIZE 1522 +/* this is the size past which hardware will drop packets when setting LPE=1 */ +#define MAXIMUM_ETHERNET_LPE_SIZE 16384 + /* * HW models: * E1000_DEV_ID_82540EM works with Windows and Linux @@ -628,6 +633,15 @@ return; } + /* Discard oversized packets if !LPE and !SBP. */ + if ((size > MAXIMUM_ETHERNET_LPE_SIZE || + (size > MAXIMUM_ETHERNET_VLAN_SIZE + && !(s->mac_reg[RCTL] & E1000_RCTL_LPE))) + && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) { + DBGOUT(RX, "packet too large for applicable LPE/VLAN size\n"); + return; + } + if (!receive_filter(s, buf, size)) return; diff -Nru xen-4.1.3/qemu/hw/pass-through.c xen-4.1.5/qemu/hw/pass-through.c --- xen-4.1.3/qemu/hw/pass-through.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/hw/pass-through.c 2013-01-17 17:01:00.000000000 +0100 @@ -3803,21 +3803,18 @@ PT_LOG("guest enabling MSI, disable MSI-INTx translation\n"); pt_disable_msi_translate(ptdev); } - else + /* Init physical one */ + PT_LOG("setup msi for dev %x\n", pd->devfn); + if (pt_msi_setup(ptdev)) { - /* Init physical one */ - PT_LOG("setup msi for dev %x\n", pd->devfn); - if (pt_msi_setup(ptdev)) - { - /* We do not broadcast the error to the framework code, so - * that MSI errors are contained in MSI emulation code and - * QEMU can go on running. - * Guest MSI would be actually not working. - */ - *value &= ~PCI_MSI_FLAGS_ENABLE; - PT_LOG("Warning: Can not map MSI for dev %x\n", pd->devfn); - return 0; - } + /* We do not broadcast the error to the framework code, so + * that MSI errors are contained in MSI emulation code and + * QEMU can go on running. + * Guest MSI would be actually not working. + */ + *value &= ~PCI_MSI_FLAGS_ENABLE; + PT_LOG("Warning: Can not map MSI for dev %x\n", pd->devfn); + return 0; } if (pt_msi_update(ptdev)) { diff -Nru xen-4.1.3/qemu/hw/pt-msi.c xen-4.1.5/qemu/hw/pt-msi.c --- xen-4.1.3/qemu/hw/pt-msi.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/hw/pt-msi.c 2013-01-17 17:01:00.000000000 +0100 @@ -263,16 +263,8 @@ uint8_t e_device = 0; uint8_t e_intx = 0; - /* MSI_ENABLE bit should be disabed until the new handler is set */ - msi_set_enable(dev, 0); - - e_device = PCI_SLOT(dev->dev.devfn); - e_intx = pci_intx(dev); - - if (xc_domain_unbind_pt_irq(xc_handle, domid, dev->msi->pirq, - PT_IRQ_TYPE_MSI_TRANSLATE, 0, - e_device, e_intx, 0)) - PT_LOG("Error: Unbinding pt irq for MSI-INTx failed!\n"); + pt_msi_disable(dev); + dev->msi->flags |= MSI_FLAG_UNINIT; if (dev->machine_irq) { @@ -280,8 +272,6 @@ 0, e_device, e_intx)) PT_LOG("Error: Rebinding of interrupt failed!\n"); } - - dev->msi_trans_en = 0; } /* MSI-X virtulization functions */ diff -Nru xen-4.1.3/qemu/hw/xen_machine_fv.c xen-4.1.5/qemu/hw/xen_machine_fv.c --- xen-4.1.3/qemu/hw/xen_machine_fv.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/hw/xen_machine_fv.c 2013-01-17 17:01:00.000000000 +0100 @@ -181,9 +181,6 @@ unsigned long paddr_index; int found = 0; - if (last_address_vaddr == buffer) - last_address_index = ~0UL; - TAILQ_FOREACH(reventry, &locked_entries, next) { if (reventry->vaddr_req == buffer) { paddr_index = reventry->paddr_index; @@ -201,6 +198,10 @@ TAILQ_REMOVE(&locked_entries, reventry, next); qemu_free(reventry); + if (last_address_index == paddr_index) { + last_address_index = ~0UL; + } + entry = &mapcache_entry[paddr_index % nr_buckets]; while (entry && entry->paddr_index != paddr_index) { pentry = entry; diff -Nru xen-4.1.3/qemu/vl.c xen-4.1.5/qemu/vl.c --- xen-4.1.3/qemu/vl.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/vl.c 2013-01-17 17:01:00.000000000 +0100 @@ -4910,7 +4910,7 @@ kernel_cmdline = ""; cyls = heads = secs = 0; translation = BIOS_ATA_TRANSLATION_AUTO; - monitor_device = "vc:80Cx24C"; + monitor_device = "null"; serial_devices[0] = "vc:80Cx24C"; for(i = 1; i < MAX_SERIAL_PORTS; i++) diff -Nru xen-4.1.3/qemu/xenstore.c xen-4.1.5/qemu/xenstore.c --- xen-4.1.3/qemu/xenstore.c 2012-04-24 19:35:40.000000000 +0200 +++ xen-4.1.5/qemu/xenstore.c 2013-01-17 17:01:00.000000000 +0100 @@ -643,7 +643,7 @@ } pstrcpy(bs->filename, sizeof(bs->filename), params); - flags = BDRV_O_NOCACHE; + flags = BDRV_O_CACHE_WB; /* snapshot and write-back */ is_readonly = 0; if (pasprintf(&buf, "%s/mode", bpath) == -1) continue; diff -Nru xen-4.1.3/stubdom/grub/kexec.c xen-4.1.5/stubdom/grub/kexec.c --- xen-4.1.3/stubdom/grub/kexec.c 2012-08-09 22:08:05.000000000 +0200 +++ xen-4.1.5/stubdom/grub/kexec.c 2013-04-23 18:44:20.000000000 +0200 @@ -137,6 +137,10 @@ dom = xc_dom_allocate(xc_handle, cmdline, features); dom->allocate = kexec_allocate; + /* We are using guest owned memory, therefore no limits. */ + xc_dom_kernel_max_size(dom, 0); + xc_dom_ramdisk_max_size(dom, 0); + dom->kernel_blob = kernel; dom->kernel_size = kernel_size; diff -Nru xen-4.1.3/tools/blktap2/control/tap-ctl.h xen-4.1.5/tools/blktap2/control/tap-ctl.h --- xen-4.1.3/tools/blktap2/control/tap-ctl.h 2012-08-09 22:08:05.000000000 +0200 +++ xen-4.1.5/tools/blktap2/control/tap-ctl.h 2013-04-23 18:44:20.000000000 +0200 @@ -76,7 +76,7 @@ int tap_ctl_list(tap_list_t ***list); void tap_ctl_free_list(tap_list_t **list); -int tap_ctl_find_minor(const char *type, const char *path); +int tap_ctl_find(const char *type, const char *path, tap_list_t *tap); int tap_ctl_allocate(int *minor, char **devname); int tap_ctl_free(const int minor); diff -Nru xen-4.1.3/tools/blktap2/control/tap-ctl-list.c xen-4.1.5/tools/blktap2/control/tap-ctl-list.c --- xen-4.1.3/tools/blktap2/control/tap-ctl-list.c 2012-08-09 22:08:05.000000000 +0200 +++ xen-4.1.5/tools/blktap2/control/tap-ctl-list.c 2013-04-23 18:44:20.000000000 +0200 @@ -506,17 +506,15 @@ } int -tap_ctl_find_minor(const char *type, const char *path) +tap_ctl_find(const char *type, const char *path, tap_list_t *tap) { tap_list_t **list, **_entry; - int minor, err; + int ret = -ENOENT, err; err = tap_ctl_list(&list); if (err) return err; - minor = -1; - for (_entry = list; *_entry != NULL; ++_entry) { tap_list_t *entry = *_entry; @@ -526,11 +524,13 @@ if (path && (!entry->path || strcmp(entry->path, path))) continue; - minor = entry->minor; + *tap = *entry; + tap->type = tap->path = NULL; + ret = 0; break; } tap_ctl_free_list(list); - return minor >= 0 ? minor : -ENOENT; + return ret; } diff -Nru xen-4.1.3/tools/blktap2/drivers/md5.c xen-4.1.5/tools/blktap2/drivers/md5.c --- xen-4.1.3/tools/blktap2/drivers/md5.c 2012-08-09 22:08:05.000000000 +0200 +++ xen-4.1.5/tools/blktap2/drivers/md5.c 2013-04-23 18:44:20.000000000 +0200 @@ -174,7 +174,7 @@ MD5Transform(ctx->buf, (uint32_t *) ctx->in); byteReverse((unsigned char *) ctx->buf, 4); memcpy(digest, ctx->buf, 16); - memset(ctx, 0, sizeof(ctx)); /* In case it's sensitive */ + memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ } /* The four core functions - F1 is optimized somewhat */ diff -Nru xen-4.1.3/tools/debugger/kdd/kdd-xen.c xen-4.1.5/tools/debugger/kdd/kdd-xen.c --- xen-4.1.3/tools/debugger/kdd/kdd-xen.c 2012-08-09 22:08:05.000000000 +0200 +++ xen-4.1.5/tools/debugger/kdd/kdd-xen.c 2013-04-23 18:44:20.000000000 +0200 @@ -333,7 +333,7 @@ if (!cpu) return -1; - memset(r, 0, sizeof(r)); + memset(r, 0, sizeof(*r)); if (w64) kdd_get_regs_x86_64(cpu, &r->r64); diff -Nru xen-4.1.3/tools/firmware/hvmloader/xenbus.c xen-4.1.5/tools/firmware/hvmloader/xenbus.c --- xen-4.1.3/tools/firmware/hvmloader/xenbus.c 2012-08-09 22:08:05.000000000 +0200 +++ xen-4.1.5/tools/firmware/hvmloader/xenbus.c 2013-04-23 18:44:20.000000000 +0200 @@ -56,6 +56,8 @@ /* Reset the xenbus connection so the next kernel can start again. */ void xenbus_shutdown(void) { + struct shared_info *shinfo = get_shared_info(); + ASSERT(rings != NULL); /* We zero out the whole ring -- the backend can handle this, and it's @@ -64,7 +66,9 @@ memset(rings, 0, sizeof *rings); /* Clear the event-channel state too. */ - memset(get_shared_info(), 0, PAGE_SIZE); + memset(shinfo->vcpu_info, 0, sizeof(shinfo->vcpu_info)); + memset(shinfo->evtchn_pending, 0, sizeof(shinfo->evtchn_pending)); + memset(shinfo->evtchn_mask, 0, sizeof(shinfo->evtchn_mask)); rings = NULL; } diff -Nru xen-4.1.3/tools/hotplug/Linux/network-nat xen-4.1.5/tools/hotplug/Linux/network-nat --- xen-4.1.3/tools/hotplug/Linux/network-nat 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/hotplug/Linux/network-nat 2013-04-23 18:44:20.000000000 +0200 @@ -1,4 +1,4 @@ -#!/bin/bash -x +#!/bin/bash #============================================================================ # Default Xen network start/stop script when using NAT. # Xend calls a network script when it starts. diff -Nru xen-4.1.3/tools/libxc/xc_cpufeature.h xen-4.1.5/tools/libxc/xc_cpufeature.h --- xen-4.1.3/tools/libxc/xc_cpufeature.h 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_cpufeature.h 2013-04-23 18:44:20.000000000 +0200 @@ -96,6 +96,7 @@ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4*32+15) /* Perf/Debug Capability MSR */ +#define X86_FEATURE_PCID (4*32+17) /* Process Context ID */ #define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ #define X86_FEATURE_SSE4_1 (4*32+19) /* Streaming SIMD Extensions 4.1 */ #define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */ @@ -146,5 +147,6 @@ #define X86_FEATURE_FSGSBASE (7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */ #define X86_FEATURE_SMEP (7*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_ERMS (7*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID (7*32+10) /* Invalidate Process Context ID */ #endif /* __LIBXC_CPUFEATURE_H */ diff -Nru xen-4.1.3/tools/libxc/xc_cpuid_x86.c xen-4.1.5/tools/libxc/xc_cpuid_x86.c --- xen-4.1.3/tools/libxc/xc_cpuid_x86.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_cpuid_x86.c 2013-04-23 18:44:20.000000000 +0200 @@ -421,6 +421,7 @@ } clear_bit(X86_FEATURE_XTPR, regs[2]); clear_bit(X86_FEATURE_PDCM, regs[2]); + clear_bit(X86_FEATURE_PCID, regs[2]); clear_bit(X86_FEATURE_DCA, regs[2]); set_bit(X86_FEATURE_HYPERVISOR, regs[2]); break; diff -Nru xen-4.1.3/tools/libxc/xc_dom_boot.c xen-4.1.5/tools/libxc/xc_dom_boot.c --- xen-4.1.3/tools/libxc/xc_dom_boot.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_dom_boot.c 2013-04-23 18:44:20.000000000 +0200 @@ -265,7 +265,7 @@ return rc; /* let the vm run */ - memset(ctxt, 0, sizeof(ctxt)); + memset(ctxt, 0, sizeof(*ctxt)); if ( (rc = dom->arch_hooks->vcpu(dom, ctxt)) != 0 ) return rc; xc_dom_unmap_all(dom); diff -Nru xen-4.1.3/tools/libxc/xc_dom_bzimageloader.c xen-4.1.5/tools/libxc/xc_dom_bzimageloader.c --- xen-4.1.3/tools/libxc/xc_dom_bzimageloader.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_dom_bzimageloader.c 2013-04-23 18:44:20.000000000 +0200 @@ -47,13 +47,19 @@ char *out_buf; char *tmp_buf; int retval = -1; - int outsize; + unsigned int outsize; uint64_t total; stream.bzalloc = NULL; stream.bzfree = NULL; stream.opaque = NULL; + if ( dom->kernel_size == 0) + { + DOMPRINTF("BZIP2: Input is 0 size"); + return -1; + } + ret = BZ2_bzDecompressInit(&stream, 0, 0); if ( ret != BZ_OK ) { @@ -66,6 +72,17 @@ * the input buffer to start, and we'll realloc as needed. */ outsize = dom->kernel_size; + + /* + * stream.avail_in and outsize are unsigned int, while kernel_size + * is a size_t. Check we aren't overflowing. + */ + if ( outsize != dom->kernel_size ) + { + DOMPRINTF("BZIP2: Input too large"); + goto bzip2_cleanup; + } + out_buf = malloc(outsize); if ( out_buf == NULL ) { @@ -98,13 +115,20 @@ if ( stream.avail_out == 0 ) { /* Protect against output buffer overflow */ - if ( outsize > INT_MAX / 2 ) + if ( outsize > UINT_MAX / 2 ) { DOMPRINTF("BZIP2: output buffer overflow"); free(out_buf); goto bzip2_cleanup; } + if ( xc_dom_kernel_check_size(dom, outsize * 2) ) + { + DOMPRINTF("BZIP2: output too large"); + free(out_buf); + goto bzip2_cleanup; + } + tmp_buf = realloc(out_buf, outsize * 2); if ( tmp_buf == NULL ) { @@ -163,22 +187,26 @@ #include -static int xc_try_lzma_decode( - struct xc_dom_image *dom, void **blob, size_t *size) +static int _xc_try_lzma_decode( + struct xc_dom_image *dom, void **blob, size_t *size, + lzma_stream *stream, lzma_ret ret, const char *what) { - lzma_stream stream = LZMA_STREAM_INIT; - lzma_ret ret; lzma_action action = LZMA_RUN; unsigned char *out_buf; unsigned char *tmp_buf; int retval = -1; - int outsize; + size_t outsize; const char *msg; - ret = lzma_alone_decoder(&stream, 128*1024*1024); + if ( dom->kernel_size == 0) + { + DOMPRINTF("%s: Input is 0 size", what); + return -1; + } + if ( ret != LZMA_OK ) { - DOMPRINTF("LZMA: Failed to init stream decoder"); + DOMPRINTF("%s: Failed to init decoder", what); return -1; } @@ -190,22 +218,22 @@ out_buf = malloc(outsize); if ( out_buf == NULL ) { - DOMPRINTF("LZMA: Failed to alloc memory"); + DOMPRINTF("%s: Failed to alloc memory", what); goto lzma_cleanup; } - stream.next_in = dom->kernel_blob; - stream.avail_in = dom->kernel_size; + stream->next_in = dom->kernel_blob; + stream->avail_in = dom->kernel_size; - stream.next_out = out_buf; - stream.avail_out = dom->kernel_size; + stream->next_out = out_buf; + stream->avail_out = dom->kernel_size; for ( ; ; ) { - ret = lzma_code(&stream, action); + ret = lzma_code(stream, action); if ( ret == LZMA_STREAM_END ) { - DOMPRINTF("LZMA: Saw data stream end"); + DOMPRINTF("%s: Saw data stream end", what); retval = 0; break; } @@ -242,18 +270,25 @@ msg = "Internal program error (bug)"; break; } - DOMPRINTF("%s: LZMA decompression error %s", - __FUNCTION__, msg); + DOMPRINTF("%s: %s decompression error %s", + __FUNCTION__, what, msg); free(out_buf); goto lzma_cleanup; } - if ( stream.avail_out == 0 ) + if ( stream->avail_out == 0 ) { /* Protect against output buffer overflow */ - if ( outsize > INT_MAX / 2 ) + if ( outsize > SIZE_MAX / 2 ) + { + DOMPRINTF("%s: output buffer overflow", what); + free(out_buf); + goto lzma_cleanup; + } + + if ( xc_dom_kernel_check_size(dom, outsize * 2) ) { - DOMPRINTF("LZMA: output buffer overflow"); + DOMPRINTF("%s: output too large", what); free(out_buf); goto lzma_cleanup; } @@ -261,32 +296,61 @@ tmp_buf = realloc(out_buf, outsize * 2); if ( tmp_buf == NULL ) { - DOMPRINTF("LZMA: Failed to realloc memory"); + DOMPRINTF("%s: Failed to realloc memory", what); free(out_buf); goto lzma_cleanup; } out_buf = tmp_buf; - stream.next_out = out_buf + outsize; - stream.avail_out = (outsize * 2) - outsize; + stream->next_out = out_buf + outsize; + stream->avail_out = (outsize * 2) - outsize; outsize *= 2; } } - DOMPRINTF("%s: LZMA decompress OK, 0x%zx -> 0x%zx", - __FUNCTION__, *size, (size_t)stream.total_out); + DOMPRINTF("%s: %s decompress OK, 0x%zx -> 0x%zx", + __FUNCTION__, what, *size, (size_t)stream->total_out); *blob = out_buf; - *size = stream.total_out; + *size = stream->total_out; lzma_cleanup: - lzma_end(&stream); + lzma_end(stream); return retval; } +/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */ +#define LZMA_BLOCK_SIZE (128*1024*1024) + +static int xc_try_xz_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + lzma_stream stream = LZMA_STREAM_INIT; + lzma_ret ret = lzma_stream_decoder(&stream, LZMA_BLOCK_SIZE, 0); + + return _xc_try_lzma_decode(dom, blob, size, &stream, ret, "XZ"); +} + +static int xc_try_lzma_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + lzma_stream stream = LZMA_STREAM_INIT; + lzma_ret ret = lzma_alone_decoder(&stream, LZMA_BLOCK_SIZE); + + return _xc_try_lzma_decode(dom, blob, size, &stream, ret, "LZMA"); +} + #else /* !defined(HAVE_LZMA) */ +static int xc_try_xz_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + DOMPRINTF("%s: XZ decompress support unavailable", + __FUNCTION__); + return -1; +} + static int xc_try_lzma_decode( struct xc_dom_image *dom, void **blob, size_t *size) { @@ -327,6 +391,12 @@ 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a }; + /* + * lzo_uint should match size_t. Check that this is the case to be + * sure we won't overflow various lzo_uint fields. + */ + XC_BUILD_BUG_ON(sizeof(lzo_uint) != sizeof(size_t)); + ret = lzo_init(); if ( ret != LZO_E_OK ) { @@ -406,6 +476,14 @@ if ( src_len <= 0 || src_len > dst_len || src_len > left ) break; + msg = "Output buffer overflow"; + if ( *size > SIZE_MAX - dst_len ) + break; + + msg = "Decompressed image too large"; + if ( xc_dom_kernel_check_size(dom, *size + dst_len) ) + break; + msg = "Failed to (re)alloc memory"; tmp_buf = realloc(out_buf, *size + dst_len); if ( tmp_buf == NULL ) @@ -607,6 +685,17 @@ __FUNCTION__); return -EINVAL; } + } + else if ( check_magic(dom, "\3757zXZ", 6) ) + { + ret = xc_try_xz_decode(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret < 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s unable to XZ decompress kernel", + __FUNCTION__); + return -EINVAL; + } } else if ( check_magic(dom, "\135\000", 2) ) { diff -Nru xen-4.1.3/tools/libxc/xc_dom_core.c xen-4.1.5/tools/libxc/xc_dom_core.c --- xen-4.1.3/tools/libxc/xc_dom_core.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_dom_core.c 2013-04-23 18:44:20.000000000 +0200 @@ -159,7 +159,8 @@ } void *xc_dom_malloc_filemap(struct xc_dom_image *dom, - const char *filename, size_t * size) + const char *filename, size_t * size, + const size_t max_size) { struct xc_dom_mem *block = NULL; int fd = -1; @@ -171,6 +172,13 @@ lseek(fd, 0, SEEK_SET); *size = lseek(fd, 0, SEEK_END); + if ( max_size && *size > max_size ) + { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "tried to map file which is too large"); + goto err; + } + block = malloc(sizeof(*block)); if ( block == NULL ) goto err; @@ -222,6 +230,40 @@ } /* ------------------------------------------------------------------------ */ +/* decompression buffer sizing */ +int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz) +{ + /* No limit */ + if ( !dom->max_kernel_size ) + return 0; + + if ( sz > dom->max_kernel_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "kernel image too large"); + return 1; + } + + return 0; +} + +int xc_dom_ramdisk_check_size(struct xc_dom_image *dom, size_t sz) +{ + /* No limit */ + if ( !dom->max_ramdisk_size ) + return 0; + + if ( sz > dom->max_ramdisk_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "ramdisk image too large"); + return 1; + } + + return 0; +} + +/* ------------------------------------------------------------------------ */ /* read files, copy memory blocks, with transparent gunzip */ size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen) @@ -235,7 +277,7 @@ gzlen = blob + ziplen - 4; unziplen = gzlen[3] << 24 | gzlen[2] << 16 | gzlen[1] << 8 | gzlen[0]; - if ( (unziplen < 0) || (unziplen > (1024*1024*1024)) ) /* 1GB limit */ + if ( (unziplen < 0) || (unziplen > XC_DOM_DECOMPRESS_MAX) ) { xc_dom_printf (xch, @@ -288,6 +330,9 @@ if ( unziplen == 0 ) return 0; + if ( xc_dom_kernel_check_size(dom, unziplen) ) + return 0; + unzip = xc_dom_malloc(dom, unziplen); if ( unzip == NULL ) return -1; @@ -588,6 +633,9 @@ memset(dom, 0, sizeof(*dom)); dom->xch = xch; + dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX; + dom->max_ramdisk_size = XC_DOM_DECOMPRESS_MAX; + if ( cmdline ) dom->cmdline = xc_dom_strdup(dom, cmdline); if ( features ) @@ -608,10 +656,25 @@ return NULL; } +int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz) +{ + DOMPRINTF("%s: kernel_max_size=%zx", __FUNCTION__, sz); + dom->max_kernel_size = sz; + return 0; +} + +int xc_dom_ramdisk_max_size(struct xc_dom_image *dom, size_t sz) +{ + DOMPRINTF("%s: ramdisk_max_size=%zx", __FUNCTION__, sz); + dom->max_ramdisk_size = sz; + return 0; +} + int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename) { DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); - dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size); + dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size, + dom->max_kernel_size); if ( dom->kernel_blob == NULL ) return -1; return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); @@ -621,7 +684,9 @@ { DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); dom->ramdisk_blob = - xc_dom_malloc_filemap(dom, filename, &dom->ramdisk_size); + xc_dom_malloc_filemap(dom, filename, &dom->ramdisk_size, + dom->max_ramdisk_size); + if ( dom->ramdisk_blob == NULL ) return -1; // return xc_dom_try_gunzip(dom, &dom->ramdisk_blob, &dom->ramdisk_size); @@ -781,7 +846,11 @@ void *ramdiskmap; unziplen = xc_dom_check_gzip(dom->xch, dom->ramdisk_blob, dom->ramdisk_size); + if ( xc_dom_ramdisk_check_size(dom, unziplen) != 0 ) + unziplen = 0; + ramdisklen = unziplen ? unziplen : dom->ramdisk_size; + if ( xc_dom_alloc_segment(dom, &dom->ramdisk_seg, "ramdisk", 0, ramdisklen) != 0 ) goto err; diff -Nru xen-4.1.3/tools/libxc/xc_dom.h xen-4.1.5/tools/libxc/xc_dom.h --- xen-4.1.3/tools/libxc/xc_dom.h 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_dom.h 2013-04-23 18:44:20.000000000 +0200 @@ -52,6 +52,9 @@ void *ramdisk_blob; size_t ramdisk_size; + size_t max_kernel_size; + size_t max_ramdisk_size; + /* arguments and parameters */ char *cmdline; uint32_t f_requested[XENFEAT_NR_SUBMAPS]; @@ -175,6 +178,23 @@ void xc_dom_release(struct xc_dom_image *dom); int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb); +/* Set this larger if you have enormous ramdisks/kernels. Note that + * you should trust all kernels not to be maliciously large (e.g. to + * exhaust all dom0 memory) if you do this (see CVE-2012-4544 / + * XSA-25). You can also set the default independently for + * ramdisks/kernels in xc_dom_allocate() or call + * xc_dom_{kernel,ramdisk}_max_size. + */ +#ifndef XC_DOM_DECOMPRESS_MAX +#define XC_DOM_DECOMPRESS_MAX (1024*1024*1024) /* 1GB */ +#endif + +int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz); +int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz); + +int xc_dom_ramdisk_check_size(struct xc_dom_image *dom, size_t sz); +int xc_dom_ramdisk_max_size(struct xc_dom_image *dom, size_t sz); + size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen); int xc_dom_do_gunzip(xc_interface *xch, @@ -224,7 +244,8 @@ void *xc_dom_malloc(struct xc_dom_image *dom, size_t size); void *xc_dom_malloc_page_aligned(struct xc_dom_image *dom, size_t size); void *xc_dom_malloc_filemap(struct xc_dom_image *dom, - const char *filename, size_t * size); + const char *filename, size_t * size, + const size_t max_size); char *xc_dom_strdup(struct xc_dom_image *dom, const char *str); /* --- alloc memory pool ------------------------------------------- */ diff -Nru xen-4.1.3/tools/libxc/xc_hvm_build.c xen-4.1.5/tools/libxc/xc_hvm_build.c --- xen-4.1.3/tools/libxc/xc_hvm_build.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxc/xc_hvm_build.c 2013-04-23 18:44:20.000000000 +0200 @@ -154,7 +154,7 @@ goto error_out; if ( memsize > target ) - pod_mode = 1; + pod_mode = XENMEMF_populate_on_demand; memset(&elf, 0, sizeof(elf)); if ( elf_init(&elf, image, image_size) != 0 ) @@ -194,6 +194,22 @@ for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ ) page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; + if ( pod_mode ) + { + /* + * Subtract 0x20 from target_pages for the VGA "hole". Xen will + * adjust the PoD cache size so that domain tot_pages will be + * target_pages - 0x20 after this call. + */ + rc = xc_domain_set_pod_target(xch, dom, target_pages - 0x20, + NULL, NULL, NULL); + if ( rc != 0 ) + { + PERROR("Could not set PoD target for HVM guest.\n"); + goto error_out; + } + } + /* * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. * @@ -205,7 +221,7 @@ * ensure that we can be preempted and hence dom0 remains responsive. */ rc = xc_domain_populate_physmap_exact( - xch, dom, 0xa0, 0, 0, &page_array[0x00]); + xch, dom, 0xa0, 0, pod_mode, &page_array[0x00]); cur_pages = 0xc0; stat_normal_pages = 0xc0; while ( (rc == 0) && (nr_pages > cur_pages) ) @@ -243,8 +259,7 @@ sp_extents[i] = page_array[cur_pages+(i< 0 ) { @@ -281,8 +296,7 @@ sp_extents[i] = page_array[cur_pages+(i< 0 ) { @@ -298,19 +312,12 @@ if ( count != 0 ) { rc = xc_domain_populate_physmap_exact( - xch, dom, count, 0, 0, &page_array[cur_pages]); + xch, dom, count, 0, pod_mode, &page_array[cur_pages]); cur_pages += count; stat_normal_pages += count; } } - /* Subtract 0x20 from target_pages for the VGA "hole". Xen will - * adjust the PoD cache size so that domain tot_pages will be - * target_pages - 0x20 after this call. */ - if ( pod_mode ) - rc = xc_domain_set_pod_target(xch, dom, target_pages - 0x20, - NULL, NULL, NULL); - if ( rc != 0 ) { PERROR("Could not allocate memory for HVM guest."); diff -Nru xen-4.1.3/tools/libxl/libxl_blktap2.c xen-4.1.5/tools/libxl/libxl_blktap2.c --- xen-4.1.3/tools/libxl/libxl_blktap2.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxl/libxl_blktap2.c 2013-04-23 18:44:20.000000000 +0200 @@ -18,6 +18,8 @@ #include "tap-ctl.h" +#include + int libxl__blktap_enabled(libxl__gc *gc) { const char *msg; @@ -30,12 +32,13 @@ { const char *type; char *params, *devname = NULL; - int minor, err; + tap_list_t tap; + int err; type = libxl__device_disk_string_of_format(format); - minor = tap_ctl_find_minor(type, disk); - if (minor >= 0) { - devname = libxl__sprintf(gc, "/dev/xen/blktap-2/tapdev%d", minor); + err = tap_ctl_find(type, disk, &tap); + if (err == 0) { + devname = libxl__sprintf(gc, "/dev/xen/blktap-2/tapdev%d", tap.minor); if (devname) return devname; } @@ -49,3 +52,28 @@ return NULL; } + + +void libxl__device_destroy_tapdisk(libxl__gc *gc, char *be_path) +{ + char *path, *params, *type, *disk; + int err; + tap_list_t tap; + + path = libxl__sprintf(gc, "%s/tapdisk-params", be_path); + if (!path) return; + + params = libxl__xs_read(gc, XBT_NULL, path); + if (!params) return; + + type = params; + disk = strchr(params, ':'); + if (!disk) return; + + *disk++ = '\0'; + + err = tap_ctl_find(type, disk, &tap); + if (err < 0) return; + + tap_ctl_destroy(tap.id, tap.minor); +} diff -Nru xen-4.1.3/tools/libxl/libxl.c xen-4.1.5/tools/libxl/libxl.c --- xen-4.1.3/tools/libxl/libxl.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxl/libxl.c 2013-04-23 18:44:20.000000000 +0200 @@ -1207,7 +1207,8 @@ goto out_free; } if (!(l = libxl__xs_directory(&gc, XBT_NULL, - libxl__sprintf(&gc, "%s/device/vif", dompath), &nb))) { + libxl__sprintf(&gc, "%s/device/vif", dompath), &nb)) || + nb == 0) { nic->devid = 0; } else { nic->devid = strtoul(l[nb - 1], NULL, 10) + 1; diff -Nru xen-4.1.3/tools/libxl/libxl_device.c xen-4.1.5/tools/libxl/libxl_device.c --- xen-4.1.3/tools/libxl/libxl_device.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxl/libxl_device.c 2013-04-23 18:44:20.000000000 +0200 @@ -250,6 +250,7 @@ if (!state) goto out; if (atoi(state) != 4) { + libxl__device_destroy_tapdisk(&gc, be_path); xs_rm(ctx->xsh, XBT_NULL, be_path); goto out; } @@ -368,6 +369,7 @@ } } } + libxl__device_destroy_tapdisk(&gc, be_path); out: libxl__free_all(&gc); return 0; diff -Nru xen-4.1.3/tools/libxl/libxl_dm.c xen-4.1.5/tools/libxl/libxl_dm.c --- xen-4.1.3/tools/libxl/libxl_dm.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxl/libxl_dm.c 2013-04-23 18:44:20.000000000 +0200 @@ -458,7 +458,7 @@ libxl_domain_create_info c_info; libxl_domain_build_info b_info; libxl_domain_build_state state; - uint32_t domid; + uint32_t domid = 0; char **args; struct xs_permissions perm[2]; xs_transaction_t t; diff -Nru xen-4.1.3/tools/libxl/libxl_internal.h xen-4.1.5/tools/libxl/libxl_internal.h --- xen-4.1.3/tools/libxl/libxl_internal.h 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxl/libxl_internal.h 2013-04-23 18:44:20.000000000 +0200 @@ -314,6 +314,12 @@ const char *disk, libxl_disk_format format); +/* libxl__device_destroy_tapdisk: + * Destroys any tapdisk process associated with the backend represented + * by be_path. + */ +_hidden void libxl__device_destroy_tapdisk(libxl__gc *gc, char *be_path); + _hidden char *libxl__uuid2string(libxl__gc *gc, const libxl_uuid uuid); struct libxl__xen_console_reader { diff -Nru xen-4.1.3/tools/libxl/libxl_noblktap2.c xen-4.1.5/tools/libxl/libxl_noblktap2.c --- xen-4.1.3/tools/libxl/libxl_noblktap2.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/libxl/libxl_noblktap2.c 2013-04-23 18:44:20.000000000 +0200 @@ -27,3 +27,7 @@ { return NULL; } + +void libxl__device_destroy_tapdisk(libxl__gc *gc, char *be_path) +{ +} diff -Nru xen-4.1.3/tools/ocaml/libs/xb/partial.ml xen-4.1.5/tools/ocaml/libs/xb/partial.ml --- xen-4.1.3/tools/ocaml/libs/xb/partial.ml 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/ocaml/libs/xb/partial.ml 2013-04-23 18:44:20.000000000 +0200 @@ -27,8 +27,15 @@ external header_of_string_internal: string -> int * int * int * int = "stub_header_of_string" +let xenstore_payload_max = 4096 (* xen/include/public/io/xs_wire.h *) + let of_string s = let tid, rid, opint, dlen = header_of_string_internal s in + (* A packet which is bigger than xenstore_payload_max is illegal. + This will leave the guest connection is a bad state and will + be hard to recover from without restarting the connection + (ie rebooting the guest) *) + let dlen = min xenstore_payload_max dlen in { tid = tid; rid = rid; @@ -38,6 +45,7 @@ } let append pkt s sz = + if pkt.len > 4096 then failwith "Buffer.add: cannot grow buffer"; Buffer.add_string pkt.buf (String.sub s 0 sz) let to_complete pkt = diff -Nru xen-4.1.3/tools/ocaml/libs/xb/xs_ring_stubs.c xen-4.1.5/tools/ocaml/libs/xb/xs_ring_stubs.c --- xen-4.1.3/tools/ocaml/libs/xb/xs_ring_stubs.c 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/ocaml/libs/xb/xs_ring_stubs.c 2013-04-23 18:44:20.000000000 +0200 @@ -43,21 +43,23 @@ char *buffer, int len) { struct xenstore_domain_interface *intf = interface->addr; - XENSTORE_RING_IDX cons, prod; + XENSTORE_RING_IDX cons, prod; /* offsets only */ int to_read; - cons = intf->req_cons; - prod = intf->req_prod; + cons = *(volatile uint32*)&intf->req_cons; + prod = *(volatile uint32*)&intf->req_prod; xen_mb(); if (prod == cons) return 0; - if (MASK_XENSTORE_IDX(prod) > MASK_XENSTORE_IDX(cons)) + cons = MASK_XENSTORE_IDX(cons); + prod = MASK_XENSTORE_IDX(prod); + if (prod > cons) to_read = prod - cons; else - to_read = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); + to_read = XENSTORE_RING_SIZE - cons; if (to_read < len) len = to_read; - memcpy(buffer, intf->req + MASK_XENSTORE_IDX(cons), len); + memcpy(buffer, intf->req + cons, len); xen_mb(); intf->req_cons += len; return len; @@ -70,8 +72,8 @@ XENSTORE_RING_IDX cons, prod; int can_write; - cons = intf->rsp_cons; - prod = intf->rsp_prod; + cons = *(volatile uint32*)&intf->rsp_cons; + prod = *(volatile uint32*)&intf->rsp_prod; xen_mb(); if ( (prod - cons) >= XENSTORE_RING_SIZE ) return 0; diff -Nru xen-4.1.3/tools/pygrub/src/pygrub xen-4.1.5/tools/pygrub/src/pygrub --- xen-4.1.3/tools/pygrub/src/pygrub 2012-08-09 22:08:06.000000000 +0200 +++ xen-4.1.5/tools/pygrub/src/pygrub 2013-04-23 18:44:20.000000000 +0200 @@ -28,6 +28,7 @@ import grub.ExtLinuxConf PYGRUB_VER = 0.6 +FS_READ_MAX = 1024 * 1024 def enable_cursor(ison): if ison: @@ -421,7 +422,8 @@ if self.__dict__.get('cf', None) is None: raise RuntimeError, "couldn't find bootloader config file in the image provided." f = fs.open_file(self.cf.filename) - buf = f.read() + # limit read size to avoid pathological cases + buf = f.read(FS_READ_MAX) del f self.cf.parse(buf) @@ -553,7 +555,7 @@ return None -def run_grub(file, entry, fs, arg): +def run_grub(file, entry, fs, cfg_args): global g global sel @@ -583,13 +585,15 @@ except IndexError: img = g.cf.images[0] - grubcfg = { "kernel": None, "ramdisk": None, "args": None } + grubcfg = { "kernel": None, "ramdisk": None, "args": "" } grubcfg["kernel"] = img.kernel[1] if img.initrd: grubcfg["ramdisk"] = img.initrd[1] if img.args: - grubcfg["args"] = img.args + " " + arg + grubcfg["args"] += img.args + if cfg_args: + grubcfg["args"] += " " + cfg_args return grubcfg @@ -670,6 +674,37 @@ def usage(): print >> sys.stderr, "Usage: %s [-q|--quiet] [-i|--interactive] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] " %(sys.argv[0],) + def copy_from_image(fs, file_to_read, file_type, output_directory, + not_really): + if not_really: + if fs.file_exists(file_to_read): + return "<%s:%s>" % (file_type, file_to_read) + else: + sys.exit("The requested %s file does not exist" % file_type) + try: + datafile = fs.open_file(file_to_read) + except Exception, e: + print >>sys.stderr, e + sys.exit("Error opening %s in guest" % file_to_read) + (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".", + dir=output_directory) + dataoff = 0 + while True: + data = datafile.read(FS_READ_MAX, dataoff) + if len(data) == 0: + os.close(tfd) + del datafile + return ret + try: + os.write(tfd, data) + except Exception, e: + print >>sys.stderr, e + os.close(tfd) + os.unlink(ret) + del datafile + sys.exit("Error writing temporary copy of "+file_type) + dataoff += len(data) + try: opts, args = getopt.gnu_getopt(sys.argv[1:], 'qinh::', ["quiet", "interactive", "not-really", "help", @@ -786,24 +821,18 @@ if not fs: raise RuntimeError, "Unable to find partition containing kernel" - if not_really: - bootcfg["kernel"] = "" % chosencfg["kernel"] - else: - data = fs.open_file(chosencfg["kernel"]).read() - (tfd, bootcfg["kernel"]) = tempfile.mkstemp(prefix="boot_kernel.", - dir=output_directory) - os.write(tfd, data) - os.close(tfd) + bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel", + output_directory, not_really) if chosencfg["ramdisk"]: - if not_really: - bootcfg["ramdisk"] = "" % chosencfg["ramdisk"] - else: - data = fs.open_file(chosencfg["ramdisk"],).read() - (tfd, bootcfg["ramdisk"]) = tempfile.mkstemp( - prefix="boot_ramdisk.", dir=output_directory) - os.write(tfd, data) - os.close(tfd) + try: + bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"], + "ramdisk", output_directory, + not_really) + except: + if not not_really: + os.unlink(bootcfg["kernel"]) + raise else: initrd = None diff -Nru xen-4.1.3/tools/python/xen/lowlevel/netlink/libnetlink.c xen-4.1.5/tools/python/xen/lowlevel/netlink/libnetlink.c --- xen-4.1.3/tools/python/xen/lowlevel/netlink/libnetlink.c 2012-08-09 22:08:07.000000000 +0200 +++ xen-4.1.5/tools/python/xen/lowlevel/netlink/libnetlink.c 2013-04-23 18:44:20.000000000 +0200 @@ -37,7 +37,7 @@ int sndbuf = 32768; int rcvbuf = 32768; - memset(rth, 0, sizeof(rth)); + memset(rth, 0, sizeof(*rth)); rth->fd = socket(AF_NETLINK, SOCK_RAW, protocol); if (rth->fd < 0) { diff -Nru xen-4.1.3/tools/python/xen/util/vscsi_util.py xen-4.1.5/tools/python/xen/util/vscsi_util.py --- xen-4.1.3/tools/python/xen/util/vscsi_util.py 2012-08-09 22:08:07.000000000 +0200 +++ xen-4.1.5/tools/python/xen/util/vscsi_util.py 2013-04-23 18:44:20.000000000 +0200 @@ -105,6 +105,8 @@ devname = None try: sg = s[-1].split('/dev/')[1] + if devname is None: + devname = sg scsi_id = _vscsi_get_scsiid(sg) except IndexError: sg = None @@ -128,27 +130,43 @@ for dirpath, dirnames, files in os.walk(sysfs_mnt + SYSFS_SCSI_PATH): for hctl in dirnames: + if len(hctl.split(':')) != 4: + continue paths = os.path.join(dirpath, hctl) devname = None sg = None scsi_id = None for f in os.listdir(paths): realpath = os.path.realpath(os.path.join(paths, f)) - if re.match('^block', f) or \ - re.match('^tape', f) or \ - re.match('^scsi_changer', f) or \ - re.match('^onstream_tape', f): + if re.match('^block:', f) or \ + re.match('^tape:', f) or \ + re.match('^scsi_changer:', f) or \ + re.match('^onstream_tape:', f): devname = os.path.basename(realpath) + elif f == "block" or \ + f == "tape" or \ + f == "scsi_changer" or \ + f == "onstream_tape": + for dir in os.listdir(os.path.join(paths, f)): + if os.path.exists(os.path.join(paths, f, dir, "dev")): + devname = os.path.basename(dir) - if re.match('^scsi_generic', f): + if re.match('^scsi_generic:', f): sg = os.path.basename(realpath) + elif f == "scsi_generic": + for dir in os.listdir(os.path.join(paths, f)): + if os.path.exists(os.path.join(paths, f, dir, "dev")): + sg = os.path.basename(dir) + if sg: + if devname is None: + devname = sg scsi_id = _vscsi_get_scsiid(sg) devices.append([hctl, devname, sg, scsi_id]) return devices -def vscsi_get_scsidevices(mask=""): +def vscsi_get_scsidevices(mask="*"): """ get all scsi devices information """ devices = _vscsi_get_scsidevices_by_lsscsi("[%s]" % mask) @@ -277,7 +295,7 @@ return _make_scsi_record(scsi_info) return None -def get_all_scsi_devices(mask=""): +def get_all_scsi_devices(mask="*"): scsi_records = [] for scsi_info in vscsi_get_scsidevices(mask): scsi_record = _make_scsi_record(scsi_info) diff -Nru xen-4.1.3/tools/python/xen/xend/server/irqif.py xen-4.1.5/tools/python/xen/xend/server/irqif.py --- xen-4.1.3/tools/python/xen/xend/server/irqif.py 2012-08-09 22:08:07.000000000 +0200 +++ xen-4.1.5/tools/python/xen/xend/server/irqif.py 2013-04-23 18:44:20.000000000 +0200 @@ -73,6 +73,12 @@ pirq = get_param('irq') + rc = xc.physdev_map_pirq(domid = self.getDomid(), + index = pirq, + pirq = pirq) + if rc < 0: + raise VmError('irq: Failed to map irq %x' % (pirq)) + rc = xc.domain_irq_permission(domid = self.getDomid(), pirq = pirq, allow_access = True) @@ -81,12 +87,6 @@ #todo non-fatal raise VmError( 'irq: Failed to configure irq: %d' % (pirq)) - rc = xc.physdev_map_pirq(domid = self.getDomid(), - index = pirq, - pirq = pirq) - if rc < 0: - raise VmError( - 'irq: Failed to map irq %x' % (pirq)) back = dict([(k, config[k]) for k in self.valid_cfg if k in config]) return (self.allocateDeviceID(), back, {}) diff -Nru xen-4.1.3/tools/python/xen/xend/XendStateStore.py xen-4.1.5/tools/python/xen/xend/XendStateStore.py --- xen-4.1.3/tools/python/xen/xend/XendStateStore.py 2012-08-09 22:08:07.000000000 +0200 +++ xen-4.1.5/tools/python/xen/xend/XendStateStore.py 2013-04-23 18:44:20.000000000 +0200 @@ -101,7 +101,7 @@ if not os.path.exists(xml_path): return {} - if not os.path.getsize(xml_path) == 0: + if os.path.getsize(xml_path) == 0: return {} dom = minidom.parse(xml_path) diff -Nru xen-4.1.3/tools/xenballoon/xenballoond.init xen-4.1.5/tools/xenballoon/xenballoond.init --- xen-4.1.3/tools/xenballoon/xenballoond.init 2012-08-09 22:08:07.000000000 +0200 +++ xen-4.1.5/tools/xenballoon/xenballoond.init 2013-04-23 18:44:20.000000000 +0200 @@ -14,7 +14,7 @@ # Should-Start: # Required-Stop: $syslog $remote_fs # Should-Stop: -# Default-Start: 3 4 5 +# Default-Start: 3 5 # Default-Stop: 0 1 2 6 # Short-Description: Start/stop xenballoond # Description: Starts and stops the Xen ballooning daemon. diff -Nru xen-4.1.3/tools/xenstat/libxenstat/src/xenstat_linux.c xen-4.1.5/tools/xenstat/libxenstat/src/xenstat_linux.c --- xen-4.1.3/tools/xenstat/libxenstat/src/xenstat_linux.c 2012-08-09 22:08:07.000000000 +0200 +++ xen-4.1.5/tools/xenstat/libxenstat/src/xenstat_linux.c 2013-04-23 18:44:20.000000000 +0200 @@ -113,7 +113,7 @@ /* Initialize all variables called has passed as non-NULL to zeros */ if (iface != NULL) - memset(iface, 0, sizeof(iface)); + memset(iface, 0, sizeof(*iface)); if (rxBytes != NULL) *rxBytes = 0; if (rxPackets != NULL) diff -Nru xen-4.1.3/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h xen-4.1.5/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h --- xen-4.1.3/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h 2013-04-23 18:44:20.000000000 +0200 @@ -13,10 +13,19 @@ #define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED #endif -#if defined(_LINUX_INIT_H) && !defined(__init) +#ifdef _LINUX_INIT_H + +#ifndef __init #define __init #endif +#ifndef __devinit +#define __devinit +#define __devinitdata +#endif + +#endif /* _LINUX_INIT_H */ + #if defined(__LINUX_CACHE_H) && !defined(__read_mostly) #define __read_mostly #endif diff -Nru xen-4.1.3/xen/arch/x86/acpi/cpufreq/powernow.c xen-4.1.5/xen/arch/x86/acpi/cpufreq/powernow.c --- xen-4.1.3/xen/arch/x86/acpi/cpufreq/powernow.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/acpi/cpufreq/powernow.c 2013-04-23 18:44:20.000000000 +0200 @@ -146,6 +146,51 @@ return result; } +static void amd_fixup_frequency(struct xen_processor_px *px) +{ + u32 hi, lo, fid, did; + int index = px->control & 0x00000007; + const struct cpuinfo_x86 *c = ¤t_cpu_data; + + if ((c->x86 != 0x10 || c->x86_model >= 10) && c->x86 != 0x11) + return; + + rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); + /* + * MSR C001_0064+: + * Bit 63: PstateEn. Read-write. If set, the P-state is valid. + */ + if (!(hi & (1U << 31))) + return; + + fid = lo & 0x3f; + did = (lo >> 6) & 7; + if (c->x86 == 0x10) + px->core_frequency = (100 * (fid + 16)) >> did; + else + px->core_frequency = (100 * (fid + 8)) >> did; +} + +struct amd_cpu_data { + struct processor_performance *perf; + u32 max_hw_pstate; +}; + +static void get_cpu_data(void *arg) +{ + struct amd_cpu_data *data = arg; + struct processor_performance *perf = data->perf; + uint64_t msr_content; + unsigned int i; + + rdmsrl(MSR_PSTATE_CUR_LIMIT, msr_content); + data->max_hw_pstate = (msr_content & HW_PSTATE_MAX_MASK) >> + HW_PSTATE_MAX_SHIFT; + + for (i = 0; i < perf->state_count && i <= data->max_hw_pstate; i++) + amd_fixup_frequency(&perf->states[i]); +} + static int powernow_cpufreq_verify(struct cpufreq_policy *policy) { struct powernow_cpufreq_data *data; @@ -192,8 +237,7 @@ struct powernow_cpufreq_data *data; unsigned int result = 0; struct processor_performance *perf; - u32 max_hw_pstate; - uint64_t msr_content; + struct amd_cpu_data info; struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; data = xmalloc(struct powernow_cpufreq_data); @@ -205,7 +249,7 @@ data->acpi_data = &processor_pminfo[cpu]->perf; - perf = data->acpi_data; + info.perf = perf = data->acpi_data; policy->shared_type = perf->shared_type; /* @@ -225,8 +269,6 @@ result = -ENODEV; goto err_unreg; } - rdmsrl(MSR_PSTATE_CUR_LIMIT, msr_content); - max_hw_pstate = (msr_content & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; if (perf->control_register.space_id != perf->status_register.space_id) { result = -ENODEV; @@ -251,9 +293,11 @@ policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR; + on_selected_cpus(cpumask_of(cpu), get_cpu_data, &info, 1); + data->max_freq = perf->states[0].core_frequency * 1000; /* table init */ - for (i = 0; i < perf->state_count && i <= max_hw_pstate; i++) { + for (i = 0; i < perf->state_count && i <= info.max_hw_pstate; i++) { if (i > 0 && perf->states[i].core_frequency >= data->freq_table[valid_states-1].frequency / 1000) continue; diff -Nru xen-4.1.3/xen/arch/x86/acpi/power.c xen-4.1.5/xen/arch/x86/acpi/power.c --- xen-4.1.3/xen/arch/x86/acpi/power.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/acpi/power.c 2013-04-23 18:44:20.000000000 +0200 @@ -96,7 +96,10 @@ rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) + { + restore_vcpu_affinity(d); domain_unpause(d); + } rcu_read_unlock(&domlist_read_lock); } diff -Nru xen-4.1.3/xen/arch/x86/acpi/suspend.c xen-4.1.5/xen/arch/x86/acpi/suspend.c --- xen-4.1.3/xen/arch/x86/acpi/suspend.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/acpi/suspend.c 2013-04-23 18:44:20.000000000 +0200 @@ -81,8 +81,12 @@ } #else /* !defined(CONFIG_X86_64) */ - if ( supervisor_mode_kernel && cpu_has_sep ) - wrmsr(MSR_IA32_SYSENTER_ESP, &this_cpu(init_tss).esp1, 0); + if ( cpu_has_sep ) + { + wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); + if ( supervisor_mode_kernel ) + wrmsr(MSR_IA32_SYSENTER_ESP, &this_cpu(init_tss).esp1, 0); + } #endif /* Maybe load the debug registers. */ diff -Nru xen-4.1.3/xen/arch/x86/boot/cmdline.S xen-4.1.5/xen/arch/x86/boot/cmdline.S --- xen-4.1.3/xen/arch/x86/boot/cmdline.S 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/boot/cmdline.S 2013-04-23 18:44:20.000000000 +0200 @@ -164,13 +164,15 @@ pushl MB_cmdline(%ebx) call .Lfind_option test %eax,%eax - setnz bootsym_phys(skip_realmode) + setnz %al + or %al,bootsym_phys(skip_realmode) /* Check for 'tboot=' command-line option. */ movl $sym_phys(.Ltboot_opt),4(%esp) call .Lfind_option test %eax,%eax - setnz bootsym_phys(skip_realmode) /* tboot= implies no-real-mode */ + setnz %al + or %al,bootsym_phys(skip_realmode) /* tboot= implies no-real-mode */ .Lparse_edd: /* Check for 'edd=' command-line option. */ diff -Nru xen-4.1.3/xen/arch/x86/boot/edd.S xen-4.1.5/xen/arch/x86/boot/edd.S --- xen-4.1.3/xen/arch/x86/boot/edd.S 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/boot/edd.S 2013-04-23 18:44:20.000000000 +0200 @@ -61,12 +61,16 @@ jc edd_mbr_sig_done # on failure, we're done. cmpb $0, %ah # some BIOSes do not set CF jne edd_mbr_sig_done # on failure, we're done. + cmpw $0xaa55, bootsym(boot_edd_info)+0x1fe + jne .Ledd_mbr_sig_next movl bootsym(boot_edd_info)+EDD_MBR_SIG_OFFSET,%eax movb %dl, (%bx) # store BIOS drive number movl %eax, 4(%bx) # store signature from MBR incb bootsym(boot_mbr_signature_nr) # note that we stored something - incb %dl # increment to next device addw $8, %bx # increment sig buffer ptr +.Ledd_mbr_sig_next: + incb %dl # increment to next device + jz edd_mbr_sig_done cmpb $EDD_MBR_SIG_MAX,bootsym(boot_mbr_signature_nr) jb edd_mbr_sig_read edd_mbr_sig_done: diff -Nru xen-4.1.3/xen/arch/x86/cpu/amd.c xen-4.1.5/xen/arch/x86/cpu/amd.c --- xen-4.1.3/xen/arch/x86/cpu/amd.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/amd.c 2013-04-23 18:44:20.000000000 +0200 @@ -11,6 +11,7 @@ #include #include /* amd_init_cpu */ #include +#include #include "cpu.h" @@ -32,8 +33,11 @@ static char opt_famrev[14]; string_param("cpuid_mask_cpu", opt_famrev); -static int opt_allow_unsafe; +#ifdef __x86_64__ +/* 1 = allow, 0 = don't allow guest creation, -1 = don't allow boot */ +int __read_mostly opt_allow_unsafe; boolean_param("allow_unsafe", opt_allow_unsafe); +#endif static inline void wrmsr_amd(unsigned int index, unsigned int lo, unsigned int hi) @@ -612,6 +616,14 @@ } } + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if (c->x86 == 0x15 && c->x86_model >= 0x02 && c->x86_model < 0x20 && + !rdmsr_safe(MSR_AMD64_IC_CFG, value) && (value & 0x1e) != 0x1e) + wrmsr_safe(MSR_AMD64_IC_CFG, value | 0x1e); + amd_get_topology(c); /* Pointless to use MWAIT on Family10 as it does not deep sleep. */ @@ -623,10 +635,19 @@ clear_bit(X86_FEATURE_MCE, c->x86_capability); #ifdef __x86_64__ - if (cpu_has_amd_erratum(c, AMD_ERRATUM_121) && !opt_allow_unsafe) + if (!cpu_has_amd_erratum(c, AMD_ERRATUM_121)) + opt_allow_unsafe = 1; + else if (opt_allow_unsafe < 0) panic("Xen will not boot on this CPU for security reasons.\n" "Pass \"allow_unsafe\" if you're trusting all your" " (PV) guest kernels.\n"); + else if (!opt_allow_unsafe && c == &boot_cpu_data) + printk(KERN_WARNING + "*** Xen will not allow creation of DomU-s on" + " this CPU for security reasons. ***\n" + KERN_WARNING + "*** Pass \"allow_unsafe\" if you're trusting" + " all your (PV) guest kernels. ***\n"); /* AMD CPUs do not support SYSENTER outside of legacy mode. */ clear_bit(X86_FEATURE_SEP, c->x86_capability); @@ -640,6 +661,19 @@ } #endif + if (c->x86 == 0x10) { + /* + * On family 10h BIOS may not have properly enabled WC+ + * support, causing it to be converted to CD memtype. This may + * result in performance degradation for certain nested-paging + * guests. Prevent this conversion by clearing bit 24 in + * MSR_F10_BU_CFG2. + */ + rdmsrl(MSR_F10_BU_CFG2, value); + value &= ~(1ULL << 24); + wrmsrl(MSR_F10_BU_CFG2, value); + } + /* * Family 0x12 and above processors have APIC timer * running in deep C states. @@ -647,6 +681,17 @@ if (c->x86 > 0x11) set_bit(X86_FEATURE_ARAT, c->x86_capability); + /* + * Prior to Family 0x14, perf counters are not reset during warm reboot. + * We have to reset them manually. + */ + if (nmi_watchdog != NMI_LOCAL_APIC && c->x86 < 0x14) { + wrmsrl(MSR_K7_PERFCTR0, 0); + wrmsrl(MSR_K7_PERFCTR1, 0); + wrmsrl(MSR_K7_PERFCTR2, 0); + wrmsrl(MSR_K7_PERFCTR3, 0); + } + /* Prevent TSC drift in non single-processor, single-core platforms. */ if ((smp_processor_id() == 1) && c1_ramping_may_cause_clock_drift(c)) disable_c1_ramping(); diff -Nru xen-4.1.3/xen/arch/x86/cpu/centaur.c xen-4.1.5/xen/arch/x86/cpu/centaur.c --- xen-4.1.3/xen/arch/x86/cpu/centaur.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/centaur.c 2013-04-23 18:44:20.000000000 +0200 @@ -56,6 +56,9 @@ if (c->x86_model >=6 && c->x86_model <9) set_bit(X86_FEATURE_3DNOW, c->x86_capability); + if (cpuid_eax(0x80000000) < 0x80000008) + paddr_bits = 32; + get_model_name(c); display_cacheinfo(c); } diff -Nru xen-4.1.3/xen/arch/x86/cpu/common.c xen-4.1.5/xen/arch/x86/cpu/common.c --- xen-4.1.3/xen/arch/x86/cpu/common.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/common.c 2013-04-23 18:44:20.000000000 +0200 @@ -43,6 +43,8 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; +unsigned int paddr_bits __read_mostly = 36; + /* * Default host IA32_CR_PAT value to cover all memory types. * BIOS usually sets it to 0x07040600070406. @@ -68,6 +70,7 @@ else if (c->x86 == 3) safe_strcpy(c->x86_model_id, "386"); } + __clear_bit(X86_FEATURE_SEP, c->x86_capability); } static struct cpu_dev default_cpu = { @@ -318,6 +321,8 @@ } if ( xlvl >= 0x80000004 ) get_model_name(c); /* Default name */ + if ( xlvl >= 0x80000008 ) + paddr_bits = cpuid_eax(0x80000008) & 0xff; } /* Intel-defined flags: level 0x00000007 */ @@ -710,8 +715,11 @@ #if defined(CONFIG_X86_32) t->ss0 = __HYPERVISOR_DS; t->esp0 = get_stack_bottom(); - if ( supervisor_mode_kernel && cpu_has_sep ) + if ( cpu_has_sep ) { + wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); + if ( supervisor_mode_kernel ) wrmsr(MSR_IA32_SYSENTER_ESP, &t->esp1, 0); + } #elif defined(CONFIG_X86_64) /* Bottom-of-stack must be 16-byte aligned! */ BUG_ON((get_stack_bottom() & 15) != 0); diff -Nru xen-4.1.3/xen/arch/x86/cpu/cyrix.c xen-4.1.5/xen/arch/x86/cpu/cyrix.c --- xen-4.1.3/xen/arch/x86/cpu/cyrix.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/cyrix.c 2013-04-23 18:44:20.000000000 +0200 @@ -303,7 +303,9 @@ } safe_strcpy(c->x86_model_id, Cx86_model[dir0_msn & 7]); if (p) safe_strcat(c->x86_model_id, p); - return; + + if (cpu_has_cyrix_arr) + paddr_bits = 32; } /* diff -Nru xen-4.1.3/xen/arch/x86/cpu/intel.c xen-4.1.5/xen/arch/x86/cpu/intel.c --- xen-4.1.3/xen/arch/x86/cpu/intel.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/intel.c 2013-04-23 18:44:20.000000000 +0200 @@ -145,6 +145,11 @@ printk("revised cpuid_level = %d\n", c->cpuid_level); } } + + /* CPUID workaround for Intel 0F33/0F34 CPU */ + if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 && + (boot_cpu_data.x86_mask == 3 || boot_cpu_data.x86_mask == 4)) + paddr_bits = 36; } /* diff -Nru xen-4.1.3/xen/arch/x86/cpu/mcheck/mce.c xen-4.1.5/xen/arch/x86/cpu/mcheck/mce.c --- xen-4.1.3/xen/arch/x86/cpu/mcheck/mce.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/mcheck/mce.c 2013-04-23 18:44:20.000000000 +0200 @@ -1117,13 +1117,15 @@ printk("intpose_add: interpose array full - request dropped\n"); } -void intpose_inval(unsigned int cpu_nr, uint64_t msr) +bool_t intpose_inval(unsigned int cpu_nr, uint64_t msr) { - struct intpose_ent *ent; + struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL); - if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) { - ent->cpu_nr = -1; - } + if ( !ent ) + return 0; + + ent->cpu_nr = -1; + return 1; } #define IS_MCA_BANKREG(r) \ @@ -1253,12 +1255,6 @@ __asm__ __volatile__("int $0x12"); } -static void x86_cmci_inject(void *data) -{ - printk("Simulating CMCI on cpu %d\n", smp_processor_id()); - __asm__ __volatile__("int $0xf7"); -} - #if BITS_PER_LONG == 64 #define ID2COOKIE(id) ((mctelem_cookie_t)(id)) @@ -1541,7 +1537,9 @@ if ( !cmci_support ) return x86_mcerr( "No CMCI supported in platform\n", -EINVAL); - on_selected_cpus(&cpumap, x86_cmci_inject, NULL, 1); + if ( cpu_isset(smp_processor_id(), cpumap) ) + send_IPI_self(CMCI_APIC_VECTOR); + send_IPI_mask(&cpumap, CMCI_APIC_VECTOR); break; default: return x86_mcerr("Wrong mca type\n", -EINVAL); diff -Nru xen-4.1.3/xen/arch/x86/cpu/mcheck/mce.h xen-4.1.5/xen/arch/x86/cpu/mcheck/mce.h --- xen-4.1.3/xen/arch/x86/cpu/mcheck/mce.h 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/mcheck/mce.h 2013-04-23 18:44:20.000000000 +0200 @@ -83,7 +83,7 @@ /* Read an MSR, checking for an interposed value first */ extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t, uint64_t *); -extern void intpose_inval(unsigned int, uint64_t); +extern bool_t intpose_inval(unsigned int, uint64_t); static inline uint64_t mca_rdmsr(unsigned int msr) { @@ -95,9 +95,9 @@ /* Write an MSR, invalidating any interposed value */ #define mca_wrmsr(msr, val) do { \ - intpose_inval(smp_processor_id(), msr); \ - wrmsrl(msr, val); \ -} while (0) + if ( !intpose_inval(smp_processor_id(), msr) ) \ + wrmsrl(msr, val); \ +} while ( 0 ) /* Utility function to "logout" all architectural MCA telemetry from the MCA diff -Nru xen-4.1.3/xen/arch/x86/cpu/mcheck/mce_intel.c xen-4.1.5/xen/arch/x86/cpu/mcheck/mce_intel.c --- xen-4.1.3/xen/arch/x86/cpu/mcheck/mce_intel.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/mcheck/mce_intel.c 2013-04-23 18:44:20.000000000 +0200 @@ -512,7 +512,7 @@ } /* this function will called when CAP(9).MCG_EXT_P = 1 */ - memset(&mc_ext, 0, sizeof(struct mcinfo_extended)); + memset(mc_ext, 0, sizeof(*mc_ext)); mc_ext->common.type = MC_TYPE_EXTENDED; mc_ext->common.size = sizeof(struct mcinfo_extended); diff -Nru xen-4.1.3/xen/arch/x86/cpu/mtrr/main.c xen-4.1.5/xen/arch/x86/cpu/mtrr/main.c --- xen-4.1.3/xen/arch/x86/cpu/mtrr/main.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/cpu/mtrr/main.c 2013-04-23 18:44:20.000000000 +0200 @@ -600,8 +600,6 @@ unsigned long lsize; }; -unsigned int paddr_bits __read_mostly = 36; - /** * mtrr_bp_init - initialize mtrrs on the boot CPU * @@ -615,48 +613,12 @@ if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ - size_and_mask = 0x00f00000; - - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ - if (cpuid_eax(0x80000000) >= 0x80000008) { - paddr_bits = cpuid_eax(0x80000008) & 0xff; - /* CPUID workaround for Intel 0F33/0F34 CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 0xF && - boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_mask == 0x3 || - boot_cpu_data.x86_mask == 0x4)) - paddr_bits = 36; - - size_or_mask = ~((1ULL << (paddr_bits - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfffff00000ULL; - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && - boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } } else { #ifndef CONFIG_X86_64 switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (cpu_has_k6_mtrr) { - /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = mtrr_ops[X86_VENDOR_AMD]; - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } - break; case X86_VENDOR_CYRIX: - if (cpu_has_cyrix_arr) { + if (cpu_has_cyrix_arr) mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } break; default: break; @@ -665,6 +627,8 @@ } if (mtrr_if) { + size_or_mask = ~((1ULL << (paddr_bits - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; set_num_var_ranges(); init_table(); if (use_intel()) diff -Nru xen-4.1.3/xen/arch/x86/domain_build.c xen-4.1.5/xen/arch/x86/domain_build.c --- xen-4.1.3/xen/arch/x86/domain_build.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/domain_build.c 2013-04-23 18:44:20.000000000 +0200 @@ -1201,7 +1201,7 @@ /* DOM0 is permitted full I/O capabilities. */ rc |= ioports_permit_access(dom0, 0, 0xFFFF); rc |= iomem_permit_access(dom0, 0UL, ~0UL); - rc |= irqs_permit_access(dom0, 0, d->nr_pirqs - 1); + rc |= irqs_permit_access(dom0, 1, nr_irqs_gsi - 1); /* * Modify I/O port access permissions. diff -Nru xen-4.1.3/xen/arch/x86/domain.c xen-4.1.5/xen/arch/x86/domain.c --- xen-4.1.3/xen/arch/x86/domain.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/domain.c 2013-04-23 18:44:20.000000000 +0200 @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_COMPAT @@ -457,6 +458,20 @@ #else /* __x86_64__ */ + if ( d->domain_id && !is_idle_domain(d) && + cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) ) + { + if ( !opt_allow_unsafe ) + { + printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU" + " for security reasons.\n"); + return -EPERM; + } + printk(XENLOG_G_WARNING + "Dom%d may compromise security on this CPU.\n", + d->domain_id); + } + BUILD_BUG_ON(PDPT_L2_ENTRIES * sizeof(*d->arch.mm_perdomain_pt_pages) != PAGE_SIZE); pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); @@ -681,6 +696,14 @@ { if ( !compat ) { +#ifdef __x86_64__ + if ( !is_canonical_address(c.nat->user_regs.eip) || + !is_canonical_address(c.nat->event_callback_eip) || + !is_canonical_address(c.nat->syscall_callback_eip) || + !is_canonical_address(c.nat->failsafe_callback_eip) ) + return -EINVAL; +#endif + fixup_guest_stack_selector(d, c.nat->user_regs.ss); fixup_guest_stack_selector(d, c.nat->kernel_ss); fixup_guest_code_selector(d, c.nat->user_regs.cs); @@ -690,7 +713,11 @@ #endif for ( i = 0; i < 256; i++ ) + { + if ( !is_canonical_address(c.nat->trap_ctxt[i].address) ) + return -EINVAL; fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs); + } /* LDT safety checks. */ if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) || diff -Nru xen-4.1.3/xen/arch/x86/domctl.c xen-4.1.5/xen/arch/x86/domctl.c --- xen-4.1.3/xen/arch/x86/domctl.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/domctl.c 2013-04-23 18:44:20.000000000 +0200 @@ -908,9 +908,13 @@ goto bind_out; ret = -EPERM; - if ( !IS_PRIV(current->domain) && - !irq_access_permitted(current->domain, bind->machine_irq) ) - goto bind_out; + if ( !IS_PRIV(current->domain) ) + { + int irq = domain_pirq_to_irq(d, bind->machine_irq); + + if ( irq <= 0 || !irq_access_permitted(current->domain, irq) ) + goto bind_out; + } ret = -ESRCH; if ( iommu_enabled ) @@ -938,9 +942,13 @@ bind = &(domctl->u.bind_pt_irq); ret = -EPERM; - if ( !IS_PRIV(current->domain) && - !irq_access_permitted(current->domain, bind->machine_irq) ) - goto unbind_out; + if ( !IS_PRIV(current->domain) ) + { + int irq = domain_pirq_to_irq(d, bind->machine_irq); + + if ( irq <= 0 || !irq_access_permitted(current->domain, irq) ) + goto unbind_out; + } if ( iommu_enabled ) { @@ -962,10 +970,12 @@ unsigned long gfn = domctl->u.memory_mapping.first_gfn; unsigned long mfn = domctl->u.memory_mapping.first_mfn; unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; - int i; + unsigned long i; ret = -EINVAL; - if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */ + if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ + ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || + (gfn + nr_mfns - 1) < gfn ) /* wrap? */ break; ret = -EPERM; @@ -977,7 +987,6 @@ if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) ) break; - ret=0; if ( domctl->u.memory_mapping.add_mapping ) { gdprintk(XENLOG_INFO, @@ -985,18 +994,47 @@ gfn, mfn, nr_mfns); ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); - for ( i = 0; i < nr_mfns; i++ ) - set_mmio_p2m_entry(p2m_get_hostp2m(d), gfn+i, _mfn(mfn+i)); + if ( !ret && paging_mode_translate(d) ) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); + + for ( i = 0; !ret && i < nr_mfns; i++ ) + if ( !set_mmio_p2m_entry(p2m, gfn + i, _mfn(mfn + i)) ) + ret = -EIO; + if ( ret ) + { + printk(XENLOG_G_WARNING + "memory_map:fail: dom%d gfn=%lx mfn=%lx\n", + d->domain_id, gfn + i, mfn + i); + while ( i-- ) + clear_mmio_p2m_entry(p2m, gfn + i); + if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) && + IS_PRIV(current->domain) ) + printk(XENLOG_ERR + "memory_map: failed to deny dom%d access to [%lx,%lx]\n", + d->domain_id, mfn, mfn + nr_mfns - 1); + } + } } else { + bool_t acc = 0; + gdprintk(XENLOG_INFO, "memory_map:remove: gfn=%lx mfn=%lx nr_mfns=%lx\n", gfn, mfn, nr_mfns); - for ( i = 0; i < nr_mfns; i++ ) - clear_mmio_p2m_entry(p2m_get_hostp2m(d), gfn+i); + if ( paging_mode_translate(d) ) + for ( i = 0; i < nr_mfns; i++ ) + acc |= !clear_mmio_p2m_entry(p2m_get_hostp2m(d), gfn + i); ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); + if ( !ret && acc ) + ret = -EIO; + if ( ret && IS_PRIV(current->domain) ) + printk(XENLOG_ERR + "memory_map: error %ld %s dom%d access to [%lx,%lx]\n", + ret, acc ? "removing" : "denying", d->domain_id, + mfn, mfn + nr_mfns - 1); } rcu_unlock_domain(d); @@ -1015,7 +1053,7 @@ int found = 0; ret = -EINVAL; - if ( (np == 0) || (fgp > MAX_IOPORTS) || (fmp > MAX_IOPORTS) || + if ( ((fgp | fmp | (np - 1)) >= MAX_IOPORTS) || ((fgp + np) > MAX_IOPORTS) || ((fmp + np) > MAX_IOPORTS) ) { gdprintk(XENLOG_ERR, @@ -1048,15 +1086,27 @@ found = 1; break; } + ret = 0; if ( !found ) { g2m_ioport = xmalloc(struct g2m_ioport); + if ( !g2m_ioport ) + ret = -ENOMEM; + } + if ( !found && !ret ) + { g2m_ioport->gport = fgp; g2m_ioport->mport = fmp; g2m_ioport->np = np; list_add_tail(&g2m_ioport->list, &hd->g2m_ioport_list); } - ret = ioports_permit_access(d, fmp, fmp + np - 1); + if ( !ret ) + ret = ioports_permit_access(d, fmp, fmp + np - 1); + if ( ret && !found && g2m_ioport ) + { + list_del(&g2m_ioport->list); + xfree(g2m_ioport); + } } else { @@ -1071,6 +1121,10 @@ break; } ret = ioports_deny_access(d, fmp, fmp + np - 1); + if ( ret && IS_PRIV(current->domain) ) + printk(XENLOG_ERR + "ioport_map: error %ld denying dom%d access to [%x,%x]\n", + ret, d->domain_id, fmp, fmp + np - 1); } rcu_unlock_domain(d); } @@ -1147,6 +1201,9 @@ if ( evc->size != sizeof(*evc) ) goto ext_vcpucontext_out; #ifdef __x86_64__ + if ( !is_canonical_address(evc->sysenter_callback_eip) || + !is_canonical_address(evc->syscall32_callback_eip) ) + goto ext_vcpucontext_out; fixup_guest_code_selector(d, evc->sysenter_callback_cs); v->arch.sysenter_callback_cs = evc->sysenter_callback_cs; v->arch.sysenter_callback_eip = evc->sysenter_callback_eip; diff -Nru xen-4.1.3/xen/arch/x86/hpet.c xen-4.1.5/xen/arch/x86/hpet.c --- xen-4.1.3/xen/arch/x86/hpet.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/hpet.c 2013-04-23 18:44:20.000000000 +0200 @@ -262,7 +262,7 @@ ch = &hpet_events[ch_idx]; cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); - cfg |= HPET_TN_FSB; + cfg |= HPET_TN_ENABLE; hpet_write32(cfg, HPET_Tn_CFG(ch->idx)); } @@ -276,7 +276,7 @@ ch = &hpet_events[ch_idx]; cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); - cfg &= ~HPET_TN_FSB; + cfg &= ~HPET_TN_ENABLE; hpet_write32(cfg, HPET_Tn_CFG(ch->idx)); } @@ -367,8 +367,14 @@ int ret; struct msi_msg msg; struct hpet_event_channel *ch = &hpet_events[irq_to_channel(irq)]; + u32 cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); irq_desc_t *desc = irq_to_desc(irq); + /* set HPET Tn as oneshot */ + cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC); + cfg |= HPET_TN_FSB | HPET_TN_32BIT; + hpet_write32(cfg, HPET_Tn_CFG(ch->idx)); + if ( desc->handler == &no_irq_type ) { desc->handler = &hpet_msi_type; @@ -493,6 +499,16 @@ return ch; } +static void set_channel_irq_affinity(const struct hpet_event_channel *ch) +{ + struct irq_desc *desc = irq_to_desc(ch->irq); + + ASSERT(!local_irq_is_enabled()); + spin_lock(&desc->lock); + desc->handler->set_affinity(ch->irq, cpumask_of_cpu(ch->cpu)); + spin_unlock(&desc->lock); +} + static void hpet_attach_channel(int cpu, struct hpet_event_channel *ch) { ASSERT(spin_is_locked(&ch->lock)); @@ -506,9 +522,7 @@ if ( ch->cpu != cpu ) return; - /* set irq affinity */ - irq_desc[ch->irq].handler-> - set_affinity(ch->irq, cpumask_of_cpu(ch->cpu)); + set_channel_irq_affinity(ch); } static void hpet_detach_channel(int cpu, struct hpet_event_channel *ch) @@ -529,9 +543,7 @@ } ch->cpu = first_cpu(ch->cpumask); - /* set irq affinity */ - irq_desc[ch->irq].handler-> - set_affinity(ch->irq, cpumask_of_cpu(ch->cpu)); + set_channel_irq_affinity(ch); } #include @@ -587,12 +599,6 @@ for ( i = 0; i < num_hpets_used; i++ ) { - /* set HPET Tn as oneshot */ - cfg = hpet_read32(HPET_Tn_CFG(hpet_events[i].idx)); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_write32(cfg, HPET_Tn_CFG(hpet_events[i].idx)); - hpet_events[i].mult = div_sc((unsigned long)hpet_rate, 1000000000ul, 32); hpet_events[i].shift = 32; @@ -620,7 +626,7 @@ /* set HPET T0 as oneshot */ cfg = hpet_read32(HPET_T0_CFG); - cfg &= ~HPET_TN_PERIODIC; + cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC); cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_write32(cfg, HPET_T0_CFG); diff -Nru xen-4.1.3/xen/arch/x86/hvm/hvm.c xen-4.1.5/xen/arch/x86/hvm/hvm.c --- xen-4.1.3/xen/arch/x86/hvm/hvm.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/hvm/hvm.c 2013-04-23 18:44:20.000000000 +0200 @@ -217,6 +217,31 @@ hvm_funcs.set_rdtsc_exiting(v, enable); } +void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat) +{ + if ( !hvm_funcs.get_guest_pat(v, guest_pat) ) + *guest_pat = v->arch.hvm_vcpu.pat_cr; +} + +int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat) +{ + int i; + uint8_t *value = (uint8_t *)&guest_pat; + + for ( i = 0; i < 8; i++ ) + if ( unlikely(!(value[i] == 0 || value[i] == 1 || + value[i] == 4 || value[i] == 5 || + value[i] == 6 || value[i] == 7)) ) { + HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid guest PAT: %"PRIx64"\n", + guest_pat); + return 0; + } + + if ( !hvm_funcs.set_guest_pat(v, guest_pat) ) + v->arch.hvm_vcpu.pat_cr = guest_pat; + return 1; +} + void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc) { uint64_t tsc; @@ -2394,7 +2419,7 @@ break; case MSR_IA32_CR_PAT: - *msr_content = v->arch.hvm_vcpu.pat_cr; + hvm_get_guest_pat(v, msr_content); break; case MSR_MTRRcap: @@ -2510,7 +2535,7 @@ break; case MSR_IA32_CR_PAT: - if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) ) + if ( !hvm_set_guest_pat(v, msr_content) ) goto gp_fault; break; @@ -3446,6 +3471,9 @@ if ( !is_hvm_domain(d) ) goto param_fail2; + if ( a.nr > GB(1) >> PAGE_SHIFT ) + goto param_fail2; + rc = xsm_hvm_param(d, op); if ( rc ) goto param_fail2; @@ -3473,7 +3501,6 @@ struct xen_hvm_modified_memory a; struct domain *d; struct p2m_domain *p2m; - unsigned long pfn; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; @@ -3501,8 +3528,9 @@ goto param_fail3; p2m = p2m_get_hostp2m(d); - for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) + while ( a.nr > 0 ) { + unsigned long pfn = a.first_pfn; p2m_type_t t; mfn_t mfn = gfn_to_mfn(p2m, pfn, &t); if ( p2m_is_paging(t) ) @@ -3523,6 +3551,19 @@ /* don't take a long time and don't die either */ sh_remove_shadows(d->vcpu[0], mfn, 1, 0); } + + a.first_pfn++; + a.nr--; + + /* Check for continuation if it's not the last interation */ + if ( a.nr > 0 && hypercall_preempt_check() ) + { + if ( copy_to_guest(arg, &a, 1) ) + rc = -EFAULT; + else + rc = -EAGAIN; + break; + } } param_fail3: @@ -3553,6 +3594,10 @@ a.mem_type = HVMMEM_ram_ro; else if ( p2m_is_ram(t) ) a.mem_type = HVMMEM_ram_rw; + else if ( p2m_is_magic(t) ) + a.mem_type = HVMMEM_ram_rw; + else if ( p2m_is_grant(t) ) + a.mem_type = HVMMEM_ram_rw; else a.mem_type = HVMMEM_mmio_dm; rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0; @@ -3566,7 +3611,6 @@ struct xen_hvm_set_mem_type a; struct domain *d; struct p2m_domain *p2m; - unsigned long pfn; /* Interface types to internal p2m types */ p2m_type_t memtype[] = { @@ -3596,8 +3640,9 @@ goto param_fail4; p2m = p2m_get_hostp2m(d); - for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) + while ( a.nr > 0 ) { + unsigned long pfn = a.first_pfn; p2m_type_t t; p2m_type_t nt; mfn_t mfn; @@ -3633,6 +3678,19 @@ goto param_fail4; } } + + a.first_pfn++; + a.nr--; + + /* Check for continuation if it's not the last interation */ + if ( a.nr > 0 && hypercall_preempt_check() ) + { + if ( copy_to_guest(arg, &a, 1) ) + rc = -EFAULT; + else + rc = -EAGAIN; + goto param_fail4; + } } rc = 0; @@ -3670,7 +3728,7 @@ return rc; rc = -EINVAL; - if ( !is_hvm_domain(d) ) + if ( !is_hvm_domain(d) || a.hvmmem_access >= ARRAY_SIZE(memaccess) ) goto param_fail5; p2m = p2m_get_hostp2m(d); @@ -3690,9 +3748,6 @@ ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) ) goto param_fail5; - if ( a.hvmmem_access >= ARRAY_SIZE(memaccess) ) - goto param_fail5; - for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) { p2m_type_t t; diff -Nru xen-4.1.3/xen/arch/x86/hvm/mtrr.c xen-4.1.5/xen/arch/x86/hvm/mtrr.c --- xen-4.1.3/xen/arch/x86/hvm/mtrr.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/hvm/mtrr.c 2013-04-23 18:44:20.000000000 +0200 @@ -406,26 +406,6 @@ return pat_type_2_pte_flags(pat_entry_value); } -/* Helper funtions for seting mtrr/pat */ -bool_t pat_msr_set(uint64_t *pat, uint64_t msr_content) -{ - uint8_t *value = (uint8_t*)&msr_content; - int32_t i; - - if ( *pat != msr_content ) - { - for ( i = 0; i < 8; i++ ) - if ( unlikely(!(value[i] == 0 || value[i] == 1 || - value[i] == 4 || value[i] == 5 || - value[i] == 6 || value[i] == 7)) ) - return 0; - - *pat = msr_content; - } - - return 1; -} - bool_t mtrr_def_type_msr_set(struct mtrr_state *m, uint64_t msr_content) { uint8_t def_type = msr_content & 0xff; @@ -636,7 +616,7 @@ { mtrr_state = &v->arch.hvm_vcpu.mtrr; - hw_mtrr.msr_pat_cr = v->arch.hvm_vcpu.pat_cr; + hvm_get_guest_pat(v, &hw_mtrr.msr_pat_cr); hw_mtrr.msr_mtrr_def_type = mtrr_state->def_type | (mtrr_state->enabled << 10); @@ -681,7 +661,7 @@ mtrr_state = &v->arch.hvm_vcpu.mtrr; - pat_msr_set(&v->arch.hvm_vcpu.pat_cr, hw_mtrr.msr_pat_cr); + hvm_set_guest_pat(v, hw_mtrr.msr_pat_cr); mtrr_state->mtrr_cap = hw_mtrr.msr_mtrr_cap; diff -Nru xen-4.1.3/xen/arch/x86/hvm/svm/svm.c xen-4.1.5/xen/arch/x86/hvm/svm/svm.c --- xen-4.1.3/xen/arch/x86/hvm/svm/svm.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/hvm/svm/svm.c 2013-04-23 18:44:20.000000000 +0200 @@ -585,6 +585,28 @@ svm_vmload(vmcb); } +static int svm_set_guest_pat(struct vcpu *v, u64 gpat) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + if ( !paging_mode_hap(v->domain) ) + return 0; + + vmcb_set_g_pat(vmcb, gpat); + return 1; +} + +static int svm_get_guest_pat(struct vcpu *v, u64 *gpat) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + if ( !paging_mode_hap(v->domain) ) + return 0; + + *gpat = vmcb_get_g_pat(vmcb); + return 1; +} + static void svm_set_tsc_offset(struct vcpu *v, u64 offset) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -1519,6 +1541,8 @@ .update_host_cr3 = svm_update_host_cr3, .update_guest_cr = svm_update_guest_cr, .update_guest_efer = svm_update_guest_efer, + .set_guest_pat = svm_set_guest_pat, + .get_guest_pat = svm_get_guest_pat, .set_tsc_offset = svm_set_tsc_offset, .inject_exception = svm_inject_exception, .init_hypercall_page = svm_init_hypercall_page, diff -Nru xen-4.1.3/xen/arch/x86/hvm/vlapic.c xen-4.1.5/xen/arch/x86/hvm/vlapic.c --- xen-4.1.3/xen/arch/x86/hvm/vlapic.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/hvm/vlapic.c 2013-04-23 18:44:20.000000000 +0200 @@ -660,10 +660,9 @@ break; case APIC_SELF_IPI: - if ( vlapic_x2apic_mode(vlapic) ) - vlapic_reg_write(v, APIC_ICR, 0x40000 | (val & 0xff)); - else - rc = X86EMUL_UNHANDLEABLE; + rc = vlapic_x2apic_mode(vlapic) + ? vlapic_reg_write(v, APIC_ICR, 0x40000 | (val & 0xff)) + : X86EMUL_UNHANDLEABLE; break; case APIC_ICR: @@ -818,19 +817,18 @@ { struct vlapic *vlapic = vcpu_vlapic(v); uint32_t offset = (msr - MSR_IA32_APICBASE_MSR) << 4; - int rc; if ( !vlapic_x2apic_mode(vlapic) ) - return 1; + return X86EMUL_UNHANDLEABLE; if ( offset == APIC_ICR ) - if ( vlapic_reg_write(v, APIC_ICR2 , (uint32_t)(msr_content >> 32)) ) - return 1; - - rc = vlapic_reg_write(v, offset, (uint32_t)msr_content); + { + int rc = vlapic_reg_write(v, APIC_ICR2, (uint32_t)(msr_content >> 32)); + if ( rc ) + return rc; + } - /* X86EMUL_RETRY for SIPI */ - return ((rc != X86EMUL_OKAY) && (rc != X86EMUL_RETRY)); + return vlapic_reg_write(v, offset, (uint32_t)msr_content); } static int vlapic_range(struct vcpu *v, unsigned long addr) diff -Nru xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c xen-4.1.5/xen/arch/x86/hvm/vmx/vmx.c --- xen-4.1.3/xen/arch/x86/hvm/vmx/vmx.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/hvm/vmx/vmx.c 2013-04-23 18:44:20.000000000 +0200 @@ -921,6 +921,34 @@ vmx_vmcs_exit(v); } +static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) +{ + if ( !cpu_has_vmx_pat || !paging_mode_hap(v->domain) ) + return 0; + + vmx_vmcs_enter(v); + __vmwrite(GUEST_PAT, gpat); +#ifdef __i386__ + __vmwrite(GUEST_PAT_HIGH, gpat >> 32); +#endif + vmx_vmcs_exit(v); + return 1; +} + +static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) +{ + if ( !cpu_has_vmx_pat || !paging_mode_hap(v->domain) ) + return 0; + + vmx_vmcs_enter(v); + *gpat = __vmread(GUEST_PAT); +#ifdef __i386__ + *gpat |= (u64)__vmread(GUEST_PAT_HIGH) << 32; +#endif + vmx_vmcs_exit(v); + return 1; +} + static void vmx_set_tsc_offset(struct vcpu *v, u64 offset) { vmx_vmcs_enter(v); @@ -1064,20 +1092,18 @@ if ( paging_mode_hap(v->domain) ) { - /* We manage GUEST_CR3 when guest CR0.PE is zero or when cr3 memevents are on */ + /* Manage GUEST_CR3 when CR0.PE=0. */ uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); v->arch.hvm_vmx.exec_control &= ~cr3_ctls; if ( !hvm_paging_enabled(v) ) v->arch.hvm_vmx.exec_control |= cr3_ctls; + /* Trap CR3 updates if CR3 memory events are enabled. */ if ( v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_CR3] ) v->arch.hvm_vmx.exec_control |= CPU_BASED_CR3_LOAD_EXITING; vmx_update_cpu_exec_control(v); - - /* Changing CR0.PE can change some bits in real CR4. */ - vmx_update_guest_cr(v, 4); } if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) @@ -1107,8 +1133,6 @@ { for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) vmx_set_segment_register(v, s, ®[s]); - v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME; - __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); v->arch.hvm_vmx.exception_bitmap = 0xffffffff; vmx_update_exception_bitmap(v); } @@ -1118,10 +1142,6 @@ if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<arch.hvm_vmx.vm86_saved_seg[s]); - v->arch.hvm_vcpu.hw_cr[4] = - ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME) - |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME)); - __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK | (paging_mode_hap(v->domain) ? 0 : (1U << TRAP_page_fault)) @@ -1135,6 +1155,9 @@ v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask; __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]); __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]); + + /* Changing CR0 can change some bits in real CR4. */ + vmx_update_guest_cr(v, 4); break; } case 2: @@ -1164,6 +1187,16 @@ v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE; v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; } + if ( !hvm_paging_enabled(v) ) + { + /* + * SMEP is disabled if CPU is in non-paging mode in hardware. + * However Xen always uses paging mode to emulate guest non-paging + * mode. To emulate this behavior, SMEP needs to be manually + * disabled when guest VCPU is in non-paging mode. + */ + v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_SMEP; + } __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]); break; @@ -1384,6 +1417,8 @@ .update_host_cr3 = vmx_update_host_cr3, .update_guest_cr = vmx_update_guest_cr, .update_guest_efer = vmx_update_guest_efer, + .set_guest_pat = vmx_set_guest_pat, + .get_guest_pat = vmx_get_guest_pat, .set_tsc_offset = vmx_set_tsc_offset, .inject_exception = vmx_inject_exception, .init_hypercall_page = vmx_init_hypercall_page, @@ -2124,6 +2159,13 @@ vector = intr_info & INTR_INFO_VECTOR_MASK; if ( vector == TRAP_machine_check ) do_machine_check(regs); + if ( vector == TRAP_nmi + && ((intr_info & INTR_INFO_INTR_TYPE_MASK) == + (X86_EVENTTYPE_NMI << 8)) ) + { + do_nmi(regs); + enable_nmis(); + } break; case EXIT_REASON_MCE_DURING_VMENTRY: do_machine_check(regs); @@ -2297,7 +2339,7 @@ (X86_EVENTTYPE_NMI << 8) ) goto exit_and_crash; HVMTRACE_0D(NMI); - self_nmi(); /* Real NMI, vector 2: normal processing. */ + /* Already handled above. */ break; case TRAP_machine_check: HVMTRACE_0D(MCE); diff -Nru xen-4.1.3/xen/arch/x86/i8259.c xen-4.1.5/xen/arch/x86/i8259.c --- xen-4.1.3/xen/arch/x86/i8259.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/i8259.c 2013-04-23 18:44:20.000000000 +0200 @@ -395,6 +395,8 @@ struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg = desc->chip_data; + if ( irq == 2 ) /* IRQ2 doesn't exist */ + continue; desc->handler = &i8259A_irq_type; per_cpu(vector_irq, cpu)[FIRST_LEGACY_VECTOR + irq] = irq; cfg->cpu_mask= cpumask_of_cpu(cpu); diff -Nru xen-4.1.3/xen/arch/x86/io_apic.c xen-4.1.5/xen/arch/x86/io_apic.c --- xen-4.1.3/xen/arch/x86/io_apic.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/io_apic.c 2013-04-23 18:44:20.000000000 +0200 @@ -499,7 +499,9 @@ irq_enter(); me = smp_processor_id(); - for (vector = FIRST_DYNAMIC_VECTOR; vector < NR_VECTORS; vector++) { + for ( vector = FIRST_DYNAMIC_VECTOR; + vector <= LAST_HIPRIORITY_VECTOR; vector++) + { unsigned int irq; unsigned int irr; struct irq_desc *desc; @@ -509,6 +511,9 @@ if (irq == -1) continue; + if ( vector >= FIRST_LEGACY_VECTOR && vector <= LAST_LEGACY_VECTOR ) + continue; + desc = irq_to_desc(irq); if (!desc) continue; @@ -530,7 +535,7 @@ * to myself. */ if (irr & (1 << (vector % 32))) { - genapic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; @@ -556,7 +561,7 @@ cpus_and(cleanup_mask, cfg->old_cpu_mask, cpu_online_map); cfg->move_cleanup_count = cpus_weight(cleanup_mask); - genapic->send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); cfg->move_in_progress = 0; } @@ -1094,18 +1099,17 @@ else add_pin_to_irq(irq, apic, pin); - if (!apic && !IO_APIC_IRQ(irq)) + if (!IO_APIC_IRQ(irq)) continue; - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - BUG_ON(vector < 0); - entry.vector = vector; - ioapic_register_intr(irq, IOAPIC_AUTO); + vector = assign_irq_vector(irq); + BUG_ON(vector < 0); + entry.vector = vector; + ioapic_register_intr(irq, IOAPIC_AUTO); + + if (platform_legacy_irq(irq)) + disable_8259A_irq(irq); - if (!apic && platform_legacy_irq(irq)) - disable_8259A_irq(irq); - } cfg = irq_cfg(irq); SET_DEST(entry.dest.dest32, entry.dest.logical.logical_dest, cpu_mask_to_apicid(&cfg->cpu_mask)); @@ -2407,18 +2411,15 @@ static int apic_pin_2_gsi_irq(int apic, int pin) { - int idx, irq; + int idx; if (apic < 0) return -EINVAL; - irq = apic_gsi_base(apic) + pin; - if (apic == 0) { - idx = find_irq_entry(apic, pin, mp_INT); - if (idx >= 0) - irq = pin_2_irq(idx, apic, pin); - } - return irq; + idx = find_irq_entry(apic, pin, mp_INT); + + return idx >= 0 ? pin_2_irq(idx, apic, pin) + : apic_gsi_base(apic) + pin; } int ioapic_guest_read(unsigned long physbase, unsigned int reg, u32 *pval) diff -Nru xen-4.1.3/xen/arch/x86/irq.c xen-4.1.5/xen/arch/x86/irq.c --- xen-4.1.3/xen/arch/x86/irq.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/irq.c 2013-04-23 18:44:20.000000000 +0200 @@ -174,6 +174,15 @@ out: spin_unlock_irqrestore(&vector_lock, flags); + if ( irq > 0 && dom0 ) + { + ret = irq_permit_access(dom0, irq); + if ( ret ) + printk(XENLOG_G_ERR + "Could not grant Dom0 access to IRQ%d (error %d)\n", + irq, ret); + } + return irq; } @@ -258,6 +267,17 @@ void destroy_irq(unsigned int irq) { BUG_ON(!MSI_IRQ(irq)); + + if ( dom0 ) + { + int err = irq_deny_access(dom0, irq); + + if ( err ) + printk(XENLOG_G_ERR + "Could not revoke Dom0 access to IRQ%u (error %d)\n", + irq, err); + } + dynamic_irq_cleanup(irq); clear_irq_vector(irq); } @@ -1604,7 +1624,7 @@ if ( !IS_PRIV(current->domain) && !(IS_PRIV_FOR(current->domain, d) && - irq_access_permitted(current->domain, pirq))) + irq_access_permitted(current->domain, irq))) return -EPERM; if ( pirq < 0 || pirq >= d->nr_pirqs || irq < 0 || irq >= nr_irqs ) @@ -1625,11 +1645,12 @@ return 0; } - ret = irq_permit_access(d, pirq); + ret = irq_permit_access(d, irq); if ( ret ) { - dprintk(XENLOG_G_ERR, "dom%d: could not permit access to irq %d\n", - d->domain_id, pirq); + printk(XENLOG_G_ERR + "dom%d: could not permit access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); return ret; } @@ -1651,8 +1672,14 @@ spin_lock_irqsave(&desc->lock, flags); if ( desc->handler != &no_irq_type ) + { + spin_unlock_irqrestore(&desc->lock, flags); dprintk(XENLOG_G_ERR, "dom%d: irq %d in use\n", d->domain_id, irq); + pci_disable_msi(msi_desc); + ret = -EBUSY; + goto done; + } desc->handler = &pci_msi_type; if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV && !desc->chip_data->used_vectors ) @@ -1677,12 +1704,13 @@ d->arch.pirq_irq[pirq] = irq; d->arch.irq_pirq[irq] = pirq; spin_unlock_irqrestore(&desc->lock, flags); - - if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV ) - printk(XENLOG_INFO "Per-device vector maps for GSIs not implemented yet.\n"); } done: + if ( ret && irq_deny_access(d, irq) ) + printk(XENLOG_G_ERR + "dom%d: could not revoke access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); return ret; } @@ -1739,10 +1767,11 @@ if (msi_desc) msi_free_irq(msi_desc); - ret = irq_deny_access(d, pirq); + ret = irq_deny_access(d, irq); if ( ret ) - dprintk(XENLOG_G_ERR, "dom%d: could not deny access to irq %d\n", - d->domain_id, pirq); + printk(XENLOG_G_ERR + "dom%d: could not deny access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); if ( desc->handler == &pci_msi_type ) desc->handler = &no_irq_type; diff -Nru xen-4.1.3/xen/arch/x86/mm/hap/hap.c xen-4.1.5/xen/arch/x86/mm/hap/hap.c --- xen-4.1.3/xen/arch/x86/mm/hap/hap.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm/hap/hap.c 2013-04-23 18:44:20.000000000 +0200 @@ -686,6 +686,9 @@ d->arch.paging.mode &= ~PG_log_dirty; + xfree(d->arch.hvm_domain.dirty_vram); + d->arch.hvm_domain.dirty_vram = NULL; + hap_unlock(d); } diff -Nru xen-4.1.3/xen/arch/x86/mm/hap/p2m-ept.c xen-4.1.5/xen/arch/x86/mm/hap/p2m-ept.c --- xen-4.1.3/xen/arch/x86/mm/hap/p2m-ept.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm/hap/p2m-ept.c 2013-04-23 18:44:20.000000000 +0200 @@ -449,7 +449,7 @@ } /* Track the highest gfn for which we have ever had a valid mapping */ - if ( mfn_valid(mfn_x(mfn)) && + if ( p2mt != p2m_invalid && (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) ) p2m->max_mapped_pfn = gfn + (1UL << order) - 1; @@ -542,13 +542,13 @@ } /* Populate this superpage */ - ASSERT(i == 1); + ASSERT(i <= 2); index = gfn_remainder >> ( i * EPT_TABLE_ORDER); ept_entry = table + index; - if ( !ept_pod_check_and_populate(p2m, gfn, - ept_entry, 9, q) ) + if ( !ept_pod_check_and_populate(p2m, gfn, ept_entry, + i * EPT_TABLE_ORDER, q) ) goto retry; else goto out; diff -Nru xen-4.1.3/xen/arch/x86/mm/p2m.c xen-4.1.5/xen/arch/x86/mm/p2m.c --- xen-4.1.3/xen/arch/x86/mm/p2m.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm/p2m.c 2013-04-23 18:44:20.000000000 +0200 @@ -327,7 +327,7 @@ static int p2m_pod_cache_add(struct p2m_domain *p2m, struct page_info *page, - unsigned long order) + unsigned int order) { int i; struct page_info *p; @@ -341,7 +341,7 @@ /* Check to make sure this is a contiguous region */ if( mfn_x(mfn) & ((1 << order) - 1) ) { - printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n", + printk("%s: mfn %lx not aligned order %u! (mask %lx)\n", __func__, mfn_x(mfn), order, ((1UL << order) - 1)); return -1; } @@ -413,7 +413,7 @@ * a superpage is requested and no superpages are available. Must be called * with the d->page_lock held. */ static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, - unsigned long order) + unsigned int order) { struct page_info *p = NULL; int i; @@ -495,7 +495,7 @@ goto retry; } - printk("%s: Unable to allocate domheap page for pod cache. target %lu cachesize %d\n", + printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n", __func__, pod_target, p2m->pod.count); ret = -ENOMEM; goto out; @@ -604,15 +604,15 @@ int p2m_pod_set_mem_target(struct domain *d, unsigned long target) { - unsigned pod_target; struct p2m_domain *p2m = p2m_get_hostp2m(d); int ret = 0; - unsigned long populated; + unsigned long populated, pod_target; p2m_lock(p2m); - /* P == B: Nothing to do. */ - if ( p2m->pod.entry_count == 0 ) + /* P == B: Nothing to do (unless the guest is being created). */ + populated = d->tot_pages - p2m->pod.count; + if ( populated > 0 && p2m->pod.entry_count == 0 ) goto out; /* Don't do anything if the domain is being torn down */ @@ -624,13 +624,11 @@ if ( target < d->tot_pages ) goto out; - populated = d->tot_pages - p2m->pod.count; - pod_target = target - populated; /* B < T': Set the cache size equal to # of outstanding entries, * let the balloon driver fill in the rest. */ - if ( pod_target > p2m->pod.entry_count ) + if ( populated > 0 && pod_target > p2m->pod.entry_count ) pod_target = p2m->pod.entry_count; ASSERT( pod_target >= p2m->pod.count ); @@ -885,7 +883,8 @@ void p2m_pod_dump_data(struct p2m_domain *p2m) { - printk(" PoD entries=%d cachesize=%d\n", + + printk(" PoD entries=%ld cachesize=%ld\n", p2m->pod.entry_count, p2m->pod.count); } @@ -1316,8 +1315,9 @@ out_of_memory: spin_unlock(&d->page_alloc_lock); - printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n", - __func__, d->tot_pages, p2m->pod.entry_count); + printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n", + __func__, d->domain_id, d->tot_pages, p2m->pod.entry_count, + current->domain->domain_id); domain_crash(d); out_fail: return -1; @@ -1521,7 +1521,7 @@ } /* Track the highest gfn for which we have ever had a valid mapping */ - if ( mfn_valid(mfn) + if ( p2mt != p2m_invalid && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) ) p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1; @@ -2074,8 +2074,7 @@ { struct page_info *page; struct domain *od; - unsigned long mfn, gfn, m2pfn, lp2mfn = 0; - int entry_count = 0; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0, entry_count = 0; mfn_t p2mfn; unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; int test_linear; @@ -2314,7 +2313,7 @@ if ( entry_count != p2m->pod.entry_count ) { - printk("%s: refcounted entry count %d, audit count %d!\n", + printk("%s: refcounted entry count %ld, audit count %lu!\n", __func__, p2m->pod.entry_count, entry_count); @@ -2408,13 +2407,16 @@ unsigned int order) { struct p2m_domain *p2m = p2m_get_hostp2m(d); - unsigned long i; + unsigned long i, pod_count = 0; p2m_type_t ot; mfn_t omfn; - int pod_count = 0; int rc = 0; - BUG_ON(!paging_mode_translate(d)); + if ( !IS_PRIV_FOR(current->domain, d) ) + return -EPERM; + + if ( !paging_mode_translate(d) ) + return -EINVAL; rc = gfn_check_limit(d, gfn, order); if ( rc != 0 ) @@ -2431,8 +2433,7 @@ omfn = gfn_to_mfn_query(p2m, gfn + i, &ot); if ( p2m_is_ram(ot) ) { - printk("%s: gfn_to_mfn returned type %d!\n", - __func__, ot); + P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); rc = -EBUSY; goto out; } @@ -2454,10 +2455,10 @@ BUG_ON(p2m->pod.entry_count < 0); } +out: audit_p2m(p2m, 1); p2m_unlock(p2m); -out: return rc; } @@ -2559,7 +2560,10 @@ if ( mfn_valid(_mfn(mfn)) ) { if ( !set_p2m_entry(p2m, gfn, _mfn(mfn), page_order, t, p2m->default_access) ) + { rc = -EINVAL; + goto out; /* Failed to update p2m, bail without updating m2p. */ + } if ( !p2m_is_grant(t) ) { for ( i = 0; i < (1UL << page_order); i++ ) @@ -2580,6 +2584,7 @@ } } +out: audit_p2m(p2m, 1); p2m_unlock(p2m); diff -Nru xen-4.1.3/xen/arch/x86/mm/paging.c xen-4.1.5/xen/arch/x86/mm/paging.c --- xen-4.1.3/xen/arch/x86/mm/paging.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm/paging.c 2013-04-23 18:44:20.000000000 +0200 @@ -529,13 +529,19 @@ if ( !d->arch.paging.log_dirty.fault_count && !d->arch.paging.log_dirty.dirty_count ) { - int size = (nr + BITS_PER_LONG - 1) / BITS_PER_LONG; - unsigned long zeroes[size]; - memset(zeroes, 0x00, size * BYTES_PER_LONG); + static uint8_t zeroes[PAGE_SIZE]; + int off, size; + + size = ((nr + BITS_PER_LONG - 1) / BITS_PER_LONG) * sizeof (long); rv = 0; - if ( copy_to_guest_offset(dirty_bitmap, 0, (uint8_t *) zeroes, - size * BYTES_PER_LONG) != 0 ) - rv = -EFAULT; + off = 0; + while ( !rv && off < size ) + { + int todo = min(size - off, (int) PAGE_SIZE); + if ( copy_to_guest_offset(dirty_bitmap, off, zeroes, todo) ) + rv = -EFAULT; + off += todo; + } goto out; } d->arch.paging.log_dirty.fault_count = 0; diff -Nru xen-4.1.3/xen/arch/x86/mm/shadow/common.c xen-4.1.5/xen/arch/x86/mm/shadow/common.c --- xen-4.1.3/xen/arch/x86/mm/shadow/common.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm/shadow/common.c 2013-04-23 18:44:20.000000000 +0200 @@ -2549,10 +2549,13 @@ { /* Don't complain if we're in HVM and there are some extra mappings: * The qemu helper process has an untyped mapping of this dom's RAM - * and the HVM restore program takes another. */ + * and the HVM restore program takes another. + * Also allow one typed refcount for xenheap pages, to match + * share_xen_page_with_guest(). */ if ( !(shadow_mode_external(v->domain) && (page->count_info & PGC_count_mask) <= 3 - && (page->u.inuse.type_info & PGT_count_mask) == 0) ) + && ((page->u.inuse.type_info & PGT_count_mask) + == !!is_xen_heap_page(page))) ) { SHADOW_ERROR("can't find all mappings of mfn %lx: " "c=%08lx t=%08lx\n", mfn_x(gmfn), diff -Nru xen-4.1.3/xen/arch/x86/mm/shadow/multi.c xen-4.1.5/xen/arch/x86/mm/shadow/multi.c --- xen-4.1.3/xen/arch/x86/mm/shadow/multi.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm/shadow/multi.c 2013-04-23 18:44:20.000000000 +0200 @@ -3674,7 +3674,7 @@ perfc_incr(shadow_invlpg_fault); return 0; } - if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) ) + if ( !(shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) ) return 0; } #else /* SHADOW_PAGING_LEVELS == 3 */ @@ -4737,8 +4737,12 @@ } for ( i = 0; i < 4; i++ ) { - if ( fast_path ) - smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i])); + if ( fast_path ) { + if ( pagetable_is_null(v->arch.shadow_table[i]) ) + smfn = _mfn(INVALID_MFN); + else + smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i])); + } else { /* retrieving the l2s */ diff -Nru xen-4.1.3/xen/arch/x86/mm.c xen-4.1.5/xen/arch/x86/mm.c --- xen-4.1.3/xen/arch/x86/mm.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/mm.c 2013-04-23 18:44:20.000000000 +0200 @@ -91,7 +91,7 @@ #include #include #include -#include +#include #include #include #include @@ -113,6 +113,7 @@ #include #include #include +#include #include #include #include @@ -279,7 +280,7 @@ * their domain field set to dom_xen. */ dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); - BUG_ON(dom_xen == NULL); + BUG_ON(IS_ERR(dom_xen)); /* * Initialise our DOMID_IO domain. @@ -287,14 +288,14 @@ * array. Mappings occur at the priv of the caller. */ dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); - BUG_ON(dom_io == NULL); + BUG_ON(IS_ERR(dom_io)); /* - * Initialise our DOMID_IO domain. + * Initialise our COW domain. * This domain owns sharable pages. */ dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0); - BUG_ON(dom_cow == NULL); + BUG_ON(IS_ERR(dom_cow)); /* First 1MB of RAM is historically marked as I/O. */ for ( i = 0; i < 0x100; i++ ) @@ -401,7 +402,7 @@ if ( is_hvm_domain(d) ) return p2m_get_hostp2m(d)->max_mapped_pfn; /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */ - return arch_get_max_pfn(d) - 1; + return (arch_get_max_pfn(d) ?: 1) - 1; } void share_xen_page_with_guest( @@ -827,6 +828,16 @@ return 0; } + if ( pg_owner != l1e_owner && + !iomem_access_permitted(l1e_owner, mfn, mfn) ) + { + if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */ + MEM_LOG("Dom%u attempted to map I/O space %08lx in dom%u to dom%u", + curr->domain->domain_id, mfn, pg_owner->domain_id, + l1e_owner->domain_id); + return 0; + } + if ( !(l1f & _PAGE_RW) || IS_PRIV(pg_owner) || !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) return 1; @@ -2893,7 +2904,7 @@ #define fixunmap_domain_page(ptr) ((void)(ptr)) #endif -int do_mmuext_op( +long do_mmuext_op( XEN_GUEST_HANDLE(mmuext_op_t) uops, unsigned int count, XEN_GUEST_HANDLE(uint) pdone, @@ -3346,7 +3357,7 @@ return rc; } -int do_mmu_update( +long do_mmu_update( XEN_GUEST_HANDLE(mmu_update_t) ureqs, unsigned int count, XEN_GUEST_HANDLE(uint) pdone, @@ -4344,15 +4355,15 @@ return rc; } -int do_update_va_mapping(unsigned long va, u64 val64, - unsigned long flags) +long do_update_va_mapping(unsigned long va, u64 val64, + unsigned long flags) { return __do_update_va_mapping(va, val64, flags, current->domain); } -int do_update_va_mapping_otherdomain(unsigned long va, u64 val64, - unsigned long flags, - domid_t domid) +long do_update_va_mapping_otherdomain(unsigned long va, u64 val64, + unsigned long flags, + domid_t domid) { struct domain *pg_owner; int rc; @@ -5683,6 +5694,25 @@ memguard_unguard_range(p, PAGE_SIZE); } +const unsigned long *__init get_platform_badpages(unsigned int *array_size) +{ + u32 igd_id; + static unsigned long __initdata bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + + *array_size = ARRAY_SIZE(bad_pages); + igd_id = pci_conf_read32(0, 2, 0, 0); + if ( !IS_SNB_GFX(igd_id) ) + return NULL; + + return bad_pages; +} + /* * Local variables: * mode: C diff -Nru xen-4.1.3/xen/arch/x86/msi.c xen-4.1.5/xen/arch/x86/msi.c --- xen-4.1.3/xen/arch/x86/msi.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/msi.c 2013-04-23 18:44:20.000000000 +0200 @@ -201,7 +201,7 @@ BUG(); } - if ( iommu_enabled ) + if ( iommu_intremap ) iommu_read_msi_from_ire(entry, msg); } @@ -222,8 +222,11 @@ { entry->msg = *msg; - if ( iommu_enabled ) + if ( iommu_intremap ) + { + ASSERT(msg != &entry->msg); iommu_update_ire_from_msi(entry, msg); + } switch ( entry->msi_attrib.type ) { @@ -435,7 +438,7 @@ } /* Free the unused IRTE if intr remap enabled */ - if ( iommu_enabled ) + if ( iommu_intremap ) iommu_update_ire_from_msi(entry, NULL); list_del(&entry->list); @@ -599,8 +602,8 @@ * @entries: pointer to an array of struct msix_entry entries * @nvec: number of @entries * - * Setup the MSI-X capability structure of device function with a - * single MSI-X irq. A return of zero indicates the successful setup of + * Setup the MSI-X capability structure of device function with the requested + * number MSI-X irqs. A return of zero indicates the successful setup of * requested MSI-X entries with allocated irqs or non-zero for otherwise. **/ static int msix_capability_init(struct pci_dev *dev, @@ -608,84 +611,67 @@ struct msi_desc **desc, unsigned int nr_entries) { - struct msi_desc *entry; - int pos; + struct msi_desc *entry = NULL; + int pos, vf; u16 control; - u64 table_paddr, entry_paddr; - u32 table_offset, entry_offset; - u8 bir; - void __iomem *base; - int idx; + u64 table_paddr; + u32 table_offset; + u8 bir, pbus, pslot, pfunc; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); ASSERT(spin_is_locked(&pcidevs_lock)); - ASSERT(desc); pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(bus, slot, func, msix_control_reg(pos)); msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ - /* MSI-X Table Initialization */ - entry = alloc_msi_entry(); - if ( !entry ) - return -ENOMEM; + if ( desc ) + { + entry = alloc_msi_entry(); + if ( !entry ) + return -ENOMEM; + ASSERT(msi); + } - /* Request & Map MSI-X table region */ + /* Locate MSI-X table region */ table_offset = pci_conf_read32(bus, slot, func, msix_table_offset_reg(pos)); bir = (u8)(table_offset & PCI_MSIX_BIRMASK); table_offset &= ~PCI_MSIX_BIRMASK; - entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE; - table_paddr = msi->table_base + table_offset; - entry_paddr = table_paddr + entry_offset; - idx = msix_get_fixmap(dev, table_paddr, entry_paddr); - if ( idx < 0 ) - { - xfree(entry); - return idx; - } - base = (void *)(fix_to_virt(idx) + - ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1))); - - entry->msi_attrib.type = PCI_CAP_ID_MSIX; - entry->msi_attrib.is_64 = 1; - entry->msi_attrib.entry_nr = msi->entry_nr; - entry->msi_attrib.maskbit = 1; - entry->msi_attrib.masked = 1; - entry->msi_attrib.pos = pos; - entry->irq = msi->irq; - entry->dev = dev; - entry->mask_base = base; - - list_add_tail(&entry->list, &dev->msi_list); - - if ( !dev->msix_nr_entries ) + if ( !dev->info.is_virtfn ) { - u8 pbus, pslot, pfunc; - int vf; - u64 pba_paddr; - u32 pba_offset; + pbus = bus; + pslot = slot; + pfunc = func; + vf = -1; + } + else + { + pbus = dev->info.physfn.bus; + pslot = PCI_SLOT(dev->info.physfn.devfn); + pfunc = PCI_FUNC(dev->info.physfn.devfn); + vf = PCI_BDF2(dev->bus, dev->devfn); + } - if ( !dev->info.is_virtfn ) - { - pbus = bus; - pslot = slot; - pfunc = func; - vf = -1; - } - else + table_paddr = read_pci_mem_bar(pbus, pslot, pfunc, bir, vf); + WARN_ON(msi && msi->table_base != table_paddr); + if ( !table_paddr ) + { + if ( !msi || !msi->table_base ) { - pbus = dev->info.physfn.bus; - pslot = PCI_SLOT(dev->info.physfn.devfn); - pfunc = PCI_FUNC(dev->info.physfn.devfn); - vf = PCI_BDF2(dev->bus, dev->devfn); + xfree(entry); + return -ENXIO; } + table_paddr = msi->table_base; + } + table_paddr += table_offset; - ASSERT(!dev->msix_used_entries); - WARN_ON(msi->table_base != - read_pci_mem_bar(pbus, pslot, pfunc, bir, vf)); + if ( !dev->msix_used_entries ) + { + u64 pba_paddr; + u32 pba_offset; dev->msix_nr_entries = nr_entries; dev->msix_table.first = PFN_DOWN(table_paddr); @@ -706,7 +692,42 @@ BITS_TO_LONGS(nr_entries) - 1); WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first, dev->msix_pba.last)); + } + + if ( entry ) + { + /* Map MSI-X table region */ + u64 entry_paddr = table_paddr + msi->entry_nr * PCI_MSIX_ENTRY_SIZE; + int idx = msix_get_fixmap(dev, table_paddr, entry_paddr); + void __iomem *base; + + if ( idx < 0 ) + { + xfree(entry); + return idx; + } + base = (void *)(fix_to_virt(idx) + + ((unsigned long)entry_paddr & (PAGE_SIZE - 1))); + + /* Mask interrupt here */ + writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + + entry->msi_attrib.type = PCI_CAP_ID_MSIX; + entry->msi_attrib.is_64 = 1; + entry->msi_attrib.entry_nr = msi->entry_nr; + entry->msi_attrib.maskbit = 1; + entry->msi_attrib.masked = 1; + entry->msi_attrib.pos = pos; + entry->irq = msi->irq; + entry->dev = dev; + entry->mask_base = base; + list_add_tail(&entry->list, &dev->msi_list); + *desc = entry; + } + + if ( !dev->msix_used_entries ) + { if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first, dev->msix_table.last) ) WARN(); @@ -717,7 +738,7 @@ if ( dev->domain ) p2m_change_entry_type_global(p2m_get_hostp2m(dev->domain), p2m_mmio_direct, p2m_mmio_direct); - if ( !dev->domain || !paging_mode_translate(dev->domain) ) + if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) ) { struct domain *d = dev->domain; @@ -731,6 +752,13 @@ break; if ( d ) { + if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id ) + { + dev->msix_warned = d->domain_id; + printk(XENLOG_ERR + "Potentially insecure use of MSI-X on %02x:%02x.%u by Dom%d\n", + bus, slot, func, d->domain_id); + } /* XXX How to deal with existing mappings? */ } } @@ -739,10 +767,6 @@ WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT)); ++dev->msix_used_entries; - /* Mask interrupt here */ - writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); - - *desc = entry; /* Restore MSI-X enabled bits */ pci_conf_write16(bus, slot, func, msix_control_reg(pos), control); @@ -873,6 +897,19 @@ return status; } +static void _pci_cleanup_msix(struct pci_dev *dev) +{ + if ( !--dev->msix_used_entries ) + { + if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first, + dev->msix_table.last) ) + WARN(); + if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first, + dev->msix_pba.last) ) + WARN(); + } +} + static void __pci_disable_msix(struct msi_desc *entry) { struct pci_dev *dev; @@ -895,15 +932,41 @@ pci_conf_write16(bus, slot, func, msix_control_reg(pos), control); - if ( !--dev->msix_used_entries ) + _pci_cleanup_msix(dev); +} + +int pci_prepare_msix(u8 bus, u8 devfn, bool_t off) +{ + int rc; + struct pci_dev *pdev; + u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn); + unsigned int pos = pci_find_cap_offset(bus, slot, func, + PCI_CAP_ID_MSIX); + + if ( !pos ) + return -ENODEV; + + spin_lock(&pcidevs_lock); + pdev = pci_get_pdev(bus, devfn); + if ( !pdev ) + rc = -ENODEV; + else if ( pdev->msix_used_entries != !!off ) + rc = -EBUSY; + else if ( off ) { - if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first, - dev->msix_table.last) ) - WARN(); - if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first, - dev->msix_pba.last) ) - WARN(); + _pci_cleanup_msix(pdev); + rc = 0; } + else + { + u16 control = pci_conf_read16(bus, slot, func, msix_control_reg(pos)); + + rc = msix_capability_init(pdev, NULL, NULL, + multi_msix_capable(control)); + } + spin_unlock(&pcidevs_lock); + + return rc; } /* @@ -954,6 +1017,7 @@ int irq; struct msi_desc *entry, *tmp; struct irq_desc *desc; + struct msi_msg msg; ASSERT(spin_is_locked(&pcidevs_lock)); @@ -982,7 +1046,8 @@ else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 0); - write_msi_msg(entry, &entry->msg); + msg = entry->msg; + write_msi_msg(entry, &msg); msi_set_mask_bit(irq, entry->msi_attrib.masked); diff -Nru xen-4.1.3/xen/arch/x86/oprofile/xenoprof.c xen-4.1.5/xen/arch/x86/oprofile/xenoprof.c --- xen-4.1.3/xen/arch/x86/oprofile/xenoprof.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/oprofile/xenoprof.c 2013-04-23 18:44:20.000000000 +0200 @@ -25,7 +25,7 @@ if ( copy_from_guest(&counter, arg, 1) ) return -EFAULT; - if ( counter.ind > OP_MAX_COUNTER ) + if ( counter.ind >= OP_MAX_COUNTER ) return -E2BIG; counter_config[counter.ind].count = counter.count; @@ -63,7 +63,7 @@ if ( copy_from_guest(&counter, arg, 1) ) return -EFAULT; - if ( counter.ind > OP_MAX_COUNTER ) + if ( counter.ind >= OP_MAX_COUNTER ) return -E2BIG; counter_config[counter.ind].count = counter.count; @@ -82,10 +82,21 @@ if ( !guest_mode(regs) ) return 2; - if ( is_hvm_vcpu(v) ) - return ((regs->cs & 3) != 3); + if ( !is_hvm_vcpu(v) ) + return guest_kernel_mode(v, regs); - return guest_kernel_mode(v, regs); + switch ( hvm_guest_x86_mode(v) ) + { + struct segment_register ss; + + case 0: /* real mode */ + return 1; + case 1: /* vm86 mode */ + return 0; + default: + hvm_get_segment_register(v, x86_seg_ss, &ss); + return (ss.sel & 3) != 3; + } } /* diff -Nru xen-4.1.3/xen/arch/x86/physdev.c xen-4.1.5/xen/arch/x86/physdev.c --- xen-4.1.3/xen/arch/x86/physdev.c 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/physdev.c 2013-04-23 18:44:20.000000000 +0200 @@ -40,11 +40,18 @@ struct hvm_girq_dpci_mapping *girq; uint32_t machine_gsi = 0; + if ( map->index < 0 || map->index >= NR_HVM_IRQS ) + { + ret = -EINVAL; + break; + } + /* find the machine gsi corresponding to the * emulated gsi */ hvm_irq_dpci = domain_get_irq_dpci(d); if ( hvm_irq_dpci ) { + BUILD_BUG_ON(ARRAY_SIZE(hvm_irq_dpci->girq) < NR_HVM_IRQS); list_for_each_entry ( girq, &hvm_irq_dpci->girq[map->index], list ) @@ -82,15 +89,11 @@ static int physdev_map_pirq(struct physdev_map_pirq *map) { - struct domain *d; + struct domain *d = current->domain; int pirq, irq, ret = 0; struct msi_info _msi; void *map_data = NULL; - ret = rcu_lock_target_domain_by_id(map->domid, &d); - if ( ret ) - return ret; - if ( map->domid == DOMID_SELF && is_hvm_domain(d) ) { /* @@ -98,14 +101,15 @@ * calls back into itself and deadlocks on hvm_domain.irq_lock. */ if ( !is_hvm_pv_evtchn_domain(d) ) - { - ret = -EINVAL; - goto free_domain; - } - ret = physdev_hvm_map_pirq(d, map); - goto free_domain; + return -EINVAL; + + return physdev_hvm_map_pirq(d, map); } + ret = rcu_lock_target_domain_by_id(map->domid, &d); + if ( ret ) + return ret; + if ( !IS_PRIV_FOR(current->domain, d) ) { ret = -EPERM; @@ -143,7 +147,7 @@ if ( irq == -1 ) irq = create_irq(); - if ( irq < 0 || irq >= nr_irqs ) + if ( irq < nr_irqs_gsi || irq >= nr_irqs ) { dprintk(XENLOG_G_ERR, "dom%d: can't create irq for msi!\n", d->domain_id); @@ -230,6 +234,10 @@ if ( ret ) return ret; + ret = -EINVAL; + if ( unmap->pirq < 0 || unmap->pirq >= d->nr_pirqs ) + goto free_domain; + if ( is_hvm_domain(d) ) { spin_lock(&d->event_lock); @@ -540,6 +548,20 @@ break; } + case PHYSDEVOP_prepare_msix: + case PHYSDEVOP_release_msix: { + struct physdev_pci_device dev; + + if ( copy_from_guest(&dev, arg, 1) ) + ret = -EFAULT; + else if ( dev.seg ) + ret = -EOPNOTSUPP; + else + ret = pci_prepare_msix(dev.bus, dev.devfn, + cmd != PHYSDEVOP_prepare_msix); + break; + } + case PHYSDEVOP_restore_msi: { struct physdev_restore_msi restore_msi; struct pci_dev *pdev; @@ -578,22 +600,24 @@ } case PHYSDEVOP_get_free_pirq: { struct physdev_get_free_pirq out; - struct domain *d; + struct domain *d = v->domain; - d = rcu_lock_current_domain(); - ret = -EFAULT; if ( copy_from_guest(&out, arg, 1) != 0 ) break; spin_lock(&d->event_lock); - out.pirq = get_free_pirq(d, out.type, 0); - d->arch.pirq_irq[out.pirq] = PIRQ_ALLOCATED; + ret = get_free_pirq(d, out.type, 0); + if ( ret >= 0 ) + d->arch.pirq_irq[ret] = PIRQ_ALLOCATED; spin_unlock(&d->event_lock); - ret = copy_to_guest(arg, &out, 1) ? -EFAULT : 0; + if ( ret >= 0 ) + { + out.pirq = ret; + ret = copy_to_guest(arg, &out, 1) ? -EFAULT : 0; + } - rcu_unlock_domain(d); break; } default: diff -Nru xen-4.1.3/xen/arch/x86/setup.c xen-4.1.5/xen/arch/x86/setup.c --- xen-4.1.3/xen/arch/x86/setup.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/setup.c 2013-04-23 18:44:20.000000000 +0200 @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -665,17 +666,6 @@ { memmap_type = "Xen-e820"; } - else if ( bootsym(lowmem_kb) ) - { - memmap_type = "Xen-e801"; - e820_raw[0].addr = 0; - e820_raw[0].size = bootsym(lowmem_kb) << 10; - e820_raw[0].type = E820_RAM; - e820_raw[1].addr = 0x100000; - e820_raw[1].size = bootsym(highmem_kb) << 10; - e820_raw[1].type = E820_RAM; - e820_raw_nr = 2; - } else if ( mbi->flags & MBI_MEMMAP ) { memmap_type = "Multiboot-e820"; @@ -713,6 +703,17 @@ bytes += map->size + 4; } } + else if ( bootsym(lowmem_kb) ) + { + memmap_type = "Xen-e801"; + e820_raw[0].addr = 0; + e820_raw[0].size = bootsym(lowmem_kb) << 10; + e820_raw[0].type = E820_RAM; + e820_raw[1].addr = 0x100000; + e820_raw[1].size = bootsym(highmem_kb) << 10; + e820_raw[1].type = E820_RAM; + e820_raw_nr = 2; + } else if ( mbi->flags & MBI_MEMLIMITS ) { memmap_type = "Multiboot-e801"; @@ -812,8 +813,8 @@ l4_pgentry_t *pl4e; l3_pgentry_t *pl3e; l2_pgentry_t *pl2e; + uint64_t load_start; int i, j, k; - void *dst; /* Select relocation address. */ e = end - reloc_size; @@ -826,11 +827,9 @@ * with a barrier(). After this we must *not* modify static/global * data until after we have switched to the relocated pagetables! */ + load_start = (unsigned long)_start - XEN_VIRT_START; barrier(); - dst = move_memory(e, 0, (unsigned long)&_end - XEN_VIRT_START, 1); - - /* Poison low 1MB to detect stray pointers to physical 0-1MB. */ - memset(dst, 0x55, 1U << 20); + move_memory(e + load_start, load_start, _end - _start, 1); /* Walk initial pagetables, relocating page directory entries. */ pl4e = __va(__pa(idle_pg_table)); @@ -1277,7 +1276,7 @@ /* Create initial domain 0. */ dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF); - if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) ) + if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0\n"); dom0->is_privileged = 1; diff -Nru xen-4.1.3/xen/arch/x86/smp.c xen-4.1.5/xen/arch/x86/smp.c --- xen-4.1.3/xen/arch/x86/smp.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/smp.c 2013-04-23 18:44:20.000000000 +0200 @@ -43,6 +43,11 @@ genapic->send_IPI_mask(mask, vector); } +void send_IPI_self(int vector) +{ + genapic->send_IPI_self(vector); +} + /* * Some notes on x86 processor bugs affecting SMP operation: * diff -Nru xen-4.1.3/xen/arch/x86/time.c xen-4.1.5/xen/arch/x86/time.c --- xen-4.1.3/xen/arch/x86/time.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/time.c 2013-04-23 18:44:20.000000000 +0200 @@ -140,8 +140,9 @@ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (scale->mul_frac) ); #else asm ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)scale->mul_frac) ); + "mul %2 ; shrd $32,%1,%0" + : "=a" (product), "=d" (delta) + : "rm" (delta), "0" ((u64)scale->mul_frac) ); #endif return product; @@ -820,7 +821,16 @@ if ( is_hvm_domain(d) ) { struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time; + stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset; + if ( (s64)stime < 0 ) + { + printk(XENLOG_G_WARNING "d%dv%d: bogus time %" PRId64 + " (offsets %" PRId64 "/%" PRId64 ")\n", + d->domain_id, v->vcpu_id, stime, + pl->stime_offset, v->arch.hvm_vcpu.stime_offset); + stime = 0; + } } tsc_stamp = gtime_to_gtsc(d, stime); } diff -Nru xen-4.1.3/xen/arch/x86/traps.c xen-4.1.5/xen/arch/x86/traps.c --- xen-4.1.3/xen/arch/x86/traps.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/traps.c 2013-04-23 18:44:20.000000000 +0200 @@ -834,6 +834,7 @@ __clear_bit(X86_FEATURE_CX16 % 32, &c); __clear_bit(X86_FEATURE_XTPR % 32, &c); __clear_bit(X86_FEATURE_PDCM % 32, &c); + __clear_bit(X86_FEATURE_PCID % 32, &c); __clear_bit(X86_FEATURE_DCA % 32, &c); if ( !xsave_enabled(current) ) { @@ -3108,6 +3109,7 @@ static void pci_serr_softirq(void) { printk("\n\nNMI - PCI system error (SERR)\n"); + outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ } void async_exception_cleanup(struct vcpu *curr) @@ -3201,9 +3203,20 @@ { outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */ - /* Would like to print a diagnostic here but can't call printk() - from NMI context -- raise a softirq instead. */ - raise_softirq(PCI_SERR_SOFTIRQ); + switch ( opt_nmi[0] ) + { + case 'd': /* 'dom0' */ + nmi_dom0_report(_XEN_NMIREASON_pci_serr); + case 'i': /* 'ignore' */ + /* Would like to print a diagnostic here but can't call printk() + from NMI context -- raise a softirq instead. */ + raise_softirq(PCI_SERR_SOFTIRQ); + break; + default: /* 'fatal' */ + console_force_unlock(); + printk("\n\nNMI - PCI system error (SERR)\n"); + fatal_trap(TRAP_nmi, regs); + } } static void io_check_error(struct cpu_user_regs *regs) @@ -3515,6 +3528,9 @@ struct domain *d = v->domain; struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi]; + if ( !is_canonical_address(address) ) + return -EINVAL; + t->vector = TRAP_nmi; t->flags = 0; t->cs = (is_pv_32on64_domain(d) ? @@ -3642,6 +3658,9 @@ if ( cur.address == 0 ) break; + if ( !is_canonical_address(cur.address) ) + return -EINVAL; + fixup_guest_code_selector(curr->domain, cur.cs); memcpy(&dst[cur.vector], &cur, sizeof(cur)); diff -Nru xen-4.1.3/xen/arch/x86/x86_32/entry.S xen-4.1.5/xen/arch/x86/x86_32/entry.S --- xen-4.1.3/xen/arch/x86/x86_32/entry.S 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/x86_32/entry.S 2013-04-23 18:44:20.000000000 +0200 @@ -60,6 +60,7 @@ #include #include #include +#include ALIGN restore_all_guest: @@ -215,6 +216,7 @@ jnz process_softirqs testb $1,VCPU_mce_pending(%ebx) jnz process_mce +.Ltest_guest_nmi: testb $1,VCPU_nmi_pending(%ebx) jnz process_nmi test_guest_events: @@ -244,7 +246,7 @@ /* %ebx: struct vcpu */ process_mce: testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%ebx) - jnz test_guest_events + jnz .Ltest_guest_nmi sti movb $0,VCPU_mce_pending(%ebx) call set_guest_machinecheck_trapbounce @@ -563,6 +565,8 @@ jmp restore_all_xen .popsection +ENTRY(nmi) + pushl $TRAP_nmi<<16 handle_nmi_mce: #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL # NMI/MCE entry protocol is incompatible with guest kernel in ring 0. @@ -583,7 +587,24 @@ * cases we have put guest DS/ES on the guest stack frame, which will * be detected by SAVE_ALL(), or we have rolled back restore_guest. */ - jmp ret_from_intr + cmpb $TRAP_nmi,UREGS_entry_vector(%esp) + jne ret_from_intr + /* We want to get straight to the IRET on the NMI exit path. */ + GET_CURRENT(%ebx) + movl UREGS_eflags(%esp),%eax + movb UREGS_cs(%esp),%al + testl $(3|X86_EFLAGS_VM),%eax + jz restore_all_xen + /* Send an IPI to ourselves to cover for the lack of event checking. */ + movl VCPU_processor(%ebx),%eax + shll $IRQSTAT_shift,%eax + cmpl $0,irq_stat(%eax) + je restore_all_guest + pushl $EVENT_CHECK_VECTOR + call send_IPI_self + addl $4,%esp + jmp restore_all_guest + .Lnmi_mce_xen: /* Check the outer (guest) context for %ds/%es state validity. */ GET_CPUINFO_FIELD(CPUINFO_guest_cpu_user_regs,%ebx) @@ -615,14 +636,18 @@ jmp .Lnmi_mce_common #endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */ -ENTRY(nmi) - pushl $TRAP_nmi<<16 - jmp handle_nmi_mce - ENTRY(machine_check) pushl $TRAP_machine_check<<16 jmp handle_nmi_mce +/* Enable NMIs. No special register assumptions. All registers are preserved. */ +ENTRY(enable_nmis) + /* Set up stack frame */ + pushf # EFLAGS + push %cs # CS + push $.Lret # EIP + iret # Disable the hardware NMI latch + ENTRY(setup_vm86_frame) mov %ecx,%ds mov %ecx,%es @@ -636,7 +661,7 @@ .endm copy_vm86_words addl $16,%esp - ret +.Lret: ret .section .rodata, "a", @progbits diff -Nru xen-4.1.3/xen/arch/x86/x86_64/compat/entry.S xen-4.1.5/xen/arch/x86/x86_64/compat/entry.S --- xen-4.1.3/xen/arch/x86/x86_64/compat/entry.S 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/x86_64/compat/entry.S 2013-04-23 18:44:20.000000000 +0200 @@ -104,6 +104,7 @@ jnz compat_process_softirqs testb $1,VCPU_mce_pending(%rbx) jnz compat_process_mce +.Lcompat_test_guest_nmi: testb $1,VCPU_nmi_pending(%rbx) jnz compat_process_nmi compat_test_guest_events: @@ -134,7 +135,7 @@ /* %rbx: struct vcpu */ compat_process_mce: testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) - jnz compat_test_guest_events + jnz .Lcompat_test_guest_nmi sti movb $0,VCPU_mce_pending(%rbx) call set_guest_machinecheck_trapbounce @@ -171,7 +172,7 @@ jmp compat_test_all_events /* %rbx: struct vcpu, interrupts disabled */ -compat_restore_all_guest: +ENTRY(compat_restore_all_guest) ASSERT_INTERRUPTS_DISABLED RESTORE_ALL addq $8,%rsp diff -Nru xen-4.1.3/xen/arch/x86/x86_64/compat/traps.c xen-4.1.5/xen/arch/x86/x86_64/compat/traps.c --- xen-4.1.3/xen/arch/x86/x86_64/compat/traps.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/x86_64/compat/traps.c 2013-04-23 18:44:20.000000000 +0200 @@ -20,11 +20,12 @@ if ( v != current ) { struct vcpu *vcpu; + unsigned long mfn; ASSERT(guest_kernel_mode(v, regs)); - addr = read_cr3() >> PAGE_SHIFT; + mfn = read_cr3() >> PAGE_SHIFT; for_each_vcpu( v->domain, vcpu ) - if ( pagetable_get_pfn(vcpu->arch.guest_table) == addr ) + if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn ) break; if ( !vcpu ) { diff -Nru xen-4.1.3/xen/arch/x86/x86_64/entry.S xen-4.1.5/xen/arch/x86/x86_64/entry.S --- xen-4.1.3/xen/arch/x86/x86_64/entry.S 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/x86_64/entry.S 2013-04-23 18:44:20.000000000 +0200 @@ -11,6 +11,7 @@ #include #include #include +#include ALIGN /* %rbx: struct vcpu */ @@ -204,6 +205,7 @@ jnz process_softirqs testb $1,VCPU_mce_pending(%rbx) jnz process_mce +.Ltest_guest_nmi: testb $1,VCPU_nmi_pending(%rbx) jnz process_nmi test_guest_events: @@ -232,7 +234,7 @@ /* %rbx: struct vcpu */ process_mce: testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) - jnz test_guest_events + jnz .Ltest_guest_nmi sti movb $0,VCPU_mce_pending(%rbx) call set_guest_machinecheck_trapbounce @@ -286,7 +288,14 @@ movl $3,UREGS_cs(%rsp) /* ring 3 null cs */ movq VCPU_sysenter_addr(%rbx),%rax setne %cl + testl $X86_EFLAGS_NT,UREGS_eflags(%rsp) leaq VCPU_trap_bounce(%rbx),%rdx +UNLIKELY_START(nz, sysenter_nt_set) + pushfq + andl $~X86_EFLAGS_NT,(%rsp) + popfq + xorl %eax,%eax +UNLIKELY_END(sysenter_nt_set) testq %rax,%rax leal (,%rcx,TBF_INTERRUPT),%ecx UNLIKELY_START(z, sysenter_gpf) @@ -621,6 +630,9 @@ jmp restore_all_xen .popsection +ENTRY(nmi) + pushq $0 + movl $TRAP_nmi,4(%rsp) handle_ist_exception: SAVE_ALL testb $3,UREGS_cs(%rsp) @@ -635,18 +647,47 @@ movl UREGS_entry_vector(%rsp),%eax leaq exception_table(%rip),%rdx callq *(%rdx,%rax,8) - jmp ret_from_intr + cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) + jne ret_from_intr -ENTRY(nmi) - pushq $0 - movl $TRAP_nmi,4(%rsp) - jmp handle_ist_exception + /* We want to get straight to the IRET on the NMI exit path. */ + testb $3,UREGS_cs(%rsp) + jz restore_all_xen + GET_CURRENT(%rbx) + /* Send an IPI to ourselves to cover for the lack of event checking. */ + movl VCPU_processor(%rbx),%eax + shll $IRQSTAT_shift,%eax + leaq irq_stat(%rip),%rcx + cmpl $0,(%rcx,%rax,1) + je 1f + movl $EVENT_CHECK_VECTOR,%edi + call send_IPI_self +1: movq VCPU_domain(%rbx),%rax + cmpb $0,DOMAIN_is_32bit_pv(%rax) + je restore_all_guest + jmp compat_restore_all_guest ENTRY(machine_check) pushq $0 movl $TRAP_machine_check,4(%rsp) jmp handle_ist_exception +/* Enable NMIs. No special register assumptions. Only %rax is not preserved. */ +ENTRY(enable_nmis) + movq %rsp, %rax /* Grab RSP before pushing */ + + /* Set up stack frame */ + pushq $0 /* SS */ + pushq %rax /* RSP */ + pushfq /* RFLAGS */ + pushq $__HYPERVISOR_CS /* CS */ + leaq 1f(%rip),%rax + pushq %rax /* RIP */ + + iretq /* Disable the hardware NMI latch */ +1: + retq + .section .rodata, "a", @progbits ENTRY(exception_table) diff -Nru xen-4.1.3/xen/arch/x86/x86_64/mmconfig_64.c xen-4.1.5/xen/arch/x86/x86_64/mmconfig_64.c --- xen-4.1.3/xen/arch/x86/x86_64/mmconfig_64.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/arch/x86/x86_64/mmconfig_64.c 2013-04-23 18:44:20.000000000 +0200 @@ -25,7 +25,7 @@ static struct mmcfg_virt *pci_mmcfg_virt; static int __initdata mmcfg_pci_segment_shift; -static char __iomem *get_virt(unsigned int seg, unsigned bus) +static char __iomem *get_virt(unsigned int seg, unsigned int *bus) { struct acpi_mcfg_allocation *cfg; int cfg_num; @@ -33,9 +33,11 @@ for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = pci_mmcfg_virt[cfg_num].cfg; if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) + (cfg->start_bus_number <= *bus) && + (cfg->end_bus_number >= *bus)) { + *bus -= cfg->start_bus_number; return pci_mmcfg_virt[cfg_num].virt; + } } /* Fall back to type 0 */ @@ -46,7 +48,7 @@ { char __iomem *addr; - addr = get_virt(seg, bus); + addr = get_virt(seg, &bus); if (!addr) return NULL; return addr + ((bus << 20) | (devfn << 12)); @@ -121,8 +123,11 @@ if (virt + size < virt || virt + size > PCI_MCFG_VIRT_END) return NULL; - map_pages_to_xen(virt, cfg->address >> PAGE_SHIFT, - size >> PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE); + if (map_pages_to_xen(virt, + (cfg->address >> PAGE_SHIFT) + + (cfg->start_bus_number << (20 - PAGE_SHIFT)), + size >> PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE)) + return NULL; return (void __iomem *) virt; } diff -Nru xen-4.1.3/xen/common/compat/grant_table.c xen-4.1.5/xen/common/compat/grant_table.c --- xen-4.1.3/xen/common/compat/grant_table.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/compat/grant_table.c 2013-04-23 18:44:20.000000000 +0200 @@ -310,6 +310,8 @@ #undef XLAT_gnttab_get_status_frames_HNDL_frame_list if ( unlikely(__copy_to_guest(cmp_uop, &cmp.get_status, 1)) ) rc = -EFAULT; + else + i = 1; } break; } diff -Nru xen-4.1.3/xen/common/compat/memory.c xen-4.1.5/xen/common/compat/memory.c --- xen-4.1.3/xen/common/compat/memory.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/compat/memory.c 2013-04-23 18:44:20.000000000 +0200 @@ -15,7 +15,8 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE(void) compat) { - int rc, split, op = cmd & MEMOP_CMD_MASK; + int split, op = cmd & MEMOP_CMD_MASK; + long rc; unsigned int start_extent = cmd >> MEMOP_EXTENT_SHIFT; do @@ -114,6 +115,12 @@ (cmp.xchg.out.nr_extents << cmp.xchg.out.extent_order)) ) return -EINVAL; + if ( !compat_handle_okay(cmp.xchg.in.extent_start, + cmp.xchg.in.nr_extents) || + !compat_handle_okay(cmp.xchg.out.extent_start, + cmp.xchg.out.nr_extents) ) + return -EFAULT; + start_extent = cmp.xchg.nr_exchanged; end_extent = (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.xchg)) / (((1U << ABS(order_delta)) + 1) * @@ -165,7 +172,7 @@ if ( order_delta >= 0 ) nat.xchg->out.nr_extents = end_extent >> order_delta; else - nat.xchg->out.nr_extents = end_extent << order_delta; + nat.xchg->out.nr_extents = end_extent << -order_delta; ++split; } @@ -185,7 +192,7 @@ rc = do_memory_op(cmd, nat.hnd); if ( rc < 0 ) - return rc; + break; cmd = 0; if ( hypercall_xlat_continuation(&cmd, 0x02, nat.hnd, compat) ) @@ -298,5 +305,11 @@ __HYPERVISOR_memory_op, "ih", cmd, compat); } while ( split > 0 ); + if ( unlikely(rc > INT_MAX) ) + return INT_MAX; + + if ( unlikely(rc < INT_MIN) ) + return INT_MIN; + return rc; } diff -Nru xen-4.1.3/xen/common/compat/xenoprof.c xen-4.1.5/xen/common/compat/xenoprof.c --- xen-4.1.3/xen/common/compat/xenoprof.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/compat/xenoprof.c 2013-04-23 18:44:20.000000000 +0200 @@ -5,6 +5,7 @@ #include #define COMPAT +#define ret_t int #define do_xenoprof_op compat_xenoprof_op diff -Nru xen-4.1.3/xen/common/cpupool.c xen-4.1.5/xen/common/cpupool.c --- xen-4.1.3/xen/common/cpupool.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/cpupool.c 2013-04-23 18:44:20.000000000 +0200 @@ -355,14 +355,18 @@ int cpupool_add_domain(struct domain *d, int poolid) { struct cpupool *c; - int rc = 1; + int rc; int n_dom = 0; if ( poolid == CPUPOOLID_NONE ) return 0; spin_lock(&cpupool_lock); c = cpupool_find_by_id(poolid); - if ( (c != NULL) && cpus_weight(c->cpu_valid) ) + if ( c == NULL ) + rc = -ESRCH; + else if ( !cpus_weight(c->cpu_valid) ) + rc = -ENODEV; + else { c->n_dom++; n_dom = c->n_dom; diff -Nru xen-4.1.3/xen/common/decompress.c xen-4.1.5/xen/common/decompress.c --- xen-4.1.3/xen/common/decompress.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/decompress.c 2013-04-23 18:44:20.000000000 +0200 @@ -20,6 +20,9 @@ if ( len >= 3 && !memcmp(inbuf, "\x42\x5a\x68", 3) ) return bunzip2(inbuf, len, NULL, NULL, outbuf, NULL, error); + if ( len >= 6 && !memcmp(inbuf, "\3757zXZ", 6) ) + return unxz(inbuf, len, NULL, NULL, outbuf, NULL, error); + if ( len >= 2 && !memcmp(inbuf, "\135\000", 2) ) return unlzma(inbuf, len, NULL, NULL, outbuf, NULL, error); diff -Nru xen-4.1.3/xen/common/decompress.h xen-4.1.5/xen/common/decompress.h --- xen-4.1.3/xen/common/decompress.h 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/decompress.h 2013-04-23 18:44:20.000000000 +0200 @@ -8,6 +8,7 @@ #define STATIC #define INIT __init +#define INITDATA __initdata static void(*__initdata error)(const char *); #define set_error_fn(x) error = x; diff -Nru xen-4.1.3/xen/common/domain.c xen-4.1.5/xen/common/domain.c --- xen-4.1.3/xen/common/domain.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/domain.c 2013-04-23 18:44:20.000000000 +0200 @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -217,17 +217,17 @@ struct domain *d, **pd; enum { INIT_xsm = 1u<<0, INIT_watchdog = 1u<<1, INIT_rangeset = 1u<<2, INIT_evtchn = 1u<<3, INIT_gnttab = 1u<<4, INIT_arch = 1u<<5 }; - int init_status = 0; + int err, init_status = 0; int poolid = CPUPOOLID_NONE; if ( (d = alloc_domain_struct()) == NULL ) - return NULL; + return ERR_PTR(-ENOMEM); d->domain_id = domid; lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain"); - if ( xsm_alloc_security_domain(d) != 0 ) + if ( (err = xsm_alloc_security_domain(d)) != 0 ) goto fail; init_status |= INIT_xsm; @@ -261,14 +261,14 @@ d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex); d->irq_caps = rangeset_new(d, "Interrupts", 0); if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) ) - goto fail; + goto nomem; if ( domcr_flags & DOMCRF_dummy ) return d; if ( !is_idle_domain(d) ) { - if ( xsm_domain_create(d, ssidref) != 0 ) + if ( (err = xsm_domain_create(d, ssidref)) != 0 ) goto fail; d->is_paused_by_controller = 1; @@ -285,29 +285,29 @@ d->pirq_mask = xmalloc_array( unsigned long, BITS_TO_LONGS(d->nr_pirqs)); if ( (d->pirq_to_evtchn == NULL) || (d->pirq_mask == NULL) ) - goto fail; + goto nomem; memset(d->pirq_to_evtchn, 0, d->nr_pirqs * sizeof(*d->pirq_to_evtchn)); bitmap_zero(d->pirq_mask, d->nr_pirqs); - if ( evtchn_init(d) != 0 ) + if ( (err = evtchn_init(d)) != 0 ) goto fail; init_status |= INIT_evtchn; - if ( grant_table_create(d) != 0 ) + if ( (err = grant_table_create(d)) != 0 ) goto fail; init_status |= INIT_gnttab; poolid = 0; } - if ( arch_domain_create(d, domcr_flags) != 0 ) + if ( (err = arch_domain_create(d, domcr_flags)) != 0 ) goto fail; init_status |= INIT_arch; - if ( cpupool_add_domain(d, poolid) != 0 ) + if ( (err = cpupool_add_domain(d, poolid)) != 0 ) goto fail; - if ( sched_init_domain(d) != 0 ) + if ( (err = sched_init_domain(d)) != 0 ) goto fail; if ( !is_idle_domain(d) ) @@ -326,6 +326,8 @@ return d; + nomem: + err = -ENOMEM; fail: d->is_dying = DOMDYING_dead; atomic_set(&d->refcnt, DOMAIN_DESTROYED); @@ -347,7 +349,7 @@ xfree(d->pirq_mask); xfree(d->pirq_to_evtchn); free_domain_struct(d); - return NULL; + return ERR_PTR(err); } @@ -871,6 +873,9 @@ if ( set.period_ns < MILLISECS(1) ) return -EINVAL; + if ( set.period_ns > STIME_DELTA_MAX ) + return -EINVAL; + v->periodic_period = set.period_ns; vcpu_force_reschedule(v); diff -Nru xen-4.1.3/xen/common/domctl.c xen-4.1.5/xen/common/domctl.c --- xen-4.1.3/xen/common/domctl.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/domctl.c 2013-04-23 18:44:20.000000000 +0200 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -435,10 +436,12 @@ if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_oos_off ) domcr_flags |= DOMCRF_oos_off; - ret = -ENOMEM; d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref); - if ( d == NULL ) + if ( IS_ERR(d) ) + { + ret = PTR_ERR(d); break; + } ret = 0; @@ -851,9 +854,9 @@ if ( pirq >= d->nr_pirqs ) ret = -EINVAL; else if ( op->u.irq_permission.allow_access ) - ret = irq_permit_access(d, pirq); + ret = pirq_permit_access(d, pirq); else - ret = irq_deny_access(d, pirq); + ret = pirq_deny_access(d, pirq); rcu_unlock_domain(d); } diff -Nru xen-4.1.3/xen/common/event_channel.c xen-4.1.5/xen/common/event_channel.c --- xen-4.1.3/xen/common/event_channel.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/event_channel.c 2013-04-23 18:44:20.000000000 +0200 @@ -104,7 +104,6 @@ if ( unlikely(chn == NULL) ) return -ENOMEM; memset(chn, 0, EVTCHNS_PER_BUCKET * sizeof(*chn)); - bucket_from_port(d, port) = chn; for ( i = 0; i < EVTCHNS_PER_BUCKET; i++ ) { @@ -117,6 +116,8 @@ } } + bucket_from_port(d, port) = chn; + return port; } @@ -331,7 +332,7 @@ if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) return -EINVAL; - if ( !is_hvm_domain(d) && !irq_access_permitted(d, pirq) ) + if ( !is_hvm_domain(d) && !pirq_access_permitted(d, pirq) ) return -EPERM; spin_lock(&d->event_lock); diff -Nru xen-4.1.3/xen/common/grant_table.c xen-4.1.5/xen/common/grant_table.c --- xen-4.1.3/xen/common/grant_table.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/grant_table.c 2013-04-23 18:44:20.000000000 +0200 @@ -24,7 +24,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#include #include #include #include @@ -172,6 +172,30 @@ return rc; } +static struct domain *gt_lock_target_domain_by_id(domid_t dom) +{ + struct domain *d; + int rc = GNTST_general_error; + + switch ( rcu_lock_target_domain_by_id(dom, &d) ) + { + case 0: + return d; + + case -ESRCH: + gdprintk(XENLOG_INFO, "Bad domid %d.\n", dom); + rc = GNTST_bad_domain; + break; + + case -EPERM: + rc = GNTST_permission_denied; + break; + } + + ASSERT(rc < 0 && -rc <= MAX_ERRNO); + return ERR_PTR(rc); +} + static inline int __get_maptrack_handle( struct grant_table *t) @@ -574,7 +598,7 @@ act->start = 0; act->length = PAGE_SIZE; act->is_sub_page = 0; - act->trans_dom = rd->domain_id; + act->trans_domain = rd; act->trans_gref = op->ref; } } @@ -1102,12 +1126,13 @@ } static int -gnttab_populate_status_frames(struct domain *d, struct grant_table *gt) +gnttab_populate_status_frames(struct domain *d, struct grant_table *gt, + unsigned int req_nr_frames) { unsigned i; unsigned req_status_frames; - req_status_frames = grant_to_status_frames(gt->nr_grant_frames); + req_status_frames = grant_to_status_frames(req_nr_frames); for ( i = nr_status_frames(gt); i < req_status_frames; i++ ) { if ( (gt->status[i] = alloc_xenheap_page()) == NULL ) @@ -1138,7 +1163,12 @@ for ( i = 0; i < nr_status_frames(gt); i++ ) { - page_set_owner(virt_to_page(gt->status[i]), dom_xen); + struct page_info *pg = virt_to_page(gt->status[i]); + + BUG_ON(page_get_owner(pg) != d); + if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) ) + put_page(pg); + BUG_ON(pg->count_info & ~PGC_xen_heap); free_xenheap_page(gt->status[i]); gt->status[i] = NULL; } @@ -1176,19 +1206,18 @@ clear_page(gt->shared_raw[i]); } - /* Share the new shared frames with the recipient domain */ - for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) - gnttab_create_shared_page(d, gt, i); - - gt->nr_grant_frames = req_nr_frames; - /* Status pages - version 2 */ if (gt->gt_version > 1) { - if ( gnttab_populate_status_frames(d, gt) ) + if ( gnttab_populate_status_frames(d, gt, req_nr_frames) ) goto shared_alloc_failed; } + /* Share the new shared frames with the recipient domain */ + for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) + gnttab_create_shared_page(d, gt, i); + gt->nr_grant_frames = req_nr_frames; + return 1; shared_alloc_failed: @@ -1216,7 +1245,6 @@ struct domain *d; int i; unsigned long gmfn; - domid_t dom; if ( count != 1 ) return -EINVAL; @@ -1236,25 +1264,11 @@ goto out1; } - dom = op.dom; - if ( dom == DOMID_SELF ) - { - d = rcu_lock_current_domain(); - } - else + d = gt_lock_target_domain_by_id(op.dom); + if ( IS_ERR(d) ) { - if ( unlikely((d = rcu_lock_domain_by_id(dom)) == NULL) ) - { - gdprintk(XENLOG_INFO, "Bad domid %d.\n", dom); - op.status = GNTST_bad_domain; - goto out1; - } - - if ( unlikely(!IS_PRIV_FOR(current->domain, d)) ) - { - op.status = GNTST_permission_denied; - goto out2; - } + op.status = PTR_ERR(d); + goto out1; } if ( xsm_grant_setup(current->domain, d) ) @@ -1309,7 +1323,6 @@ { struct gnttab_query_size op; struct domain *d; - domid_t dom; int rc; if ( count != 1 ) @@ -1321,25 +1334,11 @@ return -EFAULT; } - dom = op.dom; - if ( dom == DOMID_SELF ) - { - d = rcu_lock_current_domain(); - } - else + d = gt_lock_target_domain_by_id(op.dom); + if ( IS_ERR(d) ) { - if ( unlikely((d = rcu_lock_domain_by_id(dom)) == NULL) ) - { - gdprintk(XENLOG_INFO, "Bad domid %d.\n", dom); - op.status = GNTST_bad_domain; - goto query_out; - } - - if ( unlikely(!IS_PRIV_FOR(current->domain, d)) ) - { - op.status = GNTST_permission_denied; - goto query_out_unlock; - } + op.status = PTR_ERR(d); + goto query_out; } rc = xsm_grant_query_size(current->domain, d); @@ -1630,11 +1629,10 @@ struct active_grant_entry *act; unsigned long r_frame; uint16_t *status; - domid_t trans_domid; grant_ref_t trans_gref; int released_read; int released_write; - struct domain *trans_dom; + struct domain *td; released_read = 0; released_write = 0; @@ -1648,15 +1646,13 @@ if (rd->grant_table->gt_version == 1) { status = &sha->flags; - trans_domid = rd->domain_id; - /* Shut the compiler up. This'll never be used, because - trans_domid == rd->domain_id, but gcc doesn't know that. */ - trans_gref = 0x1234567; + td = rd; + trans_gref = gref; } else { status = &status_entry(rd->grant_table, gref); - trans_domid = act->trans_dom; + td = act->trans_domain; trans_gref = act->trans_gref; } @@ -1684,21 +1680,16 @@ spin_unlock(&rd->grant_table->lock); - if ( trans_domid != rd->domain_id ) + if ( td != rd ) { - if ( released_write || released_read ) - { - trans_dom = rcu_lock_domain_by_id(trans_domid); - if ( trans_dom != NULL ) - { - /* Recursive calls, but they're tail calls, so it's - okay. */ - if ( released_write ) - __release_grant_for_copy(trans_dom, trans_gref, 0); - else if ( released_read ) - __release_grant_for_copy(trans_dom, trans_gref, 1); - } - } + /* Recursive calls, but they're tail calls, so it's + okay. */ + if ( released_write ) + __release_grant_for_copy(td, trans_gref, 0); + else if ( released_read ) + __release_grant_for_copy(td, trans_gref, 1); + + rcu_unlock_domain(td); } } @@ -1707,14 +1698,14 @@ under the domain's grant table lock. */ /* Only safe on transitive grants. Even then, note that we don't attempt to drop any pin on the referent grant. */ -static void __fixup_status_for_pin(const struct active_grant_entry *act, +static void __fixup_status_for_copy_pin(const struct active_grant_entry *act, uint16_t *status) { if ( !(act->pin & GNTPIN_hstw_mask) ) - *status &= ~GTF_writing; + gnttab_clear_flag(_GTF_writing, status); if ( !(act->pin & GNTPIN_hstr_mask) ) - *status &= ~GTF_reading; + gnttab_clear_flag(_GTF_reading, status); } /* Grab a frame number from a grant entry and update the flags and pin @@ -1735,7 +1726,7 @@ uint32_t old_pin; domid_t trans_domid; grant_ref_t trans_gref; - struct domain *rrd; + struct domain *td; unsigned long gfn; unsigned long grant_frame; unsigned trans_page_off; @@ -1789,12 +1780,12 @@ status) ) != GNTST_okay ) goto unlock_out; - trans_domid = ld->domain_id; - trans_gref = 0; + td = rd; + trans_gref = gref; if ( sha2 && (shah->flags & GTF_type_mask) == GTF_transitive ) { if ( !allow_transitive ) - PIN_FAIL(unlock_out, GNTST_general_error, + PIN_FAIL(unlock_out_clear, GNTST_general_error, "transitive grant when transitivity not allowed\n"); trans_domid = sha2->transitive.trans_domid; @@ -1802,7 +1793,7 @@ barrier(); /* Stop the compiler from re-loading trans_domid from shared memory */ if ( trans_domid == rd->domain_id ) - PIN_FAIL(unlock_out, GNTST_general_error, + PIN_FAIL(unlock_out_clear, GNTST_general_error, "transitive grants cannot be self-referential\n"); /* We allow the trans_domid == ld->domain_id case, which @@ -1812,21 +1803,23 @@ that you don't need to go out of your way to avoid it in the guest. */ - rrd = rcu_lock_domain_by_id(trans_domid); - if ( rrd == NULL ) - PIN_FAIL(unlock_out, GNTST_general_error, + /* We need to leave the rrd locked during the grant copy */ + td = rcu_lock_domain_by_id(trans_domid); + if ( td == NULL ) + PIN_FAIL(unlock_out_clear, GNTST_general_error, "transitive grant referenced bad domain %d\n", trans_domid); spin_unlock(&rd->grant_table->lock); - rc = __acquire_grant_for_copy(rrd, trans_gref, rd, + rc = __acquire_grant_for_copy(td, trans_gref, rd, readonly, &grant_frame, &trans_page_off, &trans_length, 0, &ignore); spin_lock(&rd->grant_table->lock); if ( rc != GNTST_okay ) { - __fixup_status_for_pin(act, status); + __fixup_status_for_copy_pin(act, status); + rcu_unlock_domain(td); spin_unlock(&rd->grant_table->lock); return rc; } @@ -1837,7 +1830,8 @@ and try again. */ if ( act->pin != old_pin ) { - __fixup_status_for_pin(act, status); + __fixup_status_for_copy_pin(act, status); + rcu_unlock_domain(td); spin_unlock(&rd->grant_table->lock); return __acquire_grant_for_copy(rd, gref, ld, readonly, frame, page_off, length, @@ -1849,7 +1843,7 @@ sub-page, but we always treat it as one because that blocks mappings of transitive grants. */ is_sub_page = 1; - *owning_domain = rrd; + *owning_domain = td; act->gfn = -1ul; } else if ( sha1 ) @@ -1857,7 +1851,7 @@ gfn = sha1->frame; rc = __get_paged_frame(gfn, &grant_frame, readonly, rd); if ( rc != GNTST_okay ) - goto unlock_out; + goto unlock_out_clear; act->gfn = gfn; is_sub_page = 0; trans_page_off = 0; @@ -1869,7 +1863,7 @@ gfn = sha2->full_page.frame; rc = __get_paged_frame(gfn, &grant_frame, readonly, rd); if ( rc != GNTST_okay ) - goto unlock_out; + goto unlock_out_clear; act->gfn = gfn; is_sub_page = 0; trans_page_off = 0; @@ -1881,7 +1875,7 @@ gfn = sha2->sub_page.frame; rc = __get_paged_frame(gfn, &grant_frame, readonly, rd); if ( rc != GNTST_okay ) - goto unlock_out; + goto unlock_out_clear; act->gfn = gfn; is_sub_page = 1; trans_page_off = sha2->sub_page.page_off; @@ -1895,7 +1889,7 @@ act->is_sub_page = is_sub_page; act->start = trans_page_off; act->length = trans_length; - act->trans_dom = trans_domid; + act->trans_domain = td; act->trans_gref = trans_gref; act->frame = grant_frame; } @@ -1911,6 +1905,17 @@ *length = act->length; *frame = act->frame; + spin_unlock(&rd->grant_table->lock); + return rc; + + unlock_out_clear: + if ( !(readonly) && + !(act->pin & GNTPIN_hstw_mask) ) + gnttab_clear_flag(_GTF_writing, status); + + if ( !act->pin ) + gnttab_clear_flag(_GTF_reading, status); + unlock_out: spin_unlock(&rd->grant_table->lock); return rc; @@ -2129,7 +2134,7 @@ if ( op.version == 2 && gt->gt_version < 2 ) { - res = gnttab_populate_status_frames(d, gt); + res = gnttab_populate_status_frames(d, gt, nr_grant_frames(gt)); if ( res < 0) goto out_unlock; } @@ -2168,7 +2173,6 @@ struct grant_table *gt; uint64_t gmfn; int i; - int rc; if ( count != 1 ) return -EINVAL; @@ -2180,15 +2184,10 @@ return -EFAULT; } - rc = rcu_lock_target_domain_by_id(op.dom, &d); - if ( rc < 0 ) + d = gt_lock_target_domain_by_id(op.dom); + if ( IS_ERR(d) ) { - if ( rc == -ESRCH ) - op.status = GNTST_bad_domain; - else if ( rc == -EPERM ) - op.status = GNTST_permission_denied; - else - op.status = GNTST_general_error; + op.status = PTR_ERR(d); goto out1; } @@ -2450,9 +2449,6 @@ clear_page(t->shared_raw[i]); } - for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) - gnttab_create_shared_page(d, t, i); - /* Status pages for grant table - for version 2 */ t->status = xmalloc_array(grant_status_t *, grant_to_status_frames(max_nr_grant_frames)); @@ -2460,6 +2456,10 @@ goto no_mem_4; memset(t->status, 0, grant_to_status_frames(max_nr_grant_frames) * sizeof(t->status[0])); + + for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) + gnttab_create_shared_page(d, t, i); + t->nr_status_frames = 0; /* Okay, install the structure. */ diff -Nru xen-4.1.3/xen/common/kexec.c xen-4.1.5/xen/common/kexec.c --- xen-4.1.3/xen/common/kexec.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/kexec.c 2013-04-23 18:44:20.000000000 +0200 @@ -601,8 +601,8 @@ return -EINVAL; /* never reached */ } -int do_kexec_op_internal(unsigned long op, XEN_GUEST_HANDLE(void) uarg, - int compat) +static int do_kexec_op_internal(unsigned long op, XEN_GUEST_HANDLE(void) uarg, + bool_t compat) { unsigned long flags; int ret = -EINVAL; diff -Nru xen-4.1.3/xen/common/Makefile xen-4.1.5/xen/common/Makefile --- xen-4.1.3/xen/common/Makefile 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/Makefile 2013-04-23 18:44:20.000000000 +0200 @@ -43,7 +43,7 @@ obj-y += rbtree.o obj-y += lzo.o -obj-$(CONFIG_X86) += decompress.o bunzip2.o unlzma.o unlzo.o +obj-$(CONFIG_X86) += decompress.o bunzip2.o unxz.o unlzma.o unlzo.o obj-$(perfc) += perfc.o obj-$(crash_debug) += gdbstub.o diff -Nru xen-4.1.3/xen/common/memory.c xen-4.1.5/xen/common/memory.c --- xen-4.1.3/xen/common/memory.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/memory.c 2013-04-23 18:44:20.000000000 +0200 @@ -117,7 +117,8 @@ if ( a->memflags & MEMF_populate_on_demand ) { - if ( guest_physmap_mark_populate_on_demand(d, gpfn, + if ( a->extent_order > MAX_ORDER || + guest_physmap_mark_populate_on_demand(d, gpfn, a->extent_order) < 0 ) goto out; } @@ -216,7 +217,8 @@ xen_pfn_t gmfn; if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done, - a->nr_extents-1) ) + a->nr_extents-1) || + a->extent_order > MAX_ORDER ) return; for ( i = a->nr_done; i < a->nr_extents; i++ ) @@ -278,6 +280,9 @@ if ( (exch.nr_exchanged > exch.in.nr_extents) || /* Input and output domain identifiers match? */ (exch.in.domid != exch.out.domid) || + /* Extent orders are sensible? */ + (exch.in.extent_order > MAX_ORDER) || + (exch.out.extent_order > MAX_ORDER) || /* Sizes of input and output lists do not overflow a long? */ ((~0UL >> exch.in.extent_order) < exch.in.nr_extents) || ((~0UL >> exch.out.extent_order) < exch.out.nr_extents) || @@ -289,6 +294,13 @@ goto fail_early; } + if ( !guest_handle_okay(exch.in.extent_start, exch.in.nr_extents) || + !guest_handle_okay(exch.out.extent_start, exch.out.nr_extents) ) + { + rc = -EFAULT; + goto fail_early; + } + /* Only privileged guests can allocate multi-page contiguous extents. */ if ( !multipage_allocation_permitted(current->domain, exch.in.extent_order) || @@ -310,22 +322,9 @@ out_chunk_order = exch.in.extent_order - exch.out.extent_order; } - if ( likely(exch.in.domid == DOMID_SELF) ) - { - d = rcu_lock_current_domain(); - } - else - { - if ( (d = rcu_lock_domain_by_id(exch.in.domid)) == NULL ) - goto fail_early; - - if ( !IS_PRIV_FOR(current->domain, d) ) - { - rcu_unlock_domain(d); - rc = -EPERM; - goto fail_early; - } - } + rc = rcu_lock_target_domain_by_id(exch.in.domid, &d); + if ( rc ) + goto fail_early; memflags |= MEMF_bits(domain_clamp_alloc_bitsize( d, @@ -506,14 +505,13 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg) { struct domain *d; - int rc, op; + long rc; unsigned int address_bits; unsigned long start_extent; struct xen_memory_reservation reservation; struct memop_args args; domid_t domid; - - op = cmd & MEMOP_CMD_MASK; + int op = cmd & MEMOP_CMD_MASK; switch ( op ) { @@ -556,20 +554,8 @@ && (reservation.mem_flags & XENMEMF_populate_on_demand) ) args.memflags |= MEMF_populate_on_demand; - if ( likely(reservation.domid == DOMID_SELF) ) - { - d = rcu_lock_current_domain(); - } - else - { - if ( (d = rcu_lock_domain_by_id(reservation.domid)) == NULL ) - return start_extent; - if ( !IS_PRIV_FOR(current->domain, d) ) - { - rcu_unlock_domain(d); - return start_extent; - } - } + if ( unlikely(rcu_lock_target_domain_by_id(reservation.domid, &d)) ) + return start_extent; args.domain = d; rc = xsm_memory_adjust_reservation(current->domain, d); diff -Nru xen-4.1.3/xen/common/page_alloc.c xen-4.1.5/xen/common/page_alloc.c --- xen-4.1.3/xen/common/page_alloc.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/page_alloc.c 2013-04-23 18:44:20.000000000 +0200 @@ -144,6 +144,10 @@ { unsigned long bad_spfn, bad_epfn; const char *p; +#ifdef CONFIG_X86 + const unsigned long *badpage = NULL; + unsigned int i, array_size; +#endif ps = round_pgup(ps); pe = round_pgdown(pe); @@ -154,6 +158,25 @@ bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT); +#ifdef CONFIG_X86 + /* + * Here we put platform-specific memory range workarounds, i.e. + * memory known to be corrupt or otherwise in need to be reserved on + * specific platforms. + * We get these certain pages and remove them from memory region list. + */ + badpage = get_platform_badpages(&array_size); + if ( badpage ) + { + for ( i = 0; i < array_size; i++ ) + { + bootmem_region_zap(*badpage >> PAGE_SHIFT, + (*badpage >> PAGE_SHIFT) + 1); + badpage++; + } + } +#endif + /* Check new pages against the bad-page list. */ p = opt_badpage; while ( *p != '\0' ) @@ -303,9 +326,10 @@ unsigned int first_node, i, j, zone = 0, nodemask_retry = 0; unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1); unsigned long request = 1UL << order; - cpumask_t extra_cpus_mask, mask; struct page_info *pg; nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map; + bool_t need_tlbflush = 0; + uint32_t tlbflush_timestamp = 0; if ( node == NUMA_NO_NODE ) { @@ -417,20 +441,19 @@ if ( d != NULL ) d->last_alloc_node = node; - cpus_clear(mask); - for ( i = 0; i < (1 << order); i++ ) { /* Reference count must continuously be zero for free pages. */ BUG_ON(pg[i].count_info != PGC_state_free); pg[i].count_info = PGC_state_inuse; - if ( pg[i].u.free.need_tlbflush ) + if ( pg[i].u.free.need_tlbflush && + (pg[i].tlbflush_timestamp <= tlbflush_current_time()) && + (!need_tlbflush || + (pg[i].tlbflush_timestamp > tlbflush_timestamp)) ) { - /* Add in extra CPUs that need flushing because of this page. */ - cpus_andnot(extra_cpus_mask, cpu_online_map, mask); - tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp); - cpus_or(mask, mask, extra_cpus_mask); + need_tlbflush = 1; + tlbflush_timestamp = pg[i].tlbflush_timestamp; } /* Initialise fields which have other uses for free pages. */ @@ -440,10 +463,15 @@ spin_unlock(&heap_lock); - if ( unlikely(!cpus_empty(mask)) ) + if ( need_tlbflush ) { - perfc_incr(need_flush_tlb_flush); - flush_tlb_mask(&mask); + cpumask_t mask = cpu_online_map; + tlbflush_filter(mask, tlbflush_timestamp); + if ( !cpus_empty(mask) ) + { + perfc_incr(need_flush_tlb_flush); + flush_tlb_mask(&mask); + } } return pg; diff -Nru xen-4.1.3/xen/common/sched_credit.c xen-4.1.5/xen/common/sched_credit.c --- xen-4.1.3/xen/common/sched_credit.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/sched_credit.c 2013-04-23 18:44:20.000000000 +0200 @@ -58,8 +58,8 @@ /* * Flags */ -#define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */ -#define CSCHED_FLAG_VCPU_YIELD 0x0002 /* VCPU yielding */ +#define CSCHED_FLAG_VCPU_PARKED 0x0 /* VCPU over capped credits */ +#define CSCHED_FLAG_VCPU_YIELD 0x1 /* VCPU yielding */ /* @@ -72,6 +72,9 @@ #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) +/* Is the first element of _cpu's runq its idle vcpu? */ +#define IS_RUNQ_IDLE(_cpu) (list_empty(RUNQ(_cpu)) || \ + is_idle_vcpu(__runq_elem(RUNQ(_cpu)->next)->vcpu)) #define CSCHED_CPUONLINE(_pool) \ (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) @@ -131,7 +134,7 @@ struct vcpu *vcpu; atomic_t credit; s_time_t start_time; /* When we were scheduled (used for credit) */ - uint16_t flags; + unsigned flags; int16_t pri; #ifdef CSCHED_STATS struct { @@ -212,7 +215,7 @@ /* If the vcpu yielded, try to put it behind one lower-priority * runnable vcpu if we can. The next runq_sort will bring it forward * within 30ms if the queue too long. */ - if ( svc->flags & CSCHED_FLAG_VCPU_YIELD + if ( test_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags) && __runq_elem(iter)->pri > CSCHED_PRI_IDLE ) { iter=iter->next; @@ -488,9 +491,14 @@ * distinct cores first and guarantees we don't do something stupid * like run two VCPUs on co-hyperthreads while there are idle cores * or sockets. + * + * Notice that, when computing the "idleness" of cpu, we may want to + * discount vc. That is, iff vc is the currently running and the only + * runnable vcpu on cpu, we add cpu to the idlers. */ cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers); - cpu_set(cpu, idlers); + if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) + cpu_set(cpu, idlers); cpus_and(cpus, cpus, idlers); cpu_clear(cpu, cpus); @@ -769,7 +777,7 @@ * those. */ if ( svc->pri == CSCHED_PRI_TS_UNDER && - !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) + !test_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { svc->pri = CSCHED_PRI_TS_BOOST; } @@ -782,12 +790,12 @@ static void csched_vcpu_yield(const struct scheduler *ops, struct vcpu *vc) { - struct csched_vcpu * const sv = CSCHED_VCPU(vc); + struct csched_vcpu * const svc = CSCHED_VCPU(vc); if ( !sched_credit_default_yield ) { /* Let the scheduler know that this vcpu is trying to yield */ - sv->flags |= CSCHED_FLAG_VCPU_YIELD; + set_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags); } } @@ -1082,11 +1090,10 @@ /* Park running VCPUs of capped-out domains */ if ( sdom->cap != 0U && credit < -credit_cap && - !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) + !test_and_set_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { CSCHED_STAT_CRANK(vcpu_park); vcpu_pause_nosync(svc->vcpu); - svc->flags |= CSCHED_FLAG_VCPU_PARKED; } /* Lower bound on credits */ @@ -1102,7 +1109,7 @@ svc->pri = CSCHED_PRI_TS_UNDER; /* Unpark any capped domains whose credits go positive */ - if ( svc->flags & CSCHED_FLAG_VCPU_PARKED) + if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { /* * It's important to unset the flag AFTER the unpause() @@ -1111,7 +1118,6 @@ */ CSCHED_STAT_CRANK(vcpu_unpark); vcpu_unpause(svc->vcpu); - svc->flags &= ~CSCHED_FLAG_VCPU_PARKED; } /* Upper bound on credits means VCPU stops earning */ @@ -1370,8 +1376,7 @@ /* * Clear YIELD flag before scheduling out */ - if ( scurr->flags & CSCHED_FLAG_VCPU_YIELD ) - scurr->flags &= ~(CSCHED_FLAG_VCPU_YIELD); + clear_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags); /* * SMP Load balance: diff -Nru xen-4.1.3/xen/common/sched_sedf.c xen-4.1.5/xen/common/sched_sedf.c --- xen-4.1.3/xen/common/sched_sedf.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/sched_sedf.c 2013-04-23 18:44:20.000000000 +0200 @@ -452,7 +452,8 @@ online = SEDF_CPUONLINE(v->domain->cpupool); cpus_and(online_affinity, v->cpu_affinity, *online); - return first_cpu(online_affinity); + return cycle_cpu(v->vcpu_id % cpus_weight(online_affinity) - 1, + online_affinity); } /* diff -Nru xen-4.1.3/xen/common/schedule.c xen-4.1.5/xen/common/schedule.c --- xen-4.1.3/xen/common/schedule.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/schedule.c 2013-04-23 18:44:20.000000000 +0200 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -227,7 +227,7 @@ if ( v->sched_priv == NULL ) return 1; - SCHED_OP(VCPU2OP(v), insert_vcpu, v); + SCHED_OP(DOM2OP(d), insert_vcpu, v); return 0; } @@ -238,6 +238,9 @@ unsigned int new_p; void **vcpu_priv; void *domdata; + void *vcpudata; + struct scheduler *old_ops; + void *old_domdata; domdata = SCHED_OP(c->sched, alloc_domdata, d); if ( domdata == NULL ) @@ -269,16 +272,26 @@ domain_pause(d); + old_ops = DOM2OP(d); + old_domdata = d->sched_priv; + + for_each_vcpu ( d, v ) + { + SCHED_OP(old_ops, remove_vcpu, v); + } + + d->cpupool = c; + d->sched_priv = domdata; + new_p = first_cpu(c->cpu_valid); for_each_vcpu ( d, v ) { + vcpudata = v->sched_priv; + migrate_timer(&v->periodic_timer, new_p); migrate_timer(&v->singleshot_timer, new_p); migrate_timer(&v->poll_timer, new_p); - SCHED_OP(VCPU2OP(v), remove_vcpu, v); - SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv); - cpus_setall(v->cpu_affinity); v->processor = new_p; v->sched_priv = vcpu_priv[v->vcpu_id]; @@ -286,16 +299,16 @@ new_p = cycle_cpu(new_p, c->cpu_valid); - SCHED_OP(VCPU2OP(v), insert_vcpu, v); + SCHED_OP(c->sched, insert_vcpu, v); + + SCHED_OP(old_ops, free_vdata, vcpudata); } domain_update_node_affinity(d); - d->cpupool = c; - SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv); - d->sched_priv = domdata; - domain_unpause(d); + SCHED_OP(old_ops, free_domdata, old_domdata); + xfree(vcpu_priv); return 0; @@ -532,6 +545,38 @@ } } +void restore_vcpu_affinity(struct domain *d) +{ + struct vcpu *v; + + for_each_vcpu ( d, v ) + { + vcpu_schedule_lock_irq(v); + + if ( v->affinity_broken ) + { + printk(XENLOG_DEBUG "Restoring affinity for d%dv%d\n", + d->domain_id, v->vcpu_id); + cpus_copy(v->cpu_affinity, v->cpu_affinity_saved); + v->affinity_broken = 0; + } + + if ( v->processor == smp_processor_id() ) + { + set_bit(_VPF_migrating, &v->pause_flags); + vcpu_schedule_unlock_irq(v); + vcpu_sleep_nosync(v); + vcpu_migrate(v); + } + else + { + vcpu_schedule_unlock_irq(v); + } + } + + domain_update_node_affinity(d); +} + /* * This function is used by cpu_hotplug code from stop_machine context * and from cpupools to switch schedulers on a cpu. @@ -546,7 +591,7 @@ bool_t affinity_broken; c = per_cpu(cpupool, cpu); - if ( (c == NULL) || (system_state == SYS_STATE_suspend) ) + if ( c == NULL ) return ret; for_each_domain ( d ) @@ -564,8 +609,15 @@ if ( cpus_empty(online_affinity) && cpu_isset(cpu, v->cpu_affinity) ) { - printk("Breaking vcpu affinity for domain %d vcpu %d\n", - v->domain->domain_id, v->vcpu_id); + printk(XENLOG_DEBUG "Breaking affinity for d%dv%d\n", + d->domain_id, v->vcpu_id); + + if (system_state == SYS_STATE_suspend) + { + cpus_copy(v->cpu_affinity_saved, v->cpu_affinity); + v->affinity_broken = 1; + } + cpus_setall(v->cpu_affinity); affinity_broken = 1; } @@ -615,7 +667,8 @@ old_affinity = v->cpu_affinity; v->cpu_affinity = *affinity; *affinity = old_affinity; - if ( !cpu_isset(v->processor, v->cpu_affinity) ) + if ( VCPU2OP(v)->sched_id == XEN_SCHEDULER_SEDF || + !cpu_isset(v->processor, v->cpu_affinity) ) set_bit(_VPF_migrating, &v->pause_flags); vcpu_schedule_unlock_irq(v); @@ -1357,7 +1410,7 @@ panic("scheduler returned error on init\n"); idle_domain = domain_create(DOMID_IDLE, 0, 0); - BUG_ON(idle_domain == NULL); + BUG_ON(IS_ERR(idle_domain)); idle_domain->vcpu = idle_vcpu; idle_domain->max_vcpus = NR_CPUS; if ( alloc_vcpu(idle_domain, 0, 0) == NULL ) diff -Nru xen-4.1.3/xen/common/tmem.c xen-4.1.5/xen/common/tmem.c --- xen-4.1.3/xen/common/tmem.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/tmem.c 2013-04-23 18:44:20.000000000 +0200 @@ -387,11 +387,13 @@ pcd = pgp->pcd; if ( pgp->size < PAGE_SIZE && pgp->size != 0 && pcd->size < PAGE_SIZE && pcd->size != 0 ) - ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL); + ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, + tmh_cli_buf_null); else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE ) ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size); else - ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL); + ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, + tmh_cli_buf_null); tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]); return ret; } @@ -1108,7 +1110,7 @@ sl->client = new_client; list_add_tail(&sl->share_list, &pool->share_list); if ( new_client->cli_id != pool->client->cli_id ) - printk("adding new %s %d to shared pool owned by %s %d\n", + tmh_client_info("adding new %s %d to shared pool owned by %s %d\n", client_str, new_client->cli_id, client_str, pool->client->cli_id); return ++pool->shared_count; } @@ -1138,7 +1140,7 @@ old_client->eph_count -= _atomic_read(pool->pgp_count); list_splice_init(&old_client->ephemeral_page_list, &new_client->ephemeral_page_list); - printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n", + tmh_client_info("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n", cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid); pool->pool_id = poolid; } @@ -1174,7 +1176,7 @@ } return 0; } - printk("tmem: no match unsharing pool, %s=%d\n", + tmh_client_warn("tmem: no match unsharing pool, %s=%d\n", cli_id_str,pool->client->cli_id); return -1; } @@ -1185,17 +1187,18 @@ ASSERT(pool != NULL); if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) ) { - printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n", + tmh_client_warn("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n", cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id); return; } - printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing", - is_persistent(pool) ? "persistent" : "ephemeral" , - is_shared(pool) ? "shared" : "private"); - printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id); + tmh_client_info("%s %s-%s tmem pool %s=%d pool_id=%d\n", + destroy ? "destroying" : "flushing", + is_persistent(pool) ? "persistent" : "ephemeral" , + is_shared(pool) ? "shared" : "private", + cli_id_str, pool->client->cli_id, pool->pool_id); if ( pool->client->live_migrating ) { - printk("can't %s pool while %s is live-migrating\n", + tmh_client_warn("can't %s pool while %s is live-migrating\n", destroy?"destroy":"flush", client_str); return; } @@ -1214,21 +1217,22 @@ client_t *client = tmh_alloc_infra(sizeof(client_t),__alignof__(client_t)); int i; - printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id); + tmh_client_info("tmem: initializing tmem capability for %s=%d...", + cli_id_str, cli_id); if ( client == NULL ) { - printk("failed... out of memory\n"); + tmh_client_err("failed... out of memory\n"); goto fail; } memset(client,0,sizeof(client_t)); if ( (client->tmh = tmh_client_init(cli_id)) == NULL ) { - printk("failed... can't allocate host-dependent part of client\n"); + tmh_client_err("failed... can't allocate host-dependent part of client\n"); goto fail; } if ( !tmh_set_client_from_id(client, client->tmh, cli_id) ) { - printk("failed... can't set client\n"); + tmh_client_err("failed... can't set client\n"); goto fail; } client->cli_id = cli_id; @@ -1250,7 +1254,7 @@ client->eph_count = client->eph_count_max = 0; client->total_cycles = 0; client->succ_pers_puts = 0; client->succ_eph_gets = 0; client->succ_pers_gets = 0; - printk("ok\n"); + tmh_client_info("ok\n"); return client; fail: @@ -1447,7 +1451,7 @@ /************ TMEM CORE OPERATIONS ************************************/ static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn, - void *cva) + tmem_cli_va_t clibuf) { void *dst, *p; size_t size; @@ -1466,8 +1470,8 @@ if ( pgp->pfp != NULL ) pgp_free_data(pgp, pgp->us.obj->pool); START_CYC_COUNTER(compress); - ret = tmh_compress_from_client(cmfn, &dst, &size, cva); - if ( (ret == -EFAULT) || (ret == 0) ) + ret = tmh_compress_from_client(cmfn, &dst, &size, clibuf); + if ( ret <= 0 ) goto out; else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) { ret = 0; @@ -1493,7 +1497,8 @@ } static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn, - pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva) + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, + tmem_cli_va_t clibuf) { pool_t *pool; obj_t *obj; @@ -1515,7 +1520,7 @@ /* can we successfully manipulate pgp to change out the data? */ if ( len != 0 && client->compress && pgp->size != 0 ) { - ret = do_tmem_put_compress(pgp,cmfn,cva); + ret = do_tmem_put_compress(pgp, cmfn, clibuf); if ( ret == 1 ) goto done; else if ( ret == 0 ) @@ -1533,8 +1538,9 @@ goto failed_dup; pgp->size = 0; /* tmh_copy_from_client properly handles len==0 and offsets != 0 */ - ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0); - if ( ret == -EFAULT ) + ret = tmh_copy_from_client(pgp->pfp, cmfn, tmem_offset, pfn_offset, len, + tmh_cli_buf_null); + if ( ret < 0 ) goto bad_copy; if ( tmh_dedup_enabled() && !is_persistent(pool) ) { @@ -1555,9 +1561,7 @@ return 1; bad_copy: - /* this should only happen if the client passed a bad mfn */ failed_copies++; - ret = -EFAULT; goto cleanup; failed_dup: @@ -1585,7 +1589,7 @@ static NOINLINE int do_tmem_put(pool_t *pool, OID *oidp, uint32_t index, tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, - pagesize_t pfn_offset, pagesize_t len, void *cva) + pagesize_t pfn_offset, pagesize_t len, tmem_cli_va_t clibuf) { obj_t *obj = NULL, *objfound = NULL, *objnew = NULL; pgp_t *pgp = NULL, *pgpdel = NULL; @@ -1599,7 +1603,8 @@ { ASSERT_SPINLOCK(&objfound->obj_spinlock); if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL) - return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len,cva); + return do_tmem_dup_put(pgp, cmfn, tmem_offset, pfn_offset, len, + clibuf); } /* no puts allowed into a frozen pool (except dup puts) */ @@ -1634,7 +1639,7 @@ if ( len != 0 && client->compress ) { ASSERT(pgp->pfp == NULL); - ret = do_tmem_put_compress(pgp,cmfn,cva); + ret = do_tmem_put_compress(pgp, cmfn, clibuf); if ( ret == 1 ) goto insert_page; if ( ret == -ENOMEM ) @@ -1658,8 +1663,9 @@ goto delete_and_free; } /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */ - ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva); - if ( ret == -EFAULT ) + ret = tmh_copy_from_client(pgp->pfp, cmfn, tmem_offset, pfn_offset, len, + clibuf); + if ( ret < 0 ) goto bad_copy; if ( tmh_dedup_enabled() && !is_persistent(pool) ) { @@ -1699,8 +1705,6 @@ return 1; bad_copy: - /* this should only happen if the client passed a bad mfn */ - ret = -EFAULT; failed_copies++; delete_and_free: @@ -1728,12 +1732,13 @@ static NOINLINE int do_tmem_get(pool_t *pool, OID *oidp, uint32_t index, tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, - pagesize_t pfn_offset, pagesize_t len, void *cva) + pagesize_t pfn_offset, pagesize_t len, tmem_cli_va_t clibuf) { obj_t *obj; pgp_t *pgp; client_t *client = pool->client; DECL_LOCAL_CYC_COUNTER(decompress); + int rc; if ( !_atomic_read(pool->pgp_count) ) return -EEMPTY; @@ -1757,18 +1762,20 @@ ASSERT(pgp->size != -1); if ( tmh_dedup_enabled() && !is_persistent(pool) && pgp->firstbyte != NOT_SHAREABLE ) + rc = pcd_copy_to_client(cmfn, pgp); + else if ( pgp->size != 0 ) { - if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT ) - goto bad_copy; - } else if ( pgp->size != 0 ) { START_CYC_COUNTER(decompress); - if ( tmh_decompress_to_client(cmfn, pgp->cdata, - pgp->size, cva) == -EFAULT ) - goto bad_copy; + rc = tmh_decompress_to_client(cmfn, pgp->cdata, + pgp->size, clibuf); END_CYC_COUNTER(decompress); - } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, - pfn_offset, len, cva) == -EFAULT) + } + else + rc = tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, + pfn_offset, len, clibuf); + if ( rc <= 0 ) goto bad_copy; + if ( is_ephemeral(pool) ) { if ( is_private(pool) ) @@ -1788,7 +1795,6 @@ list_del(&pgp->us.client_eph_pages); list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list); tmem_spin_unlock(&eph_lists_spinlock); - ASSERT(obj != NULL); obj->last_client = tmh_get_cli_id_from_current(); } } @@ -1805,10 +1811,10 @@ return 1; bad_copy: - /* this should only happen if the client passed a bad mfn */ + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); failed_copies++; - return -EFAULT; - + return rc; } static NOINLINE int do_tmem_flush_page(pool_t *pool, OID *oidp, uint32_t index) @@ -1873,6 +1879,8 @@ if ( client->pools == NULL ) return 0; + if ( pool_id >= MAX_POOLS_PER_DOMAIN ) + return 0; if ( (pool = client->pools[pool_id]) == NULL ) return 0; client->pools[pool_id] = NULL; @@ -1900,32 +1908,33 @@ cli_id = tmh_get_cli_id_from_current(); else cli_id = this_cli_id; - printk("tmem: allocating %s-%s tmem pool for %s=%d...", + tmh_client_info("tmem: allocating %s-%s tmem pool for %s=%d...", persistent ? "persistent" : "ephemeral" , shared ? "shared" : "private", cli_id_str, cli_id); if ( specversion != TMEM_SPEC_VERSION ) { - printk("failed... unsupported spec version\n"); + tmh_client_err("failed... unsupported spec version\n"); return -EPERM; } if ( pagebits != (PAGE_SHIFT - 12) ) { - printk("failed... unsupported pagesize %d\n",1<<(pagebits+12)); + tmh_client_err("failed... unsupported pagesize %d\n", + 1 << (pagebits + 12)); return -EPERM; } if ( flags & TMEM_POOL_PRECOMPRESSED ) { - printk("failed... precompression flag set but unsupported\n"); + tmh_client_err("failed... precompression flag set but unsupported\n"); return -EPERM; } if ( flags & TMEM_POOL_RESERVED_BITS ) { - printk("failed... reserved bits must be zero\n"); + tmh_client_err("failed... reserved bits must be zero\n"); return -EPERM; } if ( (pool = pool_alloc()) == NULL ) { - printk("failed... out of memory\n"); + tmh_client_err("failed... out of memory\n"); return -ENOMEM; } if ( this_cli_id != CLI_ID_NULL ) @@ -1944,7 +1953,7 @@ break; if ( d_poolid >= MAX_POOLS_PER_DOMAIN ) { - printk("failed... no more pool slots available for this %s\n", + tmh_client_err("failed... no more pool slots available for this %s\n", client_str); goto fail; } @@ -1974,9 +1983,8 @@ { if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi ) { - printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ", - uuid_hi, uuid_lo); - printk("pool_id=%d\n",d_poolid); + tmh_client_info("(matches shared pool uuid=%"PRIx64".%"PRIx64") pool_id=%d\n", + uuid_hi, uuid_lo, d_poolid); client->pools[d_poolid] = global_shared_pools[s_poolid]; shared_pool_join(global_shared_pools[s_poolid], client); pool_free(pool); @@ -1988,7 +1996,7 @@ } if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS ) { - printk("tmem: failed... no global shared pool slots available\n"); + tmh_client_warn("tmem: failed... no global shared pool slots available\n"); goto fail; } else @@ -2004,7 +2012,7 @@ pool->pool_id = d_poolid; pool->persistent = persistent; pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi; - printk("pool_id=%d\n",d_poolid); + tmh_client_info("pool_id=%d\n", d_poolid); return d_poolid; fail: @@ -2027,14 +2035,15 @@ { list_for_each_entry(client,&global_client_list,client_list) client_freeze(client,freeze); - printk("tmem: all pools %s for all %ss\n",s,client_str); + tmh_client_info("tmem: all pools %s for all %ss\n", s, client_str); } else { if ( (client = tmh_client_from_cli_id(cli_id)) == NULL) return -1; client_freeze(client,freeze); - printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id); + tmh_client_info("tmem: all pools %s for %s=%d\n", + s, cli_id_str, cli_id); } return 0; } @@ -2045,7 +2054,7 @@ if ( cli_id != CLI_ID_NULL ) { - printk("tmem: %s-specific flush not supported yet, use --all\n", + tmh_client_warn("tmem: %s-specific flush not supported yet, use --all\n", client_str); return -1; } @@ -2258,13 +2267,15 @@ case TMEMC_SET_WEIGHT: old_weight = client->weight; client->weight = arg1; - printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id); + tmh_client_info("tmem: weight set to %d for %s=%d\n", + arg1, cli_id_str, cli_id); atomic_sub(old_weight,&client_weight_total); atomic_add(client->weight,&client_weight_total); break; case TMEMC_SET_CAP: client->cap = arg1; - printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id); + tmh_client_info("tmem: cap set to %d for %s=%d\n", + arg1, cli_id_str, cli_id); break; case TMEMC_SET_COMPRESS: #ifdef __i386__ @@ -2272,17 +2283,17 @@ #endif if ( tmh_dedup_enabled() ) { - printk("tmem: compression %s for all %ss, cannot be changed " - "when tmem_dedup is enabled\n", - tmh_compression_enabled() ? "enabled" : "disabled",client_str); + tmh_client_warn("tmem: compression %s for all %ss, cannot be changed when tmem_dedup is enabled\n", + tmh_compression_enabled() ? "enabled" : "disabled", + client_str); return -1; } client->compress = arg1 ? 1 : 0; - printk("tmem: compression %s for %s=%d\n", + tmh_client_info("tmem: compression %s for %s=%d\n", arg1 ? "enabled" : "disabled",cli_id_str,cli_id); break; default: - printk("tmem: unknown subop %d for tmemc_set_var\n",subop); + tmh_client_warn("tmem: unknown subop %d for tmemc_set_var\n", subop); return -1; } return 0; @@ -2346,7 +2357,6 @@ pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN) ? NULL : client->pools[pool_id]; uint32_t p; - uint64_t *uuid; pgp_t *pgp, *pgp2; int rc = -1; @@ -2380,12 +2390,18 @@ rc = MAX_POOLS_PER_DOMAIN; break; case TMEMC_SAVE_GET_CLIENT_WEIGHT: + if ( client == NULL ) + break; rc = client->weight == -1 ? -2 : client->weight; break; case TMEMC_SAVE_GET_CLIENT_CAP: + if ( client == NULL ) + break; rc = client->cap == -1 ? -2 : client->cap; break; case TMEMC_SAVE_GET_CLIENT_FLAGS: + if ( client == NULL ) + break; rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) | (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 ); break; @@ -2394,7 +2410,8 @@ break; rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) | (pool->shared ? TMEM_POOL_SHARED : 0) | - (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT); + (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT) | + (TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT); break; case TMEMC_SAVE_GET_POOL_NPAGES: if ( pool == NULL ) @@ -2404,11 +2421,12 @@ case TMEMC_SAVE_GET_POOL_UUID: if ( pool == NULL ) break; - uuid = (uint64_t *)buf.p; - *uuid++ = pool->uuid[0]; - *uuid = pool->uuid[1]; + tmh_copy_to_client_buf(buf, pool->uuid, 2); rc = 0; + break; case TMEMC_SAVE_END: + if ( client == NULL ) + break; client->live_migrating = 0; if ( !list_empty(&client->persistent_invalidated_list) ) list_for_each_entry_safe(pgp,pgp2, @@ -2416,11 +2434,12 @@ pgp_free_from_inv_list(client,pgp); client->frozen = client->was_frozen; rc = 0; + break; } return rc; } -static NOINLINE int tmemc_save_get_next_page(int cli_id, int pool_id, +static NOINLINE int tmemc_save_get_next_page(int cli_id, uint32_t pool_id, tmem_cli_va_t buf, uint32_t bufsize) { client_t *client = tmh_client_from_cli_id(cli_id); @@ -2429,11 +2448,13 @@ pgp_t *pgp; OID oid; int ret = 0; - struct tmem_handle *h; - unsigned int pagesize = 1 << (pool->pageshift+12); + struct tmem_handle h; + unsigned int pagesize; if ( pool == NULL || is_ephemeral(pool) ) return -1; + + pagesize = 1 << (pool->pageshift + 12); if ( bufsize < pagesize + sizeof(struct tmem_handle) ) return -ENOMEM; @@ -2460,11 +2481,13 @@ pgp_t,us.pool_pers_pages); pool->cur_pgp = pgp; oid = pgp->us.obj->oid; - h = (struct tmem_handle *)buf.p; - *(OID *)&h->oid[0] = oid; - h->index = pgp->index; - buf.p = (void *)(h+1); - ret = do_tmem_get(pool, &oid, h->index,0,0,0,pagesize,buf.p); + h.pool_id = pool_id; + BUILD_BUG_ON(sizeof(h.oid) != sizeof(oid)); + memcpy(h.oid, oid.oid, sizeof(h.oid)); + h.index = pgp->index; + tmh_copy_to_client_buf(buf, &h, 1); + tmh_client_buf_add(buf, sizeof(h)); + ret = do_tmem_get(pool, &oid, pgp->index, 0, 0, 0, pagesize, buf); out: tmem_spin_unlock(&pers_lists_spinlock); @@ -2476,7 +2499,7 @@ { client_t *client = tmh_client_from_cli_id(cli_id); pgp_t *pgp; - struct tmem_handle *h; + struct tmem_handle h; int ret = 0; if ( client == NULL ) @@ -2502,17 +2525,18 @@ pgp_t,client_inv_pages); client->cur_pgp = pgp; } - h = (struct tmem_handle *)buf.p; - h->pool_id = pgp->pool_id; - *(OID *)&h->oid = pgp->inv_oid; - h->index = pgp->index; + h.pool_id = pgp->pool_id; + BUILD_BUG_ON(sizeof(h.oid) != sizeof(pgp->inv_oid)); + memcpy(h.oid, pgp->inv_oid.oid, sizeof(h.oid)); + h.index = pgp->index; + tmh_copy_to_client_buf(buf, &h, 1); ret = 1; out: tmem_spin_unlock(&pers_lists_spinlock); return ret; } -static int tmemc_restore_put_page(int cli_id, int pool_id, OID *oidp, +static int tmemc_restore_put_page(int cli_id, uint32_t pool_id, OID *oidp, uint32_t index, tmem_cli_va_t buf, uint32_t bufsize) { client_t *client = tmh_client_from_cli_id(cli_id); @@ -2521,10 +2545,10 @@ if ( pool == NULL ) return -1; - return do_tmem_put(pool,oidp,index,0,0,0,bufsize,buf.p); + return do_tmem_put(pool, oidp, index, 0, 0, 0, bufsize, buf); } -static int tmemc_restore_flush_page(int cli_id, int pool_id, OID *oidp, +static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id, OID *oidp, uint32_t index) { client_t *client = tmh_client_from_cli_id(cli_id); @@ -2544,10 +2568,8 @@ OID *oidp = (OID *)(&op->u.ctrl.oid[0]); if (!tmh_current_is_privileged()) - { - /* don't fail... mystery: sometimes dom0 fails here */ - /* return -EPERM; */ - } + return -EPERM; + switch(subop) { case TMEMC_THAW: @@ -2650,13 +2672,19 @@ if ( client != NULL && tmh_client_is_dying(client) ) { rc = -ENODEV; - goto out; + if ( tmh_lock_all ) + goto out; + simple_error: + errored_tmem_ops++; + return rc; } if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) ) { - printk("tmem: can't get tmem struct from %s\n",client_str); + tmh_client_err("tmem: can't get tmem struct from %s\n", client_str); rc = -EFAULT; + if ( !tmh_lock_all ) + goto simple_error; goto out; } @@ -2687,7 +2715,8 @@ tmem_write_lock_set = 1; if ( (client = client_create(tmh_get_cli_id_from_current())) == NULL ) { - printk("tmem: can't create tmem structure for %s\n",client_str); + tmh_client_err("tmem: can't create tmem structure for %s\n", + client_str); rc = -ENOMEM; goto out; } @@ -2711,8 +2740,8 @@ if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) || ((pool = client->pools[op.pool_id]) == NULL) ) { + tmh_client_err("tmem: operation requested on uncreated pool\n"); rc = -ENODEV; - printk("tmem: operation requested on uncreated pool\n"); goto out; } ASSERT_SENTINEL(pool,POOL); @@ -2727,19 +2756,19 @@ break; case TMEM_NEW_PAGE: tmem_ensure_avail_pages(); - rc = do_tmem_put(pool, oidp, - op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL); + rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, + tmh_cli_buf_null); break; case TMEM_PUT_PAGE: tmem_ensure_avail_pages(); - rc = do_tmem_put(pool, oidp, - op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL); + rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn, 0, 0, + PAGE_SIZE, tmh_cli_buf_null); if (rc == 1) succ_put = 1; else non_succ_put = 1; break; case TMEM_GET_PAGE: rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn, - 0, 0, PAGE_SIZE, 0); + 0, 0, PAGE_SIZE, tmh_cli_buf_null); if (rc == 1) succ_get = 1; else non_succ_get = 1; break; @@ -2758,21 +2787,21 @@ case TMEM_READ: rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn, op.u.gen.tmem_offset, op.u.gen.pfn_offset, - op.u.gen.len,0); + op.u.gen.len, tmh_cli_buf_null); break; case TMEM_WRITE: rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn, op.u.gen.tmem_offset, op.u.gen.pfn_offset, - op.u.gen.len, NULL); + op.u.gen.len, tmh_cli_buf_null); break; case TMEM_XCHG: /* need to hold global lock to ensure xchg is atomic */ - printk("tmem_xchg op not implemented yet\n"); + tmh_client_warn("tmem_xchg op not implemented yet\n"); rc = 0; break; default: - printk("tmem: op %d not implemented\n", op.cmd); + tmh_client_warn("tmem: op %d not implemented\n", op.cmd); rc = 0; break; } diff -Nru xen-4.1.3/xen/common/tmem_xen.c xen-4.1.5/xen/common/tmem_xen.c --- xen-4.1.3/xen/common/tmem_xen.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/tmem_xen.c 2013-04-23 18:44:20.000000000 +0200 @@ -50,6 +50,7 @@ #define LZO_DSTMEM_PAGES 2 static DEFINE_PER_CPU_READ_MOSTLY(unsigned char *, workmem); static DEFINE_PER_CPU_READ_MOSTLY(unsigned char *, dstmem); +static DEFINE_PER_CPU_READ_MOSTLY(void *, scratch_page); #ifdef COMPARE_COPY_PAGE_SSE2 #include /* REMOVE ME AFTER TEST */ @@ -120,7 +121,7 @@ if ( !ret ) return NULL; *pcli_mfn = cli_mfn; - *pcli_pfp = (pfp_t *)page; + *pcli_pfp = page; return map_domain_page(cli_mfn); } @@ -129,24 +130,26 @@ { if ( mark_dirty ) { - put_page_and_type((struct page_info *)cli_pfp); + put_page_and_type(cli_pfp); paging_mark_dirty(current->domain,cli_mfn); } else - put_page((struct page_info *)cli_pfp); + put_page(cli_pfp); unmap_domain_page(cli_va); } #endif EXPORT int tmh_copy_from_client(pfp_t *pfp, tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, - pagesize_t pfn_offset, pagesize_t len, void *cli_va) + pagesize_t pfn_offset, pagesize_t len, tmem_cli_va_t clibuf) { unsigned long tmem_mfn, cli_mfn = 0; - void *tmem_va; + char *tmem_va, *cli_va = NULL; pfp_t *cli_pfp = NULL; - bool_t tmemc = cli_va != NULL; /* if true, cli_va is control-op buffer */ + int rc = 1; + if ( tmem_offset > PAGE_SIZE || pfn_offset > PAGE_SIZE || len > PAGE_SIZE ) + return -EINVAL; ASSERT(pfp != NULL); tmem_mfn = page_to_mfn(pfp); tmem_va = map_domain_page(tmem_mfn); @@ -156,62 +159,80 @@ unmap_domain_page(tmem_va); return 1; } - if ( !tmemc ) + if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 0); if ( cli_va == NULL ) + { + unmap_domain_page(tmem_va); return -EFAULT; + } } mb(); - if (len == PAGE_SIZE && !tmem_offset && !pfn_offset) + if ( len == PAGE_SIZE && !tmem_offset && !pfn_offset && cli_va ) tmh_copy_page(tmem_va, cli_va); else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) ) - memcpy((char *)tmem_va+tmem_offset,(char *)cli_va+pfn_offset,len); - if ( !tmemc ) + { + if ( cli_va ) + memcpy(tmem_va + tmem_offset, cli_va + pfn_offset, len); + else if ( copy_from_guest_offset(tmem_va + tmem_offset, clibuf, + pfn_offset, len) ) + rc = -EFAULT; + } + else if ( len ) + rc = -EINVAL; + if ( cli_va ) cli_put_page(cli_va, cli_pfp, cli_mfn, 0); unmap_domain_page(tmem_va); - return 1; + return rc; } EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn, - void **out_va, size_t *out_len, void *cli_va) + void **out_va, size_t *out_len, tmem_cli_va_t clibuf) { int ret = 0; unsigned char *dmem = this_cpu(dstmem); unsigned char *wmem = this_cpu(workmem); + char *scratch = this_cpu(scratch_page); pfp_t *cli_pfp = NULL; unsigned long cli_mfn = 0; - bool_t tmemc = cli_va != NULL; /* if true, cli_va is control-op buffer */ + void *cli_va = NULL; if ( dmem == NULL || wmem == NULL ) return 0; /* no buffer, so can't compress */ - if ( !tmemc ) + if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 0); if ( cli_va == NULL ) return -EFAULT; } + else if ( !scratch ) + return 0; + else if ( copy_from_guest(scratch, clibuf, PAGE_SIZE) ) + return -EFAULT; mb(); - ret = lzo1x_1_compress(cli_va, PAGE_SIZE, dmem, out_len, wmem); + ret = lzo1x_1_compress(cli_va ?: scratch, PAGE_SIZE, dmem, out_len, wmem); ASSERT(ret == LZO_E_OK); *out_va = dmem; - if ( !tmemc ) + if ( cli_va ) cli_put_page(cli_va, cli_pfp, cli_mfn, 0); - unmap_domain_page(cli_va); return 1; } EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, - pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cli_va) + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, + tmem_cli_va_t clibuf) { unsigned long tmem_mfn, cli_mfn = 0; - void *tmem_va; + char *tmem_va, *cli_va = NULL; pfp_t *cli_pfp = NULL; - bool_t tmemc = cli_va != NULL; /* if true, cli_va is control-op buffer */ + int rc = 1; + if ( tmem_offset > PAGE_SIZE || pfn_offset > PAGE_SIZE || len > PAGE_SIZE ) + return -EINVAL; ASSERT(pfp != NULL); - if ( !tmemc ) + if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 1); if ( cli_va == NULL ) @@ -219,37 +240,50 @@ } tmem_mfn = page_to_mfn(pfp); tmem_va = map_domain_page(tmem_mfn); - if (len == PAGE_SIZE && !tmem_offset && !pfn_offset) + if ( len == PAGE_SIZE && !tmem_offset && !pfn_offset && cli_va ) tmh_copy_page(cli_va, tmem_va); else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) ) - memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len); + { + if ( cli_va ) + memcpy(cli_va + pfn_offset, tmem_va + tmem_offset, len); + else if ( copy_to_guest_offset(clibuf, pfn_offset, + tmem_va + tmem_offset, len) ) + rc = -EFAULT; + } + else if ( len ) + rc = -EINVAL; unmap_domain_page(tmem_va); - if ( !tmemc ) + if ( cli_va ) cli_put_page(cli_va, cli_pfp, cli_mfn, 1); mb(); - return 1; + return rc; } EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, - size_t size, void *cli_va) + size_t size, tmem_cli_va_t clibuf) { unsigned long cli_mfn = 0; pfp_t *cli_pfp = NULL; + void *cli_va = NULL; + char *scratch = this_cpu(scratch_page); size_t out_len = PAGE_SIZE; - bool_t tmemc = cli_va != NULL; /* if true, cli_va is control-op buffer */ int ret; - if ( !tmemc ) + if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 1); if ( cli_va == NULL ) return -EFAULT; } - ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len); + else if ( !scratch ) + return 0; + ret = lzo1x_decompress_safe(tmem_va, size, cli_va ?: scratch, &out_len); ASSERT(ret == LZO_E_OK); ASSERT(out_len == PAGE_SIZE); - if ( !tmemc ) + if ( cli_va ) cli_put_page(cli_va, cli_pfp, cli_mfn, 1); + else if ( copy_to_guest(clibuf, scratch, PAGE_SIZE) ) + return -EFAULT; mb(); return 1; } @@ -419,6 +453,11 @@ struct page_info *p = alloc_domheap_pages(0, workmem_order, 0); per_cpu(workmem, cpu) = p ? page_to_virt(p) : NULL; } + if ( per_cpu(scratch_page, cpu) == NULL ) + { + struct page_info *p = alloc_domheap_page(NULL, 0); + per_cpu(scratch_page, cpu) = p ? page_to_virt(p) : NULL; + } break; } case CPU_DEAD: @@ -435,6 +474,11 @@ free_domheap_pages(p, workmem_order); per_cpu(workmem, cpu) = NULL; } + if ( per_cpu(scratch_page, cpu) != NULL ) + { + free_domheap_page(virt_to_page(per_cpu(scratch_page, cpu))); + per_cpu(scratch_page, cpu) = NULL; + } break; } default: diff -Nru xen-4.1.3/xen/common/unxz.c xen-4.1.5/xen/common/unxz.c --- xen-4.1.3/xen/common/unxz.c 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/unxz.c 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,306 @@ +/* + * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +/* + * Important notes about in-place decompression + * + * At least on x86, the kernel is decompressed in place: the compressed data + * is placed to the end of the output buffer, and the decompressor overwrites + * most of the compressed data. There must be enough safety margin to + * guarantee that the write position is always behind the read position. + * + * The safety margin for XZ with LZMA2 or BCJ+LZMA2 is calculated below. + * Note that the margin with XZ is bigger than with Deflate (gzip)! + * + * The worst case for in-place decompression is that the beginning of + * the file is compressed extremely well, and the rest of the file is + * uncompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding uncompressible data. + * + * The structure of the .xz file in case of a compresed kernel is as follows. + * Sizes (as bytes) of the fields are in parenthesis. + * + * Stream Header (12) + * Block Header: + * Block Header (8-12) + * Compressed Data (N) + * Block Padding (0-3) + * CRC32 (4) + * Index (8-20) + * Stream Footer (12) + * + * Normally there is exactly one Block, but let's assume that there are + * 2-4 Blocks just in case. Because Stream Header and also Block Header + * of the first Block don't make the decompressor produce any uncompressed + * data, we can ignore them from our calculations. Block Headers of possible + * additional Blocks have to be taken into account still. With these + * assumptions, it is safe to assume that the total header overhead is + * less than 128 bytes. + * + * Compressed Data contains LZMA2 or BCJ+LZMA2 encoded data. Since BCJ + * doesn't change the size of the data, it is enough to calculate the + * safety margin for LZMA2. + * + * LZMA2 stores the data in chunks. Each chunk has a header whose size is + * a maximum of 6 bytes, but to get round 2^n numbers, let's assume that + * the maximum chunk header size is 8 bytes. After the chunk header, there + * may be up to 64 KiB of actual payload in the chunk. Often the payload is + * quite a bit smaller though; to be safe, let's assume that an average + * chunk has only 32 KiB of payload. + * + * The maximum uncompressed size of the payload is 2 MiB. The minimum + * uncompressed size of the payload is in practice never less than the + * payload size itself. The LZMA2 format would allow uncompressed size + * to be less than the payload size, but no sane compressor creates such + * files. LZMA2 supports storing uncompressible data in uncompressed form, + * so there's never a need to create payloads whose uncompressed size is + * smaller than the compressed size. + * + * The assumption, that the uncompressed size of the payload is never + * smaller than the payload itself, is valid only when talking about + * the payload as a whole. It is possible that the payload has parts where + * the decompressor consumes more input than it produces output. Calculating + * the worst case for this would be tricky. Instead of trying to do that, + * let's simply make sure that the decompressor never overwrites any bytes + * of the payload which it is currently reading. + * + * Now we have enough information to calculate the safety margin. We need + * - 128 bytes for the .xz file format headers; + * - 8 bytes per every 32 KiB of uncompressed size (one LZMA2 chunk header + * per chunk, each chunk having average payload size of 32 KiB); and + * - 64 KiB (biggest possible LZMA2 chunk payload size) to make sure that + * the decompressor never overwrites anything from the LZMA2 chunk + * payload it is currently reading. + * + * We get the following formula: + * + * safety_margin = 128 + uncompressed_size * 8 / 32768 + 65536 + * = 128 + (uncompressed_size >> 12) + 65536 + * + * For comparision, according to arch/x86/boot/compressed/misc.c, the + * equivalent formula for Deflate is this: + * + * safety_margin = 18 + (uncompressed_size >> 12) + 32768 + * + * Thus, when updating Deflate-only in-place kernel decompressor to + * support XZ, the fixed overhead has to be increased from 18+32768 bytes + * to 128+65536 bytes. + */ + +#include "decompress.h" + +#define XZ_EXTERN STATIC + +/* + * For boot time use, we enable only the BCJ filter of the current + * architecture or none if no BCJ filter is available for the architecture. + */ +#ifdef CONFIG_X86 +# define XZ_DEC_X86 +#endif +#ifdef CONFIG_PPC +# define XZ_DEC_POWERPC +#endif +#ifdef CONFIG_ARM +# define XZ_DEC_ARM +#endif +#ifdef CONFIG_IA64 +# define XZ_DEC_IA64 +#endif +#ifdef CONFIG_SPARC +# define XZ_DEC_SPARC +#endif + +/* + * This will get the basic headers so that memeq() and others + * can be defined. + */ +#include "xz/private.h" + +/* + * memeq and memzero are not used much and any remotely sane implementation + * is fast enough. memcpy/memmove speed matters in multi-call mode, but + * the kernel image is decompressed in single-call mode, in which only + * memcpy speed can matter and only if there is a lot of uncompressible data + * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the + * functions below should just be kept small; it's probably not worth + * optimizing for speed. + */ + +#ifndef memeq +#define memeq(p1, p2, sz) (memcmp(p1, p2, sz) == 0) +#endif + +#ifndef memzero +#define memzero(p, sz) memset(p, 0, sz) +#endif + +#include "xz/crc32.c" +#include "xz/dec_stream.c" +#include "xz/dec_lzma2.c" +#include "xz/dec_bcj.c" + +/* Size of the input and output buffers in multi-call mode */ +#define XZ_IOBUF_SIZE 4096 + +/* + * This function implements the API defined in . + * + * This wrapper will automatically choose single-call or multi-call mode + * of the native XZ decoder API. The single-call mode can be used only when + * both input and output buffers are available as a single chunk, i.e. when + * fill() and flush() won't be used. + */ +STATIC int INIT unxz(unsigned char *in, unsigned int in_size, + int (*fill)(void *dest, unsigned int size), + int (*flush)(void *src, unsigned int size), + unsigned char *out, unsigned int *in_used, + void (*error_fn)(const char *x)) +{ + struct xz_buf b; + struct xz_dec *s; + enum xz_ret ret; + bool_t must_free_in = false; + + set_error_fn(error_fn); + + xz_crc32_init(); + + if (in_used != NULL) + *in_used = 0; + + if (fill == NULL && flush == NULL) + s = xz_dec_init(XZ_SINGLE, 0); + else + s = xz_dec_init(XZ_DYNALLOC, (uint32_t)-1); + + if (s == NULL) + goto error_alloc_state; + + if (flush == NULL) { + b.out = out; + b.out_size = (size_t)-1; + } else { + b.out_size = XZ_IOBUF_SIZE; + b.out = malloc(XZ_IOBUF_SIZE); + if (b.out == NULL) + goto error_alloc_out; + } + + if (in == NULL) { + must_free_in = true; + in = malloc(XZ_IOBUF_SIZE); + if (in == NULL) + goto error_alloc_in; + } + + b.in = in; + b.in_pos = 0; + b.in_size = in_size; + b.out_pos = 0; + + if (fill == NULL && flush == NULL) { + ret = xz_dec_run(s, &b); + } else { + do { + if (b.in_pos == b.in_size && fill != NULL) { + if (in_used != NULL) + *in_used += b.in_pos; + + b.in_pos = 0; + + in_size = fill(in, XZ_IOBUF_SIZE); + if ((int)in_size < 0) { + /* + * This isn't an optimal error code + * but it probably isn't worth making + * a new one either. + */ + ret = XZ_BUF_ERROR; + break; + } + + b.in_size = in_size; + } + + ret = xz_dec_run(s, &b); + + if (flush != NULL && (b.out_pos == b.out_size + || (ret != XZ_OK && b.out_pos > 0))) { + /* + * Setting ret here may hide an error + * returned by xz_dec_run(), but probably + * it's not too bad. + */ + if (flush(b.out, b.out_pos) != (int)b.out_pos) + ret = XZ_BUF_ERROR; + + b.out_pos = 0; + } + } while (ret == XZ_OK); + + if (must_free_in) + free(in); + + if (flush != NULL) + free(b.out); + } + + if (in_used != NULL) + *in_used += b.in_pos; + + xz_dec_end(s); + + switch (ret) { + case XZ_STREAM_END: + return 0; + + case XZ_MEM_ERROR: + /* This can occur only in multi-call mode. */ + error("XZ decompressor ran out of memory"); + break; + + case XZ_FORMAT_ERROR: + error("Input is not in the XZ format (wrong magic bytes)"); + break; + + case XZ_OPTIONS_ERROR: + error("Input was encoded with settings that are not " + "supported by this XZ decoder"); + break; + + case XZ_DATA_ERROR: + case XZ_BUF_ERROR: + error("XZ-compressed data is corrupt"); + break; + + default: + error("Bug in the XZ decompressor"); + break; + } + + return -1; + +error_alloc_in: + if (flush != NULL) + free(b.out); + +error_alloc_out: + xz_dec_end(s); + +error_alloc_state: + error("XZ decompressor ran out of memory"); + return -1; +} + +/* + * This macro is used by architecture-specific files to decompress + * the kernel image. + */ +#define decompress unxz diff -Nru xen-4.1.3/xen/common/xenoprof.c xen-4.1.5/xen/common/xenoprof.c --- xen-4.1.3/xen/common/xenoprof.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/common/xenoprof.c 2013-04-23 18:44:20.000000000 +0200 @@ -192,6 +192,13 @@ unsigned max_max_samples; int i; + nvcpu = 0; + for_each_vcpu ( d, v ) + nvcpu++; + + if ( !nvcpu ) + return -EINVAL; + d->xenoprof = xmalloc(struct xenoprof); if ( d->xenoprof == NULL ) @@ -213,10 +220,6 @@ memset(d->xenoprof->vcpu, 0, d->max_vcpus * sizeof(*d->xenoprof->vcpu)); - nvcpu = 0; - for_each_vcpu ( d, v ) - nvcpu++; - bufsize = sizeof(struct xenoprof_buf); i = sizeof(struct event_log); #ifdef CONFIG_COMPAT @@ -612,6 +615,8 @@ return (copy_to_guest(arg, &xenoprof_init, 1) ? -EFAULT : 0); } +#define ret_t long + #endif /* !COMPAT */ static int xenoprof_op_get_buffer(XEN_GUEST_HANDLE(void) arg) @@ -665,7 +670,7 @@ || (op == XENOPROF_disable_virq) \ || (op == XENOPROF_get_buffer)) -int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg) +ret_t do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg) { int ret = 0; @@ -909,6 +914,7 @@ } #if defined(CONFIG_COMPAT) && !defined(COMPAT) +#undef ret_t #include "compat/xenoprof.c" #endif diff -Nru xen-4.1.3/xen/common/xz/crc32.c xen-4.1.5/xen/common/xz/crc32.c --- xen-4.1.3/xen/common/xz/crc32.c 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/crc32.c 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,51 @@ +/* + * CRC32 using the polynomial from IEEE-802.3 + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +/* + * This is not the fastest implementation, but it is pretty compact. + * The fastest versions of xz_crc32() on modern CPUs without hardware + * accelerated CRC instruction are 3-5 times as fast as this version, + * but they are bigger and use more memory for the lookup table. + */ + +#include "private.h" + +XZ_EXTERN uint32_t INITDATA xz_crc32_table[256]; + +XZ_EXTERN void INIT xz_crc32_init(void) +{ + const uint32_t poly = 0xEDB88320; + + uint32_t i; + uint32_t j; + uint32_t r; + + for (i = 0; i < 256; ++i) { + r = i; + for (j = 0; j < 8; ++j) + r = (r >> 1) ^ (poly & ~((r & 1) - 1)); + + xz_crc32_table[i] = r; + } + + return; +} + +XZ_EXTERN uint32_t INIT xz_crc32(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + + while (size != 0) { + crc = xz_crc32_table[*buf++ ^ (crc & 0xFF)] ^ (crc >> 8); + --size; + } + + return ~crc; +} diff -Nru xen-4.1.3/xen/common/xz/dec_bcj.c xen-4.1.5/xen/common/xz/dec_bcj.c --- xen-4.1.3/xen/common/xz/dec_bcj.c 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/dec_bcj.c 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,575 @@ +/* + * Branch/Call/Jump (BCJ) filter decoders + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "private.h" + +/* + * The rest of the file is inside this ifdef. It makes things a little more + * convenient when building without support for any BCJ filters. + */ +#ifdef XZ_DEC_BCJ + +struct xz_dec_bcj { + /* Type of the BCJ filter being used */ + enum { + BCJ_X86 = 4, /* x86 or x86-64 */ + BCJ_POWERPC = 5, /* Big endian only */ + BCJ_IA64 = 6, /* Big or little endian */ + BCJ_ARM = 7, /* Little endian only */ + BCJ_ARMTHUMB = 8, /* Little endian only */ + BCJ_SPARC = 9 /* Big or little endian */ + } type; + + /* + * Return value of the next filter in the chain. We need to preserve + * this information across calls, because we must not call the next + * filter anymore once it has returned XZ_STREAM_END. + */ + enum xz_ret ret; + + /* True if we are operating in single-call mode. */ + bool_t single_call; + + /* + * Absolute position relative to the beginning of the uncompressed + * data (in a single .xz Block). We care only about the lowest 32 + * bits so this doesn't need to be uint64_t even with big files. + */ + uint32_t pos; + + /* x86 filter state */ + uint32_t x86_prev_mask; + + /* Temporary space to hold the variables from struct xz_buf */ + uint8_t *out; + size_t out_pos; + size_t out_size; + + struct { + /* Amount of already filtered data in the beginning of buf */ + size_t filtered; + + /* Total amount of data currently stored in buf */ + size_t size; + + /* + * Buffer to hold a mix of filtered and unfiltered data. This + * needs to be big enough to hold Alignment + 2 * Look-ahead: + * + * Type Alignment Look-ahead + * x86 1 4 + * PowerPC 4 0 + * IA-64 16 0 + * ARM 4 0 + * ARM-Thumb 2 2 + * SPARC 4 0 + */ + uint8_t buf[16]; + } temp; +}; + +#ifdef XZ_DEC_X86 +/* + * This is used to test the most significant byte of a memory address + * in an x86 instruction. + */ +static inline int INIT bcj_x86_test_msbyte(uint8_t b) +{ + return b == 0x00 || b == 0xFF; +} + +static size_t INIT bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + static /*const*/ bool_t INITDATA mask_to_allowed_status[8] + = { true, true, true, false, true, false, false, false }; + + static /*const*/ uint8_t INITDATA mask_to_bit_num[8] + = { 0, 1, 2, 2, 3, 3, 3, 3 }; + + size_t i; + size_t prev_pos = (size_t)-1; + uint32_t prev_mask = s->x86_prev_mask; + uint32_t src; + uint32_t dest; + uint32_t j; + uint8_t b; + + if (size <= 4) + return 0; + + size -= 4; + for (i = 0; i < size; ++i) { + if ((buf[i] & 0xFE) != 0xE8) + continue; + + prev_pos = i - prev_pos; + if (prev_pos > 3) { + prev_mask = 0; + } else { + prev_mask = (prev_mask << (prev_pos - 1)) & 7; + if (prev_mask != 0) { + b = buf[i + 4 - mask_to_bit_num[prev_mask]]; + if (!mask_to_allowed_status[prev_mask] + || bcj_x86_test_msbyte(b)) { + prev_pos = i; + prev_mask = (prev_mask << 1) | 1; + continue; + } + } + } + + prev_pos = i; + + if (bcj_x86_test_msbyte(buf[i + 4])) { + src = get_unaligned_le32(buf + i + 1); + while (true) { + dest = src - (s->pos + (uint32_t)i + 5); + if (prev_mask == 0) + break; + + j = mask_to_bit_num[prev_mask] * 8; + b = (uint8_t)(dest >> (24 - j)); + if (!bcj_x86_test_msbyte(b)) + break; + + src = dest ^ (((uint32_t)1 << (32 - j)) - 1); + } + + dest &= 0x01FFFFFF; + dest |= (uint32_t)0 - (dest & 0x01000000); + put_unaligned_le32(dest, buf + i + 1); + i += 4; + } else { + prev_mask = (prev_mask << 1) | 1; + } + } + + prev_pos = i - prev_pos; + s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); + return i; +} +#endif + +#ifdef XZ_DEC_POWERPC +static size_t INIT bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + + for (i = 0; i + 4 <= size; i += 4) { + instr = get_unaligned_be32(buf + i); + if ((instr & 0xFC000003) == 0x48000001) { + instr &= 0x03FFFFFC; + instr -= s->pos + (uint32_t)i; + instr &= 0x03FFFFFC; + instr |= 0x48000001; + put_unaligned_be32(instr, buf + i); + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_IA64 +static size_t INIT bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + static const uint8_t branch_table[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 6, 6, 0, 0, 7, 7, + 4, 4, 0, 0, 4, 4, 0, 0 + }; + + /* + * The local variables take a little bit stack space, but it's less + * than what LZMA2 decoder takes, so it doesn't make sense to reduce + * stack usage here without doing that for the LZMA2 decoder too. + */ + + /* Loop counters */ + size_t i; + size_t j; + + /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ + uint32_t slot; + + /* Bitwise offset of the instruction indicated by slot */ + uint32_t bit_pos; + + /* bit_pos split into byte and bit parts */ + uint32_t byte_pos; + uint32_t bit_res; + + /* Address part of an instruction */ + uint32_t addr; + + /* Mask used to detect which instructions to convert */ + uint32_t mask; + + /* 41-bit instruction stored somewhere in the lowest 48 bits */ + uint64_t instr; + + /* Instruction normalized with bit_res for easier manipulation */ + uint64_t norm; + + for (i = 0; i + 16 <= size; i += 16) { + mask = branch_table[buf[i] & 0x1F]; + for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { + if (((mask >> slot) & 1) == 0) + continue; + + byte_pos = bit_pos >> 3; + bit_res = bit_pos & 7; + instr = 0; + for (j = 0; j < 6; ++j) + instr |= (uint64_t)(buf[i + j + byte_pos]) + << (8 * j); + + norm = instr >> bit_res; + + if (((norm >> 37) & 0x0F) == 0x05 + && ((norm >> 9) & 0x07) == 0) { + addr = (norm >> 13) & 0x0FFFFF; + addr |= ((uint32_t)(norm >> 36) & 1) << 20; + addr <<= 4; + addr -= s->pos + (uint32_t)i; + addr >>= 4; + + norm &= ~((uint64_t)0x8FFFFF << 13); + norm |= (uint64_t)(addr & 0x0FFFFF) << 13; + norm |= (uint64_t)(addr & 0x100000) + << (36 - 20); + + instr &= (1 << bit_res) - 1; + instr |= norm << bit_res; + + for (j = 0; j < 6; j++) + buf[i + j + byte_pos] + = (uint8_t)(instr >> (8 * j)); + } + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_ARM +static size_t INIT bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t addr; + + for (i = 0; i + 4 <= size; i += 4) { + if (buf[i + 3] == 0xEB) { + addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) + | ((uint32_t)buf[i + 2] << 16); + addr <<= 2; + addr -= s->pos + (uint32_t)i + 8; + addr >>= 2; + buf[i] = (uint8_t)addr; + buf[i + 1] = (uint8_t)(addr >> 8); + buf[i + 2] = (uint8_t)(addr >> 16); + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_ARMTHUMB +static size_t INIT bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t addr; + + for (i = 0; i + 4 <= size; i += 2) { + if ((buf[i + 1] & 0xF8) == 0xF0 + && (buf[i + 3] & 0xF8) == 0xF8) { + addr = (((uint32_t)buf[i + 1] & 0x07) << 19) + | ((uint32_t)buf[i] << 11) + | (((uint32_t)buf[i + 3] & 0x07) << 8) + | (uint32_t)buf[i + 2]; + addr <<= 1; + addr -= s->pos + (uint32_t)i + 4; + addr >>= 1; + buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); + buf[i] = (uint8_t)(addr >> 11); + buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); + buf[i + 2] = (uint8_t)addr; + i += 2; + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_SPARC +static size_t INIT bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + + for (i = 0; i + 4 <= size; i += 4) { + instr = get_unaligned_be32(buf + i); + if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { + instr <<= 2; + instr -= s->pos + (uint32_t)i; + instr >>= 2; + instr = ((uint32_t)0x40000000 - (instr & 0x400000)) + | 0x40000000 | (instr & 0x3FFFFF); + put_unaligned_be32(instr, buf + i); + } + } + + return i; +} +#endif + +/* + * Apply the selected BCJ filter. Update *pos and s->pos to match the amount + * of data that got filtered. + * + * NOTE: This is implemented as a switch statement to avoid using function + * pointers, which could be problematic in the kernel boot code, which must + * avoid pointers to static data (at least on x86). + */ +static void INIT bcj_apply(struct xz_dec_bcj *s, + uint8_t *buf, size_t *pos, size_t size) +{ + size_t filtered; + + buf += *pos; + size -= *pos; + + switch (s->type) { +#ifdef XZ_DEC_X86 + case BCJ_X86: + filtered = bcj_x86(s, buf, size); + break; +#endif +#ifdef XZ_DEC_POWERPC + case BCJ_POWERPC: + filtered = bcj_powerpc(s, buf, size); + break; +#endif +#ifdef XZ_DEC_IA64 + case BCJ_IA64: + filtered = bcj_ia64(s, buf, size); + break; +#endif +#ifdef XZ_DEC_ARM + case BCJ_ARM: + filtered = bcj_arm(s, buf, size); + break; +#endif +#ifdef XZ_DEC_ARMTHUMB + case BCJ_ARMTHUMB: + filtered = bcj_armthumb(s, buf, size); + break; +#endif +#ifdef XZ_DEC_SPARC + case BCJ_SPARC: + filtered = bcj_sparc(s, buf, size); + break; +#endif + default: + /* Never reached but silence compiler warnings. */ + filtered = 0; + break; + } + + *pos += filtered; + s->pos += filtered; +} + +/* + * Flush pending filtered data from temp to the output buffer. + * Move the remaining mixture of possibly filtered and unfiltered + * data to the beginning of temp. + */ +static void INIT bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) +{ + size_t copy_size; + + copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); + memcpy(b->out + b->out_pos, s->temp.buf, copy_size); + b->out_pos += copy_size; + + s->temp.filtered -= copy_size; + s->temp.size -= copy_size; + memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); +} + +/* + * The BCJ filter functions are primitive in sense that they process the + * data in chunks of 1-16 bytes. To hide this issue, this function does + * some buffering. + */ +XZ_EXTERN enum xz_ret INIT xz_dec_bcj_run(struct xz_dec_bcj *s, + struct xz_dec_lzma2 *lzma2, + struct xz_buf *b) +{ + size_t out_start; + + /* + * Flush pending already filtered data to the output buffer. Return + * immediatelly if we couldn't flush everything, or if the next + * filter in the chain had already returned XZ_STREAM_END. + */ + if (s->temp.filtered > 0) { + bcj_flush(s, b); + if (s->temp.filtered > 0) + return XZ_OK; + + if (s->ret == XZ_STREAM_END) + return XZ_STREAM_END; + } + + /* + * If we have more output space than what is currently pending in + * temp, copy the unfiltered data from temp to the output buffer + * and try to fill the output buffer by decoding more data from the + * next filter in the chain. Apply the BCJ filter on the new data + * in the output buffer. If everything cannot be filtered, copy it + * to temp and rewind the output buffer position accordingly. + * + * This needs to be always run when temp.size == 0 to handle a special + * case where the output buffer is full and the next filter has no + * more output coming but hasn't returned XZ_STREAM_END yet. + */ + if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { + out_start = b->out_pos; + memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); + b->out_pos += s->temp.size; + + s->ret = xz_dec_lzma2_run(lzma2, b); + if (s->ret != XZ_STREAM_END + && (s->ret != XZ_OK || s->single_call)) + return s->ret; + + bcj_apply(s, b->out, &out_start, b->out_pos); + + /* + * As an exception, if the next filter returned XZ_STREAM_END, + * we can do that too, since the last few bytes that remain + * unfiltered are meant to remain unfiltered. + */ + if (s->ret == XZ_STREAM_END) + return XZ_STREAM_END; + + s->temp.size = b->out_pos - out_start; + b->out_pos -= s->temp.size; + memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); + + /* + * If there wasn't enough input to the next filter to fill + * the output buffer with unfiltered data, there's no point + * to try decoding more data to temp. + */ + if (b->out_pos + s->temp.size < b->out_size) + return XZ_OK; + } + + /* + * We have unfiltered data in temp. If the output buffer isn't full + * yet, try to fill the temp buffer by decoding more data from the + * next filter. Apply the BCJ filter on temp. Then we hopefully can + * fill the actual output buffer by copying filtered data from temp. + * A mix of filtered and unfiltered data may be left in temp; it will + * be taken care on the next call to this function. + */ + if (b->out_pos < b->out_size) { + /* Make b->out{,_pos,_size} temporarily point to s->temp. */ + s->out = b->out; + s->out_pos = b->out_pos; + s->out_size = b->out_size; + b->out = s->temp.buf; + b->out_pos = s->temp.size; + b->out_size = sizeof(s->temp.buf); + + s->ret = xz_dec_lzma2_run(lzma2, b); + + s->temp.size = b->out_pos; + b->out = s->out; + b->out_pos = s->out_pos; + b->out_size = s->out_size; + + if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) + return s->ret; + + bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); + + /* + * If the next filter returned XZ_STREAM_END, we mark that + * everything is filtered, since the last unfiltered bytes + * of the stream are meant to be left as is. + */ + if (s->ret == XZ_STREAM_END) + s->temp.filtered = s->temp.size; + + bcj_flush(s, b); + if (s->temp.filtered > 0) + return XZ_OK; + } + + return s->ret; +} + +XZ_EXTERN struct xz_dec_bcj *INIT xz_dec_bcj_create(bool_t single_call) +{ + struct xz_dec_bcj *s = malloc(sizeof(*s)); + if (s != NULL) + s->single_call = single_call; + + return s; +} + +XZ_EXTERN enum xz_ret INIT xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) +{ + switch (id) { +#ifdef XZ_DEC_X86 + case BCJ_X86: +#endif +#ifdef XZ_DEC_POWERPC + case BCJ_POWERPC: +#endif +#ifdef XZ_DEC_IA64 + case BCJ_IA64: +#endif +#ifdef XZ_DEC_ARM + case BCJ_ARM: +#endif +#ifdef XZ_DEC_ARMTHUMB + case BCJ_ARMTHUMB: +#endif +#ifdef XZ_DEC_SPARC + case BCJ_SPARC: +#endif + break; + + default: + /* Unsupported Filter ID */ + return XZ_OPTIONS_ERROR; + } + + s->type = id; + s->ret = XZ_OK; + s->pos = 0; + s->x86_prev_mask = 0; + s->temp.filtered = 0; + s->temp.size = 0; + + return XZ_OK; +} + +#endif diff -Nru xen-4.1.3/xen/common/xz/dec_lzma2.c xen-4.1.5/xen/common/xz/dec_lzma2.c --- xen-4.1.3/xen/common/xz/dec_lzma2.c 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/dec_lzma2.c 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,1171 @@ +/* + * LZMA2 decoder + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "private.h" +#include "lzma2.h" + +/* + * Range decoder initialization eats the first five bytes of each LZMA chunk. + */ +#define RC_INIT_BYTES 5 + +/* + * Minimum number of usable input buffer to safely decode one LZMA symbol. + * The worst case is that we decode 22 bits using probabilities and 26 + * direct bits. This may decode at maximum of 20 bytes of input. However, + * lzma_main() does an extra normalization before returning, thus we + * need to put 21 here. + */ +#define LZMA_IN_REQUIRED 21 + +/* + * Dictionary (history buffer) + * + * These are always true: + * start <= pos <= full <= end + * pos <= limit <= end + * + * In multi-call mode, also these are true: + * end == size + * size <= size_max + * allocated <= size + * + * Most of these variables are size_t to support single-call mode, + * in which the dictionary variables address the actual output + * buffer directly. + */ +struct dictionary { + /* Beginning of the history buffer */ + uint8_t *buf; + + /* Old position in buf (before decoding more data) */ + size_t start; + + /* Position in buf */ + size_t pos; + + /* + * How full dictionary is. This is used to detect corrupt input that + * would read beyond the beginning of the uncompressed stream. + */ + size_t full; + + /* Write limit; we don't write to buf[limit] or later bytes. */ + size_t limit; + + /* + * End of the dictionary buffer. In multi-call mode, this is + * the same as the dictionary size. In single-call mode, this + * indicates the size of the output buffer. + */ + size_t end; + + /* + * Size of the dictionary as specified in Block Header. This is used + * together with "full" to detect corrupt input that would make us + * read beyond the beginning of the uncompressed stream. + */ + uint32_t size; + + /* + * Maximum allowed dictionary size in multi-call mode. + * This is ignored in single-call mode. + */ + uint32_t size_max; + + /* + * Amount of memory currently allocated for the dictionary. + * This is used only with XZ_DYNALLOC. (With XZ_PREALLOC, + * size_max is always the same as the allocated size.) + */ + uint32_t allocated; + + /* Operation mode */ + enum xz_mode mode; +}; + +/* Range decoder */ +struct rc_dec { + uint32_t range; + uint32_t code; + + /* + * Number of initializing bytes remaining to be read + * by rc_read_init(). + */ + uint32_t init_bytes_left; + + /* + * Buffer from which we read our input. It can be either + * temp.buf or the caller-provided input buffer. + */ + const uint8_t *in; + size_t in_pos; + size_t in_limit; +}; + +/* Probabilities for a length decoder. */ +struct lzma_len_dec { + /* Probability of match length being at least 10 */ + uint16_t choice; + + /* Probability of match length being at least 18 */ + uint16_t choice2; + + /* Probabilities for match lengths 2-9 */ + uint16_t low[POS_STATES_MAX][LEN_LOW_SYMBOLS]; + + /* Probabilities for match lengths 10-17 */ + uint16_t mid[POS_STATES_MAX][LEN_MID_SYMBOLS]; + + /* Probabilities for match lengths 18-273 */ + uint16_t high[LEN_HIGH_SYMBOLS]; +}; + +struct lzma_dec { + /* Distances of latest four matches */ + uint32_t rep0; + uint32_t rep1; + uint32_t rep2; + uint32_t rep3; + + /* Types of the most recently seen LZMA symbols */ + enum lzma_state state; + + /* + * Length of a match. This is updated so that dict_repeat can + * be called again to finish repeating the whole match. + */ + uint32_t len; + + /* + * LZMA properties or related bit masks (number of literal + * context bits, a mask dervied from the number of literal + * position bits, and a mask dervied from the number + * position bits) + */ + uint32_t lc; + uint32_t literal_pos_mask; /* (1 << lp) - 1 */ + uint32_t pos_mask; /* (1 << pb) - 1 */ + + /* If 1, it's a match. Otherwise it's a single 8-bit literal. */ + uint16_t is_match[STATES][POS_STATES_MAX]; + + /* If 1, it's a repeated match. The distance is one of rep0 .. rep3. */ + uint16_t is_rep[STATES]; + + /* + * If 0, distance of a repeated match is rep0. + * Otherwise check is_rep1. + */ + uint16_t is_rep0[STATES]; + + /* + * If 0, distance of a repeated match is rep1. + * Otherwise check is_rep2. + */ + uint16_t is_rep1[STATES]; + + /* If 0, distance of a repeated match is rep2. Otherwise it is rep3. */ + uint16_t is_rep2[STATES]; + + /* + * If 1, the repeated match has length of one byte. Otherwise + * the length is decoded from rep_len_decoder. + */ + uint16_t is_rep0_long[STATES][POS_STATES_MAX]; + + /* + * Probability tree for the highest two bits of the match + * distance. There is a separate probability tree for match + * lengths of 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273]. + */ + uint16_t dist_slot[DIST_STATES][DIST_SLOTS]; + + /* + * Probility trees for additional bits for match distance + * when the distance is in the range [4, 127]. + */ + uint16_t dist_special[FULL_DISTANCES - DIST_MODEL_END]; + + /* + * Probability tree for the lowest four bits of a match + * distance that is equal to or greater than 128. + */ + uint16_t dist_align[ALIGN_SIZE]; + + /* Length of a normal match */ + struct lzma_len_dec match_len_dec; + + /* Length of a repeated match */ + struct lzma_len_dec rep_len_dec; + + /* Probabilities of literals */ + uint16_t literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE]; +}; + +struct lzma2_dec { + /* Position in xz_dec_lzma2_run(). */ + enum lzma2_seq { + SEQ_CONTROL, + SEQ_UNCOMPRESSED_1, + SEQ_UNCOMPRESSED_2, + SEQ_COMPRESSED_0, + SEQ_COMPRESSED_1, + SEQ_PROPERTIES, + SEQ_LZMA_PREPARE, + SEQ_LZMA_RUN, + SEQ_COPY + } sequence; + + /* Next position after decoding the compressed size of the chunk. */ + enum lzma2_seq next_sequence; + + /* Uncompressed size of LZMA chunk (2 MiB at maximum) */ + uint32_t uncompressed; + + /* + * Compressed size of LZMA chunk or compressed/uncompressed + * size of uncompressed chunk (64 KiB at maximum) + */ + uint32_t compressed; + + /* + * True if dictionary reset is needed. This is false before + * the first chunk (LZMA or uncompressed). + */ + bool_t need_dict_reset; + + /* + * True if new LZMA properties are needed. This is false + * before the first LZMA chunk. + */ + bool_t need_props; +}; + +struct xz_dec_lzma2 { + /* + * The order below is important on x86 to reduce code size and + * it shouldn't hurt on other platforms. Everything up to and + * including lzma.pos_mask are in the first 128 bytes on x86-32, + * which allows using smaller instructions to access those + * variables. On x86-64, fewer variables fit into the first 128 + * bytes, but this is still the best order without sacrificing + * the readability by splitting the structures. + */ + struct rc_dec rc; + struct dictionary dict; + struct lzma2_dec lzma2; + struct lzma_dec lzma; + + /* + * Temporary buffer which holds small number of input bytes between + * decoder calls. See lzma2_lzma() for details. + */ + struct { + uint32_t size; + uint8_t buf[3 * LZMA_IN_REQUIRED]; + } temp; +}; + +/************** + * Dictionary * + **************/ + +/* + * Reset the dictionary state. When in single-call mode, set up the beginning + * of the dictionary to point to the actual output buffer. + */ +static void INIT dict_reset(struct dictionary *dict, struct xz_buf *b) +{ + if (DEC_IS_SINGLE(dict->mode)) { + dict->buf = b->out + b->out_pos; + dict->end = b->out_size - b->out_pos; + } + + dict->start = 0; + dict->pos = 0; + dict->limit = 0; + dict->full = 0; +} + +/* Set dictionary write limit */ +static void INIT dict_limit(struct dictionary *dict, size_t out_max) +{ + if (dict->end - dict->pos <= out_max) + dict->limit = dict->end; + else + dict->limit = dict->pos + out_max; +} + +/* Return true if at least one byte can be written into the dictionary. */ +static inline bool_t INIT dict_has_space(const struct dictionary *dict) +{ + return dict->pos < dict->limit; +} + +/* + * Get a byte from the dictionary at the given distance. The distance is + * assumed to valid, or as a special case, zero when the dictionary is + * still empty. This special case is needed for single-call decoding to + * avoid writing a '\0' to the end of the destination buffer. + */ +static inline uint32_t INIT dict_get(const struct dictionary *dict, uint32_t dist) +{ + size_t offset = dict->pos - dist - 1; + + if (dist >= dict->pos) + offset += dict->end; + + return dict->full > 0 ? dict->buf[offset] : 0; +} + +/* + * Put one byte into the dictionary. It is assumed that there is space for it. + */ +static inline void INIT dict_put(struct dictionary *dict, uint8_t byte) +{ + dict->buf[dict->pos++] = byte; + + if (dict->full < dict->pos) + dict->full = dict->pos; +} + +/* + * Repeat given number of bytes from the given distance. If the distance is + * invalid, false is returned. On success, true is returned and *len is + * updated to indicate how many bytes were left to be repeated. + */ +static bool_t INIT dict_repeat(struct dictionary *dict, uint32_t *len, uint32_t dist) +{ + size_t back; + uint32_t left; + + if (dist >= dict->full || dist >= dict->size) + return false; + + left = min_t(size_t, dict->limit - dict->pos, *len); + *len -= left; + + back = dict->pos - dist - 1; + if (dist >= dict->pos) + back += dict->end; + + do { + dict->buf[dict->pos++] = dict->buf[back++]; + if (back == dict->end) + back = 0; + } while (--left > 0); + + if (dict->full < dict->pos) + dict->full = dict->pos; + + return true; +} + +/* Copy uncompressed data as is from input to dictionary and output buffers. */ +static void INIT dict_uncompressed(struct dictionary *dict, struct xz_buf *b, + uint32_t *left) +{ + size_t copy_size; + + while (*left > 0 && b->in_pos < b->in_size + && b->out_pos < b->out_size) { + copy_size = min(b->in_size - b->in_pos, + b->out_size - b->out_pos); + if (copy_size > dict->end - dict->pos) + copy_size = dict->end - dict->pos; + if (copy_size > *left) + copy_size = *left; + + *left -= copy_size; + + memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); + dict->pos += copy_size; + + if (dict->full < dict->pos) + dict->full = dict->pos; + + if (DEC_IS_MULTI(dict->mode)) { + if (dict->pos == dict->end) + dict->pos = 0; + + memcpy(b->out + b->out_pos, b->in + b->in_pos, + copy_size); + } + + dict->start = dict->pos; + + b->out_pos += copy_size; + b->in_pos += copy_size; + } +} + +/* + * Flush pending data from dictionary to b->out. It is assumed that there is + * enough space in b->out. This is guaranteed because caller uses dict_limit() + * before decoding data into the dictionary. + */ +static uint32_t INIT dict_flush(struct dictionary *dict, struct xz_buf *b) +{ + size_t copy_size = dict->pos - dict->start; + + if (DEC_IS_MULTI(dict->mode)) { + if (dict->pos == dict->end) + dict->pos = 0; + + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); + } + + dict->start = dict->pos; + b->out_pos += copy_size; + return copy_size; +} + +/***************** + * Range decoder * + *****************/ + +/* Reset the range decoder. */ +static void INIT rc_reset(struct rc_dec *rc) +{ + rc->range = (uint32_t)-1; + rc->code = 0; + rc->init_bytes_left = RC_INIT_BYTES; +} + +/* + * Read the first five initial bytes into rc->code if they haven't been + * read already. (Yes, the first byte gets completely ignored.) + */ +static bool_t INIT rc_read_init(struct rc_dec *rc, struct xz_buf *b) +{ + while (rc->init_bytes_left > 0) { + if (b->in_pos == b->in_size) + return false; + + rc->code = (rc->code << 8) + b->in[b->in_pos++]; + --rc->init_bytes_left; + } + + return true; +} + +/* Return true if there may not be enough input for the next decoding loop. */ +static inline bool_t INIT rc_limit_exceeded(const struct rc_dec *rc) +{ + return rc->in_pos > rc->in_limit; +} + +/* + * Return true if it is possible (from point of view of range decoder) that + * we have reached the end of the LZMA chunk. + */ +static inline bool_t INIT rc_is_finished(const struct rc_dec *rc) +{ + return rc->code == 0; +} + +/* Read the next input byte if needed. */ +static always_inline void rc_normalize(struct rc_dec *rc) +{ + if (rc->range < RC_TOP_VALUE) { + rc->range <<= RC_SHIFT_BITS; + rc->code = (rc->code << RC_SHIFT_BITS) + rc->in[rc->in_pos++]; + } +} + +/* + * Decode one bit. In some versions, this function has been splitted in three + * functions so that the compiler is supposed to be able to more easily avoid + * an extra branch. In this particular version of the LZMA decoder, this + * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 + * on x86). Using a non-splitted version results in nicer looking code too. + * + * NOTE: This must return an int. Do not make it return a bool or the speed + * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, + * and it generates 10-20 % faster code than GCC 3.x from this file anyway.) + */ +static always_inline int rc_bit(struct rc_dec *rc, uint16_t *prob) +{ + uint32_t bound; + int bit; + + rc_normalize(rc); + bound = (rc->range >> RC_BIT_MODEL_TOTAL_BITS) * *prob; + if (rc->code < bound) { + rc->range = bound; + *prob += (RC_BIT_MODEL_TOTAL - *prob) >> RC_MOVE_BITS; + bit = 0; + } else { + rc->range -= bound; + rc->code -= bound; + *prob -= *prob >> RC_MOVE_BITS; + bit = 1; + } + + return bit; +} + +/* Decode a bittree starting from the most significant bit. */ +static always_inline uint32_t rc_bittree(struct rc_dec *rc, + uint16_t *probs, uint32_t limit) +{ + uint32_t symbol = 1; + + do { + if (rc_bit(rc, &probs[symbol])) + symbol = (symbol << 1) + 1; + else + symbol <<= 1; + } while (symbol < limit); + + return symbol; +} + +/* Decode a bittree starting from the least significant bit. */ +static always_inline void rc_bittree_reverse(struct rc_dec *rc, + uint16_t *probs, + uint32_t *dest, uint32_t limit) +{ + uint32_t symbol = 1; + uint32_t i = 0; + + do { + if (rc_bit(rc, &probs[symbol])) { + symbol = (symbol << 1) + 1; + *dest += 1 << i; + } else { + symbol <<= 1; + } + } while (++i < limit); +} + +/* Decode direct bits (fixed fifty-fifty probability) */ +static inline void INIT rc_direct(struct rc_dec *rc, uint32_t *dest, uint32_t limit) +{ + uint32_t mask; + + do { + rc_normalize(rc); + rc->range >>= 1; + rc->code -= rc->range; + mask = (uint32_t)0 - (rc->code >> 31); + rc->code += rc->range & mask; + *dest = (*dest << 1) + (mask + 1); + } while (--limit > 0); +} + +/******** + * LZMA * + ********/ + +/* Get pointer to literal coder probability array. */ +static uint16_t *INIT lzma_literal_probs(struct xz_dec_lzma2 *s) +{ + uint32_t prev_byte = dict_get(&s->dict, 0); + uint32_t low = prev_byte >> (8 - s->lzma.lc); + uint32_t high = (s->dict.pos & s->lzma.literal_pos_mask) << s->lzma.lc; + return s->lzma.literal[low + high]; +} + +/* Decode a literal (one 8-bit byte) */ +static void INIT lzma_literal(struct xz_dec_lzma2 *s) +{ + uint16_t *probs; + uint32_t symbol; + uint32_t match_byte; + uint32_t match_bit; + uint32_t offset; + uint32_t i; + + probs = lzma_literal_probs(s); + + if (lzma_state_is_literal(s->lzma.state)) { + symbol = rc_bittree(&s->rc, probs, 0x100); + } else { + symbol = 1; + match_byte = dict_get(&s->dict, s->lzma.rep0) << 1; + offset = 0x100; + + do { + match_bit = match_byte & offset; + match_byte <<= 1; + i = offset + match_bit + symbol; + + if (rc_bit(&s->rc, &probs[i])) { + symbol = (symbol << 1) + 1; + offset &= match_bit; + } else { + symbol <<= 1; + offset &= ~match_bit; + } + } while (symbol < 0x100); + } + + dict_put(&s->dict, (uint8_t)symbol); + lzma_state_literal(&s->lzma.state); +} + +/* Decode the length of the match into s->lzma.len. */ +static void INIT lzma_len(struct xz_dec_lzma2 *s, struct lzma_len_dec *l, + uint32_t pos_state) +{ + uint16_t *probs; + uint32_t limit; + + if (!rc_bit(&s->rc, &l->choice)) { + probs = l->low[pos_state]; + limit = LEN_LOW_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN; + } else { + if (!rc_bit(&s->rc, &l->choice2)) { + probs = l->mid[pos_state]; + limit = LEN_MID_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS; + } else { + probs = l->high; + limit = LEN_HIGH_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS + + LEN_MID_SYMBOLS; + } + } + + s->lzma.len += rc_bittree(&s->rc, probs, limit) - limit; +} + +/* Decode a match. The distance will be stored in s->lzma.rep0. */ +static void INIT lzma_match(struct xz_dec_lzma2 *s, uint32_t pos_state) +{ + uint16_t *probs; + uint32_t dist_slot; + uint32_t limit; + + lzma_state_match(&s->lzma.state); + + s->lzma.rep3 = s->lzma.rep2; + s->lzma.rep2 = s->lzma.rep1; + s->lzma.rep1 = s->lzma.rep0; + + lzma_len(s, &s->lzma.match_len_dec, pos_state); + + probs = s->lzma.dist_slot[lzma_get_dist_state(s->lzma.len)]; + dist_slot = rc_bittree(&s->rc, probs, DIST_SLOTS) - DIST_SLOTS; + + if (dist_slot < DIST_MODEL_START) { + s->lzma.rep0 = dist_slot; + } else { + limit = (dist_slot >> 1) - 1; + s->lzma.rep0 = 2 + (dist_slot & 1); + + if (dist_slot < DIST_MODEL_END) { + s->lzma.rep0 <<= limit; + probs = s->lzma.dist_special + s->lzma.rep0 + - dist_slot - 1; + rc_bittree_reverse(&s->rc, probs, + &s->lzma.rep0, limit); + } else { + rc_direct(&s->rc, &s->lzma.rep0, limit - ALIGN_BITS); + s->lzma.rep0 <<= ALIGN_BITS; + rc_bittree_reverse(&s->rc, s->lzma.dist_align, + &s->lzma.rep0, ALIGN_BITS); + } + } +} + +/* + * Decode a repeated match. The distance is one of the four most recently + * seen matches. The distance will be stored in s->lzma.rep0. + */ +static void INIT lzma_rep_match(struct xz_dec_lzma2 *s, uint32_t pos_state) +{ + uint32_t tmp; + + if (!rc_bit(&s->rc, &s->lzma.is_rep0[s->lzma.state])) { + if (!rc_bit(&s->rc, &s->lzma.is_rep0_long[ + s->lzma.state][pos_state])) { + lzma_state_short_rep(&s->lzma.state); + s->lzma.len = 1; + return; + } + } else { + if (!rc_bit(&s->rc, &s->lzma.is_rep1[s->lzma.state])) { + tmp = s->lzma.rep1; + } else { + if (!rc_bit(&s->rc, &s->lzma.is_rep2[s->lzma.state])) { + tmp = s->lzma.rep2; + } else { + tmp = s->lzma.rep3; + s->lzma.rep3 = s->lzma.rep2; + } + + s->lzma.rep2 = s->lzma.rep1; + } + + s->lzma.rep1 = s->lzma.rep0; + s->lzma.rep0 = tmp; + } + + lzma_state_long_rep(&s->lzma.state); + lzma_len(s, &s->lzma.rep_len_dec, pos_state); +} + +/* LZMA decoder core */ +static bool_t INIT lzma_main(struct xz_dec_lzma2 *s) +{ + uint32_t pos_state; + + /* + * If the dictionary was reached during the previous call, try to + * finish the possibly pending repeat in the dictionary. + */ + if (dict_has_space(&s->dict) && s->lzma.len > 0) + dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0); + + /* + * Decode more LZMA symbols. One iteration may consume up to + * LZMA_IN_REQUIRED - 1 bytes. + */ + while (dict_has_space(&s->dict) && !rc_limit_exceeded(&s->rc)) { + pos_state = s->dict.pos & s->lzma.pos_mask; + + if (!rc_bit(&s->rc, &s->lzma.is_match[ + s->lzma.state][pos_state])) { + lzma_literal(s); + } else { + if (rc_bit(&s->rc, &s->lzma.is_rep[s->lzma.state])) + lzma_rep_match(s, pos_state); + else + lzma_match(s, pos_state); + + if (!dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0)) + return false; + } + } + + /* + * Having the range decoder always normalized when we are outside + * this function makes it easier to correctly handle end of the chunk. + */ + rc_normalize(&s->rc); + + return true; +} + +/* + * Reset the LZMA decoder and range decoder state. Dictionary is nore reset + * here, because LZMA state may be reset without resetting the dictionary. + */ +static void INIT lzma_reset(struct xz_dec_lzma2 *s) +{ + uint16_t *probs; + size_t i; + + s->lzma.state = STATE_LIT_LIT; + s->lzma.rep0 = 0; + s->lzma.rep1 = 0; + s->lzma.rep2 = 0; + s->lzma.rep3 = 0; + + /* + * All probabilities are initialized to the same value. This hack + * makes the code smaller by avoiding a separate loop for each + * probability array. + * + * This could be optimized so that only that part of literal + * probabilities that are actually required. In the common case + * we would write 12 KiB less. + */ + probs = s->lzma.is_match[0]; + for (i = 0; i < PROBS_TOTAL; ++i) + probs[i] = RC_BIT_MODEL_TOTAL / 2; + + rc_reset(&s->rc); +} + +/* + * Decode and validate LZMA properties (lc/lp/pb) and calculate the bit masks + * from the decoded lp and pb values. On success, the LZMA decoder state is + * reset and true is returned. + */ +static bool_t INIT lzma_props(struct xz_dec_lzma2 *s, uint8_t props) +{ + if (props > (4 * 5 + 4) * 9 + 8) + return false; + + s->lzma.pos_mask = 0; + while (props >= 9 * 5) { + props -= 9 * 5; + ++s->lzma.pos_mask; + } + + s->lzma.pos_mask = (1 << s->lzma.pos_mask) - 1; + + s->lzma.literal_pos_mask = 0; + while (props >= 9) { + props -= 9; + ++s->lzma.literal_pos_mask; + } + + s->lzma.lc = props; + + if (s->lzma.lc + s->lzma.literal_pos_mask > 4) + return false; + + s->lzma.literal_pos_mask = (1 << s->lzma.literal_pos_mask) - 1; + + lzma_reset(s); + + return true; +} + +/********* + * LZMA2 * + *********/ + +/* + * The LZMA decoder assumes that if the input limit (s->rc.in_limit) hasn't + * been exceeded, it is safe to read up to LZMA_IN_REQUIRED bytes. This + * wrapper function takes care of making the LZMA decoder's assumption safe. + * + * As long as there is plenty of input left to be decoded in the current LZMA + * chunk, we decode directly from the caller-supplied input buffer until + * there's LZMA_IN_REQUIRED bytes left. Those remaining bytes are copied into + * s->temp.buf, which (hopefully) gets filled on the next call to this + * function. We decode a few bytes from the temporary buffer so that we can + * continue decoding from the caller-supplied input buffer again. + */ +static bool_t INIT lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b) +{ + size_t in_avail; + uint32_t tmp; + + in_avail = b->in_size - b->in_pos; + if (s->temp.size > 0 || s->lzma2.compressed == 0) { + tmp = 2 * LZMA_IN_REQUIRED - s->temp.size; + if (tmp > s->lzma2.compressed - s->temp.size) + tmp = s->lzma2.compressed - s->temp.size; + if (tmp > in_avail) + tmp = in_avail; + + memcpy(s->temp.buf + s->temp.size, b->in + b->in_pos, tmp); + + if (s->temp.size + tmp == s->lzma2.compressed) { + memzero(s->temp.buf + s->temp.size + tmp, + sizeof(s->temp.buf) + - s->temp.size - tmp); + s->rc.in_limit = s->temp.size + tmp; + } else if (s->temp.size + tmp < LZMA_IN_REQUIRED) { + s->temp.size += tmp; + b->in_pos += tmp; + return true; + } else { + s->rc.in_limit = s->temp.size + tmp - LZMA_IN_REQUIRED; + } + + s->rc.in = s->temp.buf; + s->rc.in_pos = 0; + + if (!lzma_main(s) || s->rc.in_pos > s->temp.size + tmp) + return false; + + s->lzma2.compressed -= s->rc.in_pos; + + if (s->rc.in_pos < s->temp.size) { + s->temp.size -= s->rc.in_pos; + memmove(s->temp.buf, s->temp.buf + s->rc.in_pos, + s->temp.size); + return true; + } + + b->in_pos += s->rc.in_pos - s->temp.size; + s->temp.size = 0; + } + + in_avail = b->in_size - b->in_pos; + if (in_avail >= LZMA_IN_REQUIRED) { + s->rc.in = b->in; + s->rc.in_pos = b->in_pos; + + if (in_avail >= s->lzma2.compressed + LZMA_IN_REQUIRED) + s->rc.in_limit = b->in_pos + s->lzma2.compressed; + else + s->rc.in_limit = b->in_size - LZMA_IN_REQUIRED; + + if (!lzma_main(s)) + return false; + + in_avail = s->rc.in_pos - b->in_pos; + if (in_avail > s->lzma2.compressed) + return false; + + s->lzma2.compressed -= in_avail; + b->in_pos = s->rc.in_pos; + } + + in_avail = b->in_size - b->in_pos; + if (in_avail < LZMA_IN_REQUIRED) { + if (in_avail > s->lzma2.compressed) + in_avail = s->lzma2.compressed; + + memcpy(s->temp.buf, b->in + b->in_pos, in_avail); + s->temp.size = in_avail; + b->in_pos += in_avail; + } + + return true; +} + +/* + * Take care of the LZMA2 control layer, and forward the job of actual LZMA + * decoding or copying of uncompressed chunks to other functions. + */ +XZ_EXTERN enum xz_ret INIT xz_dec_lzma2_run(struct xz_dec_lzma2 *s, + struct xz_buf *b) +{ + uint32_t tmp; + + while (b->in_pos < b->in_size || s->lzma2.sequence == SEQ_LZMA_RUN) { + switch (s->lzma2.sequence) { + case SEQ_CONTROL: + /* + * LZMA2 control byte + * + * Exact values: + * 0x00 End marker + * 0x01 Dictionary reset followed by + * an uncompressed chunk + * 0x02 Uncompressed chunk (no dictionary reset) + * + * Highest three bits (s->control & 0xE0): + * 0xE0 Dictionary reset, new properties and state + * reset, followed by LZMA compressed chunk + * 0xC0 New properties and state reset, followed + * by LZMA compressed chunk (no dictionary + * reset) + * 0xA0 State reset using old properties, + * followed by LZMA compressed chunk (no + * dictionary reset) + * 0x80 LZMA chunk (no dictionary or state reset) + * + * For LZMA compressed chunks, the lowest five bits + * (s->control & 1F) are the highest bits of the + * uncompressed size (bits 16-20). + * + * A new LZMA2 stream must begin with a dictionary + * reset. The first LZMA chunk must set new + * properties and reset the LZMA state. + * + * Values that don't match anything described above + * are invalid and we return XZ_DATA_ERROR. + */ + tmp = b->in[b->in_pos++]; + + if (tmp == 0x00) + return XZ_STREAM_END; + + if (tmp >= 0xE0 || tmp == 0x01) { + s->lzma2.need_props = true; + s->lzma2.need_dict_reset = false; + dict_reset(&s->dict, b); + } else if (s->lzma2.need_dict_reset) { + return XZ_DATA_ERROR; + } + + if (tmp >= 0x80) { + s->lzma2.uncompressed = (tmp & 0x1F) << 16; + s->lzma2.sequence = SEQ_UNCOMPRESSED_1; + + if (tmp >= 0xC0) { + /* + * When there are new properties, + * state reset is done at + * SEQ_PROPERTIES. + */ + s->lzma2.need_props = false; + s->lzma2.next_sequence + = SEQ_PROPERTIES; + + } else if (s->lzma2.need_props) { + return XZ_DATA_ERROR; + + } else { + s->lzma2.next_sequence + = SEQ_LZMA_PREPARE; + if (tmp >= 0xA0) + lzma_reset(s); + } + } else { + if (tmp > 0x02) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_COMPRESSED_0; + s->lzma2.next_sequence = SEQ_COPY; + } + + break; + + case SEQ_UNCOMPRESSED_1: + s->lzma2.uncompressed + += (uint32_t)b->in[b->in_pos++] << 8; + s->lzma2.sequence = SEQ_UNCOMPRESSED_2; + break; + + case SEQ_UNCOMPRESSED_2: + s->lzma2.uncompressed + += (uint32_t)b->in[b->in_pos++] + 1; + s->lzma2.sequence = SEQ_COMPRESSED_0; + break; + + case SEQ_COMPRESSED_0: + s->lzma2.compressed + = (uint32_t)b->in[b->in_pos++] << 8; + s->lzma2.sequence = SEQ_COMPRESSED_1; + break; + + case SEQ_COMPRESSED_1: + s->lzma2.compressed + += (uint32_t)b->in[b->in_pos++] + 1; + s->lzma2.sequence = s->lzma2.next_sequence; + break; + + case SEQ_PROPERTIES: + if (!lzma_props(s, b->in[b->in_pos++])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + + case SEQ_LZMA_PREPARE: + if (s->lzma2.compressed < RC_INIT_BYTES) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + case SEQ_LZMA_RUN: + /* + * Set dictionary limit to indicate how much we want + * to be encoded at maximum. Decode new data into the + * dictionary. Flush the new data from dictionary to + * b->out. Check if we finished decoding this chunk. + * In case the dictionary got full but we didn't fill + * the output buffer yet, we may run this loop + * multiple times without changing s->lzma2.sequence. + */ + dict_limit(&s->dict, min_t(size_t, + b->out_size - b->out_pos, + s->lzma2.uncompressed)); + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + + rc_reset(&s->rc); + s->lzma2.sequence = SEQ_CONTROL; + + } else if (b->out_pos == b->out_size + || (b->in_pos == b->in_size + && s->temp.size + < s->lzma2.compressed)) { + return XZ_OK; + } + + break; + + case SEQ_COPY: + dict_uncompressed(&s->dict, b, &s->lzma2.compressed); + if (s->lzma2.compressed > 0) + return XZ_OK; + + s->lzma2.sequence = SEQ_CONTROL; + break; + } + } + + return XZ_OK; +} + +XZ_EXTERN struct xz_dec_lzma2 *INIT xz_dec_lzma2_create(enum xz_mode mode, + uint32_t dict_max) +{ + struct xz_dec_lzma2 *s = malloc(sizeof(*s)); + if (s == NULL) + return NULL; + + s->dict.mode = mode; + s->dict.size_max = dict_max; + + if (DEC_IS_PREALLOC(mode)) { + s->dict.buf = large_malloc(dict_max); + if (s->dict.buf == NULL) { + free(s); + return NULL; + } + } else if (DEC_IS_DYNALLOC(mode)) { + s->dict.buf = NULL; + s->dict.allocated = 0; + } + + return s; +} + +XZ_EXTERN enum xz_ret INIT xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) +{ + /* This limits dictionary size to 3 GiB to keep parsing simpler. */ + if (props > 39) + return XZ_OPTIONS_ERROR; + + s->dict.size = 2 + (props & 1); + s->dict.size <<= (props >> 1) + 11; + + if (DEC_IS_MULTI(s->dict.mode)) { + if (s->dict.size > s->dict.size_max) + return XZ_MEMLIMIT_ERROR; + + s->dict.end = s->dict.size; + + if (DEC_IS_DYNALLOC(s->dict.mode)) { + if (s->dict.allocated < s->dict.size) { + large_free(s->dict.buf); + s->dict.buf = large_malloc(s->dict.size); + if (s->dict.buf == NULL) { + s->dict.allocated = 0; + return XZ_MEM_ERROR; + } + } + } + } + + s->lzma.len = 0; + + s->lzma2.sequence = SEQ_CONTROL; + s->lzma2.need_dict_reset = true; + + s->temp.size = 0; + + return XZ_OK; +} + +XZ_EXTERN void INIT xz_dec_lzma2_end(struct xz_dec_lzma2 *s) +{ + if (DEC_IS_MULTI(s->dict.mode)) + large_free(s->dict.buf); + + free(s); +} diff -Nru xen-4.1.3/xen/common/xz/dec_stream.c xen-4.1.5/xen/common/xz/dec_stream.c --- xen-4.1.3/xen/common/xz/dec_stream.c 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/dec_stream.c 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,821 @@ +/* + * .xz Stream decoder + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "private.h" +#include "stream.h" + +/* Hash used to validate the Index field */ +struct xz_dec_hash { + vli_type unpadded; + vli_type uncompressed; + uint32_t crc32; +}; + +struct xz_dec { + /* Position in dec_main() */ + enum { + SEQ_STREAM_HEADER, + SEQ_BLOCK_START, + SEQ_BLOCK_HEADER, + SEQ_BLOCK_UNCOMPRESS, + SEQ_BLOCK_PADDING, + SEQ_BLOCK_CHECK, + SEQ_INDEX, + SEQ_INDEX_PADDING, + SEQ_INDEX_CRC32, + SEQ_STREAM_FOOTER + } sequence; + + /* Position in variable-length integers and Check fields */ + uint32_t pos; + + /* Variable-length integer decoded by dec_vli() */ + vli_type vli; + + /* Saved in_pos and out_pos */ + size_t in_start; + size_t out_start; + + /* CRC32 value in Block or Index */ + uint32_t crc32; + + /* Type of the integrity check calculated from uncompressed data */ + enum xz_check check_type; + + /* Operation mode */ + enum xz_mode mode; + + /* + * True if the next call to xz_dec_run() is allowed to return + * XZ_BUF_ERROR. + */ + bool_t allow_buf_error; + + /* Information stored in Block Header */ + struct { + /* + * Value stored in the Compressed Size field, or + * VLI_UNKNOWN if Compressed Size is not present. + */ + vli_type compressed; + + /* + * Value stored in the Uncompressed Size field, or + * VLI_UNKNOWN if Uncompressed Size is not present. + */ + vli_type uncompressed; + + /* Size of the Block Header field */ + uint32_t size; + } block_header; + + /* Information collected when decoding Blocks */ + struct { + /* Observed compressed size of the current Block */ + vli_type compressed; + + /* Observed uncompressed size of the current Block */ + vli_type uncompressed; + + /* Number of Blocks decoded so far */ + vli_type count; + + /* + * Hash calculated from the Block sizes. This is used to + * validate the Index field. + */ + struct xz_dec_hash hash; + } block; + + /* Variables needed when verifying the Index field */ + struct { + /* Position in dec_index() */ + enum { + SEQ_INDEX_COUNT, + SEQ_INDEX_UNPADDED, + SEQ_INDEX_UNCOMPRESSED + } sequence; + + /* Size of the Index in bytes */ + vli_type size; + + /* Number of Records (matches block.count in valid files) */ + vli_type count; + + /* + * Hash calculated from the Records (matches block.hash in + * valid files). + */ + struct xz_dec_hash hash; + } index; + + /* + * Temporary buffer needed to hold Stream Header, Block Header, + * and Stream Footer. The Block Header is the biggest (1 KiB) + * so we reserve space according to that. buf[] has to be aligned + * to a multiple of four bytes; the size_t variables before it + * should guarantee this. + */ + struct { + size_t pos; + size_t size; + uint8_t buf[1024]; + } temp; + + struct xz_dec_lzma2 *lzma2; + +#ifdef XZ_DEC_BCJ + struct xz_dec_bcj *bcj; + bool_t bcj_active; +#endif +}; + +#ifdef XZ_DEC_ANY_CHECK +/* Sizes of the Check field with different Check IDs */ +static const uint8_t check_sizes[16] = { + 0, + 4, 4, 4, + 8, 8, 8, + 16, 16, 16, + 32, 32, 32, + 64, 64, 64 +}; +#endif + +/* + * Fill s->temp by copying data starting from b->in[b->in_pos]. Caller + * must have set s->temp.pos to indicate how much data we are supposed + * to copy into s->temp.buf. Return true once s->temp.pos has reached + * s->temp.size. + */ +static bool_t INIT fill_temp(struct xz_dec *s, struct xz_buf *b) +{ + size_t copy_size = min_t(size_t, + b->in_size - b->in_pos, s->temp.size - s->temp.pos); + + memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size); + b->in_pos += copy_size; + s->temp.pos += copy_size; + + if (s->temp.pos == s->temp.size) { + s->temp.pos = 0; + return true; + } + + return false; +} + +/* Decode a variable-length integer (little-endian base-128 encoding) */ +static enum xz_ret INIT dec_vli(struct xz_dec *s, const uint8_t *in, + size_t *in_pos, size_t in_size) +{ + uint8_t byte; + + if (s->pos == 0) + s->vli = 0; + + while (*in_pos < in_size) { + byte = in[*in_pos]; + ++*in_pos; + + s->vli |= (vli_type)(byte & 0x7F) << s->pos; + + if ((byte & 0x80) == 0) { + /* Don't allow non-minimal encodings. */ + if (byte == 0 && s->pos != 0) + return XZ_DATA_ERROR; + + s->pos = 0; + return XZ_STREAM_END; + } + + s->pos += 7; + if (s->pos == 7 * VLI_BYTES_MAX) + return XZ_DATA_ERROR; + } + + return XZ_OK; +} + +/* + * Decode the Compressed Data field from a Block. Update and validate + * the observed compressed and uncompressed sizes of the Block so that + * they don't exceed the values possibly stored in the Block Header + * (validation assumes that no integer overflow occurs, since vli_type + * is normally uint64_t). Update the CRC32 if presence of the CRC32 + * field was indicated in Stream Header. + * + * Once the decoding is finished, validate that the observed sizes match + * the sizes possibly stored in the Block Header. Update the hash and + * Block count, which are later used to validate the Index field. + */ +static enum xz_ret INIT dec_block(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + s->in_start = b->in_pos; + s->out_start = b->out_pos; + +#ifdef XZ_DEC_BCJ + if (s->bcj_active) + ret = xz_dec_bcj_run(s->bcj, s->lzma2, b); + else +#endif + ret = xz_dec_lzma2_run(s->lzma2, b); + + s->block.compressed += b->in_pos - s->in_start; + s->block.uncompressed += b->out_pos - s->out_start; + + /* + * There is no need to separately check for VLI_UNKNOWN, since + * the observed sizes are always smaller than VLI_UNKNOWN. + */ + if (s->block.compressed > s->block_header.compressed + || s->block.uncompressed + > s->block_header.uncompressed) + return XZ_DATA_ERROR; + + if (s->check_type == XZ_CHECK_CRC32) + s->crc32 = xz_crc32(b->out + s->out_start, + b->out_pos - s->out_start, s->crc32); + + if (ret == XZ_STREAM_END) { + if (s->block_header.compressed != VLI_UNKNOWN + && s->block_header.compressed + != s->block.compressed) + return XZ_DATA_ERROR; + + if (s->block_header.uncompressed != VLI_UNKNOWN + && s->block_header.uncompressed + != s->block.uncompressed) + return XZ_DATA_ERROR; + + s->block.hash.unpadded += s->block_header.size + + s->block.compressed; + +#ifdef XZ_DEC_ANY_CHECK + s->block.hash.unpadded += check_sizes[s->check_type]; +#else + if (s->check_type == XZ_CHECK_CRC32) + s->block.hash.unpadded += 4; +#endif + + s->block.hash.uncompressed += s->block.uncompressed; + s->block.hash.crc32 = xz_crc32( + (const uint8_t *)&s->block.hash, + sizeof(s->block.hash), s->block.hash.crc32); + + ++s->block.count; + } + + return ret; +} + +/* Update the Index size and the CRC32 value. */ +static void INIT index_update(struct xz_dec *s, const struct xz_buf *b) +{ + size_t in_used = b->in_pos - s->in_start; + s->index.size += in_used; + s->crc32 = xz_crc32(b->in + s->in_start, in_used, s->crc32); +} + +/* + * Decode the Number of Records, Unpadded Size, and Uncompressed Size + * fields from the Index field. That is, Index Padding and CRC32 are not + * decoded by this function. + * + * This can return XZ_OK (more input needed), XZ_STREAM_END (everything + * successfully decoded), or XZ_DATA_ERROR (input is corrupt). + */ +static enum xz_ret INIT dec_index(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + do { + ret = dec_vli(s, b->in, &b->in_pos, b->in_size); + if (ret != XZ_STREAM_END) { + index_update(s, b); + return ret; + } + + switch (s->index.sequence) { + case SEQ_INDEX_COUNT: + s->index.count = s->vli; + + /* + * Validate that the Number of Records field + * indicates the same number of Records as + * there were Blocks in the Stream. + */ + if (s->index.count != s->block.count) + return XZ_DATA_ERROR; + + s->index.sequence = SEQ_INDEX_UNPADDED; + break; + + case SEQ_INDEX_UNPADDED: + s->index.hash.unpadded += s->vli; + s->index.sequence = SEQ_INDEX_UNCOMPRESSED; + break; + + case SEQ_INDEX_UNCOMPRESSED: + s->index.hash.uncompressed += s->vli; + s->index.hash.crc32 = xz_crc32( + (const uint8_t *)&s->index.hash, + sizeof(s->index.hash), + s->index.hash.crc32); + --s->index.count; + s->index.sequence = SEQ_INDEX_UNPADDED; + break; + } + } while (s->index.count > 0); + + return XZ_STREAM_END; +} + +/* + * Validate that the next four input bytes match the value of s->crc32. + * s->pos must be zero when starting to validate the first byte. + */ +static enum xz_ret INIT crc32_validate(struct xz_dec *s, struct xz_buf *b) +{ + do { + if (b->in_pos == b->in_size) + return XZ_OK; + + if (((s->crc32 >> s->pos) & 0xFF) != b->in[b->in_pos++]) + return XZ_DATA_ERROR; + + s->pos += 8; + + } while (s->pos < 32); + + s->crc32 = 0; + s->pos = 0; + + return XZ_STREAM_END; +} + +#ifdef XZ_DEC_ANY_CHECK +/* + * Skip over the Check field when the Check ID is not supported. + * Returns true once the whole Check field has been skipped over. + */ +static bool_t INIT check_skip(struct xz_dec *s, struct xz_buf *b) +{ + while (s->pos < check_sizes[s->check_type]) { + if (b->in_pos == b->in_size) + return false; + + ++b->in_pos; + ++s->pos; + } + + s->pos = 0; + + return true; +} +#endif + +/* Decode the Stream Header field (the first 12 bytes of the .xz Stream). */ +static enum xz_ret INIT dec_stream_header(struct xz_dec *s) +{ + if (!memeq(s->temp.buf, HEADER_MAGIC, HEADER_MAGIC_SIZE)) + return XZ_FORMAT_ERROR; + + if (xz_crc32(s->temp.buf + HEADER_MAGIC_SIZE, 2, 0) + != get_le32(s->temp.buf + HEADER_MAGIC_SIZE + 2)) + return XZ_DATA_ERROR; + + if (s->temp.buf[HEADER_MAGIC_SIZE] != 0) + return XZ_OPTIONS_ERROR; + + /* + * Of integrity checks, we support only none (Check ID = 0) and + * CRC32 (Check ID = 1). However, if XZ_DEC_ANY_CHECK is defined, + * we will accept other check types too, but then the check won't + * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. + */ + s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; + +#ifdef XZ_DEC_ANY_CHECK + if (s->check_type > XZ_CHECK_MAX) + return XZ_OPTIONS_ERROR; + + if (s->check_type > XZ_CHECK_CRC32) + return XZ_UNSUPPORTED_CHECK; +#else + if (s->check_type > XZ_CHECK_CRC32) + return XZ_OPTIONS_ERROR; +#endif + + return XZ_OK; +} + +/* Decode the Stream Footer field (the last 12 bytes of the .xz Stream) */ +static enum xz_ret INIT dec_stream_footer(struct xz_dec *s) +{ + if (!memeq(s->temp.buf + 10, FOOTER_MAGIC, FOOTER_MAGIC_SIZE)) + return XZ_DATA_ERROR; + + if (xz_crc32(s->temp.buf + 4, 6, 0) != get_le32(s->temp.buf)) + return XZ_DATA_ERROR; + + /* + * Validate Backward Size. Note that we never added the size of the + * Index CRC32 field to s->index.size, thus we use s->index.size / 4 + * instead of s->index.size / 4 - 1. + */ + if ((s->index.size >> 2) != get_le32(s->temp.buf + 4)) + return XZ_DATA_ERROR; + + if (s->temp.buf[8] != 0 || s->temp.buf[9] != s->check_type) + return XZ_DATA_ERROR; + + /* + * Use XZ_STREAM_END instead of XZ_OK to be more convenient + * for the caller. + */ + return XZ_STREAM_END; +} + +/* Decode the Block Header and initialize the filter chain. */ +static enum xz_ret INIT dec_block_header(struct xz_dec *s) +{ + enum xz_ret ret; + + /* + * Validate the CRC32. We know that the temp buffer is at least + * eight bytes so this is safe. + */ + s->temp.size -= 4; + if (xz_crc32(s->temp.buf, s->temp.size, 0) + != get_le32(s->temp.buf + s->temp.size)) + return XZ_DATA_ERROR; + + s->temp.pos = 2; + + /* + * Catch unsupported Block Flags. We support only one or two filters + * in the chain, so we catch that with the same test. + */ +#ifdef XZ_DEC_BCJ + if (s->temp.buf[1] & 0x3E) +#else + if (s->temp.buf[1] & 0x3F) +#endif + return XZ_OPTIONS_ERROR; + + /* Compressed Size */ + if (s->temp.buf[1] & 0x40) { + if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) + != XZ_STREAM_END) + return XZ_DATA_ERROR; + + s->block_header.compressed = s->vli; + } else { + s->block_header.compressed = VLI_UNKNOWN; + } + + /* Uncompressed Size */ + if (s->temp.buf[1] & 0x80) { + if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) + != XZ_STREAM_END) + return XZ_DATA_ERROR; + + s->block_header.uncompressed = s->vli; + } else { + s->block_header.uncompressed = VLI_UNKNOWN; + } + +#ifdef XZ_DEC_BCJ + /* If there are two filters, the first one must be a BCJ filter. */ + s->bcj_active = s->temp.buf[1] & 0x01; + if (s->bcj_active) { + if (s->temp.size - s->temp.pos < 2) + return XZ_OPTIONS_ERROR; + + ret = xz_dec_bcj_reset(s->bcj, s->temp.buf[s->temp.pos++]); + if (ret != XZ_OK) + return ret; + + /* + * We don't support custom start offset, + * so Size of Properties must be zero. + */ + if (s->temp.buf[s->temp.pos++] != 0x00) + return XZ_OPTIONS_ERROR; + } +#endif + + /* Valid Filter Flags always take at least two bytes. */ + if (s->temp.size - s->temp.pos < 2) + return XZ_DATA_ERROR; + + /* Filter ID = LZMA2 */ + if (s->temp.buf[s->temp.pos++] != 0x21) + return XZ_OPTIONS_ERROR; + + /* Size of Properties = 1-byte Filter Properties */ + if (s->temp.buf[s->temp.pos++] != 0x01) + return XZ_OPTIONS_ERROR; + + /* Filter Properties contains LZMA2 dictionary size. */ + if (s->temp.size - s->temp.pos < 1) + return XZ_DATA_ERROR; + + ret = xz_dec_lzma2_reset(s->lzma2, s->temp.buf[s->temp.pos++]); + if (ret != XZ_OK) + return ret; + + /* The rest must be Header Padding. */ + while (s->temp.pos < s->temp.size) + if (s->temp.buf[s->temp.pos++] != 0x00) + return XZ_OPTIONS_ERROR; + + s->temp.pos = 0; + s->block.compressed = 0; + s->block.uncompressed = 0; + + return XZ_OK; +} + +static enum xz_ret INIT dec_main(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + /* + * Store the start position for the case when we are in the middle + * of the Index field. + */ + s->in_start = b->in_pos; + + while (true) { + switch (s->sequence) { + case SEQ_STREAM_HEADER: + /* + * Stream Header is copied to s->temp, and then + * decoded from there. This way if the caller + * gives us only little input at a time, we can + * still keep the Stream Header decoding code + * simple. Similar approach is used in many places + * in this file. + */ + if (!fill_temp(s, b)) + return XZ_OK; + + /* + * If dec_stream_header() returns + * XZ_UNSUPPORTED_CHECK, it is still possible + * to continue decoding if working in multi-call + * mode. Thus, update s->sequence before calling + * dec_stream_header(). + */ + s->sequence = SEQ_BLOCK_START; + + ret = dec_stream_header(s); + if (ret != XZ_OK) + return ret; + + case SEQ_BLOCK_START: + /* We need one byte of input to continue. */ + if (b->in_pos == b->in_size) + return XZ_OK; + + /* See if this is the beginning of the Index field. */ + if (b->in[b->in_pos] == 0) { + s->in_start = b->in_pos++; + s->sequence = SEQ_INDEX; + break; + } + + /* + * Calculate the size of the Block Header and + * prepare to decode it. + */ + s->block_header.size + = ((uint32_t)b->in[b->in_pos] + 1) * 4; + + s->temp.size = s->block_header.size; + s->temp.pos = 0; + s->sequence = SEQ_BLOCK_HEADER; + + case SEQ_BLOCK_HEADER: + if (!fill_temp(s, b)) + return XZ_OK; + + ret = dec_block_header(s); + if (ret != XZ_OK) + return ret; + + s->sequence = SEQ_BLOCK_UNCOMPRESS; + + case SEQ_BLOCK_UNCOMPRESS: + ret = dec_block(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->sequence = SEQ_BLOCK_PADDING; + + case SEQ_BLOCK_PADDING: + /* + * Size of Compressed Data + Block Padding + * must be a multiple of four. We don't need + * s->block.compressed for anything else + * anymore, so we use it here to test the size + * of the Block Padding field. + */ + while (s->block.compressed & 3) { + if (b->in_pos == b->in_size) + return XZ_OK; + + if (b->in[b->in_pos++] != 0) + return XZ_DATA_ERROR; + + ++s->block.compressed; + } + + s->sequence = SEQ_BLOCK_CHECK; + + case SEQ_BLOCK_CHECK: + if (s->check_type == XZ_CHECK_CRC32) { + ret = crc32_validate(s, b); + if (ret != XZ_STREAM_END) + return ret; + } +#ifdef XZ_DEC_ANY_CHECK + else if (!check_skip(s, b)) { + return XZ_OK; + } +#endif + + s->sequence = SEQ_BLOCK_START; + break; + + case SEQ_INDEX: + ret = dec_index(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->sequence = SEQ_INDEX_PADDING; + + case SEQ_INDEX_PADDING: + while ((s->index.size + (b->in_pos - s->in_start)) + & 3) { + if (b->in_pos == b->in_size) { + index_update(s, b); + return XZ_OK; + } + + if (b->in[b->in_pos++] != 0) + return XZ_DATA_ERROR; + } + + /* Finish the CRC32 value and Index size. */ + index_update(s, b); + + /* Compare the hashes to validate the Index field. */ + if (!memeq(&s->block.hash, &s->index.hash, + sizeof(s->block.hash))) + return XZ_DATA_ERROR; + + s->sequence = SEQ_INDEX_CRC32; + + case SEQ_INDEX_CRC32: + ret = crc32_validate(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->temp.size = STREAM_HEADER_SIZE; + s->sequence = SEQ_STREAM_FOOTER; + + case SEQ_STREAM_FOOTER: + if (!fill_temp(s, b)) + return XZ_OK; + + return dec_stream_footer(s); + } + } + + /* Never reached */ +} + +XZ_EXTERN void INIT xz_dec_reset(struct xz_dec *s) +{ + s->sequence = SEQ_STREAM_HEADER; + s->allow_buf_error = false; + s->pos = 0; + s->crc32 = 0; + memzero(&s->block, sizeof(s->block)); + memzero(&s->index, sizeof(s->index)); + s->temp.pos = 0; + s->temp.size = STREAM_HEADER_SIZE; +} + +/* + * xz_dec_run() is a wrapper for dec_main() to handle some special cases in + * multi-call and single-call decoding. + * + * In multi-call mode, we must return XZ_BUF_ERROR when it seems clear that we + * are not going to make any progress anymore. This is to prevent the caller + * from calling us infinitely when the input file is truncated or otherwise + * corrupt. Since zlib-style API allows that the caller fills the input buffer + * only when the decoder doesn't produce any new output, we have to be careful + * to avoid returning XZ_BUF_ERROR too easily: XZ_BUF_ERROR is returned only + * after the second consecutive call to xz_dec_run() that makes no progress. + * + * In single-call mode, if we couldn't decode everything and no error + * occurred, either the input is truncated or the output buffer is too small. + * Since we know that the last input byte never produces any output, we know + * that if all the input was consumed and decoding wasn't finished, the file + * must be corrupt. Otherwise the output buffer has to be too small or the + * file is corrupt in a way that decoding it produces too big output. + * + * If single-call decoding fails, we reset b->in_pos and b->out_pos back to + * their original values. This is because with some filter chains there won't + * be any valid uncompressed data in the output buffer unless the decoding + * actually succeeds (that's the price to pay of using the output buffer as + * the workspace). + */ +XZ_EXTERN enum xz_ret INIT xz_dec_run(struct xz_dec *s, struct xz_buf *b) +{ + size_t in_start; + size_t out_start; + enum xz_ret ret; + + if (DEC_IS_SINGLE(s->mode)) + xz_dec_reset(s); + + in_start = b->in_pos; + out_start = b->out_pos; + ret = dec_main(s, b); + + if (DEC_IS_SINGLE(s->mode)) { + if (ret == XZ_OK) + ret = b->in_pos == b->in_size + ? XZ_DATA_ERROR : XZ_BUF_ERROR; + + if (ret != XZ_STREAM_END) { + b->in_pos = in_start; + b->out_pos = out_start; + } + + } else if (ret == XZ_OK && in_start == b->in_pos + && out_start == b->out_pos) { + if (s->allow_buf_error) + ret = XZ_BUF_ERROR; + + s->allow_buf_error = true; + } else { + s->allow_buf_error = false; + } + + return ret; +} + +XZ_EXTERN struct xz_dec *INIT xz_dec_init(enum xz_mode mode, uint32_t dict_max) +{ + struct xz_dec *s = malloc(sizeof(*s)); + if (s == NULL) + return NULL; + + s->mode = mode; + +#ifdef XZ_DEC_BCJ + s->bcj = xz_dec_bcj_create(DEC_IS_SINGLE(mode)); + if (s->bcj == NULL) + goto error_bcj; +#endif + + s->lzma2 = xz_dec_lzma2_create(mode, dict_max); + if (s->lzma2 == NULL) + goto error_lzma2; + + xz_dec_reset(s); + return s; + +error_lzma2: +#ifdef XZ_DEC_BCJ + xz_dec_bcj_end(s->bcj); +error_bcj: +#endif + free(s); + return NULL; +} + +XZ_EXTERN void INIT xz_dec_end(struct xz_dec *s) +{ + if (s != NULL) { + xz_dec_lzma2_end(s->lzma2); +#ifdef XZ_DEC_BCJ + xz_dec_bcj_end(s->bcj); +#endif + free(s); + } +} diff -Nru xen-4.1.3/xen/common/xz/lzma2.h xen-4.1.5/xen/common/xz/lzma2.h --- xen-4.1.3/xen/common/xz/lzma2.h 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/lzma2.h 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,204 @@ +/* + * LZMA2 definitions + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_LZMA2_H +#define XZ_LZMA2_H + +/* Range coder constants */ +#define RC_SHIFT_BITS 8 +#define RC_TOP_BITS 24 +#define RC_TOP_VALUE (1 << RC_TOP_BITS) +#define RC_BIT_MODEL_TOTAL_BITS 11 +#define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS) +#define RC_MOVE_BITS 5 + +/* + * Maximum number of position states. A position state is the lowest pb + * number of bits of the current uncompressed offset. In some places there + * are different sets of probabilities for different position states. + */ +#define POS_STATES_MAX (1 << 4) + +/* + * This enum is used to track which LZMA symbols have occurred most recently + * and in which order. This information is used to predict the next symbol. + * + * Symbols: + * - Literal: One 8-bit byte + * - Match: Repeat a chunk of data at some distance + * - Long repeat: Multi-byte match at a recently seen distance + * - Short repeat: One-byte repeat at a recently seen distance + * + * The symbol names are in from STATE_oldest_older_previous. REP means + * either short or long repeated match, and NONLIT means any non-literal. + */ +enum lzma_state { + STATE_LIT_LIT, + STATE_MATCH_LIT_LIT, + STATE_REP_LIT_LIT, + STATE_SHORTREP_LIT_LIT, + STATE_MATCH_LIT, + STATE_REP_LIT, + STATE_SHORTREP_LIT, + STATE_LIT_MATCH, + STATE_LIT_LONGREP, + STATE_LIT_SHORTREP, + STATE_NONLIT_MATCH, + STATE_NONLIT_REP +}; + +/* Total number of states */ +#define STATES 12 + +/* The lowest 7 states indicate that the previous state was a literal. */ +#define LIT_STATES 7 + +/* Indicate that the latest symbol was a literal. */ +static inline void INIT lzma_state_literal(enum lzma_state *state) +{ + if (*state <= STATE_SHORTREP_LIT_LIT) + *state = STATE_LIT_LIT; + else if (*state <= STATE_LIT_SHORTREP) + *state -= 3; + else + *state -= 6; +} + +/* Indicate that the latest symbol was a match. */ +static inline void INIT lzma_state_match(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH; +} + +/* Indicate that the latest state was a long repeated match. */ +static inline void INIT lzma_state_long_rep(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP; +} + +/* Indicate that the latest symbol was a short match. */ +static inline void INIT lzma_state_short_rep(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP; +} + +/* Test if the previous symbol was a literal. */ +static inline bool_t INIT lzma_state_is_literal(enum lzma_state state) +{ + return state < LIT_STATES; +} + +/* Each literal coder is divided in three sections: + * - 0x001-0x0FF: Without match byte + * - 0x101-0x1FF: With match byte; match bit is 0 + * - 0x201-0x2FF: With match byte; match bit is 1 + * + * Match byte is used when the previous LZMA symbol was something else than + * a literal (that is, it was some kind of match). + */ +#define LITERAL_CODER_SIZE 0x300 + +/* Maximum number of literal coders */ +#define LITERAL_CODERS_MAX (1 << 4) + +/* Minimum length of a match is two bytes. */ +#define MATCH_LEN_MIN 2 + +/* Match length is encoded with 4, 5, or 10 bits. + * + * Length Bits + * 2-9 4 = Choice=0 + 3 bits + * 10-17 5 = Choice=1 + Choice2=0 + 3 bits + * 18-273 10 = Choice=1 + Choice2=1 + 8 bits + */ +#define LEN_LOW_BITS 3 +#define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS) +#define LEN_MID_BITS 3 +#define LEN_MID_SYMBOLS (1 << LEN_MID_BITS) +#define LEN_HIGH_BITS 8 +#define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS) +#define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS) + +/* + * Maximum length of a match is 273 which is a result of the encoding + * described above. + */ +#define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1) + +/* + * Different sets of probabilities are used for match distances that have + * very short match length: Lengths of 2, 3, and 4 bytes have a separate + * set of probabilities for each length. The matches with longer length + * use a shared set of probabilities. + */ +#define DIST_STATES 4 + +/* + * Get the index of the appropriate probability array for decoding + * the distance slot. + */ +static inline uint32_t INIT lzma_get_dist_state(uint32_t len) +{ + return len < DIST_STATES + MATCH_LEN_MIN + ? len - MATCH_LEN_MIN : DIST_STATES - 1; +} + +/* + * The highest two bits of a 32-bit match distance are encoded using six bits. + * This six-bit value is called a distance slot. This way encoding a 32-bit + * value takes 6-36 bits, larger values taking more bits. + */ +#define DIST_SLOT_BITS 6 +#define DIST_SLOTS (1 << DIST_SLOT_BITS) + +/* Match distances up to 127 are fully encoded using probabilities. Since + * the highest two bits (distance slot) are always encoded using six bits, + * the distances 0-3 don't need any additional bits to encode, since the + * distance slot itself is the same as the actual distance. DIST_MODEL_START + * indicates the first distance slot where at least one additional bit is + * needed. + */ +#define DIST_MODEL_START 4 + +/* + * Match distances greater than 127 are encoded in three pieces: + * - distance slot: the highest two bits + * - direct bits: 2-26 bits below the highest two bits + * - alignment bits: four lowest bits + * + * Direct bits don't use any probabilities. + * + * The distance slot value of 14 is for distances 128-191. + */ +#define DIST_MODEL_END 14 + +/* Distance slots that indicate a distance <= 127. */ +#define FULL_DISTANCES_BITS (DIST_MODEL_END / 2) +#define FULL_DISTANCES (1 << FULL_DISTANCES_BITS) + +/* + * For match distances greater than 127, only the highest two bits and the + * lowest four bits (alignment) is encoded using probabilities. + */ +#define ALIGN_BITS 4 +#define ALIGN_SIZE (1 << ALIGN_BITS) +#define ALIGN_MASK (ALIGN_SIZE - 1) + +/* Total number of all probability variables */ +#define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE) + +/* + * LZMA remembers the four most recent match distances. Reusing these + * distances tends to take less space than re-encoding the actual + * distance value. + */ +#define REPS 4 + +#endif diff -Nru xen-4.1.3/xen/common/xz/private.h xen-4.1.5/xen/common/xz/private.h --- xen-4.1.3/xen/common/xz/private.h 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/private.h 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,271 @@ +/* + * Private includes and definitions + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_PRIVATE_H +#define XZ_PRIVATE_H + +#include +#include +#define get_le32(p) le32_to_cpup((const uint32_t *)(p)) + +#if 1 /* ndef CONFIG_??? */ +static inline u32 INIT get_unaligned_le32(void *p) +{ + return le32_to_cpup(p); +} + +static inline void INIT put_unaligned_le32(u32 val, void *p) +{ + *(__force __le32*)p = cpu_to_le32(val); +} +#else +#include + +static inline u32 INIT get_unaligned_le32(void *p) +{ + return le32_to_cpu(__get_unaligned(p, 4)); +} + +static inline void INIT put_unaligned_le32(u32 val, void *p) +{ + __put_unaligned(cpu_to_le32(val), p, 4); +} +#endif + +#define false 0 +#define true 1 + +/** + * enum xz_mode - Operation mode + * + * @XZ_SINGLE: Single-call mode. This uses less RAM than + * than multi-call modes, because the LZMA2 + * dictionary doesn't need to be allocated as + * part of the decoder state. All required data + * structures are allocated at initialization, + * so xz_dec_run() cannot return XZ_MEM_ERROR. + * @XZ_PREALLOC: Multi-call mode with preallocated LZMA2 + * dictionary buffer. All data structures are + * allocated at initialization, so xz_dec_run() + * cannot return XZ_MEM_ERROR. + * @XZ_DYNALLOC: Multi-call mode. The LZMA2 dictionary is + * allocated once the required size has been + * parsed from the stream headers. If the + * allocation fails, xz_dec_run() will return + * XZ_MEM_ERROR. + * + * It is possible to enable support only for a subset of the above + * modes at compile time by defining XZ_DEC_SINGLE, XZ_DEC_PREALLOC, + * or XZ_DEC_DYNALLOC. The xz_dec kernel module is always compiled + * with support for all operation modes, but the preboot code may + * be built with fewer features to minimize code size. + */ +enum xz_mode { + XZ_SINGLE, + XZ_PREALLOC, + XZ_DYNALLOC +}; + +/** + * enum xz_ret - Return codes + * @XZ_OK: Everything is OK so far. More input or more + * output space is required to continue. This + * return code is possible only in multi-call mode + * (XZ_PREALLOC or XZ_DYNALLOC). + * @XZ_STREAM_END: Operation finished successfully. + * @XZ_UNSUPPORTED_CHECK: Integrity check type is not supported. Decoding + * is still possible in multi-call mode by simply + * calling xz_dec_run() again. + * Note that this return value is used only if + * XZ_DEC_ANY_CHECK was defined at build time, + * which is not used in the kernel. Unsupported + * check types return XZ_OPTIONS_ERROR if + * XZ_DEC_ANY_CHECK was not defined at build time. + * @XZ_MEM_ERROR: Allocating memory failed. This return code is + * possible only if the decoder was initialized + * with XZ_DYNALLOC. The amount of memory that was + * tried to be allocated was no more than the + * dict_max argument given to xz_dec_init(). + * @XZ_MEMLIMIT_ERROR: A bigger LZMA2 dictionary would be needed than + * allowed by the dict_max argument given to + * xz_dec_init(). This return value is possible + * only in multi-call mode (XZ_PREALLOC or + * XZ_DYNALLOC); the single-call mode (XZ_SINGLE) + * ignores the dict_max argument. + * @XZ_FORMAT_ERROR: File format was not recognized (wrong magic + * bytes). + * @XZ_OPTIONS_ERROR: This implementation doesn't support the requested + * compression options. In the decoder this means + * that the header CRC32 matches, but the header + * itself specifies something that we don't support. + * @XZ_DATA_ERROR: Compressed data is corrupt. + * @XZ_BUF_ERROR: Cannot make any progress. Details are slightly + * different between multi-call and single-call + * mode; more information below. + * + * In multi-call mode, XZ_BUF_ERROR is returned when two consecutive calls + * to XZ code cannot consume any input and cannot produce any new output. + * This happens when there is no new input available, or the output buffer + * is full while at least one output byte is still pending. Assuming your + * code is not buggy, you can get this error only when decoding a compressed + * stream that is truncated or otherwise corrupt. + * + * In single-call mode, XZ_BUF_ERROR is returned only when the output buffer + * is too small or the compressed input is corrupt in a way that makes the + * decoder produce more output than the caller expected. When it is + * (relatively) clear that the compressed input is truncated, XZ_DATA_ERROR + * is used instead of XZ_BUF_ERROR. + */ +enum xz_ret { + XZ_OK, + XZ_STREAM_END, + XZ_UNSUPPORTED_CHECK, + XZ_MEM_ERROR, + XZ_MEMLIMIT_ERROR, + XZ_FORMAT_ERROR, + XZ_OPTIONS_ERROR, + XZ_DATA_ERROR, + XZ_BUF_ERROR +}; + +/** + * struct xz_buf - Passing input and output buffers to XZ code + * @in: Beginning of the input buffer. This may be NULL if and only + * if in_pos is equal to in_size. + * @in_pos: Current position in the input buffer. This must not exceed + * in_size. + * @in_size: Size of the input buffer + * @out: Beginning of the output buffer. This may be NULL if and only + * if out_pos is equal to out_size. + * @out_pos: Current position in the output buffer. This must not exceed + * out_size. + * @out_size: Size of the output buffer + * + * Only the contents of the output buffer from out[out_pos] onward, and + * the variables in_pos and out_pos are modified by the XZ code. + */ +struct xz_buf { + const uint8_t *in; + size_t in_pos; + size_t in_size; + + uint8_t *out; + size_t out_pos; + size_t out_size; +}; + +/** + * struct xz_dec - Opaque type to hold the XZ decoder state + */ +struct xz_dec; + +/* If no specific decoding mode is requested, enable support for all modes. */ +#if !defined(XZ_DEC_SINGLE) && !defined(XZ_DEC_PREALLOC) \ + && !defined(XZ_DEC_DYNALLOC) +# define XZ_DEC_SINGLE +# define XZ_DEC_PREALLOC +# define XZ_DEC_DYNALLOC +#endif + +/* + * The DEC_IS_foo(mode) macros are used in "if" statements. If only some + * of the supported modes are enabled, these macros will evaluate to true or + * false at compile time and thus allow the compiler to omit unneeded code. + */ +#ifdef XZ_DEC_SINGLE +# define DEC_IS_SINGLE(mode) ((mode) == XZ_SINGLE) +#else +# define DEC_IS_SINGLE(mode) (false) +#endif + +#ifdef XZ_DEC_PREALLOC +# define DEC_IS_PREALLOC(mode) ((mode) == XZ_PREALLOC) +#else +# define DEC_IS_PREALLOC(mode) (false) +#endif + +#ifdef XZ_DEC_DYNALLOC +# define DEC_IS_DYNALLOC(mode) ((mode) == XZ_DYNALLOC) +#else +# define DEC_IS_DYNALLOC(mode) (false) +#endif + +#if !defined(XZ_DEC_SINGLE) +# define DEC_IS_MULTI(mode) (true) +#elif defined(XZ_DEC_PREALLOC) || defined(XZ_DEC_DYNALLOC) +# define DEC_IS_MULTI(mode) ((mode) != XZ_SINGLE) +#else +# define DEC_IS_MULTI(mode) (false) +#endif + +/* + * If any of the BCJ filter decoders are wanted, define XZ_DEC_BCJ. + * XZ_DEC_BCJ is used to enable generic support for BCJ decoders. + */ +#ifndef XZ_DEC_BCJ +# if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ + || defined(XZ_DEC_IA64) || defined(XZ_DEC_ARM) \ + || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ + || defined(XZ_DEC_SPARC) +# define XZ_DEC_BCJ +# endif +#endif + +/* + * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used + * before calling xz_dec_lzma2_run(). + */ +XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, + uint32_t dict_max); + +/* + * Decode the LZMA2 properties (one byte) and reset the decoder. Return + * XZ_OK on success, XZ_MEMLIMIT_ERROR if the preallocated dictionary is not + * big enough, and XZ_OPTIONS_ERROR if props indicates something that this + * decoder doesn't support. + */ +XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, + uint8_t props); + +/* Decode raw LZMA2 stream from b->in to b->out. */ +XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, + struct xz_buf *b); + +/* Free the memory allocated for the LZMA2 decoder. */ +XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s); + +#ifdef XZ_DEC_BCJ +/* + * Allocate memory for BCJ decoders. xz_dec_bcj_reset() must be used before + * calling xz_dec_bcj_run(). + */ +XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool_t single_call); + +/* + * Decode the Filter ID of a BCJ filter. This implementation doesn't + * support custom start offsets, so no decoding of Filter Properties + * is needed. Returns XZ_OK if the given Filter ID is supported. + * Otherwise XZ_OPTIONS_ERROR is returned. + */ +XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id); + +/* + * Decode raw BCJ + LZMA2 stream. This must be used only if there actually is + * a BCJ filter in the chain. If the chain has only LZMA2, xz_dec_lzma2_run() + * must be called directly. + */ +XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, + struct xz_dec_lzma2 *lzma2, + struct xz_buf *b); + +/* Free the memory allocated for the BCJ filters. */ +#define xz_dec_bcj_end(s) free(s) +#endif + +#endif diff -Nru xen-4.1.3/xen/common/xz/stream.h xen-4.1.5/xen/common/xz/stream.h --- xen-4.1.3/xen/common/xz/stream.h 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/common/xz/stream.h 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,55 @@ +/* + * Definitions for handling the .xz file format + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_STREAM_H +#define XZ_STREAM_H + +/* + * See the .xz file format specification at + * http://tukaani.org/xz/xz-file-format.txt + * to understand the container format. + */ + +#define STREAM_HEADER_SIZE 12 + +#define HEADER_MAGIC "\3757zXZ" +#define HEADER_MAGIC_SIZE 6 + +#define FOOTER_MAGIC "YZ" +#define FOOTER_MAGIC_SIZE 2 + +/* + * Variable-length integer can hold a 63-bit unsigned integer or a special + * value indicating that the value is unknown. + * + * Experimental: vli_type can be defined to uint32_t to save a few bytes + * in code size (no effect on speed). Doing so limits the uncompressed and + * compressed size of the file to less than 256 MiB and may also weaken + * error detection slightly. + */ +typedef uint64_t vli_type; + +#define VLI_MAX ((vli_type)-1 / 2) +#define VLI_UNKNOWN ((vli_type)-1) + +/* Maximum encoded size of a VLI */ +#define VLI_BYTES_MAX (sizeof(vli_type) * 8 / 7) + +/* Integrity Check types */ +enum xz_check { + XZ_CHECK_NONE = 0, + XZ_CHECK_CRC32 = 1, + XZ_CHECK_CRC64 = 4, + XZ_CHECK_SHA256 = 10 +}; + +/* Maximum possible Check ID */ +#define XZ_CHECK_MAX 15 + +#endif diff -Nru xen-4.1.3/xen/drivers/acpi/apei/apei-base.c xen-4.1.5/xen/drivers/acpi/apei/apei-base.c --- xen-4.1.3/xen/drivers/acpi/apei/apei-base.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/acpi/apei/apei-base.c 2013-04-23 18:44:20.000000000 +0200 @@ -154,9 +154,10 @@ * Interpret the specified action. Go through whole action table, * execute all instructions belong to the action. */ -int apei_exec_run(struct apei_exec_context *ctx, u8 action) +int __apei_exec_run(struct apei_exec_context *ctx, u8 action, + bool_t optional) { - int rc; + int rc = -ENOENT; u32 i, ip; struct acpi_whea_header *entry; apei_exec_ins_func_t run; @@ -195,7 +196,7 @@ goto rewind; } - return 0; + return !optional && rc < 0 ? rc : 0; } typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx, diff -Nru xen-4.1.3/xen/drivers/acpi/apei/apei-internal.h xen-4.1.5/xen/drivers/acpi/apei/apei-internal.h --- xen-4.1.3/xen/drivers/acpi/apei/apei-internal.h 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/acpi/apei/apei-internal.h 2013-04-23 18:44:20.000000000 +0200 @@ -48,7 +48,18 @@ return ctx->value; } -int apei_exec_run(struct apei_exec_context *ctx, u8 action); +int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool_t optional); + +static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action) +{ + return __apei_exec_run(ctx, action, 0); +} + +/* It is optional whether the firmware provides the action */ +static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action) +{ + return __apei_exec_run(ctx, action, 1); +} /* Common instruction implementation */ diff -Nru xen-4.1.3/xen/drivers/acpi/apei/apei-io.c xen-4.1.5/xen/drivers/acpi/apei/apei-io.c --- xen-4.1.3/xen/drivers/acpi/apei/apei-io.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/acpi/apei/apei-io.c 2013-04-23 18:44:20.000000000 +0200 @@ -146,10 +146,8 @@ spin_lock_irqsave(&apei_iomaps_lock, flags); map = __apei_find_iomap(paddr, size); - if (!map) - return; - - list_del(&map->list); + if (map) + list_del(&map->list); spin_unlock_irqrestore(&apei_iomaps_lock, flags); xfree(map); diff -Nru xen-4.1.3/xen/drivers/acpi/apei/erst.c xen-4.1.5/xen/drivers/acpi/apei/erst.c --- xen-4.1.3/xen/drivers/acpi/apei/erst.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/acpi/apei/erst.c 2013-04-23 18:44:20.000000000 +0200 @@ -247,15 +247,64 @@ { int rc; u64 offset; +#ifdef CONFIG_X86 + enum fixed_addresses idx; +#endif + void *src, *dst; + + /* ioremap does not work in interrupt context */ + if (in_irq()) { + printk(KERN_WARNING + "MOVE_DATA cannot be used in interrupt context\n"); + return -EBUSY; + } rc = __apei_exec_read_register(entry, &offset); if (rc) return rc; - memmove((void *)(unsigned long)(ctx->dst_base + offset), - (void *)(unsigned long)(ctx->src_base + offset), - ctx->var2); - return 0; +#ifdef CONFIG_X86 + switch (ctx->var2) { + case 0: + return 0; + case 1 ... PAGE_SIZE: + break; + default: + printk(KERN_WARNING + "MOVE_DATA cannot be used for %#"PRIx64" bytes of data\n", + ctx->var2); + return -EOPNOTSUPP; + } + + src = __acpi_map_table(ctx->src_base + offset, ctx->var2); +#else + src = ioremap(ctx->src_base + offset, ctx->var2); +#endif + if (!src) + return -ENOMEM; + +#ifdef CONFIG_X86 + BUILD_BUG_ON(FIX_ACPI_PAGES < 4); + idx = virt_to_fix((unsigned long)src + 2 * PAGE_SIZE); + offset += ctx->dst_base; + dst = (void *)fix_to_virt(idx) + (offset & ~PAGE_MASK); + set_fixmap(idx, offset); + if (PFN_DOWN(offset) != PFN_DOWN(offset + ctx->var2 - 1)) { + idx = virt_to_fix((unsigned long)dst + PAGE_SIZE); + set_fixmap(idx, offset + PAGE_SIZE); + } +#else + dst = ioremap(ctx->dst_base + offset, ctx->var2); +#endif + if (dst) { + memmove(dst, src, ctx->var2); + iounmap(dst); + } else + rc = -ENOMEM; + + iounmap(src); + + return rc; } static struct apei_exec_ins_type erst_ins_type[] = { @@ -715,12 +764,23 @@ static int __init erst_check_table(struct acpi_table_erst *erst_tab) { - if (erst_tab->header_length != sizeof(struct acpi_table_erst)) + if (erst_tab->header.length < sizeof(*erst_tab)) return -EINVAL; - if (erst_tab->header.length < sizeof(struct acpi_table_erst)) + + switch (erst_tab->header_length) { + case sizeof(*erst_tab) - sizeof(erst_tab->header): + /* + * While invalid per specification, there are (early?) systems + * indicating the full header size here, so accept that value too. + */ + case sizeof(*erst_tab): + break; + default: return -EINVAL; + } + if (erst_tab->entries != - (erst_tab->header.length - sizeof(struct acpi_table_erst)) / + (erst_tab->header.length - sizeof(*erst_tab)) / sizeof(struct acpi_erst_entry)) return -EINVAL; @@ -739,11 +799,11 @@ status = acpi_get_table(ACPI_SIG_ERST, 0, (struct acpi_table_header **)&erst_tab); if (status == AE_NOT_FOUND) { - printk(KERN_ERR "Table is not found!\n"); + printk(KERN_INFO "ERST table was not found\n"); return -ENODEV; } else if (ACPI_FAILURE(status)) { const char *msg = acpi_format_exception(status); - printk(KERN_ERR "Failed to get table, %s\n", msg); + printk(KERN_WARNING "Failed to get ERST table: %s\n", msg); return -EINVAL; } diff -Nru xen-4.1.3/xen/drivers/acpi/pmstat.c xen-4.1.5/xen/drivers/acpi/pmstat.c --- xen-4.1.3/xen/drivers/acpi/pmstat.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/acpi/pmstat.c 2013-04-23 18:44:20.000000000 +0200 @@ -203,8 +203,6 @@ struct list_head *pos; uint32_t cpu, i, j = 0; - if ( !op || !cpu_online(op->cpuid) ) - return -EINVAL; pmpt = processor_pminfo[op->cpuid]; policy = per_cpu(cpufreq_cpu_policy, op->cpuid); @@ -312,9 +310,6 @@ { struct cpufreq_policy new_policy, *old_policy; - if ( !op || !cpu_online(op->cpuid) ) - return -EINVAL; - old_policy = per_cpu(cpufreq_cpu_policy, op->cpuid); if ( !old_policy ) return -EINVAL; @@ -333,8 +328,6 @@ int ret = 0; struct cpufreq_policy *policy; - if ( !op || !cpu_online(op->cpuid) ) - return -EINVAL; policy = per_cpu(cpufreq_cpu_policy, op->cpuid); if ( !policy || !policy->governor ) @@ -411,22 +404,12 @@ return ret; } -static int get_cpufreq_avgfreq(struct xen_sysctl_pm_op *op) -{ - if ( !op || !cpu_online(op->cpuid) ) - return -EINVAL; - - op->u.get_avgfreq = cpufreq_driver_getavg(op->cpuid, USR_GETAVG); - - return 0; -} - int do_pm_op(struct xen_sysctl_pm_op *op) { int ret = 0; const struct processor_pminfo *pmpt; - if ( !op || !cpu_online(op->cpuid) ) + if ( !op || op->cpuid >= NR_CPUS || !cpu_online(op->cpuid) ) return -EINVAL; pmpt = processor_pminfo[op->cpuid]; @@ -462,7 +445,7 @@ case GET_CPUFREQ_AVGFREQ: { - ret = get_cpufreq_avgfreq(op); + op->u.get_avgfreq = cpufreq_driver_getavg(op->cpuid, USR_GETAVG); break; } diff -Nru xen-4.1.3/xen/drivers/acpi/tables.c xen-4.1.5/xen/drivers/acpi/tables.c --- xen-4.1.3/xen/drivers/acpi/tables.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/acpi/tables.c 2013-04-23 18:44:20.000000000 +0200 @@ -267,7 +267,7 @@ * @handler: handler to run * * Scan the ACPI System Descriptor Table (STD) for a table matching @id, - * run @handler on it. Return 0 if table found, return on if not. + * run @handler on it. */ int acpi_table_parse(char *id, acpi_table_handler handler) { @@ -282,8 +282,7 @@ acpi_get_table(id, 0, &table); if (table) { - handler(table); - return 0; + return handler(table); } else return 1; } diff -Nru xen-4.1.3/xen/drivers/passthrough/amd/iommu_acpi.c xen-4.1.5/xen/drivers/passthrough/amd/iommu_acpi.c --- xen-4.1.3/xen/drivers/passthrough/amd/iommu_acpi.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/amd/iommu_acpi.c 2013-04-23 18:44:20.000000000 +0200 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -29,7 +30,6 @@ extern unsigned short ivrs_bdf_entries; extern struct ivrs_mappings *ivrs_mappings; extern unsigned short last_bdf; -extern int ioapic_bdf[MAX_IO_APICS]; extern void *shared_intremap_table; static void add_ivrs_mapping_entry( @@ -346,9 +346,8 @@ base = start_addr & PAGE_MASK; limit = (start_addr + mem_length - 1) & PAGE_MASK; - AMD_IOMMU_DEBUG("IVMD Block: Type 0x%x\n",ivmd_block->header.type); - AMD_IOMMU_DEBUG(" Start_Addr_Phys 0x%lx\n", start_addr); - AMD_IOMMU_DEBUG(" Mem_Length 0x%lx\n", mem_length); + AMD_IOMMU_DEBUG("IVMD Block: type %#x phys %#lx len %#lx\n", + ivmd_block->header.type, start_addr, mem_length); if ( get_field_from_byte(ivmd_block->header.flags, AMD_IOMMU_ACPI_EXCLUSION_RANGE_MASK, @@ -550,8 +549,8 @@ return 0; } - AMD_IOMMU_DEBUG(" Dev_Id Range: 0x%x -> 0x%x\n", first_bdf, last_bdf); - AMD_IOMMU_DEBUG(" Dev_Id Alias: 0x%x\n", alias_id); + AMD_IOMMU_DEBUG(" Dev_Id Range: %#x -> %#x alias %#x\n", + first_bdf, last_bdf, alias_id); for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) add_ivrs_mapping_entry(bdf, alias_id, ivhd_device->header.flags, iommu); @@ -636,6 +635,7 @@ u16 header_length, u16 block_length, struct amd_iommu *iommu) { u16 dev_length, bdf; + int apic; dev_length = sizeof(struct acpi_ivhd_device_special); if ( header_length < (block_length + dev_length) ) @@ -651,10 +651,65 @@ return 0; } + AMD_IOMMU_DEBUG("IVHD Special: %02x:%02x.%u variety %#x handle %#x\n", + PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf), + ivhd_device->special.variety, ivhd_device->special.handle); add_ivrs_mapping_entry(bdf, bdf, ivhd_device->header.flags, iommu); - /* set device id of ioapic */ - ioapic_bdf[ivhd_device->special.handle] = bdf; - return dev_length; + + if ( ivhd_device->special.variety != 1 /* ACPI_IVHD_IOAPIC */ ) + { + if ( ivhd_device->special.variety != 2 /* ACPI_IVHD_HPET */ ) + printk(XENLOG_ERR "Unrecognized IVHD special variety %#x\n", + ivhd_device->special.variety); + return dev_length; + } + + if ( !iommu_intremap ) + return dev_length; + + /* + * Some BIOSes have IOAPIC broken entries so we check for IVRS + * consistency here --- whether entry's IOAPIC ID is valid and + * whether there are conflicting/duplicated entries. + */ + for ( apic = 0; apic < nr_ioapics; apic++ ) + { + if ( IO_APIC_ID(apic) != ivhd_device->special.handle ) + continue; + + if ( ioapic_bdf[ivhd_device->special.handle].pin_setup ) + { + if ( ioapic_bdf[ivhd_device->special.handle].bdf == bdf ) + AMD_IOMMU_DEBUG("IVHD Warning: Duplicate IO-APIC %#x entries\n", + ivhd_device->special.handle); + else + { + printk(XENLOG_ERR "IVHD Error: Conflicting IO-APIC %#x entries\n", + ivhd_device->special.handle); + if ( amd_iommu_perdev_intremap ) + return 0; + } + } + else + { + /* set device id of ioapic */ + ioapic_bdf[ivhd_device->special.handle].bdf = bdf; + + ioapic_bdf[ivhd_device->special.handle].pin_setup = xzalloc_array( + unsigned long, BITS_TO_LONGS(nr_ioapic_registers[apic])); + if ( nr_ioapic_registers[apic] && + !ioapic_bdf[IO_APIC_ID(apic)].pin_setup ) + { + printk(XENLOG_ERR "IVHD Error: Out of memory\n"); + return 0; + } + } + return dev_length; + } + + printk(XENLOG_ERR "IVHD Error: Invalid IO-APIC %#x\n", + ivhd_device->special.handle); + return 0; } static int __init parse_ivhd_block(struct acpi_ivhd_block_header *ivhd_block) @@ -687,10 +742,9 @@ ivhd_device = (union acpi_ivhd_device *) ((u8 *)ivhd_block + block_length); - AMD_IOMMU_DEBUG( "IVHD Device Entry:\n"); - AMD_IOMMU_DEBUG( " Type 0x%x\n", ivhd_device->header.type); - AMD_IOMMU_DEBUG( " Dev_Id 0x%x\n", ivhd_device->header.dev_id); - AMD_IOMMU_DEBUG( " Flags 0x%x\n", ivhd_device->header.flags); + AMD_IOMMU_DEBUG("IVHD Device Entry: type %#x id %#x flags %#x\n", + ivhd_device->header.type, ivhd_device->header.dev_id, + ivhd_device->header.flags); switch ( ivhd_device->header.type ) { @@ -817,6 +871,7 @@ { struct acpi_ivrs_block_header *ivrs_block; unsigned long length; + unsigned int apic; int error = 0; struct acpi_table_header *table = (struct acpi_table_header *)_table; @@ -832,11 +887,9 @@ ivrs_block = (struct acpi_ivrs_block_header *) ((u8 *)table + length); - AMD_IOMMU_DEBUG("IVRS Block:\n"); - AMD_IOMMU_DEBUG(" Type 0x%x\n", ivrs_block->type); - AMD_IOMMU_DEBUG(" Flags 0x%x\n", ivrs_block->flags); - AMD_IOMMU_DEBUG(" Length 0x%x\n", ivrs_block->length); - AMD_IOMMU_DEBUG(" Dev_Id 0x%x\n", ivrs_block->dev_id); + AMD_IOMMU_DEBUG("IVRS Block: type %#x flags %#x len %#x id %#x\n", + ivrs_block->type, ivrs_block->flags, + ivrs_block->length, ivrs_block->dev_id); if ( table->length < (length + ivrs_block->length) ) { @@ -851,6 +904,29 @@ length += ivrs_block->length; } + /* Each IO-APIC must have been mentioned in the table. */ + for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic ) + { + if ( !nr_ioapic_registers[apic] || + ioapic_bdf[IO_APIC_ID(apic)].pin_setup ) + continue; + + printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n", + IO_APIC_ID(apic)); + if ( amd_iommu_perdev_intremap ) + error = -ENXIO; + else + { + ioapic_bdf[IO_APIC_ID(apic)].pin_setup = xzalloc_array( + unsigned long, BITS_TO_LONGS(nr_ioapic_registers[apic])); + if ( !ioapic_bdf[IO_APIC_ID(apic)].pin_setup ) + { + printk(XENLOG_ERR "IVHD Error: Out of memory\n"); + error = -ENOMEM; + } + } + } + return error; } diff -Nru xen-4.1.3/xen/drivers/passthrough/amd/iommu_init.c xen-4.1.5/xen/drivers/passthrough/amd/iommu_init.c --- xen-4.1.3/xen/drivers/passthrough/amd/iommu_init.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/amd/iommu_init.c 2013-04-23 18:44:20.000000000 +0200 @@ -27,6 +27,7 @@ #include #include #include +#include static struct amd_iommu **irq_to_iommu; static int nr_amd_iommus; @@ -467,6 +468,7 @@ u16 domain_id, device_id, bdf, cword; u32 code; u64 *addr; + int count = 0; char * event_str[] = {"ILLEGAL_DEV_TABLE_ENTRY", "IO_PAGE_FAULT", "DEV_TABLE_HW_ERROR", @@ -479,6 +481,25 @@ code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK, IOMMU_EVENT_CODE_SHIFT); + /* + * Workaround for erratum 732: + * It can happen that the tail pointer is updated before the actual entry + * got written. As suggested by RevGuide, we initialize the event log + * buffer to all zeros and clear event log entries after processing them. + */ + while ( code == 0 ) + { + if ( unlikely(++count == IOMMU_LOG_ENTRY_TIMEOUT) ) + { + AMD_IOMMU_DEBUG("AMD-Vi: No event written to log\n"); + return; + } + udelay(1); + rmb(); + code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK, + IOMMU_EVENT_CODE_SHIFT); + } + if ( (code > IOMMU_EVENT_INVALID_DEV_REQUEST) || (code < IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY) ) { @@ -517,6 +538,8 @@ AMD_IOMMU_DEBUG("event 0x%08x 0x%08x 0x%08x 0x%08x\n", entry[0], entry[1], entry[2], entry[3]); } + + memset(entry, 0, IOMMU_EVENT_LOG_ENTRY_SIZE); } static void do_amd_iommu_irq(unsigned long data) @@ -613,6 +636,42 @@ return irq; } +/* + * Family15h Model 10h-1fh erratum 746 (IOMMU Logging May Stall Translations) + * Workaround: + * BIOS should disable L2B micellaneous clock gating by setting + * L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b + */ +static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu) +{ + u32 value; + u8 bus = PCI_BUS(iommu->bdf); + u8 dev = PCI_SLOT(iommu->bdf); + u8 func = PCI_FUNC(iommu->bdf); + + if ( (boot_cpu_data.x86 != 0x15) || + (boot_cpu_data.x86_model < 0x10) || + (boot_cpu_data.x86_model > 0x1f) ) + return; + + pci_conf_write32(bus, dev, func, 0xf0, 0x90); + value = pci_conf_read32(bus, dev, func, 0xf4); + + if ( value & (1 << 2) ) + return; + + /* Select NB indirect register 0x90 and enable writing */ + pci_conf_write32(bus, dev, func, 0xf0, 0x90 | (1 << 8)); + + pci_conf_write32(bus, dev, func, 0xf4, value | (1 << 2)); + printk(XENLOG_INFO + "AMD-Vi: Applying erratum 746 workaround for IOMMU at %02x:%02x.%u\n", + bus, dev, func); + + /* Clear the enable writing bit */ + pci_conf_write32(bus, dev, func, 0xf0, 0x90); +} + static void enable_iommu(struct amd_iommu *iommu) { unsigned long flags; @@ -625,6 +684,8 @@ return; } + amd_iommu_erratum_746_workaround(iommu); + register_iommu_dev_table_in_mmio_space(iommu); register_iommu_cmd_buffer_in_mmio_space(iommu); register_iommu_event_log_in_mmio_space(iommu); @@ -874,12 +935,46 @@ return 0; } +/* Check whether SP5100 SATA Combined mode is on */ +static bool_t __init amd_sp5100_erratum28(void) +{ + u32 bus, id; + u16 vendor_id, dev_id; + u8 byte; + + for (bus = 0; bus < 256; bus++) + { + id = pci_conf_read32(bus, 0x14, 0, PCI_VENDOR_ID); + + vendor_id = id & 0xffff; + dev_id = (id >> 16) & 0xffff; + + /* SP5100 SMBus module sets Combined mode on */ + if (vendor_id != 0x1002 || dev_id != 0x4385) + continue; + + byte = pci_conf_read8(bus, 0x14, 0, 0xad); + if ( (byte >> 3) & 1 ) + { + printk(XENLOG_WARNING "AMD-Vi: SP5100 erratum 28 detected, disabling IOMMU.\n" + "If possible, disable SATA Combined mode in BIOS or contact your vendor for BIOS update.\n"); + return 1; + } + } + + return 0; +} + int __init amd_iommu_init(void) { struct amd_iommu *iommu; BUG_ON( !iommu_found() ); + if ( iommu_intremap && amd_iommu_perdev_intremap && + amd_sp5100_erratum28() ) + goto error_out; + irq_to_iommu = xmalloc_array(struct amd_iommu *, nr_irqs); if ( irq_to_iommu == NULL ) goto error_out; @@ -897,7 +992,7 @@ goto error_out; /* initialize io-apic interrupt remapping entries */ - if ( amd_iommu_setup_ioapic_remapping() != 0 ) + if ( iommu_intremap && amd_iommu_setup_ioapic_remapping() != 0 ) goto error_out; /* allocate and initialize a global device table shared by all iommus */ diff -Nru xen-4.1.3/xen/drivers/passthrough/amd/iommu_intr.c xen-4.1.5/xen/drivers/passthrough/amd/iommu_intr.c --- xen-4.1.3/xen/drivers/passthrough/amd/iommu_intr.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/amd/iommu_intr.c 2013-04-23 18:44:20.000000000 +0200 @@ -27,7 +27,7 @@ #define INTREMAP_LENGTH 0xB #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH) -int ioapic_bdf[MAX_IO_APICS]; +struct ioapic_bdf ioapic_bdf[MAX_IO_APICS]; extern struct ivrs_mappings *ivrs_mappings; extern unsigned short ivrs_bdf_entries; void *shared_intremap_table; @@ -117,12 +117,12 @@ static void update_intremap_entry_from_ioapic( int bdf, struct amd_iommu *iommu, - struct IO_APIC_route_entry *ioapic_rte) + const struct IO_APIC_route_entry *rte, + const struct IO_APIC_route_entry *old_rte) { unsigned long flags; u32* entry; u8 delivery_mode, dest, vector, dest_mode; - struct IO_APIC_route_entry *rte = ioapic_rte; int req_id; spinlock_t *lock; int offset; @@ -138,6 +138,14 @@ spin_lock_irqsave(lock, flags); offset = get_intremap_offset(vector, delivery_mode); + if ( old_rte ) + { + int old_offset = get_intremap_offset(old_rte->vector, + old_rte->delivery_mode); + + if ( offset != old_offset ) + free_intremap_entry(bdf, old_offset); + } entry = (u32*)get_intremap_entry(req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); @@ -176,7 +184,7 @@ continue; /* get device id of ioapic devices */ - bdf = ioapic_bdf[IO_APIC_ID(apic)]; + bdf = ioapic_bdf[IO_APIC_ID(apic)].bdf; iommu = find_iommu_for_device(bdf); if ( !iommu ) { @@ -207,6 +215,7 @@ flush_command_buffer(iommu); spin_unlock_irqrestore(&iommu->lock, flags); } + set_bit(pin, ioapic_bdf[IO_APIC_ID(apic)].pin_setup); } } return 0; @@ -218,6 +227,7 @@ struct IO_APIC_route_entry old_rte = { 0 }; struct IO_APIC_route_entry new_rte = { 0 }; unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; + unsigned int pin = (reg - 0x10) / 2; int saved_mask, bdf; struct amd_iommu *iommu; @@ -228,7 +238,7 @@ } /* get device id of ioapic devices */ - bdf = ioapic_bdf[IO_APIC_ID(apic)]; + bdf = ioapic_bdf[IO_APIC_ID(apic)].bdf; iommu = find_iommu_for_device(bdf); if ( !iommu ) { @@ -254,6 +264,14 @@ *(((u32 *)&new_rte) + 1) = value; } + if ( new_rte.mask && + !test_bit(pin, ioapic_bdf[IO_APIC_ID(apic)].pin_setup) ) + { + ASSERT(saved_mask); + __io_apic_write(apic, reg, value); + return; + } + /* mask the interrupt while we change the intremap table */ if ( !saved_mask ) { @@ -262,7 +280,11 @@ } /* Update interrupt remapping entry */ - update_intremap_entry_from_ioapic(bdf, iommu, &new_rte); + update_intremap_entry_from_ioapic( + bdf, iommu, &new_rte, + test_and_set_bit(pin, + ioapic_bdf[IO_APIC_ID(apic)].pin_setup) ? &old_rte + : NULL); /* Forward write access to IO-APIC RTE */ __io_apic_write(apic, reg, value); @@ -373,6 +395,12 @@ return; } + if ( msi_desc->remap_index >= 0 ) + update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, NULL); + + if ( !msg ) + return; + update_intremap_entry_from_msi_msg(iommu, pdev, msi_desc, msg); } diff -Nru xen-4.1.3/xen/drivers/passthrough/amd/pci_amd_iommu.c xen-4.1.5/xen/drivers/passthrough/amd/pci_amd_iommu.c --- xen-4.1.3/xen/drivers/passthrough/amd/pci_amd_iommu.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/amd/pci_amd_iommu.c 2013-04-23 18:44:20.000000000 +0200 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -30,8 +31,7 @@ struct amd_iommu *find_iommu_for_device(int bdf) { - BUG_ON ( bdf >= ivrs_bdf_entries ); - return ivrs_mappings[bdf].iommu; + return bdf < ivrs_bdf_entries ? ivrs_mappings[bdf].iommu : NULL; } /* @@ -195,6 +195,8 @@ { printk("AMD-Vi: Not overriding irq_vector_map setting\n"); } + if ( !amd_iommu_perdev_intremap ) + printk(XENLOG_WARNING "AMD-Vi: Using global interrupt remap table is not recommended (see XSA-36)!\n"); return scan_pci_devices(); } @@ -270,6 +272,9 @@ * a pfn_valid() check would seem desirable here. */ amd_iommu_map_page(d, pfn, pfn, IOMMUF_readable|IOMMUF_writable); + + if ( !(i & 0xfffff) ) + process_pending_softirqs(); } } diff -Nru xen-4.1.3/xen/drivers/passthrough/iommu.c xen-4.1.5/xen/drivers/passthrough/iommu.c --- xen-4.1.3/xen/drivers/passthrough/iommu.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/iommu.c 2013-04-23 18:44:20.000000000 +0200 @@ -49,7 +49,7 @@ bool_t __read_mostly iommu_intremap = 1; bool_t __read_mostly iommu_hap_pt_share; bool_t __read_mostly amd_iommu_debug; -bool_t __read_mostly amd_iommu_perdev_intremap; +bool_t __read_mostly amd_iommu_perdev_intremap = 1; static void __init parse_iommu_param(char *s) { @@ -78,6 +78,8 @@ amd_iommu_debug = 1; else if ( !strcmp(s, "amd-iommu-perdev-intremap") ) amd_iommu_perdev_intremap = 1; + else if ( !strcmp(s, "amd-iommu-global-intremap") ) + amd_iommu_perdev_intremap = 0; else if ( !strcmp(s, "dom0-passthrough") ) iommu_passthrough = 1; else if ( !strcmp(s, "dom0-strict") ) @@ -321,6 +323,8 @@ rc = iommu_hardware_setup(); iommu_enabled = (rc == 0); } + if ( !iommu_enabled ) + iommu_intremap = 0; if ( (force_iommu && !iommu_enabled) || (force_intremap && !iommu_intremap) ) @@ -337,9 +341,12 @@ } printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); if ( iommu_enabled ) + { printk(" - Dom0 mode: %s\n", iommu_passthrough ? "Passthrough" : iommu_dom0_strict ? "Strict" : "Relaxed"); + printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis"); + } return rc; } @@ -429,7 +436,7 @@ const struct iommu_ops *ops = iommu_get_ops(); if ( iommu_enabled ) ops->crash_shutdown(); - iommu_enabled = 0; + iommu_enabled = iommu_intremap = 0; } /* diff -Nru xen-4.1.3/xen/drivers/passthrough/pci.c xen-4.1.5/xen/drivers/passthrough/pci.c --- xen-4.1.3/xen/drivers/passthrough/pci.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/pci.c 2013-04-23 18:44:20.000000000 +0200 @@ -332,7 +332,6 @@ pci_clean_dpci_irqs(d); while ( (pdev = pci_get_pdev_by_domain(d, -1, -1)) ) { - pci_cleanup_msi(pdev); bus = pdev->bus; devfn = pdev->devfn; if ( deassign_device(d, bus, devfn) ) printk("domain %d: deassign device (%02x:%02x.%x) failed!\n", @@ -346,16 +345,13 @@ int pdev_type(u8 bus, u8 devfn) { - u16 class_device; - u16 status, creg; - int pos; + u16 class_device, creg; u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); + int pos = pci_find_cap_offset(bus, d, f, PCI_CAP_ID_EXP); class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE); if ( class_device == PCI_CLASS_BRIDGE_PCI ) { - pos = pci_find_next_cap(bus, devfn, - PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP); if ( !pos ) return DEV_TYPE_LEGACY_PCI_BRIDGE; creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS); @@ -363,14 +359,7 @@ DEV_TYPE_PCIe2PCI_BRIDGE : DEV_TYPE_PCIe_BRIDGE; } - status = pci_conf_read16(bus, d, f, PCI_STATUS); - if ( !(status & PCI_STATUS_CAP_LIST) ) - return DEV_TYPE_PCI; - - if ( pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP) ) - return DEV_TYPE_PCIe_ENDPOINT; - - return DEV_TYPE_PCI; + return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI; } /* diff -Nru xen-4.1.3/xen/drivers/passthrough/vtd/intremap.c xen-4.1.5/xen/drivers/passthrough/vtd/intremap.c --- xen-4.1.3/xen/drivers/passthrough/vtd/intremap.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/vtd/intremap.c 2013-04-23 18:44:20.000000000 +0200 @@ -499,7 +499,7 @@ set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16, (bus << 8) | pdev->bus); else if ( pdev_type(bus, devfn) == DEV_TYPE_LEGACY_PCI_BRIDGE ) - set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16, + set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_ALL_16, PCI_BDF2(bus, devfn)); } break; diff -Nru xen-4.1.3/xen/drivers/passthrough/vtd/iommu.c xen-4.1.5/xen/drivers/passthrough/vtd/iommu.c --- xen-4.1.3/xen/drivers/passthrough/vtd/iommu.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/vtd/iommu.c 2013-04-23 18:44:20.000000000 +0200 @@ -1017,9 +1017,6 @@ { unsigned long flags; struct iommu *iommu = irq_to_iommu[irq]; - struct irq_desc *desc = irq_to_desc(irq); - - irq_complete_move(&desc); /* mask it */ spin_lock_irqsave(&iommu->register_lock, flags); @@ -1033,6 +1030,15 @@ return 0; } +static void dma_msi_ack(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + irq_complete_move(&desc); + dma_msi_mask(irq); + move_masked_irq(irq); +} + static void dma_msi_end(unsigned int irq, u8 vector) { dma_msi_unmask(irq); @@ -1097,7 +1103,7 @@ .shutdown = dma_msi_mask, .enable = dma_msi_unmask, .disable = dma_msi_mask, - .ack = dma_msi_mask, + .ack = dma_msi_ack, .end = dma_msi_end, .set_affinity = dma_msi_set_affinity, }; @@ -2095,6 +2101,9 @@ break; } } + if ( !iommu_intremap ) + for_each_drhd_unit ( drhd ) + disable_intremap(drhd->iommu); } /* @@ -2142,6 +2151,8 @@ return -ENODEV; platform_quirks_init(); + if ( !iommu_enabled ) + return -ENODEV; irq_to_iommu = xmalloc_array(struct iommu*, nr_irqs); BUG_ON(!irq_to_iommu); @@ -2158,7 +2169,8 @@ { iommu = drhd->iommu; - printk("Intel VT-d supported page sizes: 4kB"); + printk("Intel VT-d iommu %"PRIu32" supported page sizes: 4kB", + iommu->index); if (cap_sps_2mb(iommu->cap)) printk(", 2MB"); diff -Nru xen-4.1.3/xen/drivers/passthrough/vtd/quirks.c xen-4.1.5/xen/drivers/passthrough/vtd/quirks.c --- xen-4.1.3/xen/drivers/passthrough/vtd/quirks.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/vtd/quirks.c 2013-04-23 18:44:20.000000000 +0200 @@ -47,7 +47,6 @@ #define IS_CTG(id) (id == 0x2a408086) #define IS_ILK(id) (id == 0x00408086 || id == 0x00448086 || id== 0x00628086 || id == 0x006A8086) #define IS_CPT(id) (id == 0x01008086 || id == 0x01048086) -#define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 || id == 0x01268086 || id == 0x01028086 || id == 0x01128086 || id == 0x01228086 || id == 0x010A8086) u32 ioh_id; u32 igd_id; @@ -249,6 +248,29 @@ } } +/* 5500/5520/X58 Chipset Interrupt remapping errata, for stepping B-3. + * Fixed in stepping C-2. */ +static void __init tylersburg_intremap_quirk(void) +{ + uint32_t bus, device; + uint8_t rev; + + for ( bus = 0; bus < 0x100; bus++ ) + { + /* Match on System Management Registers on Device 20 Function 0 */ + device = pci_conf_read32(bus, 20, 0, PCI_VENDOR_ID); + rev = pci_conf_read8(bus, 20, 0, PCI_REVISION_ID); + + if ( rev == 0x13 && device == 0x342e8086 ) + { + printk(XENLOG_WARNING VTDPREFIX + "Disabling IOMMU due to Intel 5500/5520/X58 Chipset errata #47, #53\n"); + iommu_enabled = 0; + break; + } + } +} + /* initialize platform identification flags */ void __init platform_quirks_init(void) { @@ -269,6 +291,10 @@ /* ioremap IGD MMIO+0x2000 page */ map_igd_reg(); + + /* Tylersburg interrupt remap quirk */ + if ( iommu_intremap ) + tylersburg_intremap_quirk(); } /* diff -Nru xen-4.1.3/xen/drivers/passthrough/vtd/x86/ats.c xen-4.1.5/xen/drivers/passthrough/vtd/x86/ats.c --- xen-4.1.3/xen/drivers/passthrough/vtd/x86/ats.c 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/drivers/passthrough/vtd/x86/ats.c 2013-04-23 18:44:20.000000000 +0200 @@ -32,7 +32,7 @@ #define ATS_REG_CAP 4 #define ATS_REG_CTL 6 -#define ATS_QUEUE_DEPTH_MASK 0xF +#define ATS_QUEUE_DEPTH_MASK 0x1f #define ATS_ENABLE (1<<15) struct pci_ats_dev { @@ -178,7 +178,8 @@ pdev->devfn = devfn; value = pci_conf_read16(bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CAP); - pdev->ats_queue_depth = value & ATS_QUEUE_DEPTH_MASK; + pdev->ats_queue_depth = value & ATS_QUEUE_DEPTH_MASK ?: + ATS_QUEUE_DEPTH_MASK + 1; list_add(&pdev->list, &ats_devices); } diff -Nru xen-4.1.3/xen/include/asm-x86/amd.h xen-4.1.5/xen/include/asm-x86/amd.h --- xen-4.1.3/xen/include/asm-x86/amd.h 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/amd.h 2013-04-23 18:44:20.000000000 +0200 @@ -151,6 +151,8 @@ int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...); #ifdef __x86_64__ +extern int opt_allow_unsafe; + void fam10h_check_enable_mmcfg(void); void check_enable_amd_mmconf_dmi(void); #endif diff -Nru xen-4.1.3/xen/include/asm-x86/config.h xen-4.1.5/xen/include/asm-x86/config.h --- xen-4.1.3/xen/include/asm-x86/config.h 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/config.h 2013-04-23 18:44:20.000000000 +0200 @@ -108,6 +108,9 @@ extern unsigned char trampoline_cpu_started; extern char wakeup_start[]; extern unsigned int video_mode, video_flags; + +#define GB(_gb) (_gb ## UL << 30) + #endif #define asmlinkage @@ -123,7 +126,6 @@ #define PML4_ADDR(_slot) \ ((((_slot ## UL) >> 8) * 0xffff000000000000UL) | \ (_slot ## UL << PML4_ENTRY_BITS)) -#define GB(_gb) (_gb ## UL << 30) #else #define PML4_ENTRY_BYTES (1 << PML4_ENTRY_BITS) #define PML4_ADDR(_slot) \ diff -Nru xen-4.1.3/xen/include/asm-x86/cpufeature.h xen-4.1.5/xen/include/asm-x86/cpufeature.h --- xen-4.1.3/xen/include/asm-x86/cpufeature.h 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/cpufeature.h 2013-04-23 18:44:20.000000000 +0200 @@ -96,6 +96,7 @@ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4*32+15) /* Perf/Debug Capability MSR */ +#define X86_FEATURE_PCID (4*32+17) /* Process Context ID */ #define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ #define X86_FEATURE_SSE4_1 (4*32+19) /* Streaming SIMD Extensions 4.1 */ #define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */ @@ -146,6 +147,7 @@ #define X86_FEATURE_FSGSBASE (7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */ #define X86_FEATURE_SMEP (7*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_ERMS (7*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID (7*32+10) /* Invalidate Process Context ID */ #define cpu_has(c, bit) test_bit(bit, (c)->x86_capability) #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability) diff -Nru xen-4.1.3/xen/include/asm-x86/debugreg.h xen-4.1.5/xen/include/asm-x86/debugreg.h --- xen-4.1.3/xen/include/asm-x86/debugreg.h 2012-08-09 22:08:09.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/debugreg.h 2013-04-23 18:44:20.000000000 +0200 @@ -58,7 +58,7 @@ We can slow the instruction pipeline for instructions coming via the gdt or the ldt if we want to. I am not sure why this is an advantage */ -#define DR_CONTROL_RESERVED_ZERO (0x0000d800ul) /* Reserved, read as zero */ +#define DR_CONTROL_RESERVED_ZERO (~0xffff27fful) /* Reserved, read as zero */ #define DR_CONTROL_RESERVED_ONE (0x00000400ul) /* Reserved, read as one */ #define DR_LOCAL_EXACT_ENABLE (0x00000100ul) /* Local exact enable */ #define DR_GLOBAL_EXACT_ENABLE (0x00000200ul) /* Global exact enable */ diff -Nru xen-4.1.3/xen/include/asm-x86/hpet.h xen-4.1.5/xen/include/asm-x86/hpet.h --- xen-4.1.3/xen/include/asm-x86/hpet.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/hpet.h 2013-04-23 18:44:20.000000000 +0200 @@ -42,6 +42,7 @@ #define HPET_LEGACY_8254 2 #define HPET_LEGACY_RTC 8 +#define HPET_TN_LEVEL 0x002 #define HPET_TN_ENABLE 0x004 #define HPET_TN_PERIODIC 0x008 #define HPET_TN_PERIODIC_CAP 0x010 diff -Nru xen-4.1.3/xen/include/asm-x86/hvm/hvm.h xen-4.1.5/xen/include/asm-x86/hvm/hvm.h --- xen-4.1.3/xen/include/asm-x86/hvm/hvm.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/hvm/hvm.h 2013-04-23 18:44:20.000000000 +0200 @@ -116,6 +116,9 @@ void (*update_guest_cr)(struct vcpu *v, unsigned int cr); void (*update_guest_efer)(struct vcpu *v); + int (*get_guest_pat)(struct vcpu *v, u64 *); + int (*set_guest_pat)(struct vcpu *v, u64); + void (*set_tsc_offset)(struct vcpu *v, u64 offset); void (*inject_exception)(unsigned int trapnr, int errcode, @@ -166,6 +169,9 @@ bool_t hvm_send_assist_req(struct vcpu *v); +void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat); +int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat); + void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc); u64 hvm_get_guest_tsc(struct vcpu *v); diff -Nru xen-4.1.3/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h xen-4.1.5/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h --- xen-4.1.3/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h 2013-04-23 18:44:20.000000000 +0200 @@ -269,6 +269,8 @@ #define IOMMU_EVENT_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_EVENT_DEVICE_ID_SHIFT 0 +#define IOMMU_LOG_ENTRY_TIMEOUT 1000 + /* Control Register */ #define IOMMU_CONTROL_MMIO_OFFSET 0x18 #define IOMMU_CONTROL_TRANSLATION_ENABLE_MASK 0x00000001 diff -Nru xen-4.1.3/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h xen-4.1.5/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h --- xen-4.1.3/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 2013-04-23 18:44:20.000000000 +0200 @@ -88,6 +88,11 @@ unsigned int amd_iommu_read_ioapic_from_ire( unsigned int apic, unsigned int reg); +extern struct ioapic_bdf { + u16 bdf; + unsigned long *pin_setup; +} ioapic_bdf[]; + /* power management support */ void amd_iommu_resume(void); void amd_iommu_suspend(void); diff -Nru xen-4.1.3/xen/include/asm-x86/hypercall.h xen-4.1.5/xen/include/asm-x86/hypercall.h --- xen-4.1.3/xen/include/asm-x86/hypercall.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/hypercall.h 2013-04-23 18:44:20.000000000 +0200 @@ -25,7 +25,7 @@ do_set_trap_table( XEN_GUEST_HANDLE(const_trap_info_t) traps); -extern int +extern long do_mmu_update( XEN_GUEST_HANDLE(mmu_update_t) ureqs, unsigned int count, @@ -63,7 +63,7 @@ extern long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc); -extern int +extern long do_update_va_mapping( unsigned long va, u64 val64, @@ -73,14 +73,14 @@ do_physdev_op( int cmd, XEN_GUEST_HANDLE(void) arg); -extern int +extern long do_update_va_mapping_otherdomain( unsigned long va, u64 val64, unsigned long flags, domid_t domid); -extern int +extern long do_mmuext_op( XEN_GUEST_HANDLE(mmuext_op_t) uops, unsigned int count, @@ -101,10 +101,6 @@ struct xen_domctl *domctl, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl); -extern int -do_kexec( - unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg); - #ifdef __x86_64__ extern long diff -Nru xen-4.1.3/xen/include/asm-x86/io_apic.h xen-4.1.5/xen/include/asm-x86/io_apic.h --- xen-4.1.3/xen/include/asm-x86/io_apic.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/io_apic.h 2013-04-23 18:44:20.000000000 +0200 @@ -129,7 +129,7 @@ extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; /* Only need to remap ioapic RTE (reg: 10~3Fh) */ -#define ioapic_reg_remapped(reg) (iommu_enabled && ((reg) >= 0x10)) +#define ioapic_reg_remapped(reg) (iommu_intremap && ((reg) >= 0x10)) static inline unsigned int __io_apic_read(unsigned int apic, unsigned int reg) { diff -Nru xen-4.1.3/xen/include/asm-x86/mm.h xen-4.1.5/xen/include/asm-x86/mm.h --- xen-4.1.3/xen/include/asm-x86/mm.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/mm.h 2013-04-23 18:44:20.000000000 +0200 @@ -238,7 +238,7 @@ #endif #if defined(__i386__) -#define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page)) +#define is_xen_heap_page(page) is_xen_heap_mfn(__page_to_mfn(page)) #define is_xen_heap_mfn(mfn) ({ \ unsigned long _mfn = (mfn); \ (_mfn < paddr_to_pfn(xenheap_phys_end)); \ @@ -329,6 +329,7 @@ void clear_superpage_mark(struct page_info *page); +const unsigned long *get_platform_badpages(unsigned int *array_size); struct domain *page_get_owner_and_reference(struct page_info *page); void put_page(struct page_info *page); int get_page(struct page_info *page, struct domain *domain); diff -Nru xen-4.1.3/xen/include/asm-x86/msi.h xen-4.1.5/xen/include/asm-x86/msi.h --- xen-4.1.3/xen/include/asm-x86/msi.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/msi.h 2013-04-23 18:44:20.000000000 +0200 @@ -75,6 +75,7 @@ extern void set_msi_affinity(unsigned int vector, cpumask_t mask); extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc); extern void pci_disable_msi(struct msi_desc *desc); +extern int pci_prepare_msix(u8 bus, u8 devfn, bool_t off); extern void pci_cleanup_msi(struct pci_dev *pdev); extern int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq); extern void teardown_msi_irq(int irq); diff -Nru xen-4.1.3/xen/include/asm-x86/msr-index.h xen-4.1.5/xen/include/asm-x86/msr-index.h --- xen-4.1.3/xen/include/asm-x86/msr-index.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/msr-index.h 2013-04-23 18:44:20.000000000 +0200 @@ -245,6 +245,7 @@ /* AMD64 MSRs */ #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_IC_CFG 0xc0011021 #define MSR_AMD64_DC_CFG 0xc0011022 #define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT 46 @@ -253,8 +254,9 @@ #define MSR_F10_MC4_MISC2 0xc0000409 #define MSR_F10_MC4_MISC3 0xc000040A -/* AMD Family10h MMU control MSRs */ -#define MSR_F10_BU_CFG 0xc0011023 +/* AMD Family10h Bus Unit MSRs */ +#define MSR_F10_BU_CFG 0xc0011023 +#define MSR_F10_BU_CFG2 0xc001102a /* Other AMD Fam10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 diff -Nru xen-4.1.3/xen/include/asm-x86/p2m.h xen-4.1.5/xen/include/asm-x86/p2m.h --- xen-4.1.3/xen/include/asm-x86/p2m.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/p2m.h 2013-04-23 18:44:20.000000000 +0200 @@ -255,11 +255,11 @@ struct { struct page_list_head super, /* List of superpages */ single; /* Non-super lists */ - int count, /* # of pages in cache lists */ + long count, /* # of pages in cache lists */ entry_count; /* # of pages in p2m marked pod */ - unsigned reclaim_super; /* Last gpfn of a scan */ - unsigned reclaim_single; /* Last gpfn of a scan */ - unsigned max_guest; /* gpfn of max guest demand-populate */ + unsigned long reclaim_super; /* Last gpfn of a scan */ + unsigned long reclaim_single; /* Last gpfn of a scan */ + unsigned long max_guest; /* gpfn of max guest demand-populate */ } pod; }; diff -Nru xen-4.1.3/xen/include/asm-x86/processor.h xen-4.1.5/xen/include/asm-x86/processor.h --- xen-4.1.3/xen/include/asm-x86/processor.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/processor.h 2013-04-23 18:44:20.000000000 +0200 @@ -587,6 +587,8 @@ DECLARE_TRAP_HANDLER(spurious_interrupt_bug); #undef DECLARE_TRAP_HANDLER +void enable_nmis(void); + int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val); diff -Nru xen-4.1.3/xen/include/asm-x86/smp.h xen-4.1.5/xen/include/asm-x86/smp.h --- xen-4.1.3/xen/include/asm-x86/smp.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/smp.h 2013-04-23 18:44:20.000000000 +0200 @@ -30,7 +30,8 @@ void smp_send_nmi_allbutself(void); -void send_IPI_mask(const cpumask_t *mask, int vector); +void send_IPI_mask(const cpumask_t *, int vector); +void send_IPI_self(int vector); extern void (*mtrr_hook) (void); diff -Nru xen-4.1.3/xen/include/asm-x86/x86_64/uaccess.h xen-4.1.5/xen/include/asm-x86/x86_64/uaccess.h --- xen-4.1.3/xen/include/asm-x86/x86_64/uaccess.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/asm-x86/x86_64/uaccess.h 2013-04-23 18:44:20.000000000 +0200 @@ -21,7 +21,7 @@ * non-canonical address (and thus fault) before ever reaching VIRT_START. */ #define __addr_ok(addr) \ - (((unsigned long)(addr) < (1UL<<48)) || \ + (((unsigned long)(addr) < (1UL<<47)) || \ ((unsigned long)(addr) >= HYPERVISOR_VIRT_END)) #define access_ok(addr, size) \ diff -Nru xen-4.1.3/xen/include/public/nmi.h xen-4.1.5/xen/include/public/nmi.h --- xen-4.1.3/xen/include/public/nmi.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/public/nmi.h 2013-04-23 18:44:20.000000000 +0200 @@ -36,9 +36,14 @@ /* I/O-check error reported via ISA port 0x61, bit 6. */ #define _XEN_NMIREASON_io_error 0 #define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error) + /* PCI SERR reported via ISA port 0x61, bit 7. */ +#define _XEN_NMIREASON_pci_serr 1 +#define XEN_NMIREASON_pci_serr (1UL << _XEN_NMIREASON_pci_serr) +#if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */ /* Parity error reported via ISA port 0x61, bit 7. */ #define _XEN_NMIREASON_parity_error 1 #define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error) +#endif /* Unknown hardware-generated NMI. */ #define _XEN_NMIREASON_unknown 2 #define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown) diff -Nru xen-4.1.3/xen/include/public/physdev.h xen-4.1.5/xen/include/public/physdev.h --- xen-4.1.3/xen/include/public/physdev.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/public/physdev.h 2013-04-23 18:44:20.000000000 +0200 @@ -264,6 +264,21 @@ DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t); /* + * Dom0 should use these two to announce MMIO resources assigned to + * MSI-X capable devices won't (prepare) or may (release) change. + */ +#define PHYSDEVOP_prepare_msix 30 +#define PHYSDEVOP_release_msix 31 +struct physdev_pci_device { + /* IN */ + uint16_t seg; + uint8_t bus; + uint8_t devfn; +}; +typedef struct physdev_pci_device physdev_pci_device_t; +DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t); + +/* * Notify that some PIRQ-bound event channels have been unmasked. * ** This command is obsolete since interface version 0x00030202 and is ** * ** unsupported by newer versions of Xen. ** diff -Nru xen-4.1.3/xen/include/xen/decompress.h xen-4.1.5/xen/include/xen/decompress.h --- xen-4.1.3/xen/include/xen/decompress.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/decompress.h 2013-04-23 18:44:20.000000000 +0200 @@ -31,7 +31,7 @@ * dependent). */ -decompress_fn bunzip2, unlzma, unlzo; +decompress_fn bunzip2, unxz, unlzma, unlzo; int decompress(void *inbuf, unsigned int len, void *outbuf); diff -Nru xen-4.1.3/xen/include/xen/err.h xen-4.1.5/xen/include/xen/err.h --- xen-4.1.3/xen/include/xen/err.h 1970-01-01 01:00:00.000000000 +0100 +++ xen-4.1.5/xen/include/xen/err.h 2013-04-23 18:44:20.000000000 +0200 @@ -0,0 +1,57 @@ +#if !defined(__XEN_ERR_H__) && !defined(__ASSEMBLY__) +#define __XEN_ERR_H__ + +#include +#include + +/* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This could be a per-architecture thing, to allow different + * error and pointer decisions. + */ +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) + +static inline void *__must_check ERR_PTR(long error) +{ + return (void *)error; +} + +static inline long __must_check PTR_ERR(const void *ptr) +{ + return (long)ptr; +} + +static inline long __must_check IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +{ + return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} + +/** + * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type + * @ptr: The pointer to cast. + * + * Explicitly cast an error-valued pointer to another pointer type in such a + * way as to make it clear that's what's going on. + */ +static inline void * __must_check ERR_CAST(const void *ptr) +{ + /* cast away the const */ + return (void *)ptr; +} + +static inline int __must_check PTR_RET(const void *ptr) +{ + return IS_ERR(ptr) ? PTR_ERR(ptr) : 0; +} + +#endif /* __XEN_ERR_H__ */ diff -Nru xen-4.1.3/xen/include/xen/grant_table.h xen-4.1.5/xen/include/xen/grant_table.h --- xen-4.1.3/xen/include/xen/grant_table.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/grant_table.h 2013-04-23 18:44:20.000000000 +0200 @@ -32,7 +32,7 @@ struct active_grant_entry { u32 pin; /* Reference count information. */ domid_t domid; /* Domain being granted access. */ - domid_t trans_dom; + struct domain *trans_domain; uint32_t trans_gref; unsigned long frame; /* Frame being granted. */ unsigned long gfn; /* Guest's idea of the frame being granted. */ diff -Nru xen-4.1.3/xen/include/xen/hypercall.h xen-4.1.5/xen/include/xen/hypercall.h --- xen-4.1.3/xen/include/xen/hypercall.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/hypercall.h 2013-04-23 18:44:20.000000000 +0200 @@ -121,7 +121,7 @@ do_tmem_op( XEN_GUEST_HANDLE(tmem_op_t) uops); -extern int +extern long do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg); #ifdef CONFIG_COMPAT diff -Nru xen-4.1.3/xen/include/xen/iocap.h xen-4.1.5/xen/include/xen/iocap.h --- xen-4.1.3/xen/include/xen/iocap.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/iocap.h 2013-04-23 18:44:20.000000000 +0200 @@ -28,4 +28,22 @@ #define irq_access_permitted(d, i) \ rangeset_contains_singleton((d)->irq_caps, i) +#define pirq_permit_access(d, i) ({ \ + struct domain *d__ = (d); \ + int i__ = domain_pirq_to_irq(d__, i); \ + i__ > 0 ? rangeset_add_singleton(d__->irq_caps, i__)\ + : -EINVAL; \ +}) +#define pirq_deny_access(d, i) ({ \ + struct domain *d__ = (d); \ + int i__ = domain_pirq_to_irq(d__, i); \ + i__ > 0 ? rangeset_remove_singleton(d__->irq_caps, i__)\ + : -EINVAL; \ +}) +#define pirq_access_permitted(d, i) ({ \ + struct domain *d__ = (d); \ + rangeset_contains_singleton(d__->irq_caps, \ + domain_pirq_to_irq(d__, i));\ +}) + #endif /* __XEN_IOCAP_H__ */ diff -Nru xen-4.1.3/xen/include/xen/mm.h xen-4.1.5/xen/include/xen/mm.h --- xen-4.1.3/xen/include/xen/mm.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/mm.h 2013-04-23 18:44:20.000000000 +0200 @@ -268,7 +268,8 @@ last = list->tail; at = head->next; - first->list.prev = page_to_pdx(head->next); + ASSERT(first->list.prev == PAGE_LIST_NULL); + ASSERT(first->list.prev == at->list.prev); head->next = first; last->list.next = page_to_pdx(at); diff -Nru xen-4.1.3/xen/include/xen/pci.h xen-4.1.5/xen/include/xen/pci.h --- xen-4.1.3/xen/include/xen/pci.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/pci.h 2013-04-23 18:44:20.000000000 +0200 @@ -57,6 +57,7 @@ int msix_table_refcnt[MAX_MSIX_TABLE_PAGES]; int msix_table_idx[MAX_MSIX_TABLE_PAGES]; spinlock_t msix_table_lock; + domid_t msix_warned; struct domain *domain; const u8 bus; @@ -126,4 +127,9 @@ void msixtbl_pt_unregister(struct domain *d, int pirq); void pci_enable_acs(struct pci_dev *pdev); +#define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \ + || id == 0x01268086 || id == 0x01028086 \ + || id == 0x01128086 || id == 0x01228086 \ + || id == 0x010A8086 ) + #endif /* __XEN_PCI_H__ */ diff -Nru xen-4.1.3/xen/include/xen/sched.h xen-4.1.5/xen/include/xen/sched.h --- xen-4.1.3/xen/include/xen/sched.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/sched.h 2013-04-23 18:44:20.000000000 +0200 @@ -143,6 +143,9 @@ bool_t defer_shutdown; /* VCPU is paused following shutdown request (d->is_shutting_down)? */ bool_t paused_for_shutdown; + /* VCPU need affinity restored */ + bool_t affinity_broken; + /* * > 0: a single port is being polled; @@ -165,6 +168,8 @@ cpumask_t cpu_affinity; /* Used to change affinity temporarily. */ cpumask_t cpu_affinity_tmp; + /* Used to restore affinity across S3. */ + cpumask_t cpu_affinity_saved; /* Bitmask of CPUs which are holding onto this VCPU's state. */ cpumask_t vcpu_dirty_cpumask; @@ -619,6 +624,7 @@ void vcpu_force_reschedule(struct vcpu *v); int cpu_disable_scheduler(unsigned int cpu); int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); +void restore_vcpu_affinity(struct domain *d); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); uint64_t get_cpu_idle_time(unsigned int cpu); diff -Nru xen-4.1.3/xen/include/xen/time.h xen-4.1.5/xen/include/xen/time.h --- xen-4.1.3/xen/include/xen/time.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/time.h 2013-04-23 18:44:20.000000000 +0200 @@ -53,6 +53,8 @@ #define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL)) #define MICROSECS(_us) ((s_time_t)((_us) * 1000ULL)) #define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1)) +/* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */ +#define STIME_DELTA_MAX ((s_time_t)((uint64_t)~0ull>>2)) extern void update_vcpu_system_time(struct vcpu *v); extern void update_domain_wallclock_time(struct domain *d); diff -Nru xen-4.1.3/xen/include/xen/tmem_xen.h xen-4.1.5/xen/include/xen/tmem_xen.h --- xen-4.1.3/xen/include/xen/tmem_xen.h 2012-08-09 22:08:10.000000000 +0200 +++ xen-4.1.5/xen/include/xen/tmem_xen.h 2013-04-23 18:44:20.000000000 +0200 @@ -482,30 +482,39 @@ return copy_from_guest(op, uops, 1); } +#define tmh_cli_buf_null guest_handle_from_ptr(NULL, char) + static inline void tmh_copy_to_client_buf_offset(tmem_cli_va_t clibuf, int off, char *tmembuf, int len) { copy_to_guest_offset(clibuf,off,tmembuf,len); } +#define tmh_copy_to_client_buf(clibuf, tmembuf, cnt) \ + copy_to_guest(guest_handle_cast(clibuf, void), tmembuf, cnt) + +#define tmh_client_buf_add guest_handle_add_offset + #define TMH_CLI_ID_NULL ((cli_id_t)((domid_t)-1L)) #define tmh_cli_id_str "domid" #define tmh_client_str "domain" -extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t,void*); +int tmh_decompress_to_client(tmem_cli_mfn_t, void *, size_t, tmem_cli_va_t); -extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*); +int tmh_compress_from_client(tmem_cli_mfn_t, void **, size_t *, tmem_cli_va_t); -extern int tmh_copy_from_client(pfp_t *pfp, - tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, - pagesize_t pfn_offset, pagesize_t len, void *cva); +int tmh_copy_from_client(pfp_t *, tmem_cli_mfn_t, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, tmem_cli_va_t); -extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, - pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva); +int tmh_copy_to_client(tmem_cli_mfn_t, pfp_t *, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, tmem_cli_va_t); extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, pagesize_t len); +#define tmh_client_err(fmt, args...) printk(XENLOG_G_ERR fmt, ##args) +#define tmh_client_warn(fmt, args...) printk(XENLOG_G_WARNING fmt, ##args) +#define tmh_client_info(fmt, args...) printk(XENLOG_G_INFO fmt, ##args) #define TMEM_PERF #ifdef TMEM_PERF diff -Nru xen-4.1.3/xen/Makefile xen-4.1.5/xen/Makefile --- xen-4.1.3/xen/Makefile 2012-08-09 22:08:08.000000000 +0200 +++ xen-4.1.5/xen/Makefile 2013-04-23 18:44:20.000000000 +0200 @@ -2,7 +2,7 @@ # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 1 -export XEN_EXTRAVERSION ?= .3$(XEN_VENDORVERSION) +export XEN_EXTRAVERSION ?= .5$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version