Commit f39fc760 authored by Leonardo Arena's avatar Leonardo Arena

main/xen: security fixes

CVE-2018-19961, CVE-2018-19962, XSA-275
CVE-2018-18883, XSA-278
CVE-2018-19965, XSA-279
CVE-2018-19966, XSA-280
CVE-2018-19967, XSA-282

Fixes #9845
parent d39be709
......@@ -3,7 +3,7 @@
# Maintainer: William Pitcock <nenolod@dereferenced.org>
pkgname=xen
pkgver=4.9.3
pkgrel=0
pkgrel=1
pkgdesc="Xen hypervisor"
url="http://www.xen.org/"
arch="x86_64 armhf aarch64"
......@@ -130,6 +130,13 @@ options="!strip"
# - CVE-2018-15468 XSA-269
# - CVE-2018-15470 XSA-272
# - CVE-2018-3620 CVE-2018-3646 XSA-273
# 4.9.3-r1:
# - CVE-2018-19961 XSA-275
# - CVE-2018-19962 XSA-275
# - CVE-2018-18883 XSA-278
# - CVE-2018-19965 XSA-279
# - CVE-2018-19966 XSA-280
# - CVE-2018-19967 XSA-282
case "$CARCH" in
x86*)
......@@ -194,6 +201,15 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
xenqemu-xattr-size-max.patch
xsa275-4.11-1.patch
xsa275-4.11-2.patch
xsa278-4.11.patch
xsa282-4.9-1.patch
xsa282-2.patch
xsa279-4.9.patch
xsa280-4.10-2.patch
xsa280-4.9-1.patch
xenstored.initd
xenstored.confd
xenconsoled.initd
......@@ -446,6 +462,14 @@ a3197d9c2455983554610031702ea95dc31f1b375b8c1291207d33c9e6114c6928417b4c8138cb53
e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3ac853c5dbad8082da3c9cd53b65081910516feb492577b7fc xen-fd-is-file.c
69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch
2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch
158054c37d9df6b3576246ecf43505fb5417febad175650dce954151ed52b8ce27729a59ac873ce1cf210e6613c8315378fb5ac9ab1667e9b844fe0d007c776d xsa275-4.11-1.patch
6f118663e5e5c86449e05a22a84a400bb9d1e8ef6b351cbba00fafcf52932924392c44a9664d1f3d720473cc7afbb04abff0a60ec7be75109bf355f8fe95fa59 xsa275-4.11-2.patch
35c8c90b78856ce364cac0ddfd759aa807480bb57136e609a1462ad0f53e867a6a2acafbec5dad586d6d5159e2e377e5b6aa2ffe659d83a7684b7bb6fddba1a6 xsa278-4.11.patch
7050af051031c499170bb42a2060678297f6e3ff5b9079b646b84a9ad137ed478fe319ba43b9bccde56b9c4a341672403458c12d2adbf8e208995b7e09a5ca14 xsa279-4.9.patch
5eb30e29e22cf7c76a777f99e1e8035be1d6d645ddb616446a7840ef93fd4e2d2fedda9e7a3708b31e42c12b14178aa424c50b3e3f585b93052fcbc9a357f21d xsa280-4.10-2.patch
0517d9ab5dd0e1faef5126fbd012306da503a23d95143b232ca61aba2bf92a15ebced3c4a4b9bb3c5105a089ea7dff2059e861c80a82975372d78ecdbc32a4c4 xsa280-4.9-1.patch
a2cb124aab729931617e10a6a34900c21ef7f846926447a8752adb343ef7bf32f3625059f25c6487df27337eee03701da9a3009154d82a2cd1c8fb4be58cbc2e xsa282-2.patch
d5dd53d66fc45dfccd51adf81e8864b70c6c35922479002419e6e984738f4a2695d528be4d871d9aa9f4ddf60987990580a6f8ebf0a7b99e5845984f0f36755b xsa282-4.9-1.patch
52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd
093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd
3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd
......
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: amd/iommu: fix flush checks
Flush checking for AMD IOMMU didn't check whether the previous entry
was present, or whether the flags (writable/readable) changed in order
to decide whether a flush should be executed.
Fix this by taking the writable/readable/next-level fields into account,
together with the present bit.
Along these lines the flushing in amd_iommu_map_page() must not be
omitted for PV domains. The comment there was simply wrong: Mappings may
very well change, both their addresses and their permissions. Ultimately
this should honor iommu_dont_flush_iotlb, but to achieve this
amd_iommu_ops first needs to gain an .iotlb_flush hook.
Also make clear_iommu_pte_present() static, to demonstrate there's no
caller omitting the (subsequent) flush.
This is part of XSA-275.
Reported-by: Paul Durrant <paul.durrant@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -35,7 +35,7 @@ static unsigned int pfn_to_pde_idx(unsig
return idx;
}
-void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn)
+static void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn)
{
u64 *table, *pte;
@@ -49,23 +49,42 @@ static bool_t set_iommu_pde_present(u32
unsigned int next_level,
bool_t iw, bool_t ir)
{
- u64 addr_lo, addr_hi, maddr_old, maddr_next;
+ uint64_t addr_lo, addr_hi, maddr_next;
u32 entry;
- bool_t need_flush = 0;
+ bool need_flush = false, old_present;
maddr_next = (u64)next_mfn << PAGE_SHIFT;
- addr_hi = get_field_from_reg_u32(pde[1],
- IOMMU_PTE_ADDR_HIGH_MASK,
- IOMMU_PTE_ADDR_HIGH_SHIFT);
- addr_lo = get_field_from_reg_u32(pde[0],
- IOMMU_PTE_ADDR_LOW_MASK,
- IOMMU_PTE_ADDR_LOW_SHIFT);
-
- maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
-
- if ( maddr_old != maddr_next )
- need_flush = 1;
+ old_present = get_field_from_reg_u32(pde[0], IOMMU_PTE_PRESENT_MASK,
+ IOMMU_PTE_PRESENT_SHIFT);
+ if ( old_present )
+ {
+ bool old_r, old_w;
+ unsigned int old_level;
+ uint64_t maddr_old;
+
+ addr_hi = get_field_from_reg_u32(pde[1],
+ IOMMU_PTE_ADDR_HIGH_MASK,
+ IOMMU_PTE_ADDR_HIGH_SHIFT);
+ addr_lo = get_field_from_reg_u32(pde[0],
+ IOMMU_PTE_ADDR_LOW_MASK,
+ IOMMU_PTE_ADDR_LOW_SHIFT);
+ old_level = get_field_from_reg_u32(pde[0],
+ IOMMU_PDE_NEXT_LEVEL_MASK,
+ IOMMU_PDE_NEXT_LEVEL_SHIFT);
+ old_w = get_field_from_reg_u32(pde[1],
+ IOMMU_PTE_IO_WRITE_PERMISSION_MASK,
+ IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT);
+ old_r = get_field_from_reg_u32(pde[1],
+ IOMMU_PTE_IO_READ_PERMISSION_MASK,
+ IOMMU_PTE_IO_READ_PERMISSION_SHIFT);
+
+ maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
+
+ if ( maddr_old != maddr_next || iw != old_w || ir != old_r ||
+ old_level != next_level )
+ need_flush = true;
+ }
addr_lo = maddr_next & DMA_32BIT_MASK;
addr_hi = maddr_next >> 32;
@@ -687,10 +706,7 @@ int amd_iommu_map_page(struct domain *d,
if ( !need_flush )
goto out;
- /* 4K mapping for PV guests never changes,
- * no need to flush if we trust non-present bits */
- if ( is_hvm_domain(d) )
- amd_iommu_flush_pages(d, gfn, 0);
+ amd_iommu_flush_pages(d, gfn, 0);
for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2;
merge_level <= hd->arch.paging_mode; merge_level++ )
From: Jan Beulich <jbeulich@suse.com>
Subject: AMD/IOMMU: suppress PTE merging after initial table creation
The logic is not fit for this purpose, so simply disable its use until
it can be fixed / replaced. Note that this re-enables merging for the
table creation case, which was disabled as a (perhaps unintended) side
effect of the earlier "amd/iommu: fix flush checks". It relies on no
page getting mapped more than once (with different properties) in this
process, as that would still be beyond what the merging logic can cope
with. But arch_iommu_populate_page_table() guarantees this afaict.
This is part of XSA-275.
Reported-by: Paul Durrant <paul.durrant@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -702,11 +702,24 @@ int amd_iommu_map_page(struct domain *d,
!!(flags & IOMMUF_writable),
!!(flags & IOMMUF_readable));
- /* Do not increase pde count if io mapping has not been changed */
- if ( !need_flush )
- goto out;
+ if ( need_flush )
+ {
+ amd_iommu_flush_pages(d, gfn, 0);
+ /* No further merging, as the logic doesn't cope. */
+ hd->arch.no_merge = true;
+ }
- amd_iommu_flush_pages(d, gfn, 0);
+ /*
+ * Suppress merging of non-R/W mappings or after initial table creation,
+ * as the merge logic does not cope with this.
+ */
+ if ( hd->arch.no_merge || flags != (IOMMUF_writable | IOMMUF_readable) )
+ goto out;
+ if ( d->creation_finished )
+ {
+ hd->arch.no_merge = true;
+ goto out;
+ }
for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2;
merge_level <= hd->arch.paging_mode; merge_level++ )
@@ -780,6 +793,10 @@ int amd_iommu_unmap_page(struct domain *
/* mark PTE as 'page not present' */
clear_iommu_pte_present(pt_mfn[1], gfn);
+
+ /* No further merging in amd_iommu_map_page(), as the logic doesn't cope. */
+ hd->arch.no_merge = true;
+
spin_unlock(&hd->arch.mapping_lock);
amd_iommu_flush_pages(d, gfn, 0);
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -40,6 +40,7 @@ struct arch_iommu
/* amd iommu support */
int paging_mode;
+ bool no_merge;
struct page_info *root_table;
struct guest_iommu *g_iommu;
};
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/vvmx: Disallow the use of VT-x instructions when nested virt is disabled
c/s ac6a4500b "vvmx: set vmxon_region_pa of vcpu out of VMX operation to an
invalid address" was a real bugfix as described, but has a very subtle bug
which results in all VT-x instructions being usable by a guest.
The toolstack constructs a guest by issuing:
XEN_DOMCTL_createdomain
XEN_DOMCTL_max_vcpus
and optionally later, HVMOP_set_param to enable nested virt.
As a result, the call to nvmx_vcpu_initialise() in hvm_vcpu_initialise()
(which is what makes the above patch look correct during review) is actually
dead code. In practice, nvmx_vcpu_initialise() first gets called when nested
virt is enabled, which is typically never.
As a result, the zeroed memory of struct vcpu causes nvmx_vcpu_in_vmx() to
return true before nested virt is enabled for the guest.
Fixing the order of initialisation is a work in progress for other reasons,
but not viable for security backports.
A compounding factor is that the vmexit handlers for all instructions, other
than VMXON, pass 0 into vmx_inst_check_privilege()'s vmxop_check parameter,
which skips the CR4.VMXE check. (This is one of many reasons why nested virt
isn't a supported feature yet.)
However, the overall result is that when nested virt is not enabled by the
toolstack (i.e. the default configuration for all production guests), the VT-x
instructions (other than VMXON) are actually usable, and Xen very quickly
falls over the fact that the nvmx structure is uninitialised.
In order to fail safe in the supported case, re-implement all the VT-x
instruction handling using a single function with a common prologue, covering
all the checks which should cause #UD or #GP faults. This deliberately
doesn't use any state from the nvmx structure, in case there are other lurking
issues.
This is XSA-278
Reported-by: Sergey Dyasli <sergey.dyasli@citrix.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Sergey Dyasli <sergey.dyasli@citrix.com>
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index a6415f0..a4d2829 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3982,57 +3982,17 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
break;
case EXIT_REASON_VMXOFF:
- if ( nvmx_handle_vmxoff(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMXON:
- if ( nvmx_handle_vmxon(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMCLEAR:
- if ( nvmx_handle_vmclear(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMPTRLD:
- if ( nvmx_handle_vmptrld(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMPTRST:
- if ( nvmx_handle_vmptrst(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMREAD:
- if ( nvmx_handle_vmread(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMWRITE:
- if ( nvmx_handle_vmwrite(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMLAUNCH:
- if ( nvmx_handle_vmlaunch(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_VMRESUME:
- if ( nvmx_handle_vmresume(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_INVEPT:
- if ( nvmx_handle_invept(regs) == X86EMUL_OKAY )
- update_guest_eip();
- break;
-
case EXIT_REASON_INVVPID:
- if ( nvmx_handle_invvpid(regs) == X86EMUL_OKAY )
+ if ( nvmx_handle_vmx_insn(regs, exit_reason) == X86EMUL_OKAY )
update_guest_eip();
break;
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index e97db33..88cb58c 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1470,7 +1470,7 @@ void nvmx_switch_guest(void)
* VMX instructions handling
*/
-int nvmx_handle_vmxon(struct cpu_user_regs *regs)
+static int nvmx_handle_vmxon(struct cpu_user_regs *regs)
{
struct vcpu *v=current;
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
@@ -1522,7 +1522,7 @@ int nvmx_handle_vmxon(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
+static int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
{
struct vcpu *v=current;
struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
@@ -1611,7 +1611,7 @@ static int nvmx_vmresume(struct vcpu *v, struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_vmresume(struct cpu_user_regs *regs)
+static int nvmx_handle_vmresume(struct cpu_user_regs *regs)
{
bool_t launched;
struct vcpu *v = current;
@@ -1645,7 +1645,7 @@ int nvmx_handle_vmresume(struct cpu_user_regs *regs)
return nvmx_vmresume(v,regs);
}
-int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
+static int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
{
bool_t launched;
struct vcpu *v = current;
@@ -1688,7 +1688,7 @@ int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
return rc;
}
-int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
+static int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
struct vmx_inst_decoded decode;
@@ -1759,7 +1759,7 @@ int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
+static int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
struct vmx_inst_decoded decode;
@@ -1784,7 +1784,7 @@ int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_vmclear(struct cpu_user_regs *regs)
+static int nvmx_handle_vmclear(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
struct vmx_inst_decoded decode;
@@ -1836,7 +1836,7 @@ int nvmx_handle_vmclear(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_vmread(struct cpu_user_regs *regs)
+static int nvmx_handle_vmread(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
struct vmx_inst_decoded decode;
@@ -1878,7 +1878,7 @@ int nvmx_handle_vmread(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
+static int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
struct vmx_inst_decoded decode;
@@ -1926,7 +1926,7 @@ int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_invept(struct cpu_user_regs *regs)
+static int nvmx_handle_invept(struct cpu_user_regs *regs)
{
struct vmx_inst_decoded decode;
unsigned long eptp;
@@ -1954,7 +1954,7 @@ int nvmx_handle_invept(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
-int nvmx_handle_invvpid(struct cpu_user_regs *regs)
+static int nvmx_handle_invvpid(struct cpu_user_regs *regs)
{
struct vmx_inst_decoded decode;
unsigned long vpid;
@@ -1980,6 +1980,81 @@ int nvmx_handle_invvpid(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
}
+int nvmx_handle_vmx_insn(struct cpu_user_regs *regs, unsigned int exit_reason)
+{
+ struct vcpu *curr = current;
+ int ret;
+
+ if ( !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) ||
+ !nestedhvm_enabled(curr->domain) ||
+ (vmx_guest_x86_mode(curr) < (hvm_long_mode_active(curr) ? 8 : 2)) )
+ {
+ hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
+ return X86EMUL_EXCEPTION;
+ }
+
+ if ( vmx_get_cpl() > 0 )
+ {
+ hvm_inject_hw_exception(TRAP_gp_fault, 0);
+ return X86EMUL_EXCEPTION;
+ }
+
+ switch ( exit_reason )
+ {
+ case EXIT_REASON_VMXOFF:
+ ret = nvmx_handle_vmxoff(regs);
+ break;
+
+ case EXIT_REASON_VMXON:
+ ret = nvmx_handle_vmxon(regs);
+ break;
+
+ case EXIT_REASON_VMCLEAR:
+ ret = nvmx_handle_vmclear(regs);
+ break;
+
+ case EXIT_REASON_VMPTRLD:
+ ret = nvmx_handle_vmptrld(regs);
+ break;
+
+ case EXIT_REASON_VMPTRST:
+ ret = nvmx_handle_vmptrst(regs);
+ break;
+
+ case EXIT_REASON_VMREAD:
+ ret = nvmx_handle_vmread(regs);
+ break;
+
+ case EXIT_REASON_VMWRITE:
+ ret = nvmx_handle_vmwrite(regs);
+ break;
+
+ case EXIT_REASON_VMLAUNCH:
+ ret = nvmx_handle_vmlaunch(regs);
+ break;
+
+ case EXIT_REASON_VMRESUME:
+ ret = nvmx_handle_vmresume(regs);
+ break;
+
+ case EXIT_REASON_INVEPT:
+ ret = nvmx_handle_invept(regs);
+ break;
+
+ case EXIT_REASON_INVVPID:
+ ret = nvmx_handle_invvpid(regs);
+ break;
+
+ default:
+ ASSERT_UNREACHABLE();
+ domain_crash(curr->domain);
+ ret = X86EMUL_UNHANDLEABLE;
+ break;
+ }
+
+ return ret;
+}
+
#define __emul_value(enable1, default1) \
((enable1 | default1) << 32 | (default1))
diff --git a/xen/include/asm-x86/hvm/vmx/vvmx.h b/xen/include/asm-x86/hvm/vmx/vvmx.h
index 9ea35eb..fc4a8d1 100644
--- a/xen/include/asm-x86/hvm/vmx/vvmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vvmx.h
@@ -94,9 +94,6 @@ void nvmx_domain_relinquish_resources(struct domain *d);
bool_t nvmx_ept_enabled(struct vcpu *v);
-int nvmx_handle_vmxon(struct cpu_user_regs *regs);
-int nvmx_handle_vmxoff(struct cpu_user_regs *regs);
-
#define EPT_TRANSLATE_SUCCEED 0
#define EPT_TRANSLATE_VIOLATION 1
#define EPT_TRANSLATE_MISCONFIG 2
@@ -191,15 +188,7 @@ enum vmx_insn_errno set_vvmcs_real_safe(const struct vcpu *, u32 encoding,
uint64_t get_shadow_eptp(struct vcpu *v);
void nvmx_destroy_vmcs(struct vcpu *v);
-int nvmx_handle_vmptrld(struct cpu_user_regs *regs);
-int nvmx_handle_vmptrst(struct cpu_user_regs *regs);
-int nvmx_handle_vmclear(struct cpu_user_regs *regs);
-int nvmx_handle_vmread(struct cpu_user_regs *regs);
-int nvmx_handle_vmwrite(struct cpu_user_regs *regs);
-int nvmx_handle_vmresume(struct cpu_user_regs *regs);
-int nvmx_handle_vmlaunch(struct cpu_user_regs *regs);
-int nvmx_handle_invept(struct cpu_user_regs *regs);
-int nvmx_handle_invvpid(struct cpu_user_regs *regs);
+int nvmx_handle_vmx_insn(struct cpu_user_regs *regs, unsigned int exit_reason);
int nvmx_msr_read_intercept(unsigned int msr,
u64 *msr_content);
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/mm: Don't perform flush after failing to update a guests L1e
If the L1e update hasn't occured, the flush cannot do anything useful. This
skips the potentially expensive vcpumask_to_pcpumask() conversion, and
broadcast TLB shootdown.
More importantly however, we might be in the error path due to a bad va
parameter from the guest, and this should not propagate into the TLB flushing
logic. The INVPCID instruction for example raises #GP for a non-canonical
address.
This is XSA-279.
Reported-by: Matthew Daley <mattd@bugfuzz.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4894,6 +4894,14 @@ static int __do_update_va_mapping(
if ( pl1e )
guest_unmap_l1e(pl1e);
+ /*
+ * Any error at this point means that we haven't change the l1e. Skip the
+ * flush, as it won't do anything useful. Furthermore, va is guest
+ * controlled and not necesserily audited by this point.
+ */
+ if ( rc )
+ return rc;
+
switch ( flags & UVMF_FLUSHTYPE_MASK )
{
case UVMF_TLB_FLUSH:
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: shrink struct page_info's shadow_flags to 16 bits
This is to avoid it overlapping the linear_pt_count field needed for PV
domains. Introduce a separate, HVM-only pagetable_dying field to replace
the sole one left in the upper 16 bits.
Note that the accesses to ->shadow_flags in shadow_{pro,de}mote() get
switched to non-atomic, non-bitops operations, as {test,set,clear}_bit()
are not allowed on uint16_t fields and hence their use would have
required ugly casts. This is fine because all updates of the field ought
to occur with the paging lock held, and other updates of it use |= and
&= as well (i.e. using atomic operations here didn't really guard
against potentially racing updates elsewhere).
This is part of XSA-280.
Reported-by: Prgmr.com Security <security@prgmr.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -957,10 +957,14 @@ void shadow_promote(struct domain *d, mf
/* Is the page already shadowed? */
if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+ {
page->shadow_flags = 0;
+ if ( is_hvm_domain(d) )
+ page->pagetable_dying = false;
+ }
- ASSERT(!test_bit(type, &page->shadow_flags));
- set_bit(type, &page->shadow_flags);
+ ASSERT(!(page->shadow_flags & (1u << type)));
+ page->shadow_flags |= 1u << type;
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
}
@@ -969,9 +973,9 @@ void shadow_demote(struct domain *d, mfn
struct page_info *page = mfn_to_page(gmfn);
ASSERT(test_bit(_PGC_page_table, &page->count_info));
- ASSERT(test_bit(type, &page->shadow_flags));
+ ASSERT(page->shadow_flags & (1u << type));
- clear_bit(type, &page->shadow_flags);
+ page->shadow_flags &= ~(1u << type);
if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
{
@@ -2801,7 +2805,7 @@ void sh_remove_shadows(struct domain *d,
if ( !fast && all && (pg->count_info & PGC_page_table) )
{
SHADOW_ERROR("can't find all shadows of mfn %"PRI_mfn" "
- "(shadow_flags=%08x)\n",
+ "(shadow_flags=%04x)\n",
mfn_x(gmfn), pg->shadow_flags);
domain_crash(d);
}
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3328,8 +3328,8 @@ static int sh_page_fault(struct vcpu *v,
/* Unshadow if we are writing to a toplevel pagetable that is
* flagged as a dying process, and that is not currently used. */
- if ( sh_mfn_is_a_page_table(gmfn)
- && (mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying) )
+ if ( sh_mfn_is_a_page_table(gmfn) && is_hvm_domain(d) &&
+ mfn_to_page(gmfn)->pagetable_dying )
{
int used = 0;
struct vcpu *tmp;
@@ -4301,9 +4301,9 @@ int sh_rm_write_access_from_sl1p(struct
ASSERT(mfn_valid(smfn));
/* Remember if we've been told that this process is being torn down */
- if ( curr->domain == d )
+ if ( curr->domain == d && is_hvm_domain(d) )
curr->arch.paging.shadow.pagetable_dying
- = !!(mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying);
+ = mfn_to_page(gmfn)->pagetable_dying;
sp = mfn_to_page(smfn);
@@ -4619,10 +4619,10 @@ static void sh_pagetable_dying(struct vc
: shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l2_pae_shadow);
}
- if ( mfn_valid(smfn) )
+ if ( mfn_valid(smfn) && is_hvm_domain(d) )
{
gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
- mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
+ mfn_to_page(gmfn)->pagetable_dying = true;
shadow_unhook_mappings(d, smfn, 1/* user pages only */);
flush = 1;
}
@@ -4659,9 +4659,9 @@ static void sh_pagetable_dying(struct vc
smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l4_64_shadow);
#endif
- if ( mfn_valid(smfn) )
+ if ( mfn_valid(smfn) && is_hvm_domain(d) )
{
- mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
+ mfn_to_page(gmfn)->pagetable_dying = true;
shadow_unhook_mappings(d, smfn, 1/* user pages only */);
/* Now flush the TLB: we removed toplevel mappings. */
flush_tlb_mask(d->domain_dirty_cpumask);
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -292,8 +292,6 @@ static inline void sh_terminate_list(str
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
-#define SHF_pagetable_dying (1u<<31)
-
static inline int sh_page_has_multiple_shadows(struct page_info *pg)
{
u32 shadows;
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -188,8 +188,15 @@ struct page_info
* Guest pages with a shadow. This does not conflict with
* tlbflush_timestamp since page table pages are explicitly not
* tracked for TLB-flush avoidance when a guest runs in shadow mode.
+ *
+ * pagetable_dying is used for HVM domains only. The layout here has
+ * to avoid re-use of the space used by linear_pt_count, which (only)
+ * PV guests use.
*/
- u32 shadow_flags;
+ struct {
+ uint16_t shadow_flags;
+ bool pagetable_dying;
+ };
/* When in use as a shadow, next shadow in this hash chain. */
__pdx_t next_shadow;
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: move OOS flag bit positions
In preparation of reducing struct page_info's shadow_flags field to 16
bits, lower the bit positions used for SHF_out_of_sync and
SHF_oos_may_write.
Instead of also adjusting the open coded use in _get_page_type(),
introduce shadow_prepare_page_type_change() to contain knowledge of the
bit positions to shadow code.
This is part of XSA-280.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2799,15 +2799,8 @@ static int __get_page_type(struct page_i
{
struct domain *d = page_get_owner(page);
- /* Normally we should never let a page go from type count 0
- * to type count 1 when it is shadowed. One exception:
- * out-of-sync shadowed pages are allowed to become
- * writeable. */
- if ( d && shadow_mode_enabled(d)
- && (page->count_info & PGC_page_table)
- && !((page->shadow_flags & (1u<<29))
- && type == PGT_writable_page) )
- shadow_remove_all_shadows(d, _mfn(page_to_mfn(page)));
+ if ( d && shadow_mode_enabled(d) )
+ shadow_prepare_page_type_change(d, page, type);
ASSERT(!(x & PGT_pae_xen_l2));
if ( (x & PGT_type_mask) != type )
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -919,6 +919,9 @@ int sh_unsync(struct vcpu *v, mfn_t gmfn
|| !v->domain->arch.paging.shadow.oos_active )
return 0;