main/xen: security fixes (xsa237 - xsa244)

CVE-2017-15590 XSA-237 XSA-238 CVE-2017-15589 XSA-239 CVE-2017-15595 XSA-240 CVE-2017-15588 XSA-241 CVE-2017-15593 XSA-242 CVE-2017-15592 XSA-243 CVE-2017-15594 XSA-244

main/xen: security fixes (xsa237 - xsa244)
a977efc9 · Daniel Sabogal · Natanael Copa · 18a6777d · a977efc9 · a977efc9
Commit a977efc9 authored 7 years ago by Daniel Sabogal Committed by Natanael Copa 7 years ago
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -3,7 +3,7 @@
 # Maintainer: William Pitcock <nenolod@dereferenced.org>
 pkgname=xen
 pkgver=4.9.0
-pkgrel=5
+pkgrel=6
 pkgdesc="Xen hypervisor"
 url="http://www.xen.org/"
 arch="x86_64 armhf aarch64"
@@ -87,6 +87,15 @@ options="!strip"
 #     - CVE-2017-14319 XSA-234
 #   4.9.0-r5:
 #     - XSA-245
+#   4.9.0-r6:
+#     - CVE-2017-15590 XSA-237
+#     - XSA-238
+#     - CVE-2017-15589 XSA-239
+#     - CVE-2017-15595 XSA-240
+#     - CVE-2017-15588 XSA-241
+#     - CVE-2017-15593 XSA-242
+#     - CVE-2017-15592 XSA-243
+#     - CVE-2017-15594 XSA-244
 case "$CARCH" in
 x86*)
@@ -144,6 +153,19 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
 	xsa233.patch
 	xsa234-4.9.patch
 	xsa235-4.9.patch
+	xsa237-1.patch
+	xsa237-2.patch
+	xsa237-3.patch
+	xsa237-4.patch
+	xsa237-5.patch
+	xsa238.patch
+	xsa239.patch
+	xsa240-1.patch
+	xsa240-2.patch
+	xsa241-4.9.patch
+	xsa242-4.9.patch
+	xsa243.patch
+	xsa244.patch
 	xsa245-1.patch
 	xsa245-2.patch
@@ -408,6 +430,19 @@ fb742225a4f3dbf2a574c4a6e3ef61a5da0c91aaeed77a2247023bdefcd4e0b6c08f1c9ffb42eaac
 a322ac6c5ac2f858a59096108032fd42974eaaeeebd8f4966119149665f32bed281e333e743136e79add2e6f3844d88b6a3e4d5a685c2808702fd3a9e6396cd4  xsa233.patch
 cafeef137cd82cefc3e974b42b974c6562e822c9b359efb654ac374e663d9fc123be210eec17b278f40eabb77c93d3bf0ff03e445607159ad0712808a609a906  xsa234-4.9.patch
 8bab6e59577b51f0c6b8a547c9a37a257bd0460e7219512e899d25f80a74084745d2a4c54e55ad12526663d40f218cb8f833b71350220d36e3750d002ff43d29  xsa235-4.9.patch
+a447b4f0a5379da46b5f0eb5b77eab07c3cfe8d303af6e116e03c7d88a9fc9ea154043165631d29248c07516ab8fdfd5de4da1ccf0ab7358d90fb7f9c87bf221  xsa237-1.patch
+10f2d84f783fb8bae5a39c463a32f4ac5d4d2614a7eecf109dcccd5418b8ec5e523691e79b3578d9c7b113f368a94d360acb9534808c440852a91c36369f88fd  xsa237-2.patch
+50607fca2e02eed322927e0288c77e7a6c541794fa2c70c78ada0c2fa762b5ad0f3b5108ecb9f01d8826f89dab492d56c502236c70234e6ba741e94a39356ea3  xsa237-3.patch
+c8aab545cd4118e74cbb2010f0a844e88608d44bd3117e765221445a8007a0dfa9ee0f4d5c10c3cd7f307a8b7718d896b928c4e06449dc63f7cfec7a10d356bb  xsa237-4.patch
+d3dfb9208deb6fc2be949e34fc1be84fbed1269bd99b8b31c176da34cc916c50955c7768b4aac75dbe4377634ae5d8390a03e0ff5ffa2ad00b9fab788449deaa  xsa237-5.patch
+b154c0925bbceab40e8f3b689e2d1fb321b42c685fdcb6bd29b0411ccd856731480a2fbb8025c633f9edf34cec938e5d8888cc71e8158212c078bb595d07a29d  xsa238.patch
+8b09cd12c7adfef69a02a2965cda22ef6499fd42c8a84a20a6af231f422a6e8a0e597501c327532e1580c1067ee4bf35579e3cf98dee9302ed34ba87f74bf6d2  xsa239.patch
+e209e629757b3471eae415913c34c662882172daad634083ee29823c2cb3f00e98886352085c1afc5d0f622781e65fae7b41ebfcbe6fd5e37c428337f5d2506c  xsa240-1.patch
+344519cd83ad13245de0e183b7afe564597b30d20756e44f98c0a00df55020d7ef85b92e71701c9791842a48cec93e0fcb9bfba2443313b3aafd8d21ea36abf4  xsa240-2.patch
+560d8062b5683b6533a67eebafdd81e6a9d2c9194cc9f9b1404544503238d4f1d98bccb1afac477f6a55ffbc67cf9629a43fd67a745ca9093e2adc0478dd0ddb  xsa241-4.9.patch
+86aa763949ca36a36120a40eafbdf3a8e8bc04acd32ee6bc1e3ae90b189b86b9b166b81a9e0a4f86a7eb1fcc8723ae8ba6bd0f84fa9732e7e4e1ccea45d0b7c1  xsa242-4.9.patch
+9f269e262aa67ff9a304ed6fc64ee9c5c9f6fd606d520fc2614cd173cddc9735ad42f91a97b91f1b9c5368d54d514820937edd0ce302dc3839b426398dc6b700  xsa243.patch
+0fd35e74be6f049f1f376aa8295b14f57b92f5e45e7487e5b485c2b8f6faa2950d0fe7d8a863692b3dab8a3a7ef1d9dd94be2c6b55d01802b0d86c84d2fa9e29  xsa244.patch
 b19197934e8685fc2af73f404b5c8cbed66d9241e5ff902d1a77fdc227e001a13b775a53d6e303d5f27419f5590561c84ec69409152d9773a5e6050c16e92f1b  xsa245-1.patch
 75369673232b2107b59dc0242d6fc224c016b9dcbf3299eab90a1d7c365d617fbc91f7b25075b394fee92782db37ce83c416387fa5ad4c4fcd51d0775a8a754f  xsa245-2.patch
 c3c46f232f0bd9f767b232af7e8ce910a6166b126bd5427bb8dc325aeb2c634b956de3fc225cab5af72649070c8205cc8e1cab7689fc266c204f525086f1a562  qemu-coroutine-gthread.patch

--- a/main/xen/xsa237-1.patch
+++ b/main/xen/xsa237-1.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86: don't allow MSI pIRQ mapping on unowned device
+MSI setup should be permitted only for existing devices owned by the
+respective guest (the operation may still be carried out by the domain
+controlling that guest).
+This is part of XSA-237.
+Reported-by: HW42 <hw42@ipsumj.de>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
+@@ -1963,7 +1963,10 @@ int map_domain_pirq(
+         if ( !cpu_has_apic )
+             goto done;
+-        pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
+        pdev = pci_get_pdev_by_domain(d, msi->seg, msi->bus, msi->devfn);
+        if ( !pdev )
+            goto done;
+
+         ret = pci_enable_msi(msi, &msi_desc);
+         if ( ret )
+         {
--- a/main/xen/xsa237-2.patch
+++ b/main/xen/xsa237-2.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86: enforce proper privilege when (un)mapping pIRQ-s
+(Un)mapping of IRQs, just like other RESOURCE__ADD* / RESOURCE__REMOVE*
+actions (in FLASK terms) should be XSM_DM_PRIV rather than XSM_TARGET.
+This in turn requires bypassing the XSM check in physdev_unmap_pirq()
+for the HVM emuirq case just like is being done in physdev_map_pirq().
+The primary goal security wise, however, is to no longer allow HVM
+guests, by specifying their own domain ID instead of DOMID_SELF, to
+enter code paths intended for PV guest and the control domains of HVM
+guests only.
+This is part of XSA-237.
+Reported-by: HW42 <hw42@ipsumj.de>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
+@@ -111,7 +111,7 @@ int physdev_map_pirq(domid_t domid, int
+     if ( d == NULL )
+         return -ESRCH;
+-    ret = xsm_map_domain_pirq(XSM_TARGET, d);
+    ret = xsm_map_domain_pirq(XSM_DM_PRIV, d);
+     if ( ret )
+         goto free_domain;
+@@ -256,13 +256,14 @@ int physdev_map_pirq(domid_t domid, int
+ int physdev_unmap_pirq(domid_t domid, int pirq)
+ {
+     struct domain *d;
+-    int ret;
+    int ret = 0;
+     d = rcu_lock_domain_by_any_id(domid);
+     if ( d == NULL )
+         return -ESRCH;
+-    ret = xsm_unmap_domain_pirq(XSM_TARGET, d);
+    if ( domid != DOMID_SELF || !is_hvm_domain(d) || !has_pirq(d) )
+        ret = xsm_unmap_domain_pirq(XSM_DM_PRIV, d);
+     if ( ret )
+         goto free_domain;
+--- a/xen/include/xsm/dummy.h
+++ b/xen/include/xsm/dummy.h
+@@ -453,7 +453,7 @@ static XSM_INLINE char *xsm_show_irq_sid
+ static XSM_INLINE int xsm_map_domain_pirq(XSM_DEFAULT_ARG struct domain *d)
+ {
+-    XSM_ASSERT_ACTION(XSM_TARGET);
+    XSM_ASSERT_ACTION(XSM_DM_PRIV);
+     return xsm_default_action(action, current->domain, d);
+ }
+@@ -465,7 +465,7 @@ static XSM_INLINE int xsm_map_domain_irq
+ static XSM_INLINE int xsm_unmap_domain_pirq(XSM_DEFAULT_ARG struct domain *d)
+ {
+-    XSM_ASSERT_ACTION(XSM_TARGET);
+    XSM_ASSERT_ACTION(XSM_DM_PRIV);
+     return xsm_default_action(action, current->domain, d);
+ }
--- a/main/xen/xsa237-3.patch
+++ b/main/xen/xsa237-3.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/MSI: disallow redundant enabling
+At the moment, Xen attempts to allow redundant enabling of MSI by
+having pci_enable_msi() return 0, and point to the existing MSI
+descriptor, when the msi already exists.
+Unfortunately, if subsequent errors are encountered, the cleanup
+paths assume pci_enable_msi() had done full initialization, and
+hence undo everything that was assumed to be done by that
+function without also undoing other setup that would normally
+occur only after that function was called (in map_domain_pirq()
+itself).
+Rather than try to make the redundant enabling case work properly, just
+forbid it entirely by having pci_enable_msi() return -EEXIST when MSI
+is already set up.
+This is part of XSA-237.
+Reported-by: HW42 <hw42@ipsumj.de>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
+@@ -1050,11 +1050,10 @@ static int __pci_enable_msi(struct msi_i
+     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
+     if ( old_desc )
+     {
+-        printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+        printk(XENLOG_ERR "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+                msi->irq, msi->seg, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+-        *desc = old_desc;
+-        return 0;
+        return -EEXIST;
+     }
+     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
+@@ -1118,11 +1117,10 @@ static int __pci_enable_msix(struct msi_
+     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
+     if ( old_desc )
+     {
+-        printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+        printk(XENLOG_ERR "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+                msi->irq, msi->seg, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+-        *desc = old_desc;
+-        return 0;
+        return -EEXIST;
+     }
+     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
--- a/main/xen/xsa237-4.patch
+++ b/main/xen/xsa237-4.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/IRQ: conditionally preserve irq <-> pirq mapping on map error paths
+Mappings that had been set up before should not be torn down when
+handling unrelated errors.
+This is part of XSA-237.
+Reported-by: HW42 <hw42@ipsumj.de>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
+@@ -1251,7 +1251,8 @@ static int prepare_domain_irq_pirq(struc
+         return -ENOMEM;
+     }
+     *pinfo = info;
+-    return 0;
+
+    return !!err;
+ }
+ static void set_domain_irq_pirq(struct domain *d, int irq, struct pirq *pirq)
+@@ -1294,7 +1295,10 @@ int init_domain_irq_mapping(struct domai
+             continue;
+         err = prepare_domain_irq_pirq(d, i, i, &info);
+         if ( err )
+        {
+            ASSERT(err < 0);
+             break;
+        }
+         set_domain_irq_pirq(d, i, info);
+     }
+@@ -1902,6 +1906,7 @@ int map_domain_pirq(
+     struct pirq *info;
+     struct irq_desc *desc;
+     unsigned long flags;
+    DECLARE_BITMAP(prepared, MAX_MSI_IRQS) = {};
+     ASSERT(spin_is_locked(&d->event_lock));
+@@ -1945,8 +1950,10 @@ int map_domain_pirq(
+     }
+     ret = prepare_domain_irq_pirq(d, irq, pirq, &info);
+-    if ( ret )
+    if ( ret < 0 )
+         goto revoke;
+    if ( !ret )
+        __set_bit(0, prepared);
+     desc = irq_to_desc(irq);
+@@ -2018,8 +2025,10 @@ int map_domain_pirq(
+             irq = create_irq(NUMA_NO_NODE);
+             ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info)
+                            : irq;
+-            if ( ret )
+            if ( ret < 0 )
+                 break;
+            if ( !ret )
+                __set_bit(nr, prepared);
+             msi_desc[nr].irq = irq;
+             if ( irq_permit_access(d, irq) != 0 )
+@@ -2052,15 +2061,15 @@ int map_domain_pirq(
+                 desc->msi_desc = NULL;
+                 spin_unlock_irqrestore(&desc->lock, flags);
+             }
+-            while ( nr-- )
+            while ( nr )
+             {
+                 if ( irq >= 0 && irq_deny_access(d, irq) )
+                     printk(XENLOG_G_ERR
+                            "dom%d: could not revoke access to IRQ%d (pirq %d)\n",
+                            d->domain_id, irq, pirq);
+-                if ( info )
+                if ( info && test_bit(nr, prepared) )
+                     cleanup_domain_irq_pirq(d, irq, info);
+-                info = pirq_info(d, pirq + nr);
+                info = pirq_info(d, pirq + --nr);
+                 irq = info->arch.irq;
+             }
+             msi_desc->irq = -1;
+@@ -2076,12 +2085,14 @@ int map_domain_pirq(
+         spin_lock_irqsave(&desc->lock, flags);
+         set_domain_irq_pirq(d, irq, info);
+         spin_unlock_irqrestore(&desc->lock, flags);
+        ret = 0;
+     }
+ done:
+     if ( ret )
+     {
+-        cleanup_domain_irq_pirq(d, irq, info);
+        if ( test_bit(0, prepared) )
+            cleanup_domain_irq_pirq(d, irq, info);
+  revoke:
+         if ( irq_deny_access(d, irq) )
+             printk(XENLOG_G_ERR
+--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
+@@ -186,7 +186,7 @@ int physdev_map_pirq(domid_t domid, int
+         }
+         else if ( type == MAP_PIRQ_TYPE_MULTI_MSI )
+         {
+-            if ( msi->entry_nr <= 0 || msi->entry_nr > 32 )
+            if ( msi->entry_nr <= 0 || msi->entry_nr > MAX_MSI_IRQS )
+                 ret = -EDOM;
+             else if ( msi->entry_nr != 1 && !iommu_intremap )
+                 ret = -EOPNOTSUPP;
+--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
+@@ -56,6 +56,8 @@
+ /* MAX fixed pages reserved for mapping MSIX tables. */
+ #define FIX_MSIX_MAX_PAGES              512
+#define MAX_MSI_IRQS 32 /* limited by MSI capability struct properties */
+
+ struct msi_info {
+     u16 seg;
+     u8 bus;
--- a/main/xen/xsa237-5.patch
+++ b/main/xen/xsa237-5.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/FLASK: fix unmap-domain-IRQ XSM hook
+The caller and the FLASK implementation of xsm_unmap_domain_irq()
+disagreed about what the "data" argument points to in the MSI case:
+Change both sides to pass/take a PCI device.
+This is part of XSA-237.
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
+@@ -2143,7 +2143,8 @@ int unmap_domain_pirq(struct domain *d,
+         nr = msi_desc->msi.nvec;
+     }
+-    ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc);
+    ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq,
+                               msi_desc ? msi_desc->dev : NULL);
+     if ( ret )
+         goto done;
+--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
+@@ -918,8 +918,8 @@ static int flask_unmap_domain_msi (struc
+                                    u32 *sid, struct avc_audit_data *ad)
+ {
+ #ifdef CONFIG_HAS_PCI
+-    struct msi_info *msi = data;
+-    u32 machine_bdf = (msi->seg << 16) | (msi->bus << 8) | msi->devfn;
+    const struct pci_dev *pdev = data;
+    u32 machine_bdf = (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn;
+     AVC_AUDIT_DATA_INIT(ad, DEV);
+     ad->device = machine_bdf;
--- a/main/xen/xsa238.patch
+++ b/main/xen/xsa238.patch
+From cdc2887076b19b39fab9faec495082586f3113df Mon Sep 17 00:00:00 2001
+From: XenProject Security Team <security@xenproject.org>
+Date: Tue, 5 Sep 2017 13:41:37 +0200
+Subject: x86/ioreq server: correctly handle bogus
+ XEN_DMOP_{,un}map_io_range_to_ioreq_server arguments
+Misbehaving device model can pass incorrect XEN_DMOP_map/
+unmap_io_range_to_ioreq_server arguments, namely end < start when
+specifying address range. When this happens we hit ASSERT(s <= e) in
+rangeset_contains_range()/rangeset_overlaps_range() with debug builds.
+Production builds will not trap right away but may misbehave later
+while handling such bogus ranges.
+This is XSA-238.
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/hvm/ioreq.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
+index b2a8b0e986..8c8bf1f0ec 100644
+--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
+@@ -820,6 +820,9 @@ int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
+     struct hvm_ioreq_server *s;
+     int rc;
+    if ( start > end )
+        return -EINVAL;
+
+     spin_lock_recursive(&d->arch.hvm_domain.ioreq_server.lock);
+     rc = -ENOENT;
+@@ -872,6 +875,9 @@ int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
+     struct hvm_ioreq_server *s;
+     int rc;
+    if ( start > end )
+        return -EINVAL;
+
+     spin_lock_recursive(&d->arch.hvm_domain.ioreq_server.lock);
+     rc = -ENOENT;
--- a/main/xen/xsa239.patch
+++ b/main/xen/xsa239.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/HVM: prefill partially used variable on emulation paths
+Certain handlers ignore the access size (vioapic_write() being the
+example this was found with), perhaps leading to subsequent reads
+seeing data that wasn't actually written by the guest. For
+consistency and extra safety also do this on the read path of
+hvm_process_io_intercept(), even if this doesn't directly affect what
+guests get to see, as we've supposedly already dealt with read handlers
+leaving data completely unitialized.
+This is XSA-239.
+Reported-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
+@@ -129,7 +129,7 @@ static int hvmemul_do_io(
+         .count = *reps,
+         .dir = dir,
+         .df = df,
+-        .data = data,
+        .data = data_is_addr ? data : 0,
+         .data_is_ptr = data_is_addr, /* ioreq_t field name is misleading */
+         .state = STATE_IOREQ_READY,
+     };
+--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
+@@ -127,6 +127,7 @@ int hvm_process_io_intercept(const struc
+             addr = (p->type == IOREQ_TYPE_COPY) ?
+                    p->addr + step * i :
+                    p->addr;
+            data = 0;
+             rc = ops->read(handler, addr, p->size, &data);
+             if ( rc != X86EMUL_OKAY )
+                 break;
+@@ -161,6 +162,7 @@ int hvm_process_io_intercept(const struc
+         {
+             if ( p->data_is_ptr )
+             {
+                data = 0;
+                 switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
+                                                   p->size) )
+                 {
--- a/main/xen/xsa240-1.patch
+++ b/main/xen/xsa240-1.patch
+From 867988237d3e472fe2c99e81ae733e103422566c Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Thu, 28 Sep 2017 15:17:25 +0100
+Subject: [PATCH 1/2] x86: limit linear page table use to a single level
+That's the only way that they're meant to be used. Without such a
+restriction arbitrarily long chains of same-level page tables can be
+built, tearing down of which may then cause arbitrarily deep recursion,
+causing a stack overflow. To facilitate this restriction, a counter is
+being introduced to track both the number of same-level entries in a
+page table as well as the number of uses of a page table in another
+same-level one (counting into positive and negative direction
+respectively, utilizing the fact that both counts can't be non-zero at
+the same time).
+Note that the added accounting introduces a restriction on the number
+of times a page can be used in other same-level page tables - more than
+32k of such uses are no longer possible.
+Note also that some put_page_and_type[_preemptible]() calls are
+replaced with open-coded equivalents.  This seemed preferrable to
+adding "parent_table" to the matrix of functions.
+Note further that cross-domain same-level page table references are no
+longer permitted (they probably never should have been).
+This is XSA-240.
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+---
+ xen/arch/x86/domain.c        |   1 +
+ xen/arch/x86/mm.c            | 171 ++++++++++++++++++++++++++++++++++++++-----
+ xen/include/asm-x86/domain.h |   2 +
+ xen/include/asm-x86/mm.h     |  25 +++++--
+ 4 files changed, 175 insertions(+), 24 deletions(-)
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index d7e699228c..d7ed72c246 100644
+--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
+@@ -1226,6 +1226,7 @@ int arch_set_info_guest(
+                     rc = -ERESTART;
+                     /* Fallthrough */
+                 case -ERESTART:
+                    v->arch.old_guest_ptpg = NULL;
+                     v->arch.old_guest_table =
+                         pagetable_get_page(v->arch.guest_table);
+                     v->arch.guest_table = pagetable_null();
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 86f5eda52d..1e469bd354 100644
+--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
+@@ -747,6 +747,61 @@ static void put_data_page(
+         put_page(page);
+ }
+static bool inc_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /*
+         * The check below checks for the "linear use" count being non-zero
+         * as well as overflow.  Signed integer overflow is undefined behavior
+         * according to the C spec.  However, as long as linear_pt_count is
+         * smaller in size than 'int', the arithmetic operation of the
+         * increment below won't overflow; rather the result will be truncated
+         * when stored.  Ensure that this is always true.
+         */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc++;
+        if ( nc <= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+    ASSERT(oc > 0);
+}
+
+static bool inc_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /* See the respective comment in inc_linear_entries(). */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc--;
+        if ( nc >= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+    ASSERT(oc < 0);
+}
+
+ /*
+  * We allow root tables to map each other (a.k.a. linear page tables). It
+  * needs some special care with reference counts and access permissions:
+@@ -777,15 +832,35 @@ get_##level##_linear_pagetable(                                             \
+                                                                             \
+     if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
+     {                                                                       \
+        struct page_info *ptpg = mfn_to_page(pde_pfn);                      \
+                                                                            \
+        /* Make sure the page table belongs to the correct domain. */       \
+        if ( unlikely(page_get_owner(ptpg) != d) )                          \
+            return 0;                                                       \
+                                                                            \
+         /* Make sure the mapped frame belongs to the correct domain. */     \
+         if ( unlikely(!get_page_from_pagenr(pfn, d)) )                      \
+             return 0;                                                       \
+                                                                             \
+         /*                                                                  \
+-         * Ensure that the mapped frame is an already-validated page table. \
+         * Ensure that the mapped frame is an already-validated page table  \
+         * and is not itself having linear entries, as well as that the     \
+         * containing page table is not iself in use as a linear page table \
+         * elsewhere.                                                       \
+          * If so, atomically increment the count (checking for overflow).   \
+          */                                                                 \
+         page = mfn_to_page(pfn);                                            \
+        if ( !inc_linear_entries(ptpg) )                                    \
+        {                                                                   \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+        if ( !inc_linear_uses(page) )                                       \
+        {                                                                   \
+            dec_linear_entries(ptpg);                                       \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+         y = page->u.inuse.type_info;                                        \
+         do {                                                                \
+             x = y;                                                          \
+@@ -793,6 +868,8 @@ get_##level##_linear_pagetable(                                             \
+                  unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
+                           (PGT_##level##_page_table|PGT_validated)) )       \
+             {                                                               \
+                dec_linear_uses(page);                                      \
+                dec_linear_entries(ptpg);                                   \
+                 put_page(page);                                             \
+                 return 0;                                                   \
+             }                                                               \
+@@ -1226,6 +1303,9 @@ get_page_from_l4e(
+             l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);   \
+     } while ( 0 )
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg);
+
+ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
+ {
+     unsigned long     pfn = l1e_get_pfn(l1e);
+@@ -1296,17 +1376,22 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+     if ( l2e_get_flags(l2e) & _PAGE_PSE )
+         put_superpage(l2e_get_pfn(l2e));
+     else
+-        put_page_and_type(l2e_get_page(l2e));
+    {
+        struct page_info *pg = l2e_get_page(l2e);
+        int rc = _put_page_type(pg, false, mfn_to_page(pfn));
+
+        ASSERT(!rc);
+        put_page(pg);
+    }
+     return 0;
+ }
+-static int __put_page_type(struct page_info *, int preemptible);
+-
+ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+                              int partial, bool_t defer)
+ {
+     struct page_info *pg;
+    int rc;
+     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
+         return 1;
+@@ -1329,21 +1414,28 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+     if ( unlikely(partial > 0) )
+     {
+         ASSERT(!defer);
+-        return __put_page_type(pg, 1);
+        return _put_page_type(pg, true, mfn_to_page(pfn));
+     }
+     if ( defer )
+     {
+        current->arch.old_guest_ptpg = mfn_to_page(pfn);
+         current->arch.old_guest_table = pg;
+         return 0;
+     }
+-    return put_page_and_type_preemptible(pg);
+    rc = _put_page_type(pg, true, mfn_to_page(pfn));
+    if ( likely(!rc) )
+        put_page(pg);
+
+    return rc;
+ }
+ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+                              int partial, bool_t defer)
+ {
+    int rc = 1;
+
+     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
+          (l4e_get_pfn(l4e) != pfn) )
+     {
+@@ -1352,18 +1444,22 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+         if ( unlikely(partial > 0) )
+         {
+             ASSERT(!defer);
+-            return __put_page_type(pg, 1);
+            return _put_page_type(pg, true, mfn_to_page(pfn));
+         }
+         if ( defer )
+         {
+            current->arch.old_guest_ptpg = mfn_to_page(pfn);
+             current->arch.old_guest_table = pg;
+             return 0;
+         }
+-        return put_page_and_type_preemptible(pg);
+        rc = _put_page_type(pg, true, mfn_to_page(pfn));
+        if ( likely(!rc) )
+            put_page(pg);
+     }
+-    return 1;
+
+    return rc;
+ }
+ static int alloc_l1_table(struct page_info *page)
+@@ -1561,6 +1657,7 @@ static int alloc_l3_table(struct page_info *page)
+         {
+             page->nr_validated_ptes = i;
+             page->partial_pte = 0;
+            current->arch.old_guest_ptpg = NULL;
+             current->arch.old_guest_table = page;
+         }
+         while ( i-- > 0 )
+@@ -1654,6 +1751,7 @@ static int alloc_l4_table(struct page_info *page)
+                 {
+                     if ( current->arch.old_guest_table )
+                         page->nr_validated_ptes++;
+                    current->arch.old_guest_ptpg = NULL;
+                     current->arch.old_guest_table = page;
+                 }
+             }
+@@ -2403,14 +2501,20 @@ int free_page_type(struct page_info *pag
+ }
+-static int __put_final_page_type(
+-    struct page_info *page, unsigned long type, int preemptible)
+static int _put_final_page_type(struct page_info *page, unsigned long type,
+                                bool preemptible, struct page_info *ptpg)
+ {
+     int rc = free_page_type(page, type, preemptible);
+     /* No need for atomic update of type_info here: noone else updates it. */
+     if ( rc == 0 )
+     {
+        if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+        {
+            dec_linear_uses(page);
+            dec_linear_entries(ptpg);
+        }
+        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
+         /*
+          * Record TLB information for flush later. We do not stamp page tables
+          * when running in shadow mode:
+@@ -2446,8 +2550,8 @@ static int __put_final_page_type(
+ }
+-static int __put_page_type(struct page_info *page,
+-                           int preemptible)
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg)
+ {
+     unsigned long nx, x, y = page->u.inuse.type_info;
+     int rc = 0;
+@@ -2474,12 +2578,28 @@ static int __put_page_type(struct page_info *page,
+                                            x, nx)) != x) )
+                     continue;
+                 /* We cleared the 'valid bit' so we do the clean up. */
+-                rc = __put_final_page_type(page, x, preemptible);
+                rc = _put_final_page_type(page, x, preemptible, ptpg);
+                ptpg = NULL;
+                 if ( x & PGT_partial )
+                     put_page(page);
+                 break;
+             }
+            if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+            {
+                /*
+                 * page_set_tlbflush_timestamp() accesses the same union
+                 * linear_pt_count lives in. Unvalidated page table pages,
+                 * however, should occur during domain destruction only
+                 * anyway.  Updating of linear_pt_count luckily is not
+                 * necessary anymore for a dying domain.
+                 */
+                ASSERT(page_get_owner(page)->is_dying);
+                ASSERT(page->linear_pt_count < 0);
+                ASSERT(ptpg->linear_pt_count > 0);
+                ptpg = NULL;
+            }
+
+             /*
+              * Record TLB information for flush later. We do not stamp page
+              * tables when running in shadow mode:
+@@ -2499,6 +2619,13 @@ static int __put_page_type(struct page_info *page,
+             return -EINTR;
+     }
+    if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+    {
+        ASSERT(!rc);
+        dec_linear_uses(page);
+        dec_linear_entries(ptpg);
+    }
+
+     return rc;
+ }
+@@ -2638,6 +2765,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
+             page->nr_validated_ptes = 0;
+             page->partial_pte = 0;
+         }
+        page->linear_pt_count = 0;
+         rc = alloc_page_type(page, type, preemptible);
+     }
+@@ -2652,7 +2780,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
+ void put_page_type(struct page_info *page)
+ {
+-    int rc = __put_page_type(page, 0);
+    int rc = _put_page_type(page, false, NULL);
+     ASSERT(rc == 0);
+     (void)rc;
+ }
+@@ -2668,7 +2796,7 @@ int get_page_type(struct page_info *page, unsigned long type)
+ int put_page_type_preemptible(struct page_info *page)
+ {
+-    return __put_page_type(page, 1);
+    return _put_page_type(page, true, NULL);
+ }
+ int get_page_type_preemptible(struct page_info *page, unsigned long type)
+@@ -2878,11 +3006,14 @@ int put_old_guest_table(struct vcpu *v)
+     if ( !v->arch.old_guest_table )
+         return 0;
+-    switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) )
+    switch ( rc = _put_page_type(v->arch.old_guest_table, true,
+                                 v->arch.old_guest_ptpg) )
+     {
+     case -EINTR:
+     case -ERESTART:
+         return -ERESTART;
+    case 0:
+        put_page(v->arch.old_guest_table);
+     }
+     v->arch.old_guest_table = NULL;
+@@ -3042,6 +3173,7 @@ int new_guest_cr3(unsigned long mfn)
+                 rc = -ERESTART;
+                 /* fallthrough */
+             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
+                 curr->arch.old_guest_table = page;
+                 break;
+             default:
+@@ -3310,7 +3442,10 @@ long do_mmuext_op(
+                     if ( type == PGT_l1_page_table )
+                         put_page_and_type(page);
+                     else
+                    {
+                        curr->arch.old_guest_ptpg = NULL;
+                         curr->arch.old_guest_table = page;
+                    }
+                 }
+             }
+@@ -3346,6 +3481,7 @@ long do_mmuext_op(
+             {
+             case -EINTR:
+             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
+                 curr->arch.old_guest_table = page;
+                 rc = 0;
+                 break;
+@@ -3425,6 +3561,7 @@ long do_mmuext_op(
+                         rc = -ERESTART;
+                         /* fallthrough */
+                     case -ERESTART:
+                        curr->arch.old_guest_ptpg = NULL;
+                         curr->arch.old_guest_table = page;
+                         break;
+                     default:
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index 924caac834..5a512918cc 100644
+--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
+@@ -527,6 +527,8 @@ struct arch_vcpu
+     pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
+     pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
+     struct page_info *old_guest_table;  /* partially destructed pagetable */
+    struct page_info *old_guest_ptpg;   /* containing page table of the */
+                                        /* former, if any */
+     /* guest_table holds a ref to the page, and also a type-count unless
+      * shadow refcounts are in use */
+     pagetable_t shadow_table[4];        /* (MFN) shadow(s) of guest */
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 119d7dec6b..445da50d47 100644
+--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
+@@ -124,11 +124,11 @@ struct page_info
+         u32 tlbflush_timestamp;
+         /*
+-         * When PGT_partial is true then this field is valid and indicates
+-         * that PTEs in the range [0, @nr_validated_ptes) have been validated.
+-         * An extra page reference must be acquired (or not dropped) whenever
+-         * PGT_partial gets set, and it must be dropped when the flag gets
+-         * cleared. This is so that a get() leaving a page in partially
+         * When PGT_partial is true then the first two fields are valid and
+         * indicate that PTEs in the range [0, @nr_validated_ptes) have been
+         * validated. An extra page reference must be acquired (or not dropped)
+         * whenever PGT_partial gets set, and it must be dropped when the flag
+         * gets cleared. This is so that a get() leaving a page in partially
+          * validated state (where the caller would drop the reference acquired
+          * due to the getting of the type [apparently] failing [-ERESTART])
+          * would not accidentally result in a page left with zero general
+@@ -152,10 +152,18 @@ struct page_info
+          * put_page_from_lNe() (due to the apparent failure), and hence it
+          * must be dropped when the put operation is resumed (and completes),
+          * but it must not be acquired if picking up the page for validation.
+         *
+         * The 3rd field, @linear_pt_count, indicates
+         * - by a positive value, how many same-level page table entries a page
+         *   table has,
+         * - by a negative value, in how many same-level page tables a page is
+         *   in use.
+          */
+         struct {
+-            u16 nr_validated_ptes;
+-            s8 partial_pte;
+            u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+            u16 :16 - PAGETABLE_ORDER - 1 - 2;
+            s16 partial_pte:2;
+            s16 linear_pt_count;
+         };
+         /*
+@@ -206,6 +214,9 @@ struct page_info
+ #define PGT_count_width   PG_shift(9)
+ #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
+/* Are the 'type mask' bits identical? */
+#define PGT_type_equal(x, y) (!(((x) ^ (y)) & PGT_type_mask))
+
+  /* Cleared when the owning guest 'frees' this page. */
+ #define _PGC_allocated    PG_shift(1)
+ #define PGC_allocated     PG_mask(1, 1)
+-- 
+2.14.1
--- a/main/xen/xsa240-2.patch
+++ b/main/xen/xsa240-2.patch
+From e614979ce054044d9e19023f1ef10dae6e38baf4 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Fri, 22 Sep 2017 11:46:55 +0100
+Subject: [PATCH 2/2] x86/mm: Disable PV linear pagetables by default
+Allowing pagetables to point to other pagetables of the same level
+(often called 'linear pagetables') has been included in Xen since its
+inception.  But it is not used by the most common PV guests (Linux,
+NetBSD, minios), and has been the source of a number of subtle
+reference-counting bugs.
+Add a command-line option to control whether PV linear pagetables are
+allowed (disabled by default).
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+---
+Changes since v2:
+- s/_/-/; in command-line option
+- Added __read_mostly
+---
+ docs/misc/xen-command-line.markdown | 15 +++++++++++++++
+ xen/arch/x86/mm.c                   | 10 ++++++++++
+ 2 files changed, 25 insertions(+)
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 44d99852aa..45ef873abb 100644
+--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
+@@ -1374,6 +1374,21 @@ The following resources are available:
+     CDP, one COS will corespond two CBMs other than one with CAT, due to the
+     sum of CBMs is fixed, that means actual `cos_max` in use will automatically
+     reduce to half when CDP is enabled.
+
+### pv-linear-pt
+> `= <boolean>`
+
+> Default: `false`
+
+Allow PV guests to have pagetable entries pointing to other pagetables
+of the same level (i.e., allowing L2 PTEs to point to other L2 pages).
+This technique is often called "linear pagetables", and is sometimes
+used to allow operating systems a simple way to consistently map the
+current process's pagetables into its own virtual address space.
+
+None of the most common PV operating systems (Linux, NetBSD, MiniOS)
+use this technique, but there may be custom operating systems which
+do.
+ ### reboot
+ > `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | [c]old]`
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 1e469bd354..32952a46b9 100644
+--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
+@@ -814,6 +814,9 @@ static void dec_linear_uses(struct page_info *pg)
+  *     frame if it is mapped by a different root table. This is sufficient and
+  *     also necessary to allow validation of a root table mapping itself.
+  */
+static bool __read_mostly pv_linear_pt_enable = false;
+boolean_param("pv-linear-pt", pv_linear_pt_enable);
+
+ #define define_get_linear_pagetable(level)                                  \
+ static int                                                                  \
+ get_##level##_linear_pagetable(                                             \
+@@ -823,6 +826,13 @@ get_##level##_linear_pagetable(                                             \
+     struct page_info *page;                                                 \
+     unsigned long pfn;                                                      \
+                                                                             \
+    if ( !pv_linear_pt_enable )                                             \
+    {                                                                       \
+        gdprintk(XENLOG_WARNING,                                            \
+                 "Attempt to create linear p.t. (feature disabled)\n");     \
+        return 0;                                                           \
+    }                                                                       \
+                                                                            \
+     if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
+     {                                                                       \
+         gdprintk(XENLOG_WARNING,                                            \
+-- 
+2.14.1
--- a/main/xen/xsa241-4.9.patch
+++ b/main/xen/xsa241-4.9.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86: don't store possibly stale TLB flush time stamp
+While the timing window is extremely narrow, it is theoretically
+possible for an update to the TLB flush clock and a subsequent flush
+IPI to happen between the read and write parts of the update of the
+per-page stamp. Exclude this possibility by disabling interrupts
+across the update, preventing the IPI to be serviced in the middle.
+This is XSA-241.
+Reported-by: Jann Horn <jannh@google.com>
+Suggested-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+--- a/xen/arch/arm/smp.c
+++ b/xen/arch/arm/smp.c
+@@ -1,3 +1,4 @@
+#include <xen/mm.h>
+ #include <asm/system.h>
+ #include <asm/smp.h>
+ #include <asm/cpregs.h>
+--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
+@@ -2524,7 +2524,7 @@ static int _put_final_page_type(struct p
+          */
+         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+                (page->count_info & PGC_page_table)) )
+-            page->tlbflush_timestamp = tlbflush_current_time();
+            page_set_tlbflush_timestamp(page);
+         wmb();
+         page->u.inuse.type_info--;
+     }
+@@ -2534,7 +2534,7 @@ static int _put_final_page_type(struct p
+                 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+                (page->count_info & PGC_page_table)) )
+-            page->tlbflush_timestamp = tlbflush_current_time();
+            page_set_tlbflush_timestamp(page);
+         wmb();
+         page->u.inuse.type_info |= PGT_validated;
+     }
+@@ -2588,7 +2588,7 @@ static int _put_page_type(struct page_in
+             if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+             {
+                 /*
+-                 * page_set_tlbflush_timestamp() accesses the same union
+                 * set_tlbflush_timestamp() accesses the same union
+                  * linear_pt_count lives in. Unvalidated page table pages,
+                  * however, should occur during domain destruction only
+                  * anyway.  Updating of linear_pt_count luckily is not
+@@ -2609,7 +2609,7 @@ static int _put_page_type(struct page_in
+              */
+             if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+                    (page->count_info & PGC_page_table)) )
+-                page->tlbflush_timestamp = tlbflush_current_time();
+                page_set_tlbflush_timestamp(page);
+         }
+         if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
+@@ -1464,7 +1464,7 @@ void shadow_free(struct domain *d, mfn_t
+          * TLBs when we reuse the page.  Because the destructors leave the
+          * contents of the pages in place, we can delay TLB flushes until
+          * just before the allocator hands the page out again. */
+-        sp->tlbflush_timestamp = tlbflush_current_time();
+        page_set_tlbflush_timestamp(sp);
+         perfc_decr(shadow_alloc_count);
+         page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
+         sp = next;
+--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
+@@ -960,7 +960,7 @@ static void free_heap_pages(
+         /* If a page has no owner it will need no safety TLB flush. */
+         pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
+         if ( pg[i].u.free.need_tlbflush )
+-            pg[i].tlbflush_timestamp = tlbflush_current_time();
+            page_set_tlbflush_timestamp(&pg[i]);
+         /* This page is not a guest frame any more. */
+         page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
+--- a/xen/include/asm-arm/flushtlb.h
+++ b/xen/include/asm-arm/flushtlb.h
+@@ -12,6 +12,11 @@ static inline void tlbflush_filter(cpuma
+ #define tlbflush_current_time()                 (0)
+static inline void page_set_tlbflush_timestamp(struct page_info *page)
+{
+    page->tlbflush_timestamp = tlbflush_current_time();
+}
+
+ #if defined(CONFIG_ARM_32)
+ # include <asm/arm32/flushtlb.h>
+ #elif defined(CONFIG_ARM_64)
+--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
+@@ -23,6 +23,20 @@ DECLARE_PER_CPU(u32, tlbflush_time);
+ #define tlbflush_current_time() tlbflush_clock
+static inline void page_set_tlbflush_timestamp(struct page_info *page)
+{
+    /*
+     * Prevent storing a stale time stamp, which could happen if an update
+     * to tlbflush_clock plus a subsequent flush IPI happen between the
+     * reading of tlbflush_clock and the writing of the struct page_info
+     * field.
+     */
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    page->tlbflush_timestamp = tlbflush_current_time();
+    local_irq_enable();
+}
+
+ /*
+  * @cpu_stamp is the timestamp at last TLB flush for the CPU we are testing.
+  * @lastuse_stamp is a timestamp taken when the PFN we are testing was last 
--- a/main/xen/xsa242-4.9.patch
+++ b/main/xen/xsa242-4.9.patch
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86: don't allow page_unlock() to drop the last type reference
+Only _put_page_type() does the necessary cleanup, and hence not all
+domain pages can be released during guest cleanup (leaving around
+zombie domains) if we get this wrong.
+This is XSA-242.
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
+@@ -1923,7 +1923,11 @@ void page_unlock(struct page_info *page)
+     do {
+         x = y;
+        ASSERT((x & PGT_count_mask) && (x & PGT_locked));
+
+         nx = x - (1 | PGT_locked);
+        /* We must not drop the last reference here. */
+        ASSERT(nx & PGT_count_mask);
+     } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
+ }
+@@ -2611,6 +2615,17 @@ static int _put_page_type(struct page_in
+                    (page->count_info & PGC_page_table)) )
+                 page_set_tlbflush_timestamp(page);
+         }
+        else if ( unlikely((nx & (PGT_locked | PGT_count_mask)) ==
+                           (PGT_locked | 1)) )
+        {
+            /*
+             * We must not drop the second to last reference when the page is
+             * locked, as page_unlock() doesn't do any cleanup of the type.
+             */
+            cpu_relax();
+            y = page->u.inuse.type_info;
+            continue;
+        }
+         if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+             break;
--- a/main/xen/xsa243.patch
+++ b/main/xen/xsa243.patch
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/shadow: Don't create self-linear shadow mappings for 4-level translated guests
+When initially creating a monitor table for 4-level translated guests, don't
+install a shadow-linear mapping.  This mapping is actually self-linear, and
+trips up the writeable heuristic logic into following Xen's mappings, not the
+guests' shadows it was expecting to follow.
+A consequence of this is that sh_guess_wrmap() needs to cope with there being
+no shadow-linear mapping present, which in practice occurs once each time a
+vcpu switches to 4-level paging from a different paging mode.
+An appropriate shadow-linear slot will be inserted into the monitor table
+either while constructing lower level monitor tables, or by sh_update_cr3().
+While fixing this, clarify the safety of the other mappings.  Despite
+appearing unsafe, it is correct to create a guest-linear mapping for
+translated domains; this is self-linear and doesn't point into the translated
+domain.  Drop a dead clause for translate != external guests.
+This is XSA-243.
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Tim Deegan <tim@xen.org>
+diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
+index 8d4f244..a18d286 100644
+--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
+@@ -1485,26 +1485,38 @@ void sh_install_xen_entries_in_l4(struct domain *d, mfn_t gl4mfn, mfn_t sl4mfn)
+         sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = shadow_l4e_empty();
+     }
+-    /* Shadow linear mapping for 4-level shadows.  N.B. for 3-level
+-     * shadows on 64-bit xen, this linear mapping is later replaced by the
+-     * monitor pagetable structure, which is built in make_monitor_table
+-     * and maintained by sh_update_linear_entries. */
+-    sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+-        shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR_RW);
+-
+-    /* Self linear mapping.  */
+-    if ( shadow_mode_translate(d) && !shadow_mode_external(d) )
+    /*
+     * Linear mapping slots:
+     *
+     * Calling this function with gl4mfn == sl4mfn is used to construct a
+     * monitor table for translated domains.  In this case, gl4mfn forms the
+     * self-linear mapping (i.e. not pointing into the translated domain), and
+     * the shadow-linear slot is skipped.  The shadow-linear slot is either
+     * filled when constructing lower level monitor tables, or via
+     * sh_update_cr3() for 4-level guests.
+     *
+     * Calling this function with gl4mfn != sl4mfn is used for non-translated
+     * guests, where the shadow-linear slot is actually self-linear, and the
+     * guest-linear slot points into the guests view of its pagetables.
+     */
+    if ( shadow_mode_translate(d) )
+     {
+-        // linear tables may not be used with translated PV guests
+-        sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+        ASSERT(mfn_eq(gl4mfn, sl4mfn));
+
+        sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+             shadow_l4e_empty();
+     }
+     else
+     {
+-        sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+-            shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR_RW);
+        ASSERT(!mfn_eq(gl4mfn, sl4mfn));
+
+        sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR_RW);
+     }
+    sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+        shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR_RW);
+
+     unmap_domain_page(sl4e);
+ }
+ #endif
+@@ -4405,6 +4417,11 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
+     /* Carefully look in the shadow linear map for the l1e we expect */
+ #if SHADOW_PAGING_LEVELS >= 4
+    /* Is a shadow linear map is installed in the first place? */
+    sl4p  = v->arch.paging.shadow.guest_vtable;
+    sl4p += shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
+    if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+        return 0;
+     sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
+     if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+         return 0;
--- a/main/xen/xsa244.patch
+++ b/main/xen/xsa244.patch
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: [PATCH] x86/cpu: Fix IST handling during PCPU bringup
+Clear IST references in newly allocated IDTs.  Nothing good will come of
+having them set before the TSS is suitably constructed (although the chances
+of the CPU surviving such an IST interrupt/exception is extremely slim).
+Uniformly set the IST references after the TSS is in place.  This fixes an
+issue on AMD hardware, where onlining a PCPU while PCPU0 is in HVM context
+will cause IST_NONE to be copied into the new IDT, making that PCPU vulnerable
+to privilege escalation from PV guests until it subsequently schedules an HVM
+guest.
+This is XSA-244
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/cpu/common.c | 5 +++++
+ xen/arch/x86/smpboot.c    | 3 +++
+ 2 files changed, 8 insertions(+)
+diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
+index 78f5667..6cf3628 100644
+--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
+@@ -640,6 +640,7 @@ void __init early_cpu_init(void)
+  * - Sets up TSS with stack pointers, including ISTs
+  * - Inserts TSS selector into regular and compat GDTs
+  * - Loads GDT, IDT, TR then null LDT
+ * - Sets up IST references in the IDT
+  */
+ void load_system_tables(void)
+ {
+@@ -702,6 +703,10 @@ void load_system_tables(void)
+ 	asm volatile ("ltr  %w0" : : "rm" (TSS_ENTRY << 3) );
+ 	asm volatile ("lldt %w0" : : "rm" (0) );
+	set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
+	set_ist(&idt_tables[cpu][TRAP_nmi],	      IST_NMI);
+	set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
+
+ 	/*
+ 	 * Bottom-of-stack must be 16-byte aligned!
+ 	 *
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index 3ca716c..1609b62 100644
+--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
+@@ -724,6 +724,9 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+     if ( idt_tables[cpu] == NULL )
+         goto oom;
+     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
+    set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
+    set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
+    set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )