Commit 6a020fa1 authored by Leonardo Arena's avatar Leonardo Arena
Browse files

main/xen: security fixes

CVE-2019-19579 XSA-306
CVE-2019-19582 XSA-307
CVE-2019-19583 XSA-308
CVE-2019-19578 XSA-309
CVE-2019-19580 XSA-310
CVE-2019-19577 XSA-311

ref #11132
parent f9cad747
......@@ -3,7 +3,7 @@
# Maintainer: William Pitcock <nenolod@dereferenced.org>
pkgname=xen
pkgver=4.11.3
pkgrel=0
pkgrel=1
pkgdesc="Xen hypervisor"
url="https://www.xenproject.org/"
arch="x86_64 armhf aarch64" # enable armv7 when builds with gcc8
......@@ -167,6 +167,13 @@ options="!strip"
# - CVE-2019-11135 XSA-305
# 4.11.3-r0:
# - CVE-2019-19579 XSA-306
# 4.11.3-r1:
# - CVE-2019-19579 XSA-306
# - CVE-2019-19582 XSA-307
# - CVE-2019-19583 XSA-308
# - CVE-2019-19578 XSA-309
# - CVE-2019-19580 XSA-310
# - CVE-2019-19577 XSA-311
case "$CARCH" in
x86*)
......@@ -234,6 +241,14 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
hotplug-Linux-iscsi-block-handle-lun-1.patch
xsa307.patch
xsa308.patch
xsa309.patch
xsa310-0001-x86-mm-Set-old_guest_table-when-destroying-vcpu-page.patch
xsa310-0002-x86-mm-alloc-free_lN_table-Retain-partial_flags-on-E.patch
xsa310-0003-x86-mm-relinquish_memory-Grab-an-extra-type-ref-when.patch
xsa311-4.11.patch
xenstored.initd
xenstored.confd
xenconsoled.initd
......@@ -490,6 +505,13 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a
69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch
2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch
8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch
984185e513e0688edca932f434ace78daf99094b563dc3f6cd1c94c7f60842e860dc9490296a4bc716c42f544e0bdf2e3c58cd46b4b490b61dbbe8389c5674c4 xsa307.patch
3650ab4d75ba65764edacb379b67c6bc08df5cada0c2b039fd8641f212ba462246ef838dee3390fddd0725cec8676197136ef99dfbd7d9ef1a7c4d78a873639b xsa308.patch
ad6468c55c13a259b8baa15f251a77ae5ff0524434201caeb1780ca58e637a9e4be398f264c010913d940a248ca619a6878cd6109180de653afadb923fc38fee xsa309.patch
806c3cd3895f6573195d3ae85f314c8b7b7bc9ac4b1663b113e1c7fb8a7d949855fab09ba794b838a1cebdb40017ebfbaed932fd23ee33cc7bef8381a8ed2584 xsa310-0001-x86-mm-Set-old_guest_table-when-destroying-vcpu-page.patch
6e713158f693c1d38f1044e1e9adea3d9338c47e9c2fec10b95a04a36cbc7c8e2841d593cb6e39b44976b6c29b7eec9919dec738e5fddaedddaaeade220185d8 xsa310-0002-x86-mm-alloc-free_lN_table-Retain-partial_flags-on-E.patch
bef47261b61f2f9f10d649c8de1ad076517ac5ecea5f26a3a61ded91ced3f274ddeb8a41592edfe7dfd5439b010b647f6c15afeb7cd2b8c6065cd2281413b614 xsa310-0003-x86-mm-relinquish_memory-Grab-an-extra-type-ref-when.patch
6e786287e21cd8f7371b75b05067428656cc5985ef98902fab577b9dff3a187d130675063db127a9c2210c935b2eb1f6288d784d595c9bdee30f0c904a81afb4 xsa311-4.11.patch
52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd
093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd
3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd
......
From: Jan Beulich <jbeulich@suse.com>
Subject: x86+Arm32: make find_next_{,zero_}bit() have well defined behavior
These functions getting used with the 2nd and 3rd arguments being equal
wasn't well defined: Arm64 reliably returns the value of the 2nd
argument in this case, while on x86 for bitmaps up to 64 bits wide the
return value was undefined (due to the undefined behavior of a shift of
a value by the number of bits it's wide) when the incoming value was 64.
On Arm32 an actual out of bounds access would happen when the
size/offset value is a multiple of 32; if this access doesn't fault, the
return value would have been sufficiently correct afaict.
Make the functions consistently tolerate the last two arguments being
equal (and in fact the 3rd argument being greater or equal to the 2nd),
in favor of finding and fixing all the use sites that violate the
original more strict assumption.
This is XSA-307.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <julien@xen.org>
---
The most obvious (albeit still indirect) exposure to guests is
evtchn_check_pollers(), which imo makes this a security issue at least
for Arm32.
This was originally already discussed between (at least) Andrew and me,
and I don't really recall who brought up the issue first.
Note that Arm's Linux origin of the code may call for syncing
publication with them. Then again I don't want to tell them just to see
them go public ahead of us.
--- a/xen/arch/arm/arm32/lib/findbit.S
+++ b/xen/arch/arm/arm32/lib/findbit.S
@@ -42,8 +42,8 @@ ENDPROC(_find_first_zero_bit_le)
* Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
*/
ENTRY(_find_next_zero_bit_le)
- teq r1, #0
- beq 3b
+ cmp r1, r2
+ bls 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
ARM( ldrb r3, [r0, r2, lsr #3] )
@@ -83,8 +83,8 @@ ENDPROC(_find_first_bit_le)
* Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
*/
ENTRY(_find_next_bit_le)
- teq r1, #0
- beq 3b
+ cmp r1, r2
+ bls 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
ARM( ldrb r3, [r0, r2, lsr #3] )
@@ -117,8 +117,8 @@ ENTRY(_find_first_zero_bit_be)
ENDPROC(_find_first_zero_bit_be)
ENTRY(_find_next_zero_bit_be)
- teq r1, #0
- beq 3b
+ cmp r1, r2
+ bls 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
eor r3, r2, #0x18 @ big endian byte ordering
@@ -151,8 +151,8 @@ ENTRY(_find_first_bit_be)
ENDPROC(_find_first_bit_be)
ENTRY(_find_next_bit_be)
- teq r1, #0
- beq 3b
+ cmp r1, r2
+ bls 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
eor r3, r2, #0x18 @ big endian byte ordering
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -358,7 +358,7 @@ static always_inline unsigned int __scan
const unsigned long *a__ = (addr); \
unsigned int s__ = (size); \
unsigned int o__ = (off); \
- if ( __builtin_constant_p(size) && !s__ ) \
+ if ( o__ >= s__ ) \
r__ = s__; \
else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG ) \
r__ = o__ + __scanbit(*(const unsigned long *)(a__) >> o__, s__); \
@@ -390,7 +390,7 @@ static always_inline unsigned int __scan
const unsigned long *a__ = (addr); \
unsigned int s__ = (size); \
unsigned int o__ = (off); \
- if ( __builtin_constant_p(size) && !s__ ) \
+ if ( o__ >= s__ ) \
r__ = s__; \
else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG ) \
r__ = o__ + __scanbit(~*(const unsigned long *)(a__) >> o__, s__); \
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/vtx: Work around SingleStep + STI/MovSS VMEntry failures
See patch comment for technical details.
Concerning the timeline, this was first discovered in the aftermath of
XSA-156 which caused #DB to be intercepted unconditionally, but only in
its SingleStep + STI form which is restricted to privileged software.
After working with Intel and identifying the problematic vmentry check,
this workaround was suggested, and the patch was posted in an RFC
series. Outstanding work for that series (not breaking Introspection)
is still pending, and this fix from it (which wouldn't have been good
enough in its original form) wasn't committed.
A vmentry failure was reported to xen-devel, and debugging identified
this bug in its SingleStep + MovSS form by way of INT1, which does not
involve the use of any privileged instructions, and proving this to be a
security issue.
This is XSA-308
Reported-by: Håkon Alstadheim <hakon@alstadheim.priv.no>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 6a5eeb5c13..59b836f43f 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3816,6 +3816,42 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
__restore_debug_registers(v);
write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
+
+ /*
+ * Work around SingleStep + STI/MovSS VMEntry failures.
+ *
+ * We intercept #DB unconditionally to work around CVE-2015-8104 /
+ * XSA-156 (guest-kernel induced host DoS).
+ *
+ * STI/MovSS shadows block/defer interrupts/exceptions (exact
+ * details are complicated and poorly documented). Debug
+ * exceptions delayed for any reason are stored in the
+ * PENDING_DBG_EXCEPTIONS field.
+ *
+ * The falling edge of PENDING_DBG causes #DB to be delivered,
+ * resulting in a VMExit, as #DB is intercepted. The VMCS still
+ * reports blocked-by-STI/MovSS.
+ *
+ * The VMEntry checks when EFLAGS.TF is set don't like a VMCS in
+ * this state. Despite a #DB queued in VMENTRY_INTR_INFO, the
+ * state is rejected as DR6.BS isn't pending. Fix this up.
+ */
+ if ( unlikely(regs->eflags & X86_EFLAGS_TF) )
+ {
+ unsigned long int_info;
+
+ __vmread(GUEST_INTERRUPTIBILITY_INFO, &int_info);
+
+ if ( int_info & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
+ {
+ unsigned long pending_dbg;
+
+ __vmread(GUEST_PENDING_DBG_EXCEPTIONS, &pending_dbg);
+ __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS,
+ pending_dbg | DR_STEP);
+ }
+ }
+
if ( !v->domain->debugger_attached )
{
unsigned long insn_len = 0;
From 523e3974ed2213719a19218f5b246e382ceef18a Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@citrix.com>
Date: Wed, 30 Oct 2019 17:05:28 +0000
Subject: [PATCH] x86/mm: Don't reset linear_pt_count on partial validation
"Linear pagetables" is a technique which involves either pointing a
pagetable at itself, or to another pagetable the same or higher level.
Xen has limited support for linear pagetables: A page may either point
to itself, or point to another page of the same level (i.e., L2 to L2,
L3 to L3, and so on).
XSA-240 introduced an additional restriction that limited the "depth"
of such chains by allowing pages to either *point to* other pages of
the same level, or *be pointed to* by other pages of the same level,
but not both. To implement this, we keep track of the number of
outstanding times a page points to or is pointed to another page
table, to prevent both from happening at the same time.
Unfortunately, the original commit introducing this reset this count
when resuming validation of a partially-validated pagetable, dropping
some "linear_pt_entry" counts.
On debug builds on systems where guests used this feature, this might
lead to crashes that look like this:
Assertion 'oc > 0' failed at mm.c:874
Worse, if an attacker could engineer such a situation to occur, they
might be able to make loops or other abitrary chains of linear
pagetables, leading to the denial-of-service situation outlined in
XSA-240.
This is XSA-309.
Reported-by: Manuel Bouyer <bouyer@antioche.eu.org>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
xen/arch/x86/mm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 7d4dd80a85..01393fb0da 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3059,8 +3059,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
{
page->nr_validated_ptes = 0;
page->partial_flags = 0;
+ page->linear_pt_count = 0;
}
- page->linear_pt_count = 0;
rc = alloc_page_type(page, type, preemptible);
}
--
2.24.0
From 7c537dc8d28a03064a14171ed5c6fc329531816a Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@citrix.com>
Date: Tue, 19 Nov 2019 11:40:34 +0000
Subject: [PATCH 1/3] x86/mm: Set old_guest_table when destroying vcpu
pagetables
Changeset 6c4efc1eba ("x86/mm: Don't drop a type ref unless you held a
ref to begin with"), part of XSA-299, changed the calling discipline
of put_page_type() such that if put_page_type() returned -ERESTART
(indicating a partially de-validated page), subsequent calls to
put_page_type() must be called with PTF_partial_set. If called on a
partially de-validated page but without PTF_partial_set, Xen will
BUG(), because to do otherwise would risk opening up the kind of
privilege escalation bug described in XSA-299.
One place this was missed was in vcpu_destroy_pagetables().
put_page_and_type_preemptible() is called, but on -ERESTART, the
entire operation is simply restarted, causing put_page_type() to be
called on a partially de-validated page without PTF_partial_set. The
result was that if such an operation were interrupted, Xen would hit a
BUG().
Fix this by having vcpu_destroy_pagetables() consistently pass off
interrupted de-validations to put_old_page_type():
- Unconditionally clear references to the page, even if
put_page_and_type failed
- Set old_guest_table and old_guest_table_partial appropriately
While here, do some refactoring:
- Move clearing of arch.cr3 to the top of the function
- Now that clearing is unconditional, move the unmap to the same
conditional as the l4tab mapping. This also allows us to reduce
the scope of the l4tab variable.
- Avoid code duplication by looping to drop references on
guest_table_user
This is part of XSA-310.
Reported-by: Sarah Newman <srn@prgmr.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
Added in v2.
Changes in v3:
- Minor comment / whitespace fixes
---
xen/arch/x86/mm.c | 75 +++++++++++++++++++++++++++++------------------
1 file changed, 47 insertions(+), 28 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 01393fb0da..a759afc9e3 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3142,40 +3142,36 @@ int put_old_guest_table(struct vcpu *v)
int vcpu_destroy_pagetables(struct vcpu *v)
{
unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
- struct page_info *page;
- l4_pgentry_t *l4tab = NULL;
+ struct page_info *page = NULL;
int rc = put_old_guest_table(v);
+ bool put_guest_table_user = false;
if ( rc )
return rc;
+ v->arch.cr3 = 0;
+
+ /*
+ * Get the top-level guest page; either the guest_table itself, for
+ * 64-bit, or the top-level l4 entry for 32-bit. Either way, remove
+ * the reference to that page.
+ */
if ( is_pv_32bit_vcpu(v) )
{
- l4tab = map_domain_page(_mfn(mfn));
- mfn = l4e_get_pfn(*l4tab);
- }
+ l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
- if ( mfn )
- {
- page = mfn_to_page(_mfn(mfn));
- if ( paging_mode_refcounts(v->domain) )
- put_page(page);
- else
- rc = put_page_and_type_preemptible(page);
- }
-
- if ( l4tab )
- {
- if ( !rc )
- l4e_write(l4tab, l4e_empty());
+ mfn = l4e_get_pfn(*l4tab);
+ l4e_write(l4tab, l4e_empty());
unmap_domain_page(l4tab);
}
- else if ( !rc )
+ else
{
v->arch.guest_table = pagetable_null();
+ put_guest_table_user = true;
+ }
- /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
- mfn = pagetable_get_pfn(v->arch.guest_table_user);
+ /* Free that page if non-zero */
+ do {
if ( mfn )
{
page = mfn_to_page(_mfn(mfn));
@@ -3183,18 +3179,41 @@ int vcpu_destroy_pagetables(struct vcpu *v)
put_page(page);
else
rc = put_page_and_type_preemptible(page);
+ mfn = 0;
}
- if ( !rc )
- v->arch.guest_table_user = pagetable_null();
- }
- v->arch.cr3 = 0;
+ if ( !rc && put_guest_table_user )
+ {
+ /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+ mfn = pagetable_get_pfn(v->arch.guest_table_user);
+ v->arch.guest_table_user = pagetable_null();
+ put_guest_table_user = false;
+ }
+ } while ( mfn );
/*
- * put_page_and_type_preemptible() is liable to return -EINTR. The
- * callers of us expect -ERESTART so convert it over.
+ * If a "put" operation was interrupted, finish things off in
+ * put_old_guest_table() when the operation is restarted.
*/
- return rc != -EINTR ? rc : -ERESTART;
+ switch ( rc )
+ {
+ case -EINTR:
+ case -ERESTART:
+ v->arch.old_guest_ptpg = NULL;
+ v->arch.old_guest_table = page;
+ v->arch.old_guest_table_partial = (rc == -ERESTART);
+ rc = -ERESTART;
+ break;
+ default:
+ /*
+ * Failure to 'put' a page may cause it to leak, but that's
+ * less bad than a crash.
+ */
+ ASSERT(rc == 0);
+ break;
+ }
+
+ return rc;
}
int new_guest_cr3(mfn_t mfn)
--
2.24.0
From 128cb126aee9b4a2855ab898fdfbfe7009fbf1f5 Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@citrix.com>
Date: Thu, 31 Oct 2019 11:17:38 +0000
Subject: [PATCH 2/3] x86/mm: alloc/free_lN_table: Retain partial_flags on
-EINTR
When validating or de-validating pages (in alloc_lN_table and
free_lN_table respectively), the `partial_flags` local variable is
used to keep track of whether the "current" PTE started the entire
operation in a "may be partial" state.
One of the patches in XSA-299 addressed the fact that it is possible
for a previously-partially-validated entry to subsequently be found to
have invalid entries (indicated by returning -EINVAL); in which case
page->partial_flags needs to be set to indicate that the current PTE
may have the partial bit set (and thus _put_page_type() should be
called with PTF_partial_set).
Unfortunately, the patches in XSA-299 assumed that once
put_page_from_lNe() returned -ERESTART on a page, it was not possible
for it to return -EINTR. This turns out to be true for
alloc_lN_table() and free_lN_table, but not for _get_page_type() and
_put_page_type(): both can return -EINTR when called on pages with
PGT_partial set. In these cases, the pages PGT_partial will still be
set; failing to set partial_flags appropriately may allow an attacker
to do a privilege escalation similar to those described in XSA-299.
Fix this by always copying the local partial_flags variable into
page->partial_flags when exiting early.
NB that on the "get" side, no adjustment to nr_validated_entries is
needed: whether pte[i] is partially validated or entirely
un-validated, we want nr_validated_entries = i. On the "put" side,
however, we need to adjust nr_validated_entries appropriately: if
pte[i] is entirely validated, we want nr_validated_entries = i + 1; if
pte[i] is partially validated, we want nr_validated_entries = i.
This is part of XSA-310.
Reported-by: Sarah Newman <srn@prgmr.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
xen/arch/x86/mm.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index a759afc9e3..97c8d73b7b 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1557,7 +1557,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
if ( rc == -EINTR && i )
{
page->nr_validated_ptes = i;
- page->partial_flags = 0;
+ page->partial_flags = partial_flags;;
rc = -ERESTART;
}
else if ( rc < 0 && rc != -EINTR )
@@ -1660,7 +1660,7 @@ static int alloc_l3_table(struct page_info *page)
else if ( rc == -EINTR && i )
{
page->nr_validated_ptes = i;
- page->partial_flags = 0;
+ page->partial_flags = partial_flags;
rc = -ERESTART;
}
if ( rc < 0 )
@@ -1982,8 +1982,8 @@ static int free_l2_table(struct page_info *page)
}
else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
{
- page->nr_validated_ptes = i + 1;
- page->partial_flags = 0;
+ page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set);
+ page->partial_flags = partial_flags;
rc = -ERESTART;
}
@@ -2030,8 +2030,8 @@ static int free_l3_table(struct page_info *page)
}
else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
{
- page->nr_validated_ptes = i + 1;
- page->partial_flags = 0;
+ page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set);
+ page->partial_flags = partial_flags;
rc = -ERESTART;
}
return rc > 0 ? 0 : rc;
@@ -2061,8 +2061,8 @@ static int free_l4_table(struct page_info *page)
}
else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
{
- page->nr_validated_ptes = i + 1;
- page->partial_flags = 0;
+ page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set);
+ page->partial_flags = partial_flags;
rc = -ERESTART;
}
--
2.24.0
From e9f835982a726ae16997c566b5eafab74f8b4cb7 Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@citrix.com>
Date: Mon, 28 Oct 2019 14:33:51 +0000
Subject: [PATCH 3/3] x86/mm: relinquish_memory: Grab an extra type ref when
setting PGT_partial
The PGT_partial bit in page->type_info holds both a type count and a
general ref count. During domain tear-down, when free_page_type()
returns -ERESTART, relinquish_memory() correctly handles the general
ref count, but fails to grab an extra type count when setting
PGT_partial. When this bit is eventually cleared, type_count underflows
and triggers the following BUG in page_alloc.c:free_domheap_pages():
BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
As far as we can tell, this page underflow cannot be exploited any any
other way: The page can't be used as a pagetable by the dying domain
because it's dying; it can't be used as a pagetable by any other
domain since it belongs to the dying domain; and ownership can't
transfer to any other domain without hitting the BUG_ON() in
free_domheap_pages().
(steal_page() won't work on a page in this state, since it requires
PGC_allocated to be set, and PGC_allocated will already have been
cleared.)
Fix this by grabbing an extra type ref if setting PGT_partial in
relinquish_memory.
This is part of XSA-310.
Reported-by: Sarah Newman <srn@prgmr.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
---
v2:
- Move discussion of potential exploits into the commit message
- Keep PGT_partial and put_page() ordering
---
xen/arch/x86/domain.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f1dd86e12e..51880fc50d 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2049,6 +2049,25 @@ static int relinquish_memory(
goto out;
case -ERESTART:
page_list_add(page, list);
+ /*
+ * PGT_partial holds a type ref and a general ref.
+ * If we came in with PGT_partial set, then we 1)
+ * don't need to grab an extra type count, and 2)
+ * do need to drop the extra page ref we grabbed
+ * at the top of the loop. If we didn't come in
+ * with PGT_partial set, we 1) do need to drab an
+ * extra type count, but 2) can transfer the page
+ * ref we grabbed above to it.
+ *
+ * Note that we must increment type_info before
+ * setting PGT_partial. Theoretically it should
+ * be safe to drop the page ref before setting
+ * PGT_partial, but do it afterwards just to be