diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD index 8845418d5cb0c8e79f1a40b9c4cf0a6eda57a7a4..0945e70d92a0eb051d6a565d2f736f00f4c4cf1e 100644 --- a/main/xen/APKBUILD +++ b/main/xen/APKBUILD @@ -1,8 +1,8 @@ # Contributor: Roger Pau Monne <roger.pau@entel.upc.edu> # Maintainer: Natanael Copa <ncopa@alpinelinux.org> pkgname=xen -pkgver=4.18.0 -pkgrel=5 +pkgver=4.18.2 +pkgrel=0 pkgdesc="Xen hypervisor" url="https://www.xenproject.org/" arch="x86_64 armv7 aarch64" @@ -367,6 +367,10 @@ options="!strip" # 4.18.0-r5: # - CVE-2023-28746 XSA-452 # - CVE-2024-2193 XSA-453 +# 4.18.2-r0: +# - CVE-2023-46842 XSA-454 +# - CVE-2024-31142 XSA-455 +# - CVE-2024-2201 XSA-456 case "$CARCH" in x86*) @@ -412,8 +416,6 @@ source="https://downloads.xenproject.org/release/xen/$pkgver/xen-$pkgver.tar.gz https://xenbits.xen.org/xen-extfiles/zlib-$_ZLIB_VERSION.tar.gz https://xenbits.xen.org/xen-extfiles/ipxe-git-$_IPXE_GIT_TAG.tar.gz - xen-stable-4.18-20240312.patch - mini-os-__divmoddi4.patch qemu-xen_paths.patch @@ -700,8 +702,7 @@ qemu_openrc() { } sha512sums=" -4cc9fd155144045a173c5f8ecc45f149817f1034eec618cb6f8b0494ef2fb5b95c4c60cf0bf4bec4bef8a622c35b6a3cb7dedc38e6d95e726f1611c73ddb3273 xen-4.18.0.tar.gz -8df958195290a39b54493766e7555d71c68083d75edd13a2f77ad237d6b6fb52bce816b9e975c0c14024a01042e599415360dcf475f7d2e0c6bee8f9fd2ed6ef xen-stable-4.18-20240312.patch +c5feb450155883b5d2e7f43b05a64e7215b661b7d2f438d8f5a0896bd57283379ee11ca8e2e7a1d8787813cc6f1a260253fcb8688ed7d61a2bfb636db1626941 xen-4.18.2.tar.gz 2e0b0fd23e6f10742a5517981e5171c6e88b0a93c83da701b296f5c0861d72c19782daab589a7eac3f9032152a0fc7eff7f5362db8fccc4859564a9aa82329cf gmp-4.3.2.tar.bz2 c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a3628bd00ba4d14a54742bc04848110eb3ae8ca25dbfbaabadb grub-0.97.tar.gz 1465b58279af1647f909450e394fe002ca165f0ff4a0254bfa9fe0e64316f50facdde2729d79a4e632565b4500cf4d6c74192ac0dd3bc9fe09129bbd67ba089d lwip-1.3.0.tar.gz diff --git a/main/xen/xen-stable-4.18-20240312.patch b/main/xen/xen-stable-4.18-20240312.patch deleted file mode 100644 index 78d40c444599019de20dfbd739a99b80d92d4ea8..0000000000000000000000000000000000000000 --- a/main/xen/xen-stable-4.18-20240312.patch +++ /dev/null @@ -1,8490 +0,0 @@ -From 52be29df793f282822436c8c13e0948a01aee1ad Mon Sep 17 00:00:00 2001 -From: Tamas K Lengyel <tamas@tklengyel.com> -Date: Thu, 23 Nov 2023 12:10:46 +0100 -Subject: [PATCH 01/70] x86/mem_sharing: add missing m2p entry when mapping - shared_info page - -When mapping in the shared_info page to a fork the m2p entry wasn't set -resulting in the shared_info being reset even when the fork reset was called -with only reset_state and not reset_memory. This results in an extra -unnecessary TLB flush. - -Fixes: 1a0000ac775 ("mem_sharing: map shared_info page to same gfn during fork") -Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 23eb39acf011ef9bbe02ed4619c55f208fbcd39b -master date: 2023-10-31 16:10:14 +0000 ---- - xen/arch/x86/mm/mem_sharing.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c -index 94b6b782ef..142258f16a 100644 ---- a/xen/arch/x86/mm/mem_sharing.c -+++ b/xen/arch/x86/mm/mem_sharing.c -@@ -1847,6 +1847,8 @@ static int copy_special_pages(struct domain *cd, struct domain *d) - p2m_ram_rw, p2m->default_access, -1); - if ( rc ) - return rc; -+ -+ set_gpfn_from_mfn(mfn_x(new_mfn), gfn_x(old_gfn)); - } - } - --- -2.44.0 - - -From 880e06fdea401493a3f408deb0f411f7aeccee27 Mon Sep 17 00:00:00 2001 -From: David Woodhouse <dwmw@amazon.co.uk> -Date: Thu, 23 Nov 2023 12:11:21 +0100 -Subject: [PATCH 02/70] x86/pv-shim: fix grant table operations for 32-bit - guests - -When switching to call the shim functions from the normal handlers, the -compat_grant_table_op() function was omitted, leaving it calling the -real grant table operations in !PV_SHIM_EXCLUSIVE builds. This leaves a -32-bit shim guest failing to set up its real grant table with the parent -hypervisor. - -Fixes: e7db635f4428 ("x86/pv-shim: Don't modify the hypercall table") -Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 93ec30bc545f15760039c23ee4b97b80c0b3b3b3 -master date: 2023-10-31 16:10:14 +0000 ---- - xen/common/compat/grant_table.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/xen/common/compat/grant_table.c b/xen/common/compat/grant_table.c -index e00bc24a34..af98eade17 100644 ---- a/xen/common/compat/grant_table.c -+++ b/xen/common/compat/grant_table.c -@@ -63,6 +63,11 @@ int compat_grant_table_op( - unsigned int i, cmd_op; - XEN_GUEST_HANDLE_PARAM(void) cnt_uop; - -+#ifdef CONFIG_PV_SHIM -+ if ( unlikely(pv_shim) ) -+ return pv_shim_grant_table_op(cmd, uop, count); -+#endif -+ - set_xen_guest_handle(cnt_uop, NULL); - cmd_op = cmd & GNTTABOP_CMD_MASK; - if ( cmd_op != GNTTABOP_cache_flush ) --- -2.44.0 - - -From 9e8edd4c75564530a6fb98f5abba267edb906313 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Thu, 23 Nov 2023 12:12:18 +0100 -Subject: [PATCH 03/70] x86/x2apic: remove usage of ACPI_FADT_APIC_CLUSTER -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The ACPI FADT APIC_CLUSTER flag mandates that when the interrupt delivery is -Logical mode APIC must be configured for Cluster destination model. However in -apic_x2apic_probe() such flag is incorrectly used to gate whether Physical mode -can be used. - -Since Xen when in x2APIC mode only uses Logical mode together with Cluster -model completely remove checking for ACPI_FADT_APIC_CLUSTER, as Xen always -fulfills the requirement signaled by the flag. - -Fixes: eb40ae41b658 ('x86/Kconfig: add option for default x2APIC destination mode') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 26a449ce32cef33f2cb50602be19fcc0c4223ba9 -master date: 2023-11-02 10:50:26 +0100 ---- - xen/arch/x86/genapic/x2apic.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c -index ca1db27157..707deef98c 100644 ---- a/xen/arch/x86/genapic/x2apic.c -+++ b/xen/arch/x86/genapic/x2apic.c -@@ -231,8 +231,7 @@ const struct genapic *__init apic_x2apic_probe(void) - */ - x2apic_phys = iommu_intremap != iommu_intremap_full || - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) || -- (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) && -- !(acpi_gbl_FADT.flags & ACPI_FADT_APIC_CLUSTER)); -+ IS_ENABLED(CONFIG_X2APIC_PHYSICAL); - } - else if ( !x2apic_phys ) - switch ( iommu_intremap ) --- -2.44.0 - - -From fcb1016bbd476e17c72b1837ae2a3eaac517fa52 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Thu, 23 Nov 2023 12:12:47 +0100 -Subject: [PATCH 04/70] x86/i8259: do not assume interrupts always target CPU0 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Sporadically we have seen the following during AP bringup on AMD platforms -only: - -microcode: CPU59 updated from revision 0x830107a to 0x830107a, date = 2023-05-17 -microcode: CPU60 updated from revision 0x830104d to 0x830107a, date = 2023-05-17 -CPU60: No irq handler for vector 27 (IRQ -2147483648) -microcode: CPU61 updated from revision 0x830107a to 0x830107a, date = 2023-05-17 - -This is similar to the issue raised on Linux commit 36e9e1eab777e, where they -observed i8259 (active) vectors getting delivered to CPUs different than 0. - -On AMD or Hygon platforms adjust the target CPU mask of i8259 interrupt -descriptors to contain all possible CPUs, so that APs will reserve the vector -at startup if any legacy IRQ is still delivered through the i8259. Note that -if the IO-APIC takes over those interrupt descriptors the CPU mask will be -reset. - -Spurious i8259 interrupt vectors however (IRQ7 and IRQ15) can be injected even -when all i8259 pins are masked, and hence would need to be handled on all CPUs. - -Continue to reserve PIC vectors on CPU0 only, but do check for such spurious -interrupts on all CPUs if the vendor is AMD or Hygon. Note that once the -vectors get used by devices detecting PIC spurious interrupts will no longer be -possible, however the device driver should be able to cope with spurious -interrupts. Such PIC spurious interrupts occurring when the vector is in use -by a local APIC routed source will lead to an extra EOI, which might -unintentionally clear a different vector from ISR. Note this is already the -current behavior, so assume it's infrequent enough to not cause real issues. - -Finally, adjust the printed message to display the CPU where the spurious -interrupt has been received, so it looks like: - -microcode: CPU1 updated from revision 0x830107a to 0x830107a, date = 2023-05-17 -cpu1: spurious 8259A interrupt: IRQ7 -microcode: CPU2 updated from revision 0x830104d to 0x830107a, date = 2023-05-17 - -Amends: 3fba06ba9f8b ('x86/IRQ: re-use legacy vector ranges on APs') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 87f37449d586b4d407b75235bb0a171e018e25ec -master date: 2023-11-02 10:50:59 +0100 ---- - xen/arch/x86/i8259.c | 21 +++++++++++++++++++-- - xen/arch/x86/irq.c | 11 ++++++++++- - 2 files changed, 29 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c -index ed9f55abe5..e0fa1f96b4 100644 ---- a/xen/arch/x86/i8259.c -+++ b/xen/arch/x86/i8259.c -@@ -222,7 +222,8 @@ static bool _mask_and_ack_8259A_irq(unsigned int irq) - is_real_irq = false; - /* Report spurious IRQ, once per IRQ line. */ - if (!(spurious_irq_mask & irqmask)) { -- printk("spurious 8259A interrupt: IRQ%d.\n", irq); -+ printk("cpu%u: spurious 8259A interrupt: IRQ%u\n", -+ smp_processor_id(), irq); - spurious_irq_mask |= irqmask; - } - /* -@@ -349,7 +350,23 @@ void __init init_IRQ(void) - continue; - desc->handler = &i8259A_irq_type; - per_cpu(vector_irq, cpu)[LEGACY_VECTOR(irq)] = irq; -- cpumask_copy(desc->arch.cpu_mask, cpumask_of(cpu)); -+ -+ /* -+ * The interrupt affinity logic never targets interrupts to offline -+ * CPUs, hence it's safe to use cpumask_all here. -+ * -+ * Legacy PIC interrupts are only targeted to CPU0, but depending on -+ * the platform they can be distributed to any online CPU in hardware. -+ * Note this behavior has only been observed on AMD hardware. In order -+ * to cope install all active legacy vectors on all CPUs. -+ * -+ * IO-APIC will change the destination mask if/when taking ownership of -+ * the interrupt. -+ */ -+ cpumask_copy(desc->arch.cpu_mask, -+ (boot_cpu_data.x86_vendor & -+ (X86_VENDOR_AMD | X86_VENDOR_HYGON) ? &cpumask_all -+ : cpumask_of(cpu))); - desc->arch.vector = LEGACY_VECTOR(irq); - } - -diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c -index f42ad539dc..16d9fceba1 100644 ---- a/xen/arch/x86/irq.c -+++ b/xen/arch/x86/irq.c -@@ -1920,7 +1920,16 @@ void do_IRQ(struct cpu_user_regs *regs) - kind = ""; - if ( !(vector >= FIRST_LEGACY_VECTOR && - vector <= LAST_LEGACY_VECTOR && -- !smp_processor_id() && -+ (!smp_processor_id() || -+ /* -+ * For AMD/Hygon do spurious PIC interrupt -+ * detection on all CPUs, as it has been observed -+ * that during unknown circumstances spurious PIC -+ * interrupts have been delivered to CPUs -+ * different than the BSP. -+ */ -+ (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | -+ X86_VENDOR_HYGON))) && - bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR)) ) - { - printk("CPU%u: No irq handler for vector %02x (IRQ %d%s)\n", --- -2.44.0 - - -From 40bfa9dd57f1efdd0f0dc974e80a438d9db90874 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 23 Nov 2023 12:13:31 +0100 -Subject: [PATCH 05/70] x86/spec-ctrl: Add SRSO whitepaper URL - -... now that it exists in public. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 78a86b26868c12ae1cc3dd2a8bb9aa5eebaa41fd -master date: 2023-11-07 17:47:34 +0000 ---- - xen/arch/x86/spec_ctrl.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 6fd7d44ce4..a8d8af22f6 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -903,6 +903,9 @@ static bool __init should_use_eager_fpu(void) - } - } - -+/* -+ * https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf -+ */ - static void __init srso_calculations(bool hw_smt_enabled) - { - if ( !(boot_cpu_data.x86_vendor & --- -2.44.0 - - -From 3f9390fea5c51a6d64596d295902d28931eeca4c Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Thu, 23 Nov 2023 12:13:53 +0100 -Subject: [PATCH 06/70] xen/sched: fix sched_move_domain() - -When moving a domain out of a cpupool running with the credit2 -scheduler and having multiple run-queues, the following ASSERT() can -be observed: - -(XEN) Xen call trace: -(XEN) [<ffff82d04023a700>] R credit2.c#csched2_unit_remove+0xe3/0xe7 -(XEN) [<ffff82d040246adb>] S sched_move_domain+0x2f3/0x5b1 -(XEN) [<ffff82d040234cf7>] S cpupool.c#cpupool_move_domain_locked+0x1d/0x3b -(XEN) [<ffff82d040236025>] S cpupool_move_domain+0x24/0x35 -(XEN) [<ffff82d040206513>] S domain_kill+0xa5/0x116 -(XEN) [<ffff82d040232b12>] S do_domctl+0xe5f/0x1951 -(XEN) [<ffff82d0402276ba>] S timer.c#timer_lock+0x69/0x143 -(XEN) [<ffff82d0402dc71b>] S pv_hypercall+0x44e/0x4a9 -(XEN) [<ffff82d0402012b7>] S lstar_enter+0x137/0x140 -(XEN) -(XEN) -(XEN) **************************************** -(XEN) Panic on CPU 1: -(XEN) Assertion 'svc->rqd == c2rqd(sched_unit_master(unit))' failed at common/sched/credit2.c:1159 -(XEN) **************************************** - -This is happening as sched_move_domain() is setting a different cpu -for a scheduling unit without telling the scheduler. When this unit is -removed from the scheduler, the ASSERT() will trigger. - -In non-debug builds the result is usually a clobbered pointer, leading -to another crash a short time later. - -Fix that by swapping the two involved actions (setting another cpu and -removing the unit from the scheduler). - -Link: https://github.com/Dasharo/dasharo-issues/issues/488 -Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: George Dunlap <george.dunlap@cloud.com> -master commit: 4709ec82917668c2df958ef91b4f21c049c76bee -master date: 2023-11-20 10:49:29 +0100 ---- - xen/common/sched/core.c | 12 +++++++----- - 1 file changed, 7 insertions(+), 5 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 12deefa745..eba0cea4bb 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -732,18 +732,20 @@ int sched_move_domain(struct domain *d, struct cpupool *c) - old_domdata = d->sched_priv; - - /* -- * Temporarily move all units to same processor to make locking -- * easier when moving the new units to the new processors. -+ * Remove all units from the old scheduler, and temporarily move them to -+ * the same processor to make locking easier when moving the new units to -+ * new processors. - */ - new_p = cpumask_first(d->cpupool->cpu_valid); - for_each_sched_unit ( d, unit ) - { -- spinlock_t *lock = unit_schedule_lock_irq(unit); -+ spinlock_t *lock; -+ -+ sched_remove_unit(old_ops, unit); - -+ lock = unit_schedule_lock_irq(unit); - sched_set_res(unit, get_sched_res(new_p)); - spin_unlock_irq(lock); -- -- sched_remove_unit(old_ops, unit); - } - - old_units = d->sched_unit_list; --- -2.44.0 - - -From 90a6d821757edf1202c527143b8a05b0d2a3dfaa Mon Sep 17 00:00:00 2001 -From: Frediano Ziglio <frediano.ziglio@cloud.com> -Date: Wed, 6 Dec 2023 10:37:13 +0100 -Subject: [PATCH 07/70] x86/mem_sharing: Release domain if we are not able to - enable memory sharing - -In case it's not possible to enable memory sharing (mem_sharing_control -fails) we just return the error code without releasing the domain -acquired some lines above by rcu_lock_live_remote_domain_by_id(). - -Fixes: 72f8d45d69b8 ("x86/mem_sharing: enable mem_sharing on first memop") -Signed-off-by: Frediano Ziglio <frediano.ziglio@cloud.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Tamas K Lengyel <tamas@tklengyel.com> -master commit: fbcec32d6d3ea0ac329301925b317478316209ed -master date: 2023-11-27 12:06:13 +0000 ---- - xen/arch/x86/mm/mem_sharing.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c -index 142258f16a..429d27ef85 100644 ---- a/xen/arch/x86/mm/mem_sharing.c -+++ b/xen/arch/x86/mm/mem_sharing.c -@@ -2013,7 +2013,7 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg) - - if ( !mem_sharing_enabled(d) && - (rc = mem_sharing_control(d, true, 0)) ) -- return rc; -+ goto out; - - switch ( mso.op ) - { --- -2.44.0 - - -From 480168fcb3135f0da6e7a6b3b754c78fabc24d4f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Wed, 6 Dec 2023 10:38:03 +0100 -Subject: [PATCH 08/70] livepatch: do not use .livepatch.funcs section to store - internal state -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Currently the livepatch logic inside of Xen will use fields of struct -livepatch_func in order to cache internal state of patched functions. Note -this is a field that is part of the payload, and is loaded as an ELF section -(.livepatch.funcs), taking into account the SHF_* flags in the section -header. - -The flags for the .livepatch.funcs section, as set by livepatch-build-tools, -are SHF_ALLOC, which leads to its contents (the array of livepatch_func -structures) being placed in read-only memory: - -Section Headers: - [Nr] Name Type Address Offset - Size EntSize Flags Link Info Align -[...] - [ 4] .livepatch.funcs PROGBITS 0000000000000000 00000080 - 0000000000000068 0000000000000000 A 0 0 8 - -This previously went unnoticed, as all writes to the fields of livepatch_func -happen in the critical region that had WP disabled in CR0. After 8676092a0f16 -however WP is no longer toggled in CR0 for patch application, and only the -hypervisor .text mappings are made write-accessible. That leads to the -following page fault when attempting to apply a livepatch: - -----[ Xen-4.19-unstable x86_64 debug=y Tainted: C ]---- -CPU: 4 -RIP: e008:[<ffff82d040221e81>] common/livepatch.c#apply_payload+0x45/0x1e1 -[...] -Xen call trace: - [<ffff82d040221e81>] R common/livepatch.c#apply_payload+0x45/0x1e1 - [<ffff82d0402235b2>] F check_for_livepatch_work+0x385/0xaa5 - [<ffff82d04032508f>] F arch/x86/domain.c#idle_loop+0x92/0xee - -Pagetable walk from ffff82d040625079: - L4[0x105] = 000000008c6c9063 ffffffffffffffff - L3[0x141] = 000000008c6c6063 ffffffffffffffff - L2[0x003] = 000000086a1e7063 ffffffffffffffff - L1[0x025] = 800000086ca5d121 ffffffffffffffff - -**************************************** -Panic on CPU 4: -FATAL PAGE FAULT -[error_code=0003] -Faulting linear address: ffff82d040625079 -**************************************** - -Fix this by moving the internal Xen function patching state out of -livepatch_func into an area not allocated as part of the ELF payload. While -there also constify the array of livepatch_func structures in order to prevent -further surprises. - -Note there's still one field (old_addr) that gets set during livepatch load. I -consider this fine since the field is read-only after load, and at the point -the field gets set the underlying mapping hasn't been made read-only yet. - -Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> - -xen/livepatch: fix livepatch tests - -The current set of in-tree livepatch tests in xen/test/livepatch started -failing after the constify of the payload funcs array, and the movement of the -status data into a separate array. - -Fix the tests so they respect the constness of the funcs array and also make -use of the new location of the per-func state data. - -Fixes: 82182ad7b46e ('livepatch: do not use .livepatch.funcs section to store internal state') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> -master commit: 82182ad7b46e0f7a3856bb12c7a9bf2e2a4570bc -master date: 2023-11-27 15:16:01 +0100 -master commit: 902377b690f42ddf44ae91c4b0751d597f1cd694 -master date: 2023-11-29 10:46:42 +0000 ---- - xen/arch/arm/arm32/livepatch.c | 9 +++-- - xen/arch/arm/arm64/livepatch.c | 9 +++-- - xen/arch/arm/livepatch.c | 9 +++-- - xen/arch/x86/livepatch.c | 26 +++++++------ - xen/common/livepatch.c | 25 ++++++++---- - xen/include/public/sysctl.h | 5 +-- - xen/include/xen/livepatch.h | 38 +++++++++++++------ - xen/include/xen/livepatch_payload.h | 3 +- - xen/test/livepatch/xen_action_hooks.c | 12 +++--- - xen/test/livepatch/xen_action_hooks_marker.c | 20 ++++++---- - xen/test/livepatch/xen_action_hooks_noapply.c | 22 ++++++----- - xen/test/livepatch/xen_action_hooks_nofunc.c | 6 +-- - .../livepatch/xen_action_hooks_norevert.c | 24 +++++++----- - xen/test/livepatch/xen_prepost_hooks.c | 8 ++-- - xen/test/livepatch/xen_prepost_hooks_fail.c | 2 +- - 15 files changed, 130 insertions(+), 88 deletions(-) - -diff --git a/xen/arch/arm/arm32/livepatch.c b/xen/arch/arm/arm32/livepatch.c -index 3c50283b2a..80d2659b78 100644 ---- a/xen/arch/arm/arm32/livepatch.c -+++ b/xen/arch/arm/arm32/livepatch.c -@@ -11,23 +11,24 @@ - #include <asm/page.h> - #include <asm/livepatch.h> - --void arch_livepatch_apply(struct livepatch_func *func) -+void arch_livepatch_apply(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { - uint32_t insn; - uint32_t *new_ptr; - unsigned int i, len; - -- BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(func->opaque)); -+ BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(state->insn_buffer)); - BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != sizeof(insn)); - - ASSERT(vmap_of_xen_text); - -- len = livepatch_insn_len(func); -+ len = livepatch_insn_len(func, state); - if ( !len ) - return; - - /* Save old ones. */ -- memcpy(func->opaque, func->old_addr, len); -+ memcpy(state->insn_buffer, func->old_addr, len); - - if ( func->new_addr ) - { -diff --git a/xen/arch/arm/arm64/livepatch.c b/xen/arch/arm/arm64/livepatch.c -index 62d2ef373a..df2cebedde 100644 ---- a/xen/arch/arm/arm64/livepatch.c -+++ b/xen/arch/arm/arm64/livepatch.c -@@ -15,23 +15,24 @@ - #include <asm/insn.h> - #include <asm/livepatch.h> - --void arch_livepatch_apply(struct livepatch_func *func) -+void arch_livepatch_apply(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { - uint32_t insn; - uint32_t *new_ptr; - unsigned int i, len; - -- BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(func->opaque)); -+ BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(state->insn_buffer)); - BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != sizeof(insn)); - - ASSERT(vmap_of_xen_text); - -- len = livepatch_insn_len(func); -+ len = livepatch_insn_len(func, state); - if ( !len ) - return; - - /* Save old ones. */ -- memcpy(func->opaque, func->old_addr, len); -+ memcpy(state->insn_buffer, func->old_addr, len); - - if ( func->new_addr ) - insn = aarch64_insn_gen_branch_imm((unsigned long)func->old_addr, -diff --git a/xen/arch/arm/livepatch.c b/xen/arch/arm/livepatch.c -index d646379c8c..bbca1e5a5e 100644 ---- a/xen/arch/arm/livepatch.c -+++ b/xen/arch/arm/livepatch.c -@@ -69,7 +69,7 @@ void arch_livepatch_revive(void) - int arch_livepatch_verify_func(const struct livepatch_func *func) - { - /* If NOPing only do up to maximum amount we can put in the ->opaque. */ -- if ( !func->new_addr && (func->new_size > sizeof(func->opaque) || -+ if ( !func->new_addr && (func->new_size > LIVEPATCH_OPAQUE_SIZE || - func->new_size % ARCH_PATCH_INSN_SIZE) ) - return -EOPNOTSUPP; - -@@ -79,15 +79,16 @@ int arch_livepatch_verify_func(const struct livepatch_func *func) - return 0; - } - --void arch_livepatch_revert(const struct livepatch_func *func) -+void arch_livepatch_revert(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { - uint32_t *new_ptr; - unsigned int len; - - new_ptr = func->old_addr - (void *)_start + vmap_of_xen_text; - -- len = livepatch_insn_len(func); -- memcpy(new_ptr, func->opaque, len); -+ len = livepatch_insn_len(func, state); -+ memcpy(new_ptr, state->insn_buffer, len); - - clean_and_invalidate_dcache_va_range(new_ptr, len); - } -diff --git a/xen/arch/x86/livepatch.c b/xen/arch/x86/livepatch.c -index a54d991c5f..ee539f001b 100644 ---- a/xen/arch/x86/livepatch.c -+++ b/xen/arch/x86/livepatch.c -@@ -95,7 +95,7 @@ int arch_livepatch_verify_func(const struct livepatch_func *func) - if ( !func->new_addr ) - { - /* Only do up to maximum amount we can put in the ->opaque. */ -- if ( func->new_size > sizeof(func->opaque) ) -+ if ( func->new_size > LIVEPATCH_OPAQUE_SIZE ) - return -EOPNOTSUPP; - - if ( func->old_size < func->new_size ) -@@ -123,13 +123,14 @@ int arch_livepatch_verify_func(const struct livepatch_func *func) - * "noinline" to cause control flow change and thus invalidate I$ and - * cause refetch after modification. - */ --void noinline arch_livepatch_apply(struct livepatch_func *func) -+void noinline arch_livepatch_apply(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { - uint8_t *old_ptr; -- uint8_t insn[sizeof(func->opaque)]; -+ uint8_t insn[sizeof(state->insn_buffer)]; - unsigned int len; - -- func->patch_offset = 0; -+ state->patch_offset = 0; - old_ptr = func->old_addr; - - /* -@@ -141,14 +142,14 @@ void noinline arch_livepatch_apply(struct livepatch_func *func) - * ENDBR64 or similar instructions). - */ - if ( is_endbr64(old_ptr) || is_endbr64_poison(func->old_addr) ) -- func->patch_offset += ENDBR64_LEN; -+ state->patch_offset += ENDBR64_LEN; - - /* This call must be done with ->patch_offset already set. */ -- len = livepatch_insn_len(func); -+ len = livepatch_insn_len(func, state); - if ( !len ) - return; - -- memcpy(func->opaque, old_ptr + func->patch_offset, len); -+ memcpy(state->insn_buffer, old_ptr + state->patch_offset, len); - if ( func->new_addr ) - { - int32_t val; -@@ -156,7 +157,7 @@ void noinline arch_livepatch_apply(struct livepatch_func *func) - BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != (1 + sizeof(val))); - - insn[0] = 0xe9; /* Relative jump. */ -- val = func->new_addr - (func->old_addr + func->patch_offset + -+ val = func->new_addr - (func->old_addr + state->patch_offset + - ARCH_PATCH_INSN_SIZE); - - memcpy(&insn[1], &val, sizeof(val)); -@@ -164,17 +165,18 @@ void noinline arch_livepatch_apply(struct livepatch_func *func) - else - add_nops(insn, len); - -- memcpy(old_ptr + func->patch_offset, insn, len); -+ memcpy(old_ptr + state->patch_offset, insn, len); - } - - /* - * "noinline" to cause control flow change and thus invalidate I$ and - * cause refetch after modification. - */ --void noinline arch_livepatch_revert(const struct livepatch_func *func) -+void noinline arch_livepatch_revert(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { -- memcpy(func->old_addr + func->patch_offset, func->opaque, -- livepatch_insn_len(func)); -+ memcpy(func->old_addr + state->patch_offset, state->insn_buffer, -+ livepatch_insn_len(func, state)); - } - - /* -diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c -index d89a904bd4..e635606c10 100644 ---- a/xen/common/livepatch.c -+++ b/xen/common/livepatch.c -@@ -260,6 +260,9 @@ static void free_payload_data(struct payload *payload) - vfree((void *)payload->text_addr); - - payload->pages = 0; -+ -+ /* fstate gets allocated strictly after move_payload. */ -+ XFREE(payload->fstate); - } - - /* -@@ -656,6 +659,7 @@ static int prepare_payload(struct payload *payload, - { - const struct livepatch_elf_sec *sec; - unsigned int i; -+ struct livepatch_func *funcs; - struct livepatch_func *f; - struct virtual_region *region; - const Elf_Note *n; -@@ -666,14 +670,19 @@ static int prepare_payload(struct payload *payload, - if ( !section_ok(elf, sec, sizeof(*payload->funcs)) ) - return -EINVAL; - -- payload->funcs = sec->load_addr; -+ payload->funcs = funcs = sec->load_addr; - payload->nfuncs = sec->sec->sh_size / sizeof(*payload->funcs); - -+ payload->fstate = xzalloc_array(typeof(*payload->fstate), -+ payload->nfuncs); -+ if ( !payload->fstate ) -+ return -ENOMEM; -+ - for ( i = 0; i < payload->nfuncs; i++ ) - { - int rc; - -- f = &(payload->funcs[i]); -+ f = &(funcs[i]); - - if ( f->version != LIVEPATCH_PAYLOAD_VERSION ) - { -@@ -1361,7 +1370,7 @@ static int apply_payload(struct payload *data) - ASSERT(!local_irq_is_enabled()); - - for ( i = 0; i < data->nfuncs; i++ ) -- common_livepatch_apply(&data->funcs[i]); -+ common_livepatch_apply(&data->funcs[i], &data->fstate[i]); - - arch_livepatch_revive(); - -@@ -1397,7 +1406,7 @@ static int revert_payload(struct payload *data) - } - - for ( i = 0; i < data->nfuncs; i++ ) -- common_livepatch_revert(&data->funcs[i]); -+ common_livepatch_revert(&data->funcs[i], &data->fstate[i]); - - /* - * Since we are running with IRQs disabled and the hooks may call common -@@ -1438,9 +1447,10 @@ static inline bool was_action_consistent(const struct payload *data, livepatch_f - - for ( i = 0; i < data->nfuncs; i++ ) - { -- struct livepatch_func *f = &(data->funcs[i]); -+ const struct livepatch_func *f = &(data->funcs[i]); -+ const struct livepatch_fstate *s = &(data->fstate[i]); - -- if ( f->applied != expected_state ) -+ if ( s->applied != expected_state ) - { - printk(XENLOG_ERR LIVEPATCH "%s: Payload has a function: '%s' with inconsistent applied state.\n", - data->name, f->name ?: "noname"); -@@ -2157,7 +2167,8 @@ static void cf_check livepatch_printall(unsigned char key) - - for ( i = 0; i < data->nfuncs; i++ ) - { -- struct livepatch_func *f = &(data->funcs[i]); -+ const struct livepatch_func *f = &(data->funcs[i]); -+ - printk(" %s patch %p(%u) with %p (%u)\n", - f->name, f->old_addr, f->old_size, f->new_addr, f->new_size); - -diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h -index f1eba78405..9b19679cae 100644 ---- a/xen/include/public/sysctl.h -+++ b/xen/include/public/sysctl.h -@@ -991,10 +991,7 @@ struct livepatch_func { - uint32_t new_size; - uint32_t old_size; - uint8_t version; /* MUST be LIVEPATCH_PAYLOAD_VERSION. */ -- uint8_t opaque[LIVEPATCH_OPAQUE_SIZE]; -- uint8_t applied; -- uint8_t patch_offset; -- uint8_t _pad[6]; -+ uint8_t _pad[39]; - livepatch_expectation_t expect; - }; - typedef struct livepatch_func livepatch_func_t; -diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h -index 9fdb29c382..537d3d58b6 100644 ---- a/xen/include/xen/livepatch.h -+++ b/xen/include/xen/livepatch.h -@@ -13,6 +13,9 @@ struct xen_sysctl_livepatch_op; - - #include <xen/elfstructs.h> - #include <xen/errno.h> /* For -ENOSYS or -EOVERFLOW */ -+ -+#include <public/sysctl.h> /* For LIVEPATCH_OPAQUE_SIZE */ -+ - #ifdef CONFIG_LIVEPATCH - - /* -@@ -51,6 +54,12 @@ struct livepatch_symbol { - bool_t new_symbol; - }; - -+struct livepatch_fstate { -+ unsigned int patch_offset; -+ enum livepatch_func_state applied; -+ uint8_t insn_buffer[LIVEPATCH_OPAQUE_SIZE]; -+}; -+ - int livepatch_op(struct xen_sysctl_livepatch_op *); - void check_for_livepatch_work(void); - unsigned long livepatch_symbols_lookup_by_name(const char *symname); -@@ -87,10 +96,11 @@ void arch_livepatch_init(void); - int arch_livepatch_verify_func(const struct livepatch_func *func); - - static inline --unsigned int livepatch_insn_len(const struct livepatch_func *func) -+unsigned int livepatch_insn_len(const struct livepatch_func *func, -+ const struct livepatch_fstate *state) - { - if ( !func->new_addr ) -- return func->new_size - func->patch_offset; -+ return func->new_size - state->patch_offset; - - return ARCH_PATCH_INSN_SIZE; - } -@@ -117,39 +127,43 @@ int arch_livepatch_safety_check(void); - int arch_livepatch_quiesce(void); - void arch_livepatch_revive(void); - --void arch_livepatch_apply(struct livepatch_func *func); --void arch_livepatch_revert(const struct livepatch_func *func); -+void arch_livepatch_apply(const struct livepatch_func *func, -+ struct livepatch_fstate *state); -+void arch_livepatch_revert(const struct livepatch_func *func, -+ struct livepatch_fstate *state); - void arch_livepatch_post_action(void); - - void arch_livepatch_mask(void); - void arch_livepatch_unmask(void); - --static inline void common_livepatch_apply(struct livepatch_func *func) -+static inline void common_livepatch_apply(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { - /* If the action has been already executed on this function, do nothing. */ -- if ( func->applied == LIVEPATCH_FUNC_APPLIED ) -+ if ( state->applied == LIVEPATCH_FUNC_APPLIED ) - { - printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n", - __func__, func->name); - return; - } - -- arch_livepatch_apply(func); -- func->applied = LIVEPATCH_FUNC_APPLIED; -+ arch_livepatch_apply(func, state); -+ state->applied = LIVEPATCH_FUNC_APPLIED; - } - --static inline void common_livepatch_revert(struct livepatch_func *func) -+static inline void common_livepatch_revert(const struct livepatch_func *func, -+ struct livepatch_fstate *state) - { - /* If the apply action hasn't been executed on this function, do nothing. */ -- if ( !func->old_addr || func->applied == LIVEPATCH_FUNC_NOT_APPLIED ) -+ if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) - { - printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n", - __func__, func->name); - return; - } - -- arch_livepatch_revert(func); -- func->applied = LIVEPATCH_FUNC_NOT_APPLIED; -+ arch_livepatch_revert(func, state); -+ state->applied = LIVEPATCH_FUNC_NOT_APPLIED; - } - #else - -diff --git a/xen/include/xen/livepatch_payload.h b/xen/include/xen/livepatch_payload.h -index 9f5f064205..b9cd4f2096 100644 ---- a/xen/include/xen/livepatch_payload.h -+++ b/xen/include/xen/livepatch_payload.h -@@ -52,7 +52,8 @@ struct payload { - size_t ro_size; /* .. and its size (if any). */ - unsigned int pages; /* Total pages for [text,rw,ro]_addr */ - struct list_head applied_list; /* Linked to 'applied_list'. */ -- struct livepatch_func *funcs; /* The array of functions to patch. */ -+ const struct livepatch_func *funcs; /* The array of functions to patch. */ -+ struct livepatch_fstate *fstate; /* State of patched functions. */ - unsigned int nfuncs; /* Nr of functions to patch. */ - const struct livepatch_symbol *symtab; /* All symbols. */ - const char *strtab; /* Pointer to .strtab. */ -diff --git a/xen/test/livepatch/xen_action_hooks.c b/xen/test/livepatch/xen_action_hooks.c -index 39b5313027..fa0b3ab35f 100644 ---- a/xen/test/livepatch/xen_action_hooks.c -+++ b/xen/test/livepatch/xen_action_hooks.c -@@ -26,9 +26,10 @@ static int apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- func->applied = LIVEPATCH_FUNC_APPLIED; -+ fstate->applied = LIVEPATCH_FUNC_APPLIED; - apply_cnt++; - - printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); -@@ -47,9 +48,10 @@ static int revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- func->applied = LIVEPATCH_FUNC_NOT_APPLIED; -+ fstate->applied = LIVEPATCH_FUNC_NOT_APPLIED; - revert_cnt++; - - printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); -@@ -68,7 +70,7 @@ static void post_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name); - } -diff --git a/xen/test/livepatch/xen_action_hooks_marker.c b/xen/test/livepatch/xen_action_hooks_marker.c -index 4f807a577f..d2e22f70d1 100644 ---- a/xen/test/livepatch/xen_action_hooks_marker.c -+++ b/xen/test/livepatch/xen_action_hooks_marker.c -@@ -23,9 +23,10 @@ static int pre_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name); - } - -@@ -42,9 +43,10 @@ static void post_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name); - } - -@@ -59,9 +61,10 @@ static int pre_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name); - } - -@@ -78,9 +81,10 @@ static void post_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); - } - -diff --git a/xen/test/livepatch/xen_action_hooks_noapply.c b/xen/test/livepatch/xen_action_hooks_noapply.c -index 4c55c156a6..646a5fd2f0 100644 ---- a/xen/test/livepatch/xen_action_hooks_noapply.c -+++ b/xen/test/livepatch/xen_action_hooks_noapply.c -@@ -25,9 +25,10 @@ static int pre_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name); - } - -@@ -44,7 +45,7 @@ static int apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - apply_cnt++; - printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); -@@ -63,10 +64,11 @@ static void post_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - - BUG_ON(apply_cnt != 1); -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name); - } - -@@ -81,9 +83,10 @@ static int pre_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name); - } - -@@ -100,9 +103,10 @@ static void post_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); - } - -diff --git a/xen/test/livepatch/xen_action_hooks_nofunc.c b/xen/test/livepatch/xen_action_hooks_nofunc.c -index 2b4e90436f..077c4c1738 100644 ---- a/xen/test/livepatch/xen_action_hooks_nofunc.c -+++ b/xen/test/livepatch/xen_action_hooks_nofunc.c -@@ -23,7 +23,7 @@ static int apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - apply_cnt++; - printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); -@@ -42,7 +42,7 @@ static int revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - revert_cnt++; - printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); -@@ -61,7 +61,7 @@ static void post_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name); - } -diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c -index ef77e72071..3e21ade6ab 100644 ---- a/xen/test/livepatch/xen_action_hooks_norevert.c -+++ b/xen/test/livepatch/xen_action_hooks_norevert.c -@@ -25,9 +25,10 @@ static int pre_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name); - } - -@@ -44,9 +45,10 @@ static void post_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name); - } - -@@ -61,9 +63,10 @@ static int pre_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - -- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); - printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name); - } - -@@ -80,7 +83,7 @@ static int revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - revert_cnt++; - printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); -@@ -99,16 +102,17 @@ static void post_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; -+ struct livepatch_fstate *fstate = &payload->fstate[i]; - - BUG_ON(revert_cnt != 1); -- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); - - /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */ - arch_livepatch_quiesce(); - common_livepatch_revert(payload); - arch_livepatch_revive(); -- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); -+ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); - - printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); - } -diff --git a/xen/test/livepatch/xen_prepost_hooks.c b/xen/test/livepatch/xen_prepost_hooks.c -index 889377d6eb..17f5af6a19 100644 ---- a/xen/test/livepatch/xen_prepost_hooks.c -+++ b/xen/test/livepatch/xen_prepost_hooks.c -@@ -30,7 +30,7 @@ static int pre_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - pre_apply_cnt++; - printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); -@@ -49,7 +49,7 @@ static void post_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - post_apply_cnt++; - printk(KERN_DEBUG "%s: applied: %s\n", __func__, func->name); -@@ -66,7 +66,7 @@ static int pre_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - pre_revert_cnt++; - printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); -@@ -86,7 +86,7 @@ static void post_revert_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - post_revert_cnt++; - printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name); -diff --git a/xen/test/livepatch/xen_prepost_hooks_fail.c b/xen/test/livepatch/xen_prepost_hooks_fail.c -index c6feb5d32d..52fd7f642e 100644 ---- a/xen/test/livepatch/xen_prepost_hooks_fail.c -+++ b/xen/test/livepatch/xen_prepost_hooks_fail.c -@@ -24,7 +24,7 @@ static int pre_apply_hook(livepatch_payload_t *payload) - - for (i = 0; i < payload->nfuncs; i++) - { -- struct livepatch_func *func = &payload->funcs[i]; -+ const struct livepatch_func *func = &payload->funcs[i]; - - printk(KERN_DEBUG "%s: pre applying: %s\n", __func__, func->name); - } --- -2.44.0 - - -From 61d032e322b178a49983359b0dfd64a42c1f5fca Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Wed, 6 Dec 2023 10:39:15 +0100 -Subject: [PATCH 09/70] xen/x86: In x2APIC mode, derive LDR from APIC ID -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Both Intel and AMD manuals agree that in x2APIC mode, the APIC LDR and ID -registers are derivable from each other through a fixed formula. - -Xen uses that formula, but applies it to vCPU IDs (which are sequential) -rather than x2APIC IDs (which are not, at the moment). As I understand it, -this is an attempt to tightly pack vCPUs into clusters so each cluster has -16 vCPUs rather than 8, but this is a spec violation. - -This patch fixes the implementation so we follow the x2APIC spec for new -VMs, while preserving the behaviour (buggy or fixed) for migrated-in VMs. - -While touching that area, remove the existing printk statement in -vlapic_load_fixup() (as the checks it performed didn't make sense in x2APIC -mode and wouldn't affect the outcome) and put another printk as an else -branch so we get warnings trying to load nonsensical LDR values we don't -know about. - -Fixes: f9e0cccf7b35 ("x86/HVM: fix ID handling of x2APIC emulation") -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 90309854fd2440fb08b4c808f47d7670ba0d250d -master date: 2023-11-29 10:05:55 +0100 ---- - xen/arch/x86/hvm/vlapic.c | 64 +++++++++++++++++++-------- - xen/arch/x86/include/asm/hvm/domain.h | 3 ++ - 2 files changed, 48 insertions(+), 19 deletions(-) - -diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c -index c7ce82d064..ba569043ea 100644 ---- a/xen/arch/x86/hvm/vlapic.c -+++ b/xen/arch/x86/hvm/vlapic.c -@@ -1061,13 +1061,26 @@ static const struct hvm_mmio_ops vlapic_mmio_ops = { - .write = vlapic_mmio_write, - }; - -+static uint32_t x2apic_ldr_from_id(uint32_t id) -+{ -+ return ((id & ~0xf) << 12) | (1 << (id & 0xf)); -+} -+ - static void set_x2apic_id(struct vlapic *vlapic) - { -- u32 id = vlapic_vcpu(vlapic)->vcpu_id; -- u32 ldr = ((id & ~0xf) << 12) | (1 << (id & 0xf)); -+ const struct vcpu *v = vlapic_vcpu(vlapic); -+ uint32_t apic_id = v->vcpu_id * 2; -+ uint32_t apic_ldr = x2apic_ldr_from_id(apic_id); - -- vlapic_set_reg(vlapic, APIC_ID, id * 2); -- vlapic_set_reg(vlapic, APIC_LDR, ldr); -+ /* -+ * Workaround for migrated domains to derive LDRs as the source host -+ * would've. -+ */ -+ if ( v->domain->arch.hvm.bug_x2apic_ldr_vcpu_id ) -+ apic_ldr = x2apic_ldr_from_id(v->vcpu_id); -+ -+ vlapic_set_reg(vlapic, APIC_ID, apic_id); -+ vlapic_set_reg(vlapic, APIC_LDR, apic_ldr); - } - - int guest_wrmsr_apic_base(struct vcpu *v, uint64_t val) -@@ -1498,27 +1511,40 @@ static int cf_check lapic_save_regs(struct vcpu *v, hvm_domain_context_t *h) - */ - static void lapic_load_fixup(struct vlapic *vlapic) - { -- uint32_t id = vlapic->loaded.id; -+ const struct vcpu *v = vlapic_vcpu(vlapic); -+ uint32_t good_ldr = x2apic_ldr_from_id(vlapic->loaded.id); - -- if ( vlapic_x2apic_mode(vlapic) && id && vlapic->loaded.ldr == 1 ) -+ /* Skip fixups on xAPIC mode, or if the x2APIC LDR is already correct */ -+ if ( !vlapic_x2apic_mode(vlapic) || -+ (vlapic->loaded.ldr == good_ldr) ) -+ return; -+ -+ if ( vlapic->loaded.ldr == 1 ) - { -- /* -- * This is optional: ID != 0 contradicts LDR == 1. It's being added -- * to aid in eventual debugging of issues arising from the fixup done -- * here, but can be dropped as soon as it is found to conflict with -- * other (future) changes. -- */ -- if ( GET_xAPIC_ID(id) != vlapic_vcpu(vlapic)->vcpu_id * 2 || -- id != SET_xAPIC_ID(GET_xAPIC_ID(id)) ) -- printk(XENLOG_G_WARNING "%pv: bogus APIC ID %#x loaded\n", -- vlapic_vcpu(vlapic), id); -+ /* -+ * Xen <= 4.4 may have a bug by which all the APICs configured in -+ * x2APIC mode got LDR = 1, which is inconsistent on every vCPU -+ * except for the one with ID = 0. We'll fix the bug now and assign -+ * an LDR value consistent with the APIC ID. -+ */ - set_x2apic_id(vlapic); - } -- else /* Undo an eventual earlier fixup. */ -+ else if ( vlapic->loaded.ldr == x2apic_ldr_from_id(v->vcpu_id) ) - { -- vlapic_set_reg(vlapic, APIC_ID, id); -- vlapic_set_reg(vlapic, APIC_LDR, vlapic->loaded.ldr); -+ /* -+ * Migrations from Xen 4.4 to date (4.19 dev window, Nov 2023) may -+ * have LDR drived from the vCPU ID, not the APIC ID. We must preserve -+ * LDRs so new vCPUs use consistent derivations and existing guests, -+ * which may have already read the LDR at the source host, aren't -+ * surprised when interrupts stop working the way they did at the -+ * other end. -+ */ -+ v->domain->arch.hvm.bug_x2apic_ldr_vcpu_id = true; - } -+ else -+ printk(XENLOG_G_WARNING -+ "%pv: bogus x2APIC record: ID %#x, LDR %#x, expected LDR %#x\n", -+ v, vlapic->loaded.id, vlapic->loaded.ldr, good_ldr); - } - - static int cf_check lapic_load_hidden(struct domain *d, hvm_domain_context_t *h) -diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h -index 6e53ce4449..dd9d837e84 100644 ---- a/xen/arch/x86/include/asm/hvm/domain.h -+++ b/xen/arch/x86/include/asm/hvm/domain.h -@@ -106,6 +106,9 @@ struct hvm_domain { - - bool is_s3_suspended; - -+ /* Compatibility setting for a bug in x2APIC LDR */ -+ bool bug_x2apic_ldr_vcpu_id; -+ - /* hypervisor intercepted msix table */ - struct list_head msixtbl_list; - --- -2.44.0 - - -From 3af9d1cbb602a9dcbab2e43fab74a881c2e05d81 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Wed, 6 Dec 2023 10:39:55 +0100 -Subject: [PATCH 10/70] tools/xg: Fix potential memory leak in cpu policy - getters/setters - -They allocate two different hypercall buffers, but leak the first -allocation if the second one failed due to an early return that bypasses -cleanup. - -Remove the early exit and go through _post() instead. Invoking _post() is -benign even if _pre() failed. - -Fixes: 6b85e427098c ('x86/sysctl: Implement XEN_SYSCTL_get_cpu_policy') -Fixes: 60529dfeca14 ('x86/domctl: Implement XEN_DOMCTL_get_cpu_policy') -Fixes: 14ba07e6f816 ('x86/domctl: Implement XEN_DOMCTL_set_cpumsr_policy') -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 1571ff7a987b88b20598a6d49910457f3b2c59f1 -master date: 2023-12-01 10:53:07 +0100 ---- - tools/libs/guest/xg_cpuid_x86.c | 86 +++++++++++++++------------------ - 1 file changed, 39 insertions(+), 47 deletions(-) - -diff --git a/tools/libs/guest/xg_cpuid_x86.c b/tools/libs/guest/xg_cpuid_x86.c -index f2b1e80901..3a74bb2b37 100644 ---- a/tools/libs/guest/xg_cpuid_x86.c -+++ b/tools/libs/guest/xg_cpuid_x86.c -@@ -136,20 +136,20 @@ static int get_system_cpu_policy(xc_interface *xch, uint32_t index, - DECLARE_HYPERCALL_BOUNCE(msrs, - *nr_msrs * sizeof(*msrs), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); -- int ret; -- -- if ( xc_hypercall_bounce_pre(xch, leaves) || -- xc_hypercall_bounce_pre(xch, msrs) ) -- return -1; -+ int ret = -1; - -- sysctl.cmd = XEN_SYSCTL_get_cpu_policy; -- sysctl.u.cpu_policy.index = index; -- sysctl.u.cpu_policy.nr_leaves = *nr_leaves; -- set_xen_guest_handle(sysctl.u.cpu_policy.leaves, leaves); -- sysctl.u.cpu_policy.nr_msrs = *nr_msrs; -- set_xen_guest_handle(sysctl.u.cpu_policy.msrs, msrs); -- -- ret = do_sysctl(xch, &sysctl); -+ if ( !xc_hypercall_bounce_pre(xch, leaves) && -+ !xc_hypercall_bounce_pre(xch, msrs) ) -+ { -+ sysctl.cmd = XEN_SYSCTL_get_cpu_policy; -+ sysctl.u.cpu_policy.index = index; -+ sysctl.u.cpu_policy.nr_leaves = *nr_leaves; -+ set_xen_guest_handle(sysctl.u.cpu_policy.leaves, leaves); -+ sysctl.u.cpu_policy.nr_msrs = *nr_msrs; -+ set_xen_guest_handle(sysctl.u.cpu_policy.msrs, msrs); -+ -+ ret = do_sysctl(xch, &sysctl); -+ } - - xc_hypercall_bounce_post(xch, leaves); - xc_hypercall_bounce_post(xch, msrs); -@@ -174,20 +174,20 @@ static int get_domain_cpu_policy(xc_interface *xch, uint32_t domid, - DECLARE_HYPERCALL_BOUNCE(msrs, - *nr_msrs * sizeof(*msrs), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); -- int ret; -- -- if ( xc_hypercall_bounce_pre(xch, leaves) || -- xc_hypercall_bounce_pre(xch, msrs) ) -- return -1; -- -- domctl.cmd = XEN_DOMCTL_get_cpu_policy; -- domctl.domain = domid; -- domctl.u.cpu_policy.nr_leaves = *nr_leaves; -- set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); -- domctl.u.cpu_policy.nr_msrs = *nr_msrs; -- set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); -+ int ret = -1; - -- ret = do_domctl(xch, &domctl); -+ if ( !xc_hypercall_bounce_pre(xch, leaves) && -+ !xc_hypercall_bounce_pre(xch, msrs) ) -+ { -+ domctl.cmd = XEN_DOMCTL_get_cpu_policy; -+ domctl.domain = domid; -+ domctl.u.cpu_policy.nr_leaves = *nr_leaves; -+ set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); -+ domctl.u.cpu_policy.nr_msrs = *nr_msrs; -+ set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); -+ -+ ret = do_domctl(xch, &domctl); -+ } - - xc_hypercall_bounce_post(xch, leaves); - xc_hypercall_bounce_post(xch, msrs); -@@ -214,32 +214,24 @@ int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid, - DECLARE_HYPERCALL_BOUNCE(msrs, - nr_msrs * sizeof(*msrs), - XC_HYPERCALL_BUFFER_BOUNCE_IN); -- int ret; -- -- if ( err_leaf_p ) -- *err_leaf_p = -1; -- if ( err_subleaf_p ) -- *err_subleaf_p = -1; -- if ( err_msr_p ) -- *err_msr_p = -1; -+ int ret = -1; - -- if ( xc_hypercall_bounce_pre(xch, leaves) ) -- return -1; -- -- if ( xc_hypercall_bounce_pre(xch, msrs) ) -- return -1; -- -- domctl.cmd = XEN_DOMCTL_set_cpu_policy; -- domctl.domain = domid; -- domctl.u.cpu_policy.nr_leaves = nr_leaves; -- set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); -- domctl.u.cpu_policy.nr_msrs = nr_msrs; -- set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); - domctl.u.cpu_policy.err_leaf = -1; - domctl.u.cpu_policy.err_subleaf = -1; - domctl.u.cpu_policy.err_msr = -1; - -- ret = do_domctl(xch, &domctl); -+ if ( !xc_hypercall_bounce_pre(xch, leaves) && -+ !xc_hypercall_bounce_pre(xch, msrs) ) -+ { -+ domctl.cmd = XEN_DOMCTL_set_cpu_policy; -+ domctl.domain = domid; -+ domctl.u.cpu_policy.nr_leaves = nr_leaves; -+ set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); -+ domctl.u.cpu_policy.nr_msrs = nr_msrs; -+ set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); -+ -+ ret = do_domctl(xch, &domctl); -+ } - - xc_hypercall_bounce_post(xch, leaves); - xc_hypercall_bounce_post(xch, msrs); --- -2.44.0 - - -From 18f900b77b3a85acadc2fe152ea354a02569acab Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 6 Dec 2023 10:40:19 +0100 -Subject: [PATCH 11/70] x86emul: avoid triggering event related assertions - -The assertion at the end of x86_emulate_wrapper() as well as the ones -in x86_emul_{hw_exception,pagefault}() can trigger if we ignore -X86EMUL_EXCEPTION coming back from certain hook functions. Squash -exceptions when merely probing MSRs, plus on SWAPGS'es "best effort" -error handling path. - -In adjust_bnd() add another assertion after the read_xcr(0, ...) -invocation, paralleling the one in x86emul_get_fpu() - XCR0 reads should -never fault when XSAVE is (implicitly) known to be available. - -Also update the respective comment in x86_emulate_wrapper(). - -Fixes: 14a6be89ec04 ("x86emul: correct EFLAGS.TF handling") -Fixes: cb2626c75813 ("x86emul: conditionally clear BNDn for branches") -Fixes: 6eb43fcf8a0b ("x86emul: support SWAPGS") -Reported-by: AFL -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 787d11c5aaf4d3411d4658cff137cd49b0bd951b -master date: 2023-12-05 09:57:05 +0100 ---- - xen/arch/x86/x86_emulate/0f01.c | 6 ++++-- - xen/arch/x86/x86_emulate/0fae.c | 3 +++ - xen/arch/x86/x86_emulate/x86_emulate.c | 28 +++++++++++++++++++++----- - 3 files changed, 30 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/x86/x86_emulate/0f01.c b/xen/arch/x86/x86_emulate/0f01.c -index ba43fc394b..1ba99609d6 100644 ---- a/xen/arch/x86/x86_emulate/0f01.c -+++ b/xen/arch/x86/x86_emulate/0f01.c -@@ -200,8 +200,10 @@ int x86emul_0f01(struct x86_emulate_state *s, - if ( (rc = ops->write_segment(x86_seg_gs, &sreg, - ctxt)) != X86EMUL_OKAY ) - { -- /* Best effort unwind (i.e. no error checking). */ -- ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, ctxt); -+ /* Best effort unwind (i.e. no real error checking). */ -+ if ( ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, -+ ctxt) == X86EMUL_EXCEPTION ) -+ x86_emul_reset_event(ctxt); - goto done; - } - break; -diff --git a/xen/arch/x86/x86_emulate/0fae.c b/xen/arch/x86/x86_emulate/0fae.c -index 00840b1d07..ba77af58f2 100644 ---- a/xen/arch/x86/x86_emulate/0fae.c -+++ b/xen/arch/x86/x86_emulate/0fae.c -@@ -55,7 +55,10 @@ int x86emul_0fae(struct x86_emulate_state *s, - cr4 = X86_CR4_OSFXSR; - if ( !ops->read_msr || - ops->read_msr(MSR_EFER, &msr_val, ctxt) != X86EMUL_OKAY ) -+ { -+ x86_emul_reset_event(ctxt); - msr_val = 0; -+ } - if ( !(cr4 & X86_CR4_OSFXSR) || - (mode_64bit() && mode_ring0() && (msr_val & EFER_FFXSE)) ) - s->op_bytes = offsetof(struct x86_fxsr, xmm[0]); -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c -index 94caec1d14..cf780da501 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate/x86_emulate.c -@@ -1143,10 +1143,18 @@ static bool is_branch_step(struct x86_emulate_ctxt *ctxt, - const struct x86_emulate_ops *ops) - { - uint64_t debugctl; -+ int rc = X86EMUL_UNHANDLEABLE; - -- return ops->read_msr && -- ops->read_msr(MSR_IA32_DEBUGCTLMSR, &debugctl, ctxt) == X86EMUL_OKAY && -- (debugctl & IA32_DEBUGCTLMSR_BTF); -+ if ( !ops->read_msr || -+ (rc = ops->read_msr(MSR_IA32_DEBUGCTLMSR, &debugctl, -+ ctxt)) != X86EMUL_OKAY ) -+ { -+ if ( rc == X86EMUL_EXCEPTION ) -+ x86_emul_reset_event(ctxt); -+ debugctl = 0; -+ } -+ -+ return debugctl & IA32_DEBUGCTLMSR_BTF; - } - - static void adjust_bnd(struct x86_emulate_ctxt *ctxt, -@@ -1160,13 +1168,21 @@ static void adjust_bnd(struct x86_emulate_ctxt *ctxt, - - if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY || - !(xcr0 & X86_XCR0_BNDREGS) || !(xcr0 & X86_XCR0_BNDCSR) ) -+ { -+ ASSERT(!ctxt->event_pending); - return; -+ } - - if ( !mode_ring0() ) - bndcfg = read_bndcfgu(); - else if ( !ops->read_msr || -- ops->read_msr(MSR_IA32_BNDCFGS, &bndcfg, ctxt) != X86EMUL_OKAY ) -+ (rc = ops->read_msr(MSR_IA32_BNDCFGS, &bndcfg, -+ ctxt)) != X86EMUL_OKAY ) -+ { -+ if ( rc == X86EMUL_EXCEPTION ) -+ x86_emul_reset_event(ctxt); - return; -+ } - if ( (bndcfg & IA32_BNDCFGS_ENABLE) && !(bndcfg & IA32_BNDCFGS_PRESERVE) ) - { - /* -@@ -8677,7 +8693,9 @@ int x86_emulate_wrapper( - * An event being pending should exactly match returning - * X86EMUL_EXCEPTION. (If this trips, the chances are a codepath has - * called hvm_inject_hw_exception() rather than using -- * x86_emul_hw_exception().) -+ * x86_emul_hw_exception(), or the invocation of a hook has caused an -+ * exception to be raised, while the caller was only checking for -+ * success/failure.) - */ - ASSERT(ctxt->event_pending == (rc == X86EMUL_EXCEPTION)); - --- -2.44.0 - - -From 5ac87c8afd2ae2b1a9fd46a9b80d9152d650fb26 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Wed, 6 Dec 2023 10:40:54 +0100 -Subject: [PATCH 12/70] xen/sched: fix adding offline cpu to cpupool -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Trying to add an offline cpu to a cpupool can crash the hypervisor, -as the probably non-existing percpu area of the cpu is accessed before -the availability of the cpu is being tested. This can happen in case -the cpupool's granularity is "core" or "socket". - -Fix that by testing the cpu to be online. - -Fixes: cb563d7665f2 ("xen/sched: support core scheduling for moving cpus to/from cpupools") -Reported-by: René Winther Højgaard <renewin@proton.me> -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 06e8d65d33896aa90f5b6d9b2bce7f11433b33c9 -master date: 2023-12-05 09:57:38 +0100 ---- - xen/common/sched/cpupool.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c -index 2e094b0cfa..ad8f608462 100644 ---- a/xen/common/sched/cpupool.c -+++ b/xen/common/sched/cpupool.c -@@ -892,6 +892,8 @@ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) - if ( cpu >= nr_cpu_ids ) - goto addcpu_out; - ret = -ENODEV; -+ if ( !cpu_online(cpu) ) -+ goto addcpu_out; - cpus = sched_get_opt_cpumask(c->gran, cpu); - if ( !cpumask_subset(cpus, &cpupool_free_cpus) || - cpumask_intersects(cpus, &cpupool_locked_cpus) ) --- -2.44.0 - - -From 25b7f9ed0f8c7e138a2cecb113bd377c613153d7 Mon Sep 17 00:00:00 2001 -From: Stewart Hildebrand <stewart.hildebrand@amd.com> -Date: Wed, 6 Dec 2023 10:41:19 +0100 -Subject: [PATCH 13/70] xen/domain: fix error path in domain_create() - -If rangeset_new() fails, err would not be set to an appropriate error -code. Set it to -ENOMEM. - -Fixes: 580c458699e3 ("xen/domain: Call arch_domain_create() as early as possible in domain_create()") -Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: ff1178062094837d55ef342070e58316c43a54c9 -master date: 2023-12-05 10:00:51 +0100 ---- - xen/common/domain.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 8f9ab01c0c..003f4ab125 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -703,6 +703,7 @@ struct domain *domain_create(domid_t domid, - watchdog_domain_init(d); - init_status |= INIT_watchdog; - -+ err = -ENOMEM; - d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex); - d->irq_caps = rangeset_new(d, "Interrupts", 0); - if ( !d->iomem_caps || !d->irq_caps ) --- -2.44.0 - - -From a56d598e13db413f98e149f8e10cc13e8d4c1635 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 12 Dec 2023 14:26:18 +0100 -Subject: [PATCH 14/70] Only compile the hypervisor with - -Wdeclaration-after-statement - -Right now, all tools and hypervisor will be complied with the option --Wdeclaration-after-statement. While most of the code in the hypervisor -is controlled by us, for tools we may import external libraries. - -The build will fail if one of them are using the construct we are -trying to prevent. This is the case when building against Python 3.12 -and Yocto: - -| In file included from /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/Python.h:44, -| from xen/lowlevel/xc/xc.c:8: -| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/object.h: In function 'Py_SIZE': -| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/object.h:233:5: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] -| 233 | PyVarObject *var_ob = _PyVarObject_CAST(ob); -| | ^~~~~~~~~~~ -| In file included from /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/Python.h:53: -| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/cpython/longintrepr.h: In function '_PyLong_CompactValue': -| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/cpython/longintrepr.h:121:5: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] -| 121 | Py_ssize_t sign = 1 - (op->long_value.lv_tag & _PyLong_SIGN_MASK); -| | ^~~~~~~~~~ -| cc1: all warnings being treated as errors - -Looking at the tools directory, a fair few directory already add --Wno-declaration-after-statement to inhibit the default behavior. - -We have always build the hypervisor with the flag, so for now remove -only the flag for anything but the hypervisor. We can decide at later -time whether we want to relax. - -Also remove the -Wno-declaration-after-statement in some subdirectory -as the flag is now unnecessary. - -Part of the commit message was take from Alexander's first proposal: - -Link: https://lore.kernel.org/xen-devel/20231128174729.3880113-1-alex@linutronix.de/ -Reported-by: Alexander Kanavin <alex@linutronix.de> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Tested-by: Jason Andryuk <jandryuk@gmail.com> -Signed-off-by: Julien Grall <jgrall@amazon.com> - -xen/hypervisor: Don't use cc-option-add for -Wdeclaration-after-statement - -Per Andrew's comment in [1] all the compilers we support should -recognize the flag. - -I forgot to address the comment while committing. - -[1] fcf00090-304a-49f7-8a61-a54347e90a3b@citrix.com - -Signed-off-by: Julien Grall <jgrall@amazon.com> -master commit: 40be6307ec005539635e7b8fcef67e989dc441f6 -master date: 2023-12-06 19:12:40 +0000 -master commit: d4bfd3899886d0fbe259c20660dadb1e00170f2d -master date: 2023-12-06 19:19:59 +0000 ---- - Config.mk | 2 -- - stubdom/Makefile | 2 +- - stubdom/vtpmmgr/Makefile | 2 +- - tools/libs/light/Makefile | 3 +-- - tools/libs/util/Makefile | 3 +-- - tools/tests/depriv/Makefile | 2 -- - tools/xl/Makefile | 3 +-- - xen/Makefile | 1 + - 8 files changed, 6 insertions(+), 12 deletions(-) - -diff --git a/Config.mk b/Config.mk -index 29b0d1e12a..2a3e16d0bd 100644 ---- a/Config.mk -+++ b/Config.mk -@@ -177,8 +177,6 @@ CFLAGS += -std=gnu99 - - CFLAGS += -Wall -Wstrict-prototypes - --$(call cc-option-add,HOSTCFLAGS,HOSTCC,-Wdeclaration-after-statement) --$(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement) - $(call cc-option-add,CFLAGS,CC,-Wno-unused-but-set-variable) - $(call cc-option-add,CFLAGS,CC,-Wno-unused-local-typedefs) - -diff --git a/stubdom/Makefile b/stubdom/Makefile -index 0ddfce1ba2..888fa20d72 100644 ---- a/stubdom/Makefile -+++ b/stubdom/Makefile -@@ -245,7 +245,7 @@ tpm_emulator-$(XEN_TARGET_ARCH): tpm_emulator-$(TPMEMU_VERSION).tar.gz - patch -d $@ -p1 < vtpm-command-duration.patch - patch -d $@ -p1 < vtpm-tpm_bn_t-addr.patch - mkdir $@/build -- cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS) -Wno-declaration-after-statement" -+ cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS)" - touch $@ - - TPMEMU_STAMPFILE=$(CROSS_ROOT)/$(GNU_TARGET_ARCH)-xen-elf/lib/libtpm.a -diff --git a/stubdom/vtpmmgr/Makefile b/stubdom/vtpmmgr/Makefile -index 6dae034a07..c29bb49838 100644 ---- a/stubdom/vtpmmgr/Makefile -+++ b/stubdom/vtpmmgr/Makefile -@@ -17,7 +17,7 @@ OBJS += vtpm_disk.o disk_tpm.o disk_io.o disk_crypto.o disk_read.o disk_write.o - OBJS += mgmt_authority.o - - CFLAGS+=-Werror -Iutil -Icrypto -Itcs --CFLAGS+=-Wno-declaration-after-statement -Wno-unused-label -+CFLAGS+=-Wno-unused-label - - build: $(TARGET) - $(TARGET): $(OBJS) -diff --git a/tools/libs/light/Makefile b/tools/libs/light/Makefile -index ba4c1b7933..37e4d16709 100644 ---- a/tools/libs/light/Makefile -+++ b/tools/libs/light/Makefile -@@ -38,8 +38,7 @@ vpath static_tables.c $(ACPI_PATH)/ - - OBJS-$(CONFIG_X86) += $(ACPI_OBJS) - --CFLAGS += -Wno-format-zero-length -Wmissing-declarations \ -- -Wno-declaration-after-statement -Wformat-nonliteral -+CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral - - CFLAGS-$(CONFIG_X86) += -DCONFIG_PCI_SUPP_LEGACY_IRQ - -diff --git a/tools/libs/util/Makefile b/tools/libs/util/Makefile -index c3b21875dc..936ec90a31 100644 ---- a/tools/libs/util/Makefile -+++ b/tools/libs/util/Makefile -@@ -9,8 +9,7 @@ OBJS-y += libxlu_disk.o - OBJS-y += libxlu_vif.o - OBJS-y += libxlu_pci.o - --CFLAGS += -Wno-format-zero-length -Wmissing-declarations \ -- -Wno-declaration-after-statement -Wformat-nonliteral -+CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral - CFLAGS += $(CFLAGS_libxenctrl) - - CFLAGS += $(PTHREAD_CFLAGS) -diff --git a/tools/tests/depriv/Makefile b/tools/tests/depriv/Makefile -index 7d9e3b01bb..5404a12f47 100644 ---- a/tools/tests/depriv/Makefile -+++ b/tools/tests/depriv/Makefile -@@ -1,8 +1,6 @@ - XEN_ROOT=$(CURDIR)/../../.. - include $(XEN_ROOT)/tools/Rules.mk - --CFLAGS += -Wno-declaration-after-statement -- - CFLAGS += $(CFLAGS_xeninclude) - CFLAGS += $(CFLAGS_libxenctrl) - CFLAGS += $(CFLAGS_libxencall) -diff --git a/tools/xl/Makefile b/tools/xl/Makefile -index 5f7aa5f46c..d742e96a5b 100644 ---- a/tools/xl/Makefile -+++ b/tools/xl/Makefile -@@ -5,8 +5,7 @@ - XEN_ROOT = $(CURDIR)/../.. - include $(XEN_ROOT)/tools/Rules.mk - --CFLAGS += -Wno-format-zero-length -Wmissing-declarations \ -- -Wno-declaration-after-statement -Wformat-nonliteral -+CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral - CFLAGS += -fPIC - - CFLAGS += $(PTHREAD_CFLAGS) -diff --git a/xen/Makefile b/xen/Makefile -index e39290f638..a92709b43e 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -392,6 +392,7 @@ CFLAGS-$(CONFIG_CC_SPLIT_SECTIONS) += -ffunction-sections -fdata-sections - - CFLAGS += -nostdinc -fno-builtin -fno-common - CFLAGS += -Werror -Wredundant-decls -Wno-pointer-arith -+CFLAGS += -Wdeclaration-after-statement - $(call cc-option-add,CFLAGS,CC,-Wvla) - CFLAGS += -pipe -D__XEN__ -include $(srctree)/include/xen/config.h - CFLAGS-$(CONFIG_DEBUG_INFO) += -g --- -2.44.0 - - -From 48eb9e91990b3fd42f8e847780f6cdb188245b4a Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 12 Dec 2023 14:26:35 +0100 -Subject: [PATCH 15/70] xen/sched: fix sched_move_domain() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Do cleanup in sched_move_domain() in a dedicated service function, -which is called either in error case with newly allocated data, or in -success case with the old data to be freed. - -This will at once fix some subtle bugs which sneaked in due to -forgetting to overwrite some pointers in the error case. - -Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity") -Reported-by: René Winther Højgaard <renewin@proton.me> -Initial-fix-by: Jan Beulich <jbeulich@suse.com> -Initial-fix-by: George Dunlap <george.dunlap@cloud.com> -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: George Dunlap <george.dunlap@cloud.com> -master commit: 23792cc0f22cff4e106d838b83aa9ae1cb6ffaf4 -master date: 2023-12-07 13:37:25 +0000 ---- - xen/common/sched/core.c | 47 +++++++++++++++++++++++------------------ - 1 file changed, 27 insertions(+), 20 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index eba0cea4bb..901782bbb4 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -647,6 +647,24 @@ static void sched_move_irqs(const struct sched_unit *unit) - vcpu_move_irqs(v); - } - -+static void sched_move_domain_cleanup(const struct scheduler *ops, -+ struct sched_unit *units, -+ void *domdata) -+{ -+ struct sched_unit *unit, *old_unit; -+ -+ for ( unit = units; unit; ) -+ { -+ if ( unit->priv ) -+ sched_free_udata(ops, unit->priv); -+ old_unit = unit; -+ unit = unit->next_in_list; -+ xfree(old_unit); -+ } -+ -+ sched_free_domdata(ops, domdata); -+} -+ - /* - * Move a domain from one cpupool to another. - * -@@ -686,7 +704,6 @@ int sched_move_domain(struct domain *d, struct cpupool *c) - void *old_domdata; - unsigned int gran = cpupool_get_granularity(c); - unsigned int n_units = d->vcpu[0] ? DIV_ROUND_UP(d->max_vcpus, gran) : 0; -- int ret = 0; - - for_each_vcpu ( d, v ) - { -@@ -699,8 +716,9 @@ int sched_move_domain(struct domain *d, struct cpupool *c) - domdata = sched_alloc_domdata(c->sched, d); - if ( IS_ERR(domdata) ) - { -- ret = PTR_ERR(domdata); -- goto out; -+ rcu_read_unlock(&sched_res_rculock); -+ -+ return PTR_ERR(domdata); - } - - for ( unit_idx = 0; unit_idx < n_units; unit_idx++ ) -@@ -718,10 +736,10 @@ int sched_move_domain(struct domain *d, struct cpupool *c) - - if ( !unit || !unit->priv ) - { -- old_units = new_units; -- old_domdata = domdata; -- ret = -ENOMEM; -- goto out_free; -+ sched_move_domain_cleanup(c->sched, new_units, domdata); -+ rcu_read_unlock(&sched_res_rculock); -+ -+ return -ENOMEM; - } - - unit_ptr = &unit->next_in_list; -@@ -808,22 +826,11 @@ int sched_move_domain(struct domain *d, struct cpupool *c) - - domain_unpause(d); - -- out_free: -- for ( unit = old_units; unit; ) -- { -- if ( unit->priv ) -- sched_free_udata(c->sched, unit->priv); -- old_unit = unit; -- unit = unit->next_in_list; -- xfree(old_unit); -- } -- -- sched_free_domdata(old_ops, old_domdata); -+ sched_move_domain_cleanup(old_ops, old_units, old_domdata); - -- out: - rcu_read_unlock(&sched_res_rculock); - -- return ret; -+ return 0; - } - - void sched_destroy_vcpu(struct vcpu *v) --- -2.44.0 - - -From a4f3f5a62c10a5adc898cf45261783209f5bc037 Mon Sep 17 00:00:00 2001 -From: Michal Orzel <michal.orzel@amd.com> -Date: Tue, 12 Dec 2023 14:27:10 +0100 -Subject: [PATCH 16/70] xen/arm: page: Avoid pointer overflow on cache clean & - invalidate - -On Arm32, after cleaning and invalidating the last dcache line of the top -domheap page i.e. VA = 0xfffff000 (as a result of flushing the page to -RAM), we end up adding the value of a dcache line size to the pointer -once again, which results in a pointer arithmetic overflow (with 64B line -size, operation 0xffffffc0 + 0x40 overflows to 0x0). Such behavior is -undefined and given the wide range of compiler versions we support, it is -difficult to determine what could happen in such scenario. - -Modify clean_and_invalidate_dcache_va_range() as well as -clean_dcache_va_range() and invalidate_dcache_va_range() due to similarity -of handling to prevent pointer arithmetic overflow. Modify the loops to -use an additional variable to store the index of the next cacheline. -Add an assert to prevent passing a region that wraps around which is -illegal and would end up in a page fault anyway (region 0-2MB is -unmapped). Lastly, return early if size passed is 0. - -Note that on Arm64, we don't have this problem given that the max VA -space we support is 48-bits. - -This is XSA-447 / CVE-2023-46837. - -Signed-off-by: Michal Orzel <michal.orzel@amd.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -master commit: 190b7f49af6487a9665da63d43adc9d9a5fbd01e -master date: 2023-12-12 14:01:00 +0100 ---- - xen/arch/arm/include/asm/page.h | 35 ++++++++++++++++++++++++++------- - 1 file changed, 28 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h -index aa0080e8d7..645331fc89 100644 ---- a/xen/arch/arm/include/asm/page.h -+++ b/xen/arch/arm/include/asm/page.h -@@ -162,6 +162,13 @@ static inline size_t read_dcache_line_bytes(void) - static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - -@@ -174,11 +181,11 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - } - - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__invalidate_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx)); - - if ( size > 0 ) -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); - - dsb(sy); /* So we know the flushes happen before continuing */ - -@@ -188,14 +195,21 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - static inline int clean_dcache_va_range(const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - size += (uintptr_t)p & cacheline_mask; - size = (size + cacheline_mask) & ~cacheline_mask; - p = (void *)((uintptr_t)p & ~cacheline_mask); - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__clean_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__clean_dcache_one(0) : : "r" (p + idx)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ - return 0; -@@ -205,14 +219,21 @@ static inline int clean_and_invalidate_dcache_va_range - (const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - size += (uintptr_t)p & cacheline_mask; - size = (size + cacheline_mask) & ~cacheline_mask; - p = (void *)((uintptr_t)p & ~cacheline_mask); - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ - return 0; --- -2.44.0 - - -From 1792d1723b7fb45a20b145d2de4d233913b22c09 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 12 Dec 2023 14:45:52 +0100 -Subject: [PATCH 17/70] x86/x2apic: introduce a mixed physical/cluster mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current implementation of x2APIC requires to either use Cluster Logical or -Physical mode for all interrupts. However the selection of Physical vs Logical -is not done at APIC setup, an APIC can be addressed both in Physical or Logical -destination modes concurrently. - -Introduce a new x2APIC mode called Mixed, which uses Logical Cluster mode for -IPIs, and Physical mode for external interrupts, thus attempting to use the -best method for each interrupt type. - -Using Physical mode for external interrupts allows more vectors to be used, and -interrupt balancing to be more accurate. - -Using Logical Cluster mode for IPIs allows fewer accesses to the ICR register -when sending those, as multiple CPUs can be targeted with a single ICR register -write. - -A simple test calling flush_tlb_all() 10000 times on a tight loop on AMD EPYC -9754 with 512 CPUs gives the following figures in nano seconds: - -x mixed -+ phys -* cluster - N Min Max Median Avg Stddev -x 25 3.5131328e+08 3.5716441e+08 3.5410987e+08 3.5432659e+08 1566737.4 -+ 12 1.231082e+09 1.238824e+09 1.2370528e+09 1.2357981e+09 2853892.9 -Difference at 95.0% confidence - 8.81472e+08 +/- 1.46849e+06 - 248.774% +/- 0.96566% - (Student's t, pooled s = 2.05985e+06) -* 11 3.5099276e+08 3.5561459e+08 3.5461234e+08 3.5415668e+08 1415071.9 -No difference proven at 95.0% confidence - -So Mixed has no difference when compared to Cluster mode, and Physical mode is -248% slower when compared to either Mixed or Cluster modes with a 95% -confidence. - -Note that Xen uses Cluster mode by default, and hence is already using the -fastest way for IPI delivery at the cost of reducing the amount of vectors -available system-wide. - -Make the newly introduced mode the default one. - -Note the printing of the APIC addressing mode done in connect_bsp_APIC() has -been removed, as with the newly introduced mixed mode this would require more -fine grained printing, or else would be incorrect. The addressing mode can -already be derived from the APIC driver in use, which is printed by different -helpers. - -Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Henry Wang <Henry.Wang@arm.com> -master commit: e3c409d59ac87ccdf97b8c7708c81efa8069cb31 -master date: 2023-11-07 09:59:48 +0000 ---- - CHANGELOG.md | 7 +++ - docs/misc/xen-command-line.pandoc | 12 ++++ - xen/arch/x86/Kconfig | 35 +++++++++-- - xen/arch/x86/apic.c | 6 +- - xen/arch/x86/genapic/x2apic.c | 98 +++++++++++++++++++++++-------- - 5 files changed, 123 insertions(+), 35 deletions(-) - -diff --git a/CHANGELOG.md b/CHANGELOG.md -index 7fb4d366c3..5aa01dae5d 100644 ---- a/CHANGELOG.md -+++ b/CHANGELOG.md -@@ -4,6 +4,13 @@ Notable changes to Xen will be documented in this file. - - The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - -+## [4.18.1](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.18.1) -+ -+### Added -+ - On x86: -+ - Introduce a new x2APIC driver that uses Cluster Logical addressing mode -+ for IPIs and Physical addressing mode for external interrupts. -+ - ## [4.18.0](https://xenbits.xenproject.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.18.0) - 2023-11-16 - - ### Changed -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 9a19a04157..8e65f8bd18 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2804,6 +2804,15 @@ the watchdog. - - Permit use of x2apic setup for SMP environments. - -+### x2apic-mode (x86) -+> `= physical | cluster | mixed` -+ -+> Default: `physical` if **FADT** mandates physical mode, otherwise set at -+> build time by CONFIG_X2APIC_{PHYSICAL,LOGICAL,MIXED}. -+ -+In the case that x2apic is in use, this option switches between modes to -+address APICs in the system as interrupt destinations. -+ - ### x2apic_phys (x86) - > `= <boolean>` - -@@ -2814,6 +2823,9 @@ In the case that x2apic is in use, this option switches between physical and - clustered mode. The default, given no hint from the **FADT**, is cluster - mode. - -+**WARNING: `x2apic_phys` is deprecated and superseded by `x2apic-mode`. -+The latter takes precedence if both are set.** -+ - ### xenheap_megabytes (arm32) - > `= <size>` - -diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig -index eac77573bd..1acdffc51c 100644 ---- a/xen/arch/x86/Kconfig -+++ b/xen/arch/x86/Kconfig -@@ -228,11 +228,18 @@ config XEN_ALIGN_2M - - endchoice - --config X2APIC_PHYSICAL -- bool "x2APIC Physical Destination mode" -+choice -+ prompt "x2APIC Driver default" -+ default X2APIC_MIXED - help -- Use x2APIC Physical Destination mode by default when available. -+ Select APIC addressing when x2APIC is enabled. -+ -+ The default mode is mixed which should provide the best aspects -+ of both physical and cluster modes. - -+config X2APIC_PHYSICAL -+ bool "Physical Destination mode" -+ help - When using this mode APICs are addressed using the Physical - Destination mode, which allows using all dynamic vectors on each - CPU independently. -@@ -242,9 +249,27 @@ config X2APIC_PHYSICAL - destination inter processor interrupts (IPIs) slightly slower than - Logical Destination mode. - -- The mode when this option is not selected is Logical Destination. -+config X2APIC_CLUSTER -+ bool "Cluster Destination mode" -+ help -+ When using this mode APICs are addressed using the Cluster Logical -+ Destination mode. -+ -+ Cluster Destination has the benefit of sending IPIs faster since -+ multiple APICs can be targeted as destinations of a single IPI. -+ However the vector space is shared between all CPUs on the cluster, -+ and hence using this mode reduces the number of available vectors -+ when compared to Physical mode. - -- If unsure, say N. -+config X2APIC_MIXED -+ bool "Mixed Destination mode" -+ help -+ When using this mode APICs are addressed using the Cluster Logical -+ Destination mode for IPIs and Physical mode for external interrupts. -+ -+ Should provide the best of both modes. -+ -+endchoice - - config GUEST - bool -diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c -index f1264ce7ed..6acdd0ec14 100644 ---- a/xen/arch/x86/apic.c -+++ b/xen/arch/x86/apic.c -@@ -229,11 +229,7 @@ void __init connect_bsp_APIC(void) - outb(0x01, 0x23); - } - -- printk("Enabling APIC mode: %s. Using %d I/O APICs\n", -- !INT_DEST_MODE ? "Physical" -- : init_apic_ldr == init_apic_ldr_flat ? "Flat" -- : "Clustered", -- nr_ioapics); -+ printk("Enabling APIC mode. Using %d I/O APICs\n", nr_ioapics); - enable_apic_mode(); - } - -diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c -index 707deef98c..b88c7a96fe 100644 ---- a/xen/arch/x86/genapic/x2apic.c -+++ b/xen/arch/x86/genapic/x2apic.c -@@ -180,6 +180,36 @@ static const struct genapic __initconstrel apic_x2apic_cluster = { - .send_IPI_self = send_IPI_self_x2apic - }; - -+/* -+ * Mixed x2APIC mode: use physical for external (device) interrupts, and -+ * cluster for inter processor interrupts. Such mode has the benefits of not -+ * sharing the vector space with all CPUs on the cluster, while still allowing -+ * IPIs to be more efficiently delivered by not having to perform an ICR write -+ * for each target CPU. -+ */ -+static const struct genapic __initconstrel apic_x2apic_mixed = { -+ APIC_INIT("x2apic_mixed", NULL), -+ -+ /* -+ * The following fields are exclusively used by external interrupts and -+ * hence are set to use Physical destination mode handlers. -+ */ -+ .int_delivery_mode = dest_Fixed, -+ .int_dest_mode = 0 /* physical delivery */, -+ .vector_allocation_cpumask = vector_allocation_cpumask_phys, -+ .cpu_mask_to_apicid = cpu_mask_to_apicid_phys, -+ -+ /* -+ * The following fields are exclusively used by IPIs and hence are set to -+ * use Cluster Logical destination mode handlers. Note that init_apic_ldr -+ * is not used by IPIs, but the per-CPU fields it initializes are only used -+ * by the IPI hooks. -+ */ -+ .init_apic_ldr = init_apic_ldr_x2apic_cluster, -+ .send_IPI_mask = send_IPI_mask_x2apic_cluster, -+ .send_IPI_self = send_IPI_self_x2apic, -+}; -+ - static int cf_check update_clusterinfo( - struct notifier_block *nfb, unsigned long action, void *hcpu) - { -@@ -220,38 +250,56 @@ static struct notifier_block x2apic_cpu_nfb = { - static int8_t __initdata x2apic_phys = -1; - boolean_param("x2apic_phys", x2apic_phys); - -+enum { -+ unset, physical, cluster, mixed -+} static __initdata x2apic_mode = unset; -+ -+static int __init cf_check parse_x2apic_mode(const char *s) -+{ -+ if ( !cmdline_strcmp(s, "physical") ) -+ x2apic_mode = physical; -+ else if ( !cmdline_strcmp(s, "cluster") ) -+ x2apic_mode = cluster; -+ else if ( !cmdline_strcmp(s, "mixed") ) -+ x2apic_mode = mixed; -+ else -+ return -EINVAL; -+ -+ return 0; -+} -+custom_param("x2apic-mode", parse_x2apic_mode); -+ - const struct genapic *__init apic_x2apic_probe(void) - { -- if ( x2apic_phys < 0 ) -+ /* Honour the legacy cmdline setting if it's the only one provided. */ -+ if ( x2apic_mode == unset && x2apic_phys >= 0 ) -+ x2apic_mode = x2apic_phys ? physical : cluster; -+ -+ if ( x2apic_mode == unset ) - { -- /* -- * Force physical mode if there's no (full) interrupt remapping support: -- * The ID in clustered mode requires a 32 bit destination field due to -- * the usage of the high 16 bits to hold the cluster ID. -- */ -- x2apic_phys = iommu_intremap != iommu_intremap_full || -- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) || -- IS_ENABLED(CONFIG_X2APIC_PHYSICAL); -- } -- else if ( !x2apic_phys ) -- switch ( iommu_intremap ) -+ if ( acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL ) - { -- case iommu_intremap_off: -- case iommu_intremap_restricted: -- printk("WARNING: x2APIC cluster mode is not supported %s interrupt remapping -" -- " forcing phys mode\n", -- iommu_intremap == iommu_intremap_off ? "without" -- : "with restricted"); -- x2apic_phys = true; -- break; -- -- case iommu_intremap_full: -- break; -+ printk(XENLOG_INFO "ACPI FADT forcing x2APIC physical mode\n"); -+ x2apic_mode = physical; - } -+ else -+ x2apic_mode = IS_ENABLED(CONFIG_X2APIC_MIXED) ? mixed -+ : (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) ? physical -+ : cluster); -+ } - -- if ( x2apic_phys ) -+ if ( x2apic_mode == physical ) - return &apic_x2apic_phys; - -+ if ( x2apic_mode == cluster && iommu_intremap != iommu_intremap_full ) -+ { -+ printk("WARNING: x2APIC cluster mode is not supported %s interrupt remapping -" -+ " forcing mixed mode\n", -+ iommu_intremap == iommu_intremap_off ? "without" -+ : "with restricted"); -+ x2apic_mode = mixed; -+ } -+ - if ( !this_cpu(cluster_cpus) ) - { - update_clusterinfo(NULL, CPU_UP_PREPARE, -@@ -260,7 +308,7 @@ const struct genapic *__init apic_x2apic_probe(void) - register_cpu_notifier(&x2apic_cpu_nfb); - } - -- return &apic_x2apic_cluster; -+ return x2apic_mode == cluster ? &apic_x2apic_cluster : &apic_x2apic_mixed; - } - - void __init check_x2apic_preenabled(void) --- -2.44.0 - - -From 637da04812fba259a5d06591ec535345637a4407 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 30 Jan 2024 14:33:48 +0100 -Subject: [PATCH 18/70] pci: fail device assignment if phantom functions cannot - be assigned -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current behavior is that no error is reported if (some) phantom functions -fail to be assigned during device add or assignment, so the operation succeeds -even if some phantom functions are not correctly setup. - -This can lead to devices possibly being successfully assigned to a domU while -some of the device phantom functions are still assigned to dom0. Even when the -device is assigned domIO before being assigned to a domU phantom functions -might fail to be assigned to domIO, and also fail to be assigned to the domU, -leaving them assigned to dom0. - -Since the device can generate requests using the IDs of those phantom -functions, given the scenario above a device in such state would be in control -of a domU, but still capable of generating transactions that use a context ID -targeting dom0 owned memory. - -Modify device assign in order to attempt to deassign the device if phantom -functions failed to be assigned. - -Note that device addition is not modified in the same way, as in that case the -device is assigned to a trusted domain, and hence partial assign can lead to -device malfunction but not a security issue. - -This is XSA-449 / CVE-2023-46839 - -Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: cb4ecb3cc17b02c2814bc817efd05f3f3ba33d1e -master date: 2024-01-30 14:28:01 +0100 ---- - xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------ - 1 file changed, 21 insertions(+), 6 deletions(-) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index 04d00c7c37..e99837b6e1 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -1439,11 +1439,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - pdev->fault.count = 0; - -- if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn, -- pci_to_dev(pdev), flag)) ) -- goto done; -+ rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev), -+ flag); - -- for ( ; pdev->phantom_stride; rc = 0 ) -+ while ( pdev->phantom_stride && !rc ) - { - devfn += pdev->phantom_stride; - if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) -@@ -1454,8 +1453,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - done: - if ( rc ) -- printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n", -- d, &PCI_SBDF(seg, bus, devfn), rc); -+ { -+ printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n", -+ d, devfn != pdev->devfn ? "phantom function " : "", -+ &PCI_SBDF(seg, bus, devfn), rc); -+ -+ if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) ) -+ { -+ /* -+ * Device with phantom functions that failed to both assign and -+ * rollback. Mark the device as broken and crash the target domain, -+ * as the state of the functions at this point is unknown and Xen -+ * has no way to assert consistent context assignment among them. -+ */ -+ pdev->broken = true; -+ if ( !is_hardware_domain(d) && d != dom_io ) -+ domain_crash(d); -+ } -+ } - /* The device is assigned to dom_io so mark it as quarantined */ - else if ( d == dom_io ) - pdev->quarantine = true; --- -2.44.0 - - -From c7ac596a575a05d6ff1e35c3ff98bc4d143712d2 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 30 Jan 2024 14:34:40 +0100 -Subject: [PATCH 19/70] VT-d: Fix "else" vs "#endif" misplacement - -In domain_pgd_maddr() the "#endif" is misplaced with respect to "else". This -generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body -is executed unconditionally. - -Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's -clearer to follow. This in turn involves adjusting p2m_get_pagetable() to -compile when CONFIG_HVM is disabled. - -This is XSA-450 / CVE-2023-46840. - -Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only") -Reported-by: Teddy Astie <teddy.astie@vates.tech> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: cc6ba68edf6dcd18c3865e7d7c0f1ed822796426 -master date: 2024-01-30 14:29:15 +0100 ---- - xen/arch/x86/include/asm/p2m.h | 9 ++++++++- - xen/drivers/passthrough/vtd/iommu.c | 4 +--- - 2 files changed, 9 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h -index 40545f5fa8..1e0b0e2dcc 100644 ---- a/xen/arch/x86/include/asm/p2m.h -+++ b/xen/arch/x86/include/asm/p2m.h -@@ -435,7 +435,14 @@ static inline bool_t p2m_is_altp2m(const struct p2m_domain *p2m) - return p2m->p2m_class == p2m_alternate; - } - --#define p2m_get_pagetable(p2m) ((p2m)->phys_table) -+#ifdef CONFIG_HVM -+static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m) -+{ -+ return p2m->phys_table; -+} -+#else -+pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m); -+#endif - - /* - * Ensure any deferred p2m TLB flush has been completed on all VCPUs. -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index e13b7d99db..9ed616e211 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -438,15 +438,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr, - - if ( pgd_maddr ) - /* nothing */; --#ifdef CONFIG_HVM -- else if ( iommu_use_hap_pt(d) ) -+ else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) ) - { - pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d)); - - pgd_maddr = pagetable_get_paddr(pgt); - } - else --#endif - { - if ( !hd->arch.vtd.pgd_maddr ) - { --- -2.44.0 - - -From 62b3d7f8e45a7ec1597f0ed61a99d1f423b22315 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Thu, 1 Feb 2024 17:58:17 +0100 -Subject: [PATCH 20/70] x86/amd: Extend CPU erratum #1474 fix to more affected - models -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Erratum #1474 has now been extended to cover models from family 17h ranges -00-2Fh, so the errata now covers all the models released under Family -17h (Zen, Zen+ and Zen2). - -Additionally extend the workaround to Family 18h (Hygon), since it's based on -the Zen architecture and very likely affected. - -Rename all the zen2 related symbols to fam17, since the errata doesn't -exclusively affect Zen2 anymore. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 23db507a01a4ec5259ec0ab43d296a41b1c326ba -master date: 2023-12-21 12:19:40 +0000 ---- - xen/arch/x86/cpu/amd.c | 27 ++++++++++++++------------- - 1 file changed, 14 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 0f305312ff..d43288ae97 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk; - bool __ro_after_init amd_legacy_ssbd; - bool __initdata amd_virt_spec_ctrl; - --static bool __read_mostly zen2_c6_disabled; -+static bool __read_mostly fam17_c6_disabled; - - static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo, - unsigned int *hi) -@@ -978,24 +978,24 @@ void amd_check_zenbleed(void) - val & chickenbit ? "chickenbit" : "microcode"); - } - --static void cf_check zen2_disable_c6(void *arg) -+static void cf_check fam17_disable_c6(void *arg) - { - /* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */ - const uint64_t mask = ~((1ul << 6) | (1ul << 14) | (1ul << 22)); - uint64_t val; - -- if (!zen2_c6_disabled) { -+ if (!fam17_c6_disabled) { - printk(XENLOG_WARNING - "Disabling C6 after 1000 days apparent uptime due to AMD errata 1474\n"); -- zen2_c6_disabled = true; -+ fam17_c6_disabled = true; - /* - * Prevent CPU hotplug so that started CPUs will either see -- * zen2_c6_disabled set, or will be handled by -+ * zen_c6_disabled set, or will be handled by - * smp_call_function(). - */ - while (!get_cpu_maps()) - process_pending_softirqs(); -- smp_call_function(zen2_disable_c6, NULL, 0); -+ smp_call_function(fam17_disable_c6, NULL, 0); - put_cpu_maps(); - } - -@@ -1294,8 +1294,8 @@ static void cf_check init_amd(struct cpuinfo_x86 *c) - amd_check_zenbleed(); - amd_check_erratum_1485(); - -- if (zen2_c6_disabled) -- zen2_disable_c6(NULL); -+ if (fam17_c6_disabled) -+ fam17_disable_c6(NULL); - - check_syscfg_dram_mod_en(); - -@@ -1307,7 +1307,7 @@ const struct cpu_dev amd_cpu_dev = { - .c_init = init_amd, - }; - --static int __init cf_check zen2_c6_errata_check(void) -+static int __init cf_check amd_check_erratum_1474(void) - { - /* - * Errata #1474: A Core May Hang After About 1044 Days -@@ -1315,7 +1315,8 @@ static int __init cf_check zen2_c6_errata_check(void) - */ - s_time_t delta; - -- if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch()) -+ if (cpu_has_hypervisor || -+ (boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18)) - return 0; - - /* -@@ -1330,10 +1331,10 @@ static int __init cf_check zen2_c6_errata_check(void) - if (delta > 0) { - static struct timer errata_c6; - -- init_timer(&errata_c6, zen2_disable_c6, NULL, 0); -+ init_timer(&errata_c6, fam17_disable_c6, NULL, 0); - set_timer(&errata_c6, NOW() + delta); - } else -- zen2_disable_c6(NULL); -+ fam17_disable_c6(NULL); - - return 0; - } -@@ -1341,4 +1342,4 @@ static int __init cf_check zen2_c6_errata_check(void) - * Must be executed after early_time_init() for tsc_ticks2ns() to have been - * calibrated. That prevents us doing the check in init_amd(). - */ --presmp_initcall(zen2_c6_errata_check); -+presmp_initcall(amd_check_erratum_1474); --- -2.44.0 - - -From b26c30a408255454f8ceb4e49e3c4385aa32fbc3 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Thu, 1 Feb 2024 17:58:59 +0100 -Subject: [PATCH 21/70] CirrusCI: drop FreeBSD 12 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Went EOL by the end of December 2023, and the pkg repos have been shut down. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: c2ce3466472e9c9eda79f5dc98eb701bc6fdba20 -master date: 2024-01-15 12:20:11 +0100 ---- - .cirrus.yml | 6 ------ - 1 file changed, 6 deletions(-) - -diff --git a/.cirrus.yml b/.cirrus.yml -index 7e0beb200d..63f3afb104 100644 ---- a/.cirrus.yml -+++ b/.cirrus.yml -@@ -14,12 +14,6 @@ freebsd_template: &FREEBSD_TEMPLATE - - ./configure --with-system-seabios=/usr/local/share/seabios/bios.bin - - gmake -j`sysctl -n hw.ncpu` clang=y - --task: -- name: 'FreeBSD 12' -- freebsd_instance: -- image_family: freebsd-12-4 -- << : *FREEBSD_TEMPLATE -- - task: - name: 'FreeBSD 13' - freebsd_instance: --- -2.44.0 - - -From 6ccf064b0ce1d06449565129ab944b4fd9531b3a Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Thu, 1 Feb 2024 17:59:25 +0100 -Subject: [PATCH 22/70] x86/intel: ensure Global Performance Counter Control is - setup correctly -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -When Architectural Performance Monitoring is available, the PERF_GLOBAL_CTRL -MSR contains per-counter enable bits that is ANDed with the enable bit in the -counter EVNTSEL MSR in order for a PMC counter to be enabled. - -So far the watchdog code seems to have relied on the PERF_GLOBAL_CTRL enable -bits being set by default, but at least on some Intel Sapphire and Emerald -Rapids this is no longer the case, and Xen reports: - -Testing NMI watchdog on all CPUs: 0 40 stuck - -The first CPU on each package is started with PERF_GLOBAL_CTRL zeroed, so PMC0 -doesn't start counting when the enable bit in EVNTSEL0 is set, due to the -relevant enable bit in PERF_GLOBAL_CTRL not being set. - -Check and adjust PERF_GLOBAL_CTRL during CPU initialization so that all the -general-purpose PMCs are enabled. Doing so brings the state of the package-BSP -PERF_GLOBAL_CTRL in line with the rest of the CPUs on the system. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: 6bdb965178bbb3fc50cd4418d4770a7789956e2c -master date: 2024-01-17 10:40:52 +0100 ---- - xen/arch/x86/cpu/intel.c | 23 ++++++++++++++++++++++- - 1 file changed, 22 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c -index a8ba3191e6..aef8e4506c 100644 ---- a/xen/arch/x86/cpu/intel.c -+++ b/xen/arch/x86/cpu/intel.c -@@ -533,9 +533,30 @@ static void cf_check init_intel(struct cpuinfo_x86 *c) - init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); -+ unsigned int cnt = (eax >> 8) & 0xff; -+ - /* Check for version and the number of counters */ -- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) -+ if ((eax & 0xff) && (cnt > 1) && (cnt <= 32)) { -+ uint64_t global_ctrl; -+ unsigned int cnt_mask = (1UL << cnt) - 1; -+ -+ /* -+ * On (some?) Sapphire/Emerald Rapids platforms each -+ * package-BSP starts with all the enable bits for the -+ * general-purpose PMCs cleared. Adjust so counters -+ * can be enabled from EVNTSEL. -+ */ -+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_ctrl); -+ if ((global_ctrl & cnt_mask) != cnt_mask) { -+ printk("CPU%u: invalid PERF_GLOBAL_CTRL: %#" -+ PRIx64 " adjusting to %#" PRIx64 "\n", -+ smp_processor_id(), global_ctrl, -+ global_ctrl | cnt_mask); -+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, -+ global_ctrl | cnt_mask); -+ } - __set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); -+ } - } - - if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) ) --- -2.44.0 - - -From 4cc0f88c42f374c7a8e2d05e38777fa18619482e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 1 Feb 2024 17:59:57 +0100 -Subject: [PATCH 23/70] x86/vmx: Fix IRQ handling for EXIT_REASON_INIT - -When receiving an INIT, a prior bugfix tried to ignore the INIT and continue -onwards. - -Unfortunately it's not safe to return at that point in vmx_vmexit_handler(). -Just out of context in the first hunk is a local_irqs_enabled() which is -depended-upon by the return-to-guest path, causing the following checklock -failure in debug builds: - - (XEN) Error: INIT received - ignoring - (XEN) CHECKLOCK FAILURE: prev irqsafe: 0, curr irqsafe 1 - (XEN) Xen BUG at common/spinlock.c:132 - (XEN) ----[ Xen-4.19-unstable x86_64 debug=y Tainted: H ]---- - ... - (XEN) Xen call trace: - (XEN) [<ffff82d040238e10>] R check_lock+0xcd/0xe1 - (XEN) [<ffff82d040238fe3>] F _spin_lock+0x1b/0x60 - (XEN) [<ffff82d0402ed6a8>] F pt_update_irq+0x32/0x3bb - (XEN) [<ffff82d0402b9632>] F vmx_intr_assist+0x3b/0x51d - (XEN) [<ffff82d040206447>] F vmx_asm_vmexit_handler+0xf7/0x210 - -Luckily, this is benign in release builds. Accidentally having IRQs disabled -when trying to take an IRQs-on lock isn't a deadlock-vulnerable pattern. - -Drop the problematic early return. In hindsight, it's wrong to skip other -normal VMExit steps. - -Fixes: b1f11273d5a7 ("x86/vmx: Don't spuriously crash the domain when INIT is received") -Reported-by: Reima ISHII <ishiir@g.ecc.u-tokyo.ac.jp> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: d1f8883aebe00f6a9632d77ab0cd5c6d02c9cbe4 -master date: 2024-01-18 20:59:06 +0000 ---- - xen/arch/x86/hvm/vmx/vmx.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index 1edc7f1e91..964891934b 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -4100,7 +4100,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) - - case EXIT_REASON_INIT: - printk(XENLOG_ERR "Error: INIT received - ignoring\n"); -- return; /* Renter the guest without further processing */ -+ break; - } - - /* Now enable interrupts so it's safe to take locks. */ -@@ -4385,6 +4385,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) - break; - } - case EXIT_REASON_EXTERNAL_INTERRUPT: -+ case EXIT_REASON_INIT: - /* Already handled above. */ - break; - case EXIT_REASON_TRIPLE_FAULT: --- -2.44.0 - - -From 00550e808c10c67710ebb8867200eda1fbee332c Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 1 Feb 2024 18:00:32 +0100 -Subject: [PATCH 24/70] x86/vmx: Disallow the use of inactivity states - -Right now, vvmx will blindly copy L12's ACTIVITY_STATE into the L02 VMCS and -enter the vCPU. Luckily for us, nested-virt is explicitly unsupported for -security bugs. - -The inactivity states are HLT, SHUTDOWN and WAIT-FOR-SIPI, and as noted by the -SDM in Vol3 27.7 "Special Features of VM Entry": - - If VM entry ends with the logical processor in an inactive activity state, - the VM entry generates any special bus cycle that is normally generated when - that activity state is entered from the active state. - -Also, - - Some activity states unconditionally block certain events. - -I.e. A VMEntry with ACTIVITY=SHUTDOWN will initiate a platform reset, while a -VMEntry with ACTIVITY=WAIT-FOR-SIPI will really block everything other than -SIPIs. - -Both of these activity states are for the TXT ACM to use, not for regular -hypervisors, and Xen doesn't support dropping the HLT intercept either. - -There are two paths in Xen which operate on ACTIVITY_STATE. - -1) The vmx_{get,set}_nonreg_state() helpers for VM-Fork. - - As regular VMs can't use any inactivity states, this is just duplicating - the 0 from construct_vmcs(). Retain the ability to query activity_state, - but crash the domain on any attempt to set an inactivity state. - -2) Nested virt, because of ACTIVITY_STATE in vmcs_gstate_field[]. - - Explicitly hide the inactivity states in the guest's view of MSR_VMX_MISC, - and remove ACTIVITY_STATE from vmcs_gstate_field[]. - - In virtual_vmentry(), we should trigger a VMEntry failure for the use of - any inactivity states, but there's no support for that in the code at all - so leave a TODO for when we finally start working on nested-virt in - earnest. - -Reported-by: Reima Ishii <ishiir@g.ecc.u-tokyo.ac.jp> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Tamas K Lengyel <tamas@tklengyel.com> -master commit: 3643bb53a05b7c8fbac072c63bef1538f2a6d0d2 -master date: 2024-01-18 20:59:06 +0000 ---- - xen/arch/x86/hvm/vmx/vmx.c | 5 ++++- - xen/arch/x86/hvm/vmx/vvmx.c | 9 +++++++-- - xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 1 + - 3 files changed, 12 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index 964891934b..28dece7c6b 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -1558,7 +1558,10 @@ static void cf_check vmx_set_nonreg_state(struct vcpu *v, - { - vmx_vmcs_enter(v); - -- __vmwrite(GUEST_ACTIVITY_STATE, nrs->vmx.activity_state); -+ if ( nrs->vmx.activity_state ) -+ domain_crash(v->domain, "Attempt to set %pv activity_state %#lx\n", -+ v, nrs->vmx.activity_state); -+ - __vmwrite(GUEST_INTERRUPTIBILITY_INFO, nrs->vmx.interruptibility_info); - __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, nrs->vmx.pending_dbg); - -diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c -index 16b0ef82b6..fd0ae39166 100644 ---- a/xen/arch/x86/hvm/vmx/vvmx.c -+++ b/xen/arch/x86/hvm/vmx/vvmx.c -@@ -899,7 +899,10 @@ static const u16 vmcs_gstate_field[] = { - GUEST_LDTR_AR_BYTES, - GUEST_TR_AR_BYTES, - GUEST_INTERRUPTIBILITY_INFO, -+ /* -+ * ACTIVITY_STATE is handled specially. - GUEST_ACTIVITY_STATE, -+ */ - GUEST_SYSENTER_CS, - GUEST_PREEMPTION_TIMER, - /* natural */ -@@ -1200,6 +1203,8 @@ static void virtual_vmentry(struct cpu_user_regs *regs) - nvcpu->nv_vmentry_pending = 0; - nvcpu->nv_vmswitch_in_progress = 1; - -+ /* TODO: Fail VMentry for GUEST_ACTIVITY_STATE != 0 */ -+ - /* - * EFER handling: - * hvm_set_efer won't work if CR0.PG = 1, so we change the value -@@ -2316,8 +2321,8 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content) - data = hvm_cr4_guest_valid_bits(d); - break; - case MSR_IA32_VMX_MISC: -- /* Do not support CR3-target feature now */ -- data = host_data & ~VMX_MISC_CR3_TARGET; -+ /* Do not support CR3-targets or activity states. */ -+ data = host_data & ~(VMX_MISC_CR3_TARGET | VMX_MISC_ACTIVITY_MASK); - break; - case MSR_IA32_VMX_EPT_VPID_CAP: - data = nept_get_ept_vpid_cap(); -diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h -index d07fcb2bc9..8de9977eb3 100644 ---- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h -+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h -@@ -277,6 +277,7 @@ extern u32 vmx_secondary_exec_control; - #define VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL 0x80000000000ULL - extern u64 vmx_ept_vpid_cap; - -+#define VMX_MISC_ACTIVITY_MASK 0x000001c0 - #define VMX_MISC_PROC_TRACE 0x00004000 - #define VMX_MISC_CR3_TARGET 0x01ff0000 - #define VMX_MISC_VMWRITE_ALL 0x20000000 --- -2.44.0 - - -From 579a622eb41cf4e1ae4d94100985a81eebda23b9 Mon Sep 17 00:00:00 2001 -From: Michal Orzel <michal.orzel@amd.com> -Date: Thu, 1 Feb 2024 18:01:27 +0100 -Subject: [PATCH 25/70] lib{fdt,elf}: move lib{fdt,elf}-temp.o and their deps - to $(targets) - -At the moment, trying to run xencov read/reset (calling SYSCTL_coverage_op -under the hood) results in a crash. This is due to a profiler trying to -access data in the .init.* sections (libfdt for Arm and libelf for x86) -that are stripped after boot. Normally, the build system compiles any -*.init.o file without COV_FLAGS. However, these two libraries are -handled differently as sections will be renamed to init after linking. - -To override COV_FLAGS to empty for these libraries, lib{fdt,elf}.o were -added to nocov-y. This worked until e321576f4047 ("xen/build: start using -if_changed") that added lib{fdt,elf}-temp.o and their deps to extra-y. -This way, even though these objects appear as prerequisites of -lib{fdt,elf}.o and the settings should propagate to them, make can also -build them as a prerequisite of __build, in which case COV_FLAGS would -still have the unwanted flags. Fix it by switching to $(targets) instead. - -Also, for libfdt, append libfdt.o to nocov-y only if CONFIG_OVERLAY_DTB -is not set. Otherwise, there is no section renaming and we should be able -to run the coverage. - -Fixes: e321576f4047 ("xen/build: start using if_changed") -Signed-off-by: Michal Orzel <michal.orzel@amd.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: 79519fcfa0605bbf19d8c02b979af3a2c8afed68 -master date: 2024-01-23 12:02:44 +0100 ---- - xen/common/libelf/Makefile | 2 +- - xen/common/libfdt/Makefile | 4 ++-- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/common/libelf/Makefile b/xen/common/libelf/Makefile -index 8a4522e4e1..917d12b006 100644 ---- a/xen/common/libelf/Makefile -+++ b/xen/common/libelf/Makefile -@@ -13,4 +13,4 @@ $(obj)/libelf.o: $(obj)/libelf-temp.o FORCE - $(obj)/libelf-temp.o: $(addprefix $(obj)/,$(libelf-objs)) FORCE - $(call if_changed,ld) - --extra-y += libelf-temp.o $(libelf-objs) -+targets += libelf-temp.o $(libelf-objs) -diff --git a/xen/common/libfdt/Makefile b/xen/common/libfdt/Makefile -index d50487aa6e..6ce679f98f 100644 ---- a/xen/common/libfdt/Makefile -+++ b/xen/common/libfdt/Makefile -@@ -5,10 +5,10 @@ SECTIONS := text data $(SPECIAL_DATA_SECTIONS) - # For CONFIG_OVERLAY_DTB, libfdt functionalities will be needed during runtime. - ifneq ($(CONFIG_OVERLAY_DTB),y) - OBJCOPYFLAGS := $(foreach s,$(SECTIONS),--rename-section .$(s)=.init.$(s)) -+nocov-y += libfdt.o - endif - - obj-y += libfdt.o --nocov-y += libfdt.o - - CFLAGS-y += -I$(srctree)/include/xen/libfdt/ - -@@ -18,4 +18,4 @@ $(obj)/libfdt.o: $(obj)/libfdt-temp.o FORCE - $(obj)/libfdt-temp.o: $(addprefix $(obj)/,$(LIBFDT_OBJS)) FORCE - $(call if_changed,ld) - --extra-y += libfdt-temp.o $(LIBFDT_OBJS) -+targets += libfdt-temp.o $(LIBFDT_OBJS) --- -2.44.0 - - -From 295ab8060d95ed8c365077946c7faf8793099ef8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Thu, 1 Feb 2024 18:01:52 +0100 -Subject: [PATCH 26/70] x86/p2m-pt: fix off by one in entry check assert -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The MMIO RO rangeset overlap check is bogus: the rangeset is inclusive so the -passed end mfn should be the last mfn to be mapped (not last + 1). - -Fixes: 6fa1755644d0 ('amd/npt/shadow: replace assert that prevents creating 2M/1G MMIO entries') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: George Dunlap <george.dunlap@cloud.com> -master commit: 610775d0dd61c1bd2f4720c755986098e6a5bafd -master date: 2024-01-25 16:09:04 +0100 ---- - xen/arch/x86/mm/p2m-pt.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c -index b2b14746c1..88d3733891 100644 ---- a/xen/arch/x86/mm/p2m-pt.c -+++ b/xen/arch/x86/mm/p2m-pt.c -@@ -552,7 +552,7 @@ static void check_entry(mfn_t mfn, p2m_type_t new, p2m_type_t old, - if ( new == p2m_mmio_direct ) - ASSERT(!mfn_eq(mfn, INVALID_MFN) && - !rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), -- mfn_x(mfn) + (1UL << order))); -+ mfn_x(mfn) + (1UL << order) - 1)); - else if ( p2m_allows_invalid_mfn(new) || new == p2m_invalid || - new == p2m_mmio_dm ) - ASSERT(mfn_valid(mfn) || mfn_eq(mfn, INVALID_MFN)); --- -2.44.0 - - -From b1fdd7d0e47e0831ac7a99d0417385fc10d3068c Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 1 Feb 2024 18:02:24 +0100 -Subject: [PATCH 27/70] x86/ucode: Fix stability of the raw CPU Policy rescan - -Always run microcode_update_helper() on the BSP, so the the updated Raw CPU -policy doesn't get non-BSP topology details included. - -Have calculate_raw_cpu_policy() clear the instantanious XSTATE sizes. The -value XCR0 | MSR_XSS had when we scanned the policy isn't terribly interesting -to report. - -When CPUID Masking is active, it affects CPUID instructions issued by Xen -too. Transiently disable masking to get a clean scan. - -Fixes: 694d79ed5aac ("x86/ucode: Refresh raw CPU policy after microcode load") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: cf7fe8b72deaa94157ddf97d4bb391480205e9c2 -master date: 2024-01-25 17:46:57 +0000 ---- - xen/arch/x86/cpu-policy.c | 7 +++++++ - xen/arch/x86/cpu/microcode/core.c | 20 +++++++++++++++++--- - 2 files changed, 24 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c -index 81e574390f..bcb17b7ce3 100644 ---- a/xen/arch/x86/cpu-policy.c -+++ b/xen/arch/x86/cpu-policy.c -@@ -353,6 +353,13 @@ void calculate_raw_cpu_policy(void) - /* Nothing good will come from Xen and libx86 disagreeing on vendor. */ - ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor); - -+ /* -+ * Clear the truly dynamic fields. These vary with the in-context XCR0 -+ * and MSR_XSS, and aren't interesting fields in the raw policy. -+ */ -+ p->xstate.raw[0].b = 0; -+ p->xstate.raw[1].b = 0; -+ - /* 0x000000ce MSR_INTEL_PLATFORM_INFO */ - /* Was already added by probe_cpuid_faulting() */ - } -diff --git a/xen/arch/x86/cpu/microcode/core.c b/xen/arch/x86/cpu/microcode/core.c -index 65ebeb50de..4e011cdc41 100644 ---- a/xen/arch/x86/cpu/microcode/core.c -+++ b/xen/arch/x86/cpu/microcode/core.c -@@ -680,8 +680,18 @@ static long cf_check microcode_update_helper(void *data) - microcode_update_cache(patch); - spin_unlock(µcode_mutex); - -- /* Refresh the raw CPU policy, in case the features have changed. */ -+ /* -+ * Refresh the raw CPU policy, in case the features have changed. -+ * Disable CPUID masking if in use, to avoid having current's -+ * cpu_policy affect the rescan. -+ */ -+ if ( ctxt_switch_masking ) -+ alternative_vcall(ctxt_switch_masking, NULL); -+ - calculate_raw_cpu_policy(); -+ -+ if ( ctxt_switch_masking ) -+ alternative_vcall(ctxt_switch_masking, current); - } - else - microcode_free_patch(patch); -@@ -721,8 +731,12 @@ int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len) - } - buffer->len = len; - -- return continue_hypercall_on_cpu(smp_processor_id(), -- microcode_update_helper, buffer); -+ /* -+ * Always queue microcode_update_helper() on CPU0. Most of the logic -+ * won't care, but the update of the Raw CPU policy wants to (re)run on -+ * the BSP. -+ */ -+ return continue_hypercall_on_cpu(0, microcode_update_helper, buffer); - } - - static int __init cf_check microcode_init(void) --- -2.44.0 - - -From 184d723e7a5d1c021d297e14d19fe5344eac7a56 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Cyril=20R=C3=A9bert=20=28zithro=29?= <slack@rabbit.lu> -Date: Tue, 27 Feb 2024 13:53:42 +0100 -Subject: [PATCH 28/70] tools/xentop: fix sorting bug for some columns -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Sort doesn't work on columns VBD_OO, VBD_RD, VBD_WR and VBD_RSECT. -Fix by adjusting variables names in compare functions. -Bug fix only. No functional change. - -Fixes: 91c3e3dc91d6 ("tools/xentop: Display '-' when stats are not available.") -Signed-off-by: Cyril Rébert (zithro) <slack@rabbit.lu> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 29f17d837421f13c0e0010802de1b2d51d2ded4a -master date: 2024-02-05 17:58:23 +0000 ---- - tools/xentop/xentop.c | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/xentop/xentop.c b/tools/xentop/xentop.c -index 950e8935c4..545bd5e96d 100644 ---- a/tools/xentop/xentop.c -+++ b/tools/xentop/xentop.c -@@ -684,7 +684,7 @@ static int compare_vbd_oo(xenstat_domain *domain1, xenstat_domain *domain2) - unsigned long long dom1_vbd_oo = 0, dom2_vbd_oo = 0; - - tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom1_vbd_oo); -- tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom2_vbd_oo); -+ tot_vbd_reqs(domain2, FIELD_VBD_OO, &dom2_vbd_oo); - - return -compare(dom1_vbd_oo, dom2_vbd_oo); - } -@@ -711,9 +711,9 @@ static int compare_vbd_rd(xenstat_domain *domain1, xenstat_domain *domain2) - unsigned long long dom1_vbd_rd = 0, dom2_vbd_rd = 0; - - tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom1_vbd_rd); -- tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom2_vbd_rd); -+ tot_vbd_reqs(domain2, FIELD_VBD_RD, &dom2_vbd_rd); - -- return -compare(dom1_vbd_rd, dom1_vbd_rd); -+ return -compare(dom1_vbd_rd, dom2_vbd_rd); - } - - /* Prints number of total VBD READ requests statistic */ -@@ -738,7 +738,7 @@ static int compare_vbd_wr(xenstat_domain *domain1, xenstat_domain *domain2) - unsigned long long dom1_vbd_wr = 0, dom2_vbd_wr = 0; - - tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom1_vbd_wr); -- tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom2_vbd_wr); -+ tot_vbd_reqs(domain2, FIELD_VBD_WR, &dom2_vbd_wr); - - return -compare(dom1_vbd_wr, dom2_vbd_wr); - } -@@ -765,7 +765,7 @@ static int compare_vbd_rsect(xenstat_domain *domain1, xenstat_domain *domain2) - unsigned long long dom1_vbd_rsect = 0, dom2_vbd_rsect = 0; - - tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom1_vbd_rsect); -- tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom2_vbd_rsect); -+ tot_vbd_reqs(domain2, FIELD_VBD_RSECT, &dom2_vbd_rsect); - - return -compare(dom1_vbd_rsect, dom2_vbd_rsect); - } --- -2.44.0 - - -From fa9950a527a70971bf9279be62d445cf9c83aedf Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 27 Feb 2024 13:54:04 +0100 -Subject: [PATCH 29/70] amd-vi: fix IVMD memory type checks -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current code that parses the IVMD blocks is relaxed with regard to the -restriction that such unity regions should always fall into memory ranges -marked as reserved in the memory map. - -However the type checks for the IVMD addresses are inverted, and as a result -IVMD ranges falling into RAM areas are accepted. Note that having such ranges -in the first place is a firmware bug, as IVMD should always fall into reserved -ranges. - -Fixes: ed6c77ebf0c1 ('AMD/IOMMU: check / convert IVMD ranges for being / to be reserved') -Reported-by: Ox <oxjo@proton.me> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Tested-by: oxjo <oxjo@proton.me> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 83afa313583019d9f159c122cecf867735d27ec5 -master date: 2024-02-06 11:56:13 +0100 ---- - xen/drivers/passthrough/amd/iommu_acpi.c | 11 ++++++++--- - 1 file changed, 8 insertions(+), 3 deletions(-) - -diff --git a/xen/drivers/passthrough/amd/iommu_acpi.c b/xen/drivers/passthrough/amd/iommu_acpi.c -index 699d33f429..96d8879e7b 100644 ---- a/xen/drivers/passthrough/amd/iommu_acpi.c -+++ b/xen/drivers/passthrough/amd/iommu_acpi.c -@@ -426,9 +426,14 @@ static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block) - return -EIO; - } - -- /* Types which won't be handed out are considered good enough. */ -- if ( !(type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | -- RAM_TYPE_UNUSABLE)) ) -+ /* -+ * Types which aren't RAM are considered good enough. -+ * Note that a page being partially RESERVED, ACPI or UNUSABLE will -+ * force Xen into assuming the whole page as having that type in -+ * practice. -+ */ -+ if ( type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | -+ RAM_TYPE_UNUSABLE) ) - continue; - - AMD_IOMMU_ERROR("IVMD: page at %lx can't be converted\n", addr); --- -2.44.0 - - -From 16475909baa2bcfda3ebc07ced5e5cd0ca8172d6 Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Tue, 27 Feb 2024 13:55:03 +0100 -Subject: [PATCH 30/70] block-common: Fix same_vm for no targets - -same_vm is broken when the two main domains do not have targets. otvm -and targetvm are both missing, which means they get set to -1 and then -converted to empty strings: - -++10697+ local targetvm=-1 -++10697+ local otvm=-1 -++10697+ otvm= -++10697+ othervm=/vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -++10697+ targetvm= -++10697+ local frontend_uuid=/vm/844dea4e-44f8-4e3e-8145-325132a31ca5 - -The final comparison returns true since the two empty strings match: - -++10697+ '[' /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o '' = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = '' -o '' = '' ']' - -Replace -1 with distinct strings indicating the lack of a value and -remove the collescing to empty stings. The strings themselves will no -longer match, and that is correct. - -++12364+ '[' /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o 'No target' = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = 'No other target' -o 'No target' = 'No other target' ']' - -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: e8f1bb803fdf44db708991593568a9e3e6b3d130 -master date: 2024-02-07 13:46:52 +0100 ---- - tools/hotplug/Linux/block-common.sh | 8 +++----- - 1 file changed, 3 insertions(+), 5 deletions(-) - -diff --git a/tools/hotplug/Linux/block-common.sh b/tools/hotplug/Linux/block-common.sh -index f86a88c4eb..5c80237d99 100644 ---- a/tools/hotplug/Linux/block-common.sh -+++ b/tools/hotplug/Linux/block-common.sh -@@ -112,14 +112,12 @@ same_vm() - "$FRONTEND_UUID") - local target=$(xenstore_read_default "/local/domain/$FRONTEND_ID/target" \ - "-1") -- local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "-1") -+ local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "No Target") - local otarget=$(xenstore_read_default "/local/domain/$otherdom/target" \ - "-1") - local otvm=$(xenstore_read_default "/local/domain/$otarget/vm" \ -- "-1") -- otvm=${otvm%-1} -- othervm=${othervm%-1} -- targetvm=${targetvm%-1} -+ "No Other Target") -+ - local frontend_uuid=${FRONTEND_UUID%-1} - - [ "$frontend_uuid" = "$othervm" -o "$targetvm" = "$othervm" -o \ --- -2.44.0 - - -From b51fd78aed865033413178f5953147effedc7ce0 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Petr=20Bene=C5=A1?= <w1benny@gmail.com> -Date: Tue, 27 Feb 2024 13:55:25 +0100 -Subject: [PATCH 31/70] x86/hvm: Fix fast singlestep state persistence -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This patch addresses an issue where the fast singlestep setting would persist -despite xc_domain_debug_control being called with XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF. -Specifically, if fast singlestep was enabled in a VMI session and that session -stopped before the MTF trap occurred, the fast singlestep setting remained -active even though MTF itself was disabled. This led to a situation where, upon -starting a new VMI session, the first event to trigger an EPT violation would -cause the corresponding EPT event callback to be skipped due to the lingering -fast singlestep setting. - -The fix ensures that the fast singlestep setting is properly reset when -disabling single step debugging operations. - -Signed-off-by: Petr BeneÅ¡ <w1benny@gmail.com> -Reviewed-by: Tamas K Lengyel <tamas@tklengyel.com> -master commit: 897def94b56175ce569673a05909d2f223e1e749 -master date: 2024-02-12 09:37:58 +0100 ---- - xen/arch/x86/hvm/hvm.c | 34 ++++++++++++++++++++++++---------- - 1 file changed, 24 insertions(+), 10 deletions(-) - -diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c -index 482eebbabf..a70b351373 100644 ---- a/xen/arch/x86/hvm/hvm.c -+++ b/xen/arch/x86/hvm/hvm.c -@@ -5167,26 +5167,40 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg) - - int hvm_debug_op(struct vcpu *v, int32_t op) - { -- int rc; -+ int rc = 0; - - switch ( op ) - { - case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: - case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: -- rc = -EOPNOTSUPP; - if ( !cpu_has_monitor_trap_flag ) -- break; -- rc = 0; -- vcpu_pause(v); -- v->arch.hvm.single_step = -- (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON); -- vcpu_unpause(v); /* guest will latch new state */ -+ return -EOPNOTSUPP; - break; - default: -- rc = -ENOSYS; -- break; -+ return -ENOSYS; -+ } -+ -+ vcpu_pause(v); -+ -+ switch ( op ) -+ { -+ case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: -+ v->arch.hvm.single_step = true; -+ break; -+ -+ case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: -+ v->arch.hvm.single_step = false; -+ v->arch.hvm.fast_single_step.enabled = false; -+ v->arch.hvm.fast_single_step.p2midx = 0; -+ break; -+ -+ default: /* Excluded above */ -+ ASSERT_UNREACHABLE(); -+ return -ENOSYS; - } - -+ vcpu_unpause(v); /* guest will latch new state */ -+ - return rc; - } - --- -2.44.0 - - -From 59e6ad6597dc9930c966b20485a9d0b369ff71a5 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 27 Feb 2024 13:55:56 +0100 -Subject: [PATCH 32/70] x86/HVM: tidy state on hvmemul_map_linear_addr()'s - error path - -While in the vast majority of cases failure of the function will not -be followed by re-invocation with the same emulation context, a few -very specific insns - involving multiple independent writes, e.g. ENTER -and PUSHA - exist where this can happen. Since failure of the function -only signals to the caller that it ought to try an MMIO write instead, -such failure also cannot be assumed to result in wholesale failure of -emulation of the current insn. Instead we have to maintain internal -state such that another invocation of the function with the same -emulation context remains possible. To achieve that we need to reset MFN -slots after putting page references on the error path. - -Note that all of this affects debugging code only, in causing an -assertion to trigger (higher up in the function). There's otherwise no -misbehavior - such a "leftover" slot would simply be overwritten by new -contents in a release build. - -Also extend the related unmap() assertion, to further check for MFN 0. - -Fixes: 8cbd4fb0b7ea ("x86/hvm: implement hvmemul_write() using real mappings") -Reported-by: Manuel Andreas <manuel.andreas@tum.de> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Paul Durrant <paul@xen.org> -master commit: e72f951df407bc3be82faac64d8733a270036ba1 -master date: 2024-02-13 09:36:14 +0100 ---- - xen/arch/x86/hvm/emulate.c | 7 ++++++- - 1 file changed, 6 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c -index 254716c766..865aa08bbc 100644 ---- a/xen/arch/x86/hvm/emulate.c -+++ b/xen/arch/x86/hvm/emulate.c -@@ -696,7 +696,12 @@ static void *hvmemul_map_linear_addr( - out: - /* Drop all held references. */ - while ( mfn-- > hvmemul_ctxt->mfn ) -+ { - put_page(mfn_to_page(*mfn)); -+#ifndef NDEBUG /* Clean slot for a subsequent map()'s error checking. */ -+ *mfn = _mfn(0); -+#endif -+ } - - return err; - } -@@ -718,7 +723,7 @@ static void hvmemul_unmap_linear_addr( - - for ( i = 0; i < nr_frames; i++ ) - { -- ASSERT(mfn_valid(*mfn)); -+ ASSERT(mfn_x(*mfn) && mfn_valid(*mfn)); - paging_mark_dirty(currd, *mfn); - put_page(mfn_to_page(*mfn)); - --- -2.44.0 - - -From 006764b871db75d5d025500a079ad246d1d418a1 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 27 Feb 2024 13:56:25 +0100 -Subject: [PATCH 33/70] build: Replace `which` with `command -v` -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The `which` command is not standard, may not exist on the build host, -or may not behave as expected by the build system. It is recommended -to use `command -v` to find out if a command exist and have its path, -and it's part of a POSIX shell standard (at least, it seems to be -mandatory since IEEE Std 1003.1-2008, but was optional before). - -Fixes: c8a8645f1efe ("xen/build: Automatically locate a suitable python interpreter") -Fixes: 3b47bcdb6d38 ("xen/build: Use a distro version of figlet") -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: f93629b18b528a5ab1b1092949c5420069c7226c -master date: 2024-02-19 12:45:48 +0100 ---- - xen/Makefile | 4 ++-- - xen/build.mk | 2 +- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/Makefile b/xen/Makefile -index a92709b43e..59d368e4d8 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -25,8 +25,8 @@ export XEN_BUILD_HOST := $(shell hostname) - endif - - # Best effort attempt to find a python interpreter, defaulting to Python 3 if --# available. Fall back to just `python` if `which` is nowhere to be found. --PYTHON_INTERPRETER := $(word 1,$(shell which python3 python python2 2>/dev/null) python) -+# available. Fall back to just `python`. -+PYTHON_INTERPRETER := $(word 1,$(shell command -v python3 || command -v python || command -v python2) python) - export PYTHON ?= $(PYTHON_INTERPRETER) - - export CHECKPOLICY ?= checkpolicy -diff --git a/xen/build.mk b/xen/build.mk -index 26dd5a8e87..0f490ca71b 100644 ---- a/xen/build.mk -+++ b/xen/build.mk -@@ -1,6 +1,6 @@ - quiet_cmd_banner = BANNER $@ - define cmd_banner -- if which figlet >/dev/null 2>&1 ; then \ -+ if command -v figlet >/dev/null 2>&1 ; then \ - echo " Xen $(XEN_FULLVERSION)" | figlet -f $< > $@.tmp; \ - else \ - echo " Xen $(XEN_FULLVERSION)" > $@.tmp; \ --- -2.44.0 - - -From 489c2b9ba173376e978c0ef3de416a2f09452e85 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - <marmarek@invisiblethingslab.com> -Date: Tue, 27 Feb 2024 13:57:07 +0100 -Subject: [PATCH 34/70] libxl: Disable relocating memory for qemu-xen in - stubdomain too -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -According to comments (and experiments) qemu-xen cannot handle memory -reolcation done by hvmloader. The code was already disabled when running -qemu-xen in dom0 (see libxl__spawn_local_dm()), but it was missed when -adding qemu-xen support to stubdomain. Adjust libxl__spawn_stub_dm() to -be consistent in this regard. - -Reported-by: Neowutran <xen@neowutran.ovh> -Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Reviewed-by: Jason Andryuk <jandryuk@gmail.com> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 97883aa269f6745a6ded232be3a855abb1297e0d -master date: 2024-02-22 11:48:22 +0100 ---- - tools/libs/light/libxl_dm.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index 14b593110f..ed620a9d8e 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -2432,6 +2432,16 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss) - "%s", - libxl_bios_type_to_string(guest_config->b_info.u.hvm.bios)); - } -+ /* Disable relocating memory to make the MMIO hole larger -+ * unless we're running qemu-traditional and vNUMA is not -+ * configured. */ -+ libxl__xs_printf(gc, XBT_NULL, -+ libxl__sprintf(gc, "%s/hvmloader/allow-memory-relocate", -+ libxl__xs_get_dompath(gc, guest_domid)), -+ "%d", -+ guest_config->b_info.device_model_version -+ == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL && -+ !libxl__vnuma_configured(&guest_config->b_info)); - ret = xc_domain_set_target(ctx->xch, dm_domid, guest_domid); - if (ret<0) { - LOGED(ERROR, guest_domid, "setting target domain %d -> %d", --- -2.44.0 - - -From 5fda82641461a5234ab9bf0575423dfb8bfc5657 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 27 Feb 2024 13:57:31 +0100 -Subject: [PATCH 35/70] build: make sure build fails when running kconfig fails -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Because of using "-include", failure to (re)build auto.conf (with -auto.conf.cmd produced as a secondary target) won't stop make from -continuing the build. Arrange for it being possible to drop the - from -Rules.mk, requiring that the include be skipped for tools-only targets. -Note that relying on the inclusion in those cases wouldn't be correct -anyway, as it might be a stale file (yet to be rebuilt) which would be -included, while during initial build, the file would be absent -altogether. - -Fixes: 8d4c17a90b0a ("xen/build: silence make warnings about missing auto.conf*") -Reported-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: d34e5fa2e8db19f23081f46a3e710bb122130691 -master date: 2024-02-22 11:52:47 +0100 ---- - xen/Makefile | 1 + - xen/Rules.mk | 4 +++- - 2 files changed, 4 insertions(+), 1 deletion(-) - -diff --git a/xen/Makefile b/xen/Makefile -index 59d368e4d8..fdf9fd3f22 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -374,6 +374,7 @@ $(KCONFIG_CONFIG): tools_fixdep - # This exploits the 'multi-target pattern rule' trick. - # The syncconfig should be executed only once to make all the targets. - include/config/%.conf include/config/%.conf.cmd: $(KCONFIG_CONFIG) -+ $(Q)rm -f include/config/auto.conf - $(Q)$(MAKE) $(build)=tools/kconfig syncconfig - - ifeq ($(CONFIG_DEBUG),y) -diff --git a/xen/Rules.mk b/xen/Rules.mk -index 8af3dd7277..d759cccee3 100644 ---- a/xen/Rules.mk -+++ b/xen/Rules.mk -@@ -15,7 +15,9 @@ srcdir := $(srctree)/$(src) - PHONY := __build - __build: - ---include $(objtree)/include/config/auto.conf -+ifneq ($(firstword $(subst /, ,$(obj))),tools) -+include $(objtree)/include/config/auto.conf -+endif - - include $(XEN_ROOT)/Config.mk - include $(srctree)/scripts/Kbuild.include --- -2.44.0 - - -From a751d1321f6e1491d6ec2134d59eefa9f9752b86 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 27 Feb 2024 13:57:50 +0100 -Subject: [PATCH 36/70] x86emul: add missing EVEX.R' checks - -EVEX.R' is not ignored in 64-bit code when encoding a GPR or mask -register. While for mask registers suitable checks are in place (there -also covering EVEX.R), they were missing for the few cases where in -EVEX-encoded instructions ModR/M.reg encodes a GPR. While for VPEXTRW -the bit is replaced before an emulation stub is invoked, for -VCVT{,T}{S,D,H}2{,U}SI this actually would have led to #UD from inside -an emulation stub, in turn raising #UD to the guest, but accompanied by -log messages indicating something's wrong in Xen nevertheless. - -Fixes: 001bd91ad864 ("x86emul: support AVX512{F,BW,DQ} extract insns") -Fixes: baf4a376f550 ("x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: cb319824bfa8d3c9ea0410cc71daaedc3e11aa2a -master date: 2024-02-22 11:54:07 +0100 ---- - xen/arch/x86/x86_emulate/x86_emulate.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c -index cf780da501..d6b60f0539 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate/x86_emulate.c -@@ -3686,7 +3686,8 @@ x86_emulate( - CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ - CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */ - CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */ -- generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || -+ generate_exception_if((evex.reg != 0xf || !evex.RX || !evex.R || -+ evex.opmsk || - (ea.type != OP_REG && evex.brs)), - X86_EXC_UD); - host_and_vcpu_must_have(avx512f); -@@ -7295,7 +7296,7 @@ x86_emulate( - goto pextr; - - case X86EMUL_OPC_EVEX_66(0x0f, 0xc5): /* vpextrw $imm8,xmm,reg */ -- generate_exception_if(ea.type != OP_REG, X86_EXC_UD); -+ generate_exception_if(ea.type != OP_REG || !evex.R, X86_EXC_UD); - /* Convert to alternative encoding: We want to use a memory operand. */ - evex.opcx = ext_0f3a; - b = 0x15; --- -2.44.0 - - -From 33a0368d3beb82ddb0cf7ed398b047325bb7be1c Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 27 Feb 2024 13:58:21 +0100 -Subject: [PATCH 37/70] xen/livepatch: fix norevert test hook setup typo -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The test code has a typo in using LIVEPATCH_APPLY_HOOK() instead of -LIVEPATCH_REVERT_HOOK(). - -Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> -master commit: f0622dd4fd6ae6ddb523a45d89ed9b8f3a9a8f36 -master date: 2024-02-26 10:13:46 +0100 ---- - xen/test/livepatch/xen_action_hooks_norevert.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c -index 3e21ade6ab..c173855192 100644 ---- a/xen/test/livepatch/xen_action_hooks_norevert.c -+++ b/xen/test/livepatch/xen_action_hooks_norevert.c -@@ -120,7 +120,7 @@ static void post_revert_hook(livepatch_payload_t *payload) - printk(KERN_DEBUG "%s: Hook done.\n", __func__); - } - --LIVEPATCH_APPLY_HOOK(revert_hook); -+LIVEPATCH_REVERT_HOOK(revert_hook); - - LIVEPATCH_PREAPPLY_HOOK(pre_apply_hook); - LIVEPATCH_POSTAPPLY_HOOK(post_apply_hook); --- -2.44.0 - - -From f6e5ab5fa7257783fdbbaabf6010d8d97656c11f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 27 Feb 2024 13:58:36 +0100 -Subject: [PATCH 38/70] xen/cmdline: fix printf format specifier in - no_config_param() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -'*' sets the width field, which is the minimum number of characters to output, -but what we want in no_config_param() is the precision instead, which is '.*' -as it imposes a maximum limit on the output. - -Fixes: 68d757df8dd2 ('x86/pv: Options to disable and/or compile out 32bit PV support') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: ef101f525173cf51dc70f4c77862f6f10a8ddccf -master date: 2024-02-26 10:17:40 +0100 ---- - xen/include/xen/param.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/include/xen/param.h b/xen/include/xen/param.h -index 93c3fe7cb7..e02e49635c 100644 ---- a/xen/include/xen/param.h -+++ b/xen/include/xen/param.h -@@ -191,7 +191,7 @@ static inline void no_config_param(const char *cfg, const char *param, - { - int len = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); - -- printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%*s' setting\n", -+ printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%.*s' setting\n", - cfg, param, len, s); - } - --- -2.44.0 - - -From 19fd9ff9981732995b1028f9e7e406061b723651 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 27 Feb 2024 13:59:05 +0100 -Subject: [PATCH 39/70] x86/altcall: use a union as register type for function - parameters on clang -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current code for alternative calls uses the caller parameter types as the -types for the register variables that serve as function parameters: - -uint8_t foo; -[...] -alternative_call(myfunc, foo); - -Would expand roughly into: - -register unint8_t a1_ asm("rdi") = foo; -register unsigned long a2_ asm("rsi"); -[...] -asm volatile ("call *%c[addr](%%rip)"...); - -However with -O2 clang will generate incorrect code, given the following -example: - -unsigned int func(uint8_t t) -{ - return t; -} - -static void bar(uint8_t b) -{ - int ret_; - register uint8_t di asm("rdi") = b; - register unsigned long si asm("rsi"); - register unsigned long dx asm("rdx"); - register unsigned long cx asm("rcx"); - register unsigned long r8 asm("r8"); - register unsigned long r9 asm("r9"); - register unsigned long r10 asm("r10"); - register unsigned long r11 asm("r11"); - - asm volatile ( "call %c[addr]" - : "+r" (di), "=r" (si), "=r" (dx), - "=r" (cx), "=r" (r8), "=r" (r9), - "=r" (r10), "=r" (r11), "=a" (ret_) - : [addr] "i" (&(func)), "g" (func) - : "memory" ); -} - -void foo(unsigned int a) -{ - bar(a); -} - -Clang generates the following assembly code: - -func: # @func - movl %edi, %eax - retq -foo: # @foo - callq func - retq - -Note the truncation of the unsigned int parameter 'a' of foo() to uint8_t when -passed into bar() is lost. clang doesn't zero extend the parameters in the -callee when required, as the psABI mandates. - -The above can be worked around by using a union when defining the register -variables, so that `di` becomes: - -register union { - uint8_t e; - unsigned long r; -} di asm("rdi") = { .e = b }; - -Which results in following code generated for `foo()`: - -foo: # @foo - movzbl %dil, %edi - callq func - retq - -So the truncation is not longer lost. Apply such workaround only when built -with clang. - -Reported-by: Matthew Grooms <mgrooms@shrew.net> -Link: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=277200 -Link: https://github.com/llvm/llvm-project/issues/12579 -Link: https://github.com/llvm/llvm-project/issues/82598 -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: 2ce562b2a413cbdb2e1128989ed1722290a27c4e -master date: 2024-02-26 10:18:01 +0100 ---- - xen/arch/x86/include/asm/alternative.h | 25 +++++++++++++++++++++++++ - 1 file changed, 25 insertions(+) - -diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h -index a1cd6a9fe5..3c14db5078 100644 ---- a/xen/arch/x86/include/asm/alternative.h -+++ b/xen/arch/x86/include/asm/alternative.h -@@ -167,9 +167,34 @@ extern void alternative_branches(void); - #define ALT_CALL_arg5 "r8" - #define ALT_CALL_arg6 "r9" - -+#ifdef CONFIG_CC_IS_CLANG -+/* -+ * Use a union with an unsigned long in order to prevent clang from -+ * skipping a possible truncation of the value. By using the union any -+ * truncation is carried before the call instruction, in turn covering -+ * for ABI-non-compliance in that the necessary clipping / extension of -+ * the value is supposed to be carried out in the callee. -+ * -+ * Note this behavior is not mandated by the standard, and hence could -+ * stop being a viable workaround, or worse, could cause a different set -+ * of code-generation issues in future clang versions. -+ * -+ * This has been reported upstream: -+ * https://github.com/llvm/llvm-project/issues/12579 -+ * https://github.com/llvm/llvm-project/issues/82598 -+ */ -+#define ALT_CALL_ARG(arg, n) \ -+ register union { \ -+ typeof(arg) e; \ -+ unsigned long r; \ -+ } a ## n ## _ asm ( ALT_CALL_arg ## n ) = { \ -+ .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \ -+ } -+#else - #define ALT_CALL_ARG(arg, n) \ - register typeof(arg) a ## n ## _ asm ( ALT_CALL_arg ## n ) = \ - ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) -+#endif - #define ALT_CALL_NO_ARG(n) \ - register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) - --- -2.44.0 - - -From 4d47dca20dcfdca2340c8cda6f50dcdcafb1c054 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 27 Feb 2024 13:59:42 +0100 -Subject: [PATCH 40/70] x86/spec: fix BRANCH_HARDEN option to only be set when - build-enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current logic to handle the BRANCH_HARDEN option will report it as enabled -even when build-time disabled. Fix this by only allowing the option to be set -when support for it is built into Xen. - -Fixes: 2d6f36daa086 ('x86/nospec: Introduce CONFIG_SPECULATIVE_HARDEN_BRANCH') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 60e00f77a5cc671d30c5ef3318f5b8e9b74e4aa3 -master date: 2024-02-26 16:06:42 +0100 ---- - xen/arch/x86/spec_ctrl.c | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index a8d8af22f6..01ba59cff7 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -50,7 +50,8 @@ static int8_t __initdata opt_psfd = -1; - int8_t __ro_after_init opt_ibpb_ctxt_switch = -1; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; --static bool __initdata opt_branch_harden = true; -+static bool __initdata opt_branch_harden = -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); - - bool __initdata bsp_delay_spec_ctrl; - uint8_t __read_mostly default_xen_spec_ctrl; -@@ -268,7 +269,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) - else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) - opt_l1d_flush = val; - else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) -- opt_branch_harden = val; -+ { -+ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ) -+ opt_branch_harden = val; -+ else -+ { -+ no_config_param("SPECULATIVE_HARDEN_BRANCH", "spec-ctrl", s, -+ ss); -+ rc = -EINVAL; -+ } -+ } - else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) - opt_srb_lock = val; - else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) --- -2.44.0 - - -From 58bb8115104c9fca749ee4cfcd3579ac1ed644db Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 27 Feb 2024 14:00:22 +0100 -Subject: [PATCH 41/70] x86: account for shadow stack in exception-from-stub - recovery - -Dealing with exceptions raised from within emulation stubs involves -discarding return address (replaced by exception related information). -Such discarding of course also requires removing the corresponding entry -from the shadow stack. - -Also amend the comment in fixup_exception_return(), to further clarify -why use of ptr[1] can't be an out-of-bounds access. - -While touching do_invalid_op() also add a missing fall-through -annotation. - -This is CVE-2023-46841 / XSA-451. - -Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 91f5f7a9154919a765c3933521760acffeddbf28 -master date: 2024-02-27 13:49:22 +0100 ---- - xen/arch/x86/extable.c | 20 ++++++---- - xen/arch/x86/include/asm/uaccess.h | 3 +- - xen/arch/x86/traps.c | 62 +++++++++++++++++++++++++++--- - 3 files changed, 71 insertions(+), 14 deletions(-) - -diff --git a/xen/arch/x86/extable.c b/xen/arch/x86/extable.c -index 74b14246e9..8ffcd346d7 100644 ---- a/xen/arch/x86/extable.c -+++ b/xen/arch/x86/extable.c -@@ -86,26 +86,29 @@ search_one_extable(const struct exception_table_entry *first, - } - - unsigned long --search_exception_table(const struct cpu_user_regs *regs) -+search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) - { - const struct virtual_region *region = find_text_region(regs->rip); - unsigned long stub = this_cpu(stubs.addr); - - if ( region && region->ex ) -+ { -+ *stub_ra = 0; - return search_one_extable(region->ex, region->ex_end, regs->rip); -+ } - - if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && - regs->rip < stub + STUB_BUF_SIZE && - regs->rsp > (unsigned long)regs && - regs->rsp < (unsigned long)get_cpu_info() ) - { -- unsigned long retptr = *(unsigned long *)regs->rsp; -+ unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; - -- region = find_text_region(retptr); -- retptr = region && region->ex -- ? search_one_extable(region->ex, region->ex_end, retptr) -- : 0; -- if ( retptr ) -+ region = find_text_region(retaddr); -+ fixup = region && region->ex -+ ? search_one_extable(region->ex, region->ex_end, retaddr) -+ : 0; -+ if ( fixup ) - { - /* - * Put trap number and error code on the stack (in place of the -@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_user_regs *regs) - }; - - *(unsigned long *)regs->rsp = token.raw; -- return retptr; -+ *stub_ra = retaddr; -+ return fixup; - } - } - -diff --git a/xen/arch/x86/include/asm/uaccess.h b/xen/arch/x86/include/asm/uaccess.h -index 684fccd95c..74bb222c03 100644 ---- a/xen/arch/x86/include/asm/uaccess.h -+++ b/xen/arch/x86/include/asm/uaccess.h -@@ -421,7 +421,8 @@ union stub_exception_token { - unsigned long raw; - }; - --extern unsigned long search_exception_table(const struct cpu_user_regs *regs); -+extern unsigned long search_exception_table(const struct cpu_user_regs *regs, -+ unsigned long *stub_ra); - extern void sort_exception_tables(void); - extern void sort_exception_table(struct exception_table_entry *start, - const struct exception_table_entry *stop); -diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c -index e1356f696a..45e1b277ea 100644 ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -845,7 +845,7 @@ void do_unhandled_trap(struct cpu_user_regs *regs) - } - - static void fixup_exception_return(struct cpu_user_regs *regs, -- unsigned long fixup) -+ unsigned long fixup, unsigned long stub_ra) - { - if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) - { -@@ -862,7 +862,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs, - /* - * Search for %rip. The shstk currently looks like this: - * -- * ... [Likely pointed to by SSP] -+ * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] -+ * ... [Pointed to by SSP for most exceptions, empty in IST cases] - * %cs [== regs->cs] - * %rip [== regs->rip] - * SSP [Likely points to 3 slots higher, above %cs] -@@ -880,7 +881,56 @@ static void fixup_exception_return(struct cpu_user_regs *regs, - */ - if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) - { -+ unsigned long primary_shstk = -+ (ssp & ~(STACK_SIZE - 1)) + -+ (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; -+ - wrss(fixup, ptr); -+ -+ if ( !stub_ra ) -+ goto shstk_done; -+ -+ /* -+ * Stub recovery ought to happen only when the outer context -+ * was on the main shadow stack. We need to also "pop" the -+ * stub's return address from the interrupted context's shadow -+ * stack. That is, -+ * - if we're still on the main stack, we need to move the -+ * entire stack (up to and including the exception frame) -+ * up by one slot, incrementing the original SSP in the -+ * exception frame, -+ * - if we're on an IST stack, we need to increment the -+ * original SSP. -+ */ -+ BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); -+ -+ if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) -+ { -+ /* -+ * We're on an IST stack. First make sure the two return -+ * addresses actually match. Then increment the interrupted -+ * context's SSP. -+ */ -+ BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); -+ wrss(ptr[-1] + 8, &ptr[-1]); -+ goto shstk_done; -+ } -+ -+ /* Make sure the two return addresses actually match. */ -+ BUG_ON(stub_ra != ptr[2]); -+ -+ /* Move exception frame, updating SSP there. */ -+ wrss(ptr[1], &ptr[2]); /* %cs */ -+ wrss(ptr[0], &ptr[1]); /* %rip */ -+ wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ -+ -+ /* Move all newer entries. */ -+ while ( --ptr != _p(ssp) ) -+ wrss(ptr[-1], &ptr[0]); -+ -+ /* Finally account for our own stack having shifted up. */ -+ asm volatile ( "incsspd %0" :: "r" (2) ); -+ - goto shstk_done; - } - } -@@ -901,7 +951,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs, - - static bool extable_fixup(struct cpu_user_regs *regs, bool print) - { -- unsigned long fixup = search_exception_table(regs); -+ unsigned long stub_ra = 0; -+ unsigned long fixup = search_exception_table(regs, &stub_ra); - - if ( unlikely(fixup == 0) ) - return false; -@@ -915,7 +966,7 @@ static bool extable_fixup(struct cpu_user_regs *regs, bool print) - vector_name(regs->entry_vector), regs->error_code, - _p(regs->rip), _p(regs->rip), _p(fixup)); - -- fixup_exception_return(regs, fixup); -+ fixup_exception_return(regs, fixup, stub_ra); - this_cpu(last_extable_addr) = regs->rip; - - return true; -@@ -1183,7 +1234,8 @@ void do_invalid_op(struct cpu_user_regs *regs) - { - case BUGFRAME_run_fn: - case BUGFRAME_warn: -- fixup_exception_return(regs, (unsigned long)eip); -+ fixup_exception_return(regs, (unsigned long)eip, 0); -+ fallthrough; - case BUGFRAME_bug: - case BUGFRAME_assert: - return; --- -2.44.0 - - -From 498b3624d0ecc1267773e6482fd0b732e90c4511 Mon Sep 17 00:00:00 2001 -From: Michal Orzel <michal.orzel@amd.com> -Date: Thu, 8 Feb 2024 11:43:39 +0100 -Subject: [PATCH 42/70] xen/arm: Fix UBSAN failure in start_xen() - -When running Xen on arm32, in scenario where Xen is loaded at an address -such as boot_phys_offset >= 2GB, UBSAN reports the following: - -(XEN) UBSAN: Undefined behaviour in arch/arm/setup.c:739:58 -(XEN) pointer operation underflowed 00200000 to 86800000 -(XEN) Xen WARN at common/ubsan/ubsan.c:172 -(XEN) ----[ Xen-4.19-unstable arm32 debug=y ubsan=y Not tainted ]---- -... -(XEN) Xen call trace: -(XEN) [<0031b4c0>] ubsan.c#ubsan_epilogue+0x18/0xf0 (PC) -(XEN) [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 (LR) -(XEN) [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 -(XEN) [<004d15a8>] start_xen+0xe0/0xbe0 -(XEN) [<0020007c>] head.o#primary_switched+0x4/0x30 - -The failure is reported for the following line: -(paddr_t)(uintptr_t)(_start + boot_phys_offset) - -This occurs because the compiler treats (ptr + size) with size bigger than -PTRDIFF_MAX as undefined behavior. To address this, switch to macro -virt_to_maddr(), given the future plans to eliminate boot_phys_offset. - -Signed-off-by: Michal Orzel <michal.orzel@amd.com> -Reviewed-by: Luca Fancellu <luca.fancellu@arm.com> -Tested-by: Luca Fancellu <luca.fancellu@arm.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit e11f5766503c0ff074b4e0f888bbfc931518a169) ---- - xen/arch/arm/setup.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c -index db748839d3..2ccdde5277 100644 ---- a/xen/arch/arm/setup.c -+++ b/xen/arch/arm/setup.c -@@ -1109,7 +1109,7 @@ void __init start_xen(unsigned long boot_phys_offset, - - /* Register Xen's load address as a boot module. */ - xen_bootmodule = add_boot_module(BOOTMOD_XEN, -- (paddr_t)(uintptr_t)(_start + boot_phys_offset), -+ virt_to_maddr(_start), - (paddr_t)(uintptr_t)(_end - _start), false); - BUG_ON(!xen_bootmodule); - --- -2.44.0 - - -From 3e383bb4137c6ca3058cd55cb867ecc2b7414499 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 5 Mar 2024 11:48:39 +0100 -Subject: [PATCH 43/70] x86/HVM: hide SVM/VMX when their enabling is prohibited - by firmware -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -... or we fail to enable the functionality on the BSP for other reasons. -The only place where hardware announcing the feature is recorded is the -raw CPU policy/featureset. - -Inspired by https://lore.kernel.org/all/20230921114940.957141-1-pbonzini@redhat.com/. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 0b5f149338e35a795bf609ce584640b0977f9e6c -master date: 2024-01-09 14:06:34 +0100 ---- - xen/arch/x86/hvm/svm/svm.c | 1 + - xen/arch/x86/hvm/vmx/vmcs.c | 17 +++++++++++++++++ - 2 files changed, 18 insertions(+) - -diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c -index 24c417ca71..ff991c82cf 100644 ---- a/xen/arch/x86/hvm/svm/svm.c -+++ b/xen/arch/x86/hvm/svm/svm.c -@@ -2543,6 +2543,7 @@ const struct hvm_function_table * __init start_svm(void) - - if ( _svm_cpu_up(true) ) - { -+ setup_clear_cpu_cap(X86_FEATURE_SVM); - printk("SVM: failed to initialise.\n"); - return NULL; - } -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index 13719cc923..e382aa16c5 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -2165,6 +2165,23 @@ int __init vmx_vmcs_init(void) - - if ( !ret ) - register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1); -+ else -+ { -+ setup_clear_cpu_cap(X86_FEATURE_VMX); -+ -+ /* -+ * _vmx_vcpu_up() may have made it past feature identification. -+ * Make sure all dependent features are off as well. -+ */ -+ vmx_basic_msr = 0; -+ vmx_pin_based_exec_control = 0; -+ vmx_cpu_based_exec_control = 0; -+ vmx_secondary_exec_control = 0; -+ vmx_vmexit_control = 0; -+ vmx_vmentry_control = 0; -+ vmx_ept_vpid_cap = 0; -+ vmx_vmfunc = 0; -+ } - - return ret; - } --- -2.44.0 - - -From 57f137053652d5a981ae21f3abe7becc507fe434 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 5 Mar 2024 11:49:22 +0100 -Subject: [PATCH 44/70] xen/sched: Fix UB shift in compat_set_timer_op() - -Tamas reported this UBSAN failure from fuzzing: - - (XEN) ================================================================================ - (XEN) UBSAN: Undefined behaviour in common/sched/compat.c:48:37 - (XEN) left shift of negative value -2147425536 - (XEN) ----[ Xen-4.19-unstable x86_64 debug=y ubsan=y Not tainted ]---- - ... - (XEN) Xen call trace: - (XEN) [<ffff82d040307c1c>] R ubsan.c#ubsan_epilogue+0xa/0xd9 - (XEN) [<ffff82d040308afb>] F __ubsan_handle_shift_out_of_bounds+0x11a/0x1c5 - (XEN) [<ffff82d040307758>] F compat_set_timer_op+0x41/0x43 - (XEN) [<ffff82d04040e4cc>] F hvm_do_multicall_call+0x77f/0xa75 - (XEN) [<ffff82d040519462>] F arch_do_multicall_call+0xec/0xf1 - (XEN) [<ffff82d040261567>] F do_multicall+0x1dc/0xde3 - (XEN) [<ffff82d04040d2b3>] F hvm_hypercall+0xa00/0x149a - (XEN) [<ffff82d0403cd072>] F vmx_vmexit_handler+0x1596/0x279c - (XEN) [<ffff82d0403d909b>] F vmx_asm_vmexit_handler+0xdb/0x200 - -Left-shifting any negative value is strictly undefined behaviour in C, and -the two parameters here come straight from the guest. - -The fuzzer happened to choose lo 0xf, hi 0x8000e300. - -Switch everything to be unsigned values, making the shift well defined. - -As GCC documents: - - As an extension to the C language, GCC does not use the latitude given in - C99 and C11 only to treat certain aspects of signed '<<' as undefined. - However, -fsanitize=shift (and -fsanitize=undefined) will diagnose such - cases. - -this was deemed not to need an XSA. - -Note: The unsigned -> signed conversion for do_set_timer_op()'s s_time_t -parameter is also well defined. C makes it implementation defined, and GCC -defines it as reduction modulo 2^N to be within range of the new type. - -Fixes: 2942f45e09fb ("Enable compatibility mode operation for HYPERVISOR_sched_op and HYPERVISOR_set_timer_op.") -Reported-by: Tamas K Lengyel <tamas@tklengyel.com> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: ae6d4fd876765e6d623eec67d14f5d0464be09cb -master date: 2024-02-01 19:52:44 +0000 ---- - xen/common/sched/compat.c | 4 ++-- - xen/include/hypercall-defs.c | 2 +- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c -index d718e450d4..dd97593630 100644 ---- a/xen/common/sched/compat.c -+++ b/xen/common/sched/compat.c -@@ -43,9 +43,9 @@ static int compat_poll(struct compat_sched_poll *compat) - - #include "core.c" - --int compat_set_timer_op(uint32_t lo, int32_t hi) -+int compat_set_timer_op(uint32_t lo, uint32_t hi) - { -- return do_set_timer_op(((s64)hi << 32) | lo); -+ return do_set_timer_op(((uint64_t)hi << 32) | lo); - } - - #endif /* __COMMON_SCHED_COMPAT_C__ */ -diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c -index 6d361ddfce..47c093acc8 100644 ---- a/xen/include/hypercall-defs.c -+++ b/xen/include/hypercall-defs.c -@@ -134,7 +134,7 @@ xenoprof_op(int op, void *arg) - - #ifdef CONFIG_COMPAT - prefix: compat --set_timer_op(uint32_t lo, int32_t hi) -+set_timer_op(uint32_t lo, uint32_t hi) - multicall(multicall_entry_compat_t *call_list, uint32_t nr_calls) - memory_op(unsigned int cmd, void *arg) - #ifdef CONFIG_IOREQ_SERVER --- -2.44.0 - - -From b7f9168878155e2d29b9b4a3048b0a9a68ed82ed Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:50:16 +0100 -Subject: [PATCH 45/70] x86/spec: print the built-in SPECULATIVE_HARDEN_* - options -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Just like it's done for INDIRECT_THUNK and SHADOW_PAGING. - -Reported-by: Jan Beulich <jbeulich@suse.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 6e9507f7d51fe49df8bc70f83e49ce06c92e4e54 -master date: 2024-02-27 14:57:52 +0100 ---- - xen/arch/x86/spec_ctrl.c | 14 +++++++++++++- - 1 file changed, 13 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 01ba59cff7..04e508b622 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -476,13 +476,25 @@ static void __init print_details(enum ind_thunk thunk) - (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); - - /* Compiled-in support which pertains to mitigations. */ -- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) -+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) - printk(" Compiled-in support:" - #ifdef CONFIG_INDIRECT_THUNK - " INDIRECT_THUNK" - #endif - #ifdef CONFIG_SHADOW_PAGING - " SHADOW_PAGING" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_ARRAY -+ " HARDEN_ARRAY" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH -+ " HARDEN_BRANCH" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS -+ " HARDEN_GUEST_ACCESS" - #endif - "\n"); - --- -2.44.0 - - -From 09b9db0413b1f31f27bece07b2bfa1723b89ace6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:50:53 +0100 -Subject: [PATCH 46/70] x86/spec: fix INDIRECT_THUNK option to only be set when - build-enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Attempt to provide a more helpful error message when the user attempts to set -spec-ctrl=bti-thunk option but the support is build-time disabled. - -While there also adjust the command line documentation to mention -CONFIG_INDIRECT_THUNK instead of INDIRECT_THUNK. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 8441fa806a3b778867867cd0159fa1722e90397e -master date: 2024-02-27 14:58:20 +0100 ---- - docs/misc/xen-command-line.pandoc | 10 +++++----- - xen/arch/x86/spec_ctrl.c | 7 ++++++- - 2 files changed, 11 insertions(+), 6 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 8e65f8bd18..582d6741d1 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2424,11 +2424,11 @@ guests to use. - performance reasons dom0 is unprotected by default. If it is necessary to - protect dom0 too, boot with `spec-ctrl=ibpb-entry`. - --If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to --select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` --locations. The default thunk is `retpoline` (generally preferred), with the --alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and --`lfence` (an `lfence; jmp *%reg` gadget). -+If Xen was compiled with `CONFIG_INDIRECT_THUNK` support, `bti-thunk=` can be -+used to select which of the thunks gets patched into the -+`__x86_indirect_thunk_%reg` locations. The default thunk is `retpoline` -+(generally preferred), with the alternatives being `jmp` (a `jmp *%reg` gadget, -+minimal overhead), and `lfence` (an `lfence; jmp *%reg` gadget). - - On hardware supporting IBRS (Indirect Branch Restricted Speculation), the - `ibrs=` option can be used to force or prevent Xen using the feature itself. -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 04e508b622..99ecfb3cba 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -241,7 +241,12 @@ static int __init cf_check parse_spec_ctrl(const char *s) - { - s += 10; - -- if ( !cmdline_strcmp(s, "retpoline") ) -+ if ( !IS_ENABLED(CONFIG_INDIRECT_THUNK) ) -+ { -+ no_config_param("INDIRECT_THUNK", "spec-ctrl", s - 10, ss); -+ rc = -EINVAL; -+ } -+ else if ( !cmdline_strcmp(s, "retpoline") ) - opt_thunk = THUNK_RETPOLINE; - else if ( !cmdline_strcmp(s, "lfence") ) - opt_thunk = THUNK_LFENCE; --- -2.44.0 - - -From 7404c25efdc70091817479b80dbbd945e6ab4861 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:51:56 +0100 -Subject: [PATCH 47/70] x86/spec: do not print thunk option selection if not - built-in -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Since the thunk built-in enable is printed as part of the "Compiled-in -support:" line, avoid printing anything in "Xen settings:" if the thunk is -disabled at build time. - -Note the BTI-Thunk option printing is also adjusted to print a colon in the -same way the other options on the line do. - -Requested-by: Jan Beulich <jbeulich@suse.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 576528a2a742069af203e90c613c5c93e23c9755 -master date: 2024-02-27 14:58:40 +0100 ---- - xen/arch/x86/spec_ctrl.c | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 99ecfb3cba..a965b6db28 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -504,11 +504,12 @@ static void __init print_details(enum ind_thunk thunk) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", -- thunk == THUNK_NONE ? "N/A" : -- thunk == THUNK_RETPOLINE ? "RETPOLINE" : -- thunk == THUNK_LFENCE ? "LFENCE" : -- thunk == THUNK_JMP ? "JMP" : "?", -+ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", -+ thunk != THUNK_NONE ? "BTI-Thunk: " : "", -+ thunk == THUNK_NONE ? "" : -+ thunk == THUNK_RETPOLINE ? "RETPOLINE, " : -+ thunk == THUNK_LFENCE ? "LFENCE, " : -+ thunk == THUNK_JMP ? "JMP, " : "?, ", - (!boot_cpu_has(X86_FEATURE_IBRSB) && - !boot_cpu_has(X86_FEATURE_IBRS)) ? "No" : - (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", --- -2.44.0 - - -From 5382a6a79cb544f2eecc47330b531802f8c52977 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:52:57 +0100 -Subject: [PATCH 48/70] xen/livepatch: register livepatch regions when loaded -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Currently livepatch regions are registered as virtual regions only after the -livepatch has been applied. - -This can lead to issues when using the pre-apply or post-revert hooks, as at -that point the livepatch is not in the virtual regions list. If a livepatch -pre-apply hook contains a WARN() it would trigger an hypervisor crash, as the -code to handle the bug frame won't be able to find the instruction pointer that -triggered the #UD in any of the registered virtual regions, and hence crash. - -Fix this by adding the livepatch payloads as virtual regions as soon as loaded, -and only remove them once the payload is unloaded. This requires some changes -to the virtual regions code, as the removal of the virtual regions is no longer -done in stop machine context, and hence an RCU barrier is added in order to -make sure there are no users of the virtual region after it's been removed from -the list. - -Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> -master commit: a57b4074ab39bee78b6c116277f0a9963bd8e687 -master date: 2024-02-28 16:57:25 +0000 ---- - xen/common/livepatch.c | 4 ++-- - xen/common/virtual_region.c | 44 ++++++++++++++----------------------- - 2 files changed, 19 insertions(+), 29 deletions(-) - -diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c -index e635606c10..e1964b841a 100644 ---- a/xen/common/livepatch.c -+++ b/xen/common/livepatch.c -@@ -1071,6 +1071,7 @@ static int build_symbol_table(struct payload *payload, - static void free_payload(struct payload *data) - { - ASSERT(spin_is_locked(&payload_lock)); -+ unregister_virtual_region(&data->region); - list_del(&data->list); - payload_cnt--; - payload_version++; -@@ -1170,6 +1171,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload) - INIT_LIST_HEAD(&data->list); - INIT_LIST_HEAD(&data->applied_list); - -+ register_virtual_region(&data->region); - list_add_tail(&data->list, &payload_list); - payload_cnt++; - payload_version++; -@@ -1386,7 +1388,6 @@ static inline void apply_payload_tail(struct payload *data) - * The applied_list is iterated by the trap code. - */ - list_add_tail_rcu(&data->applied_list, &applied_list); -- register_virtual_region(&data->region); - - data->state = LIVEPATCH_STATE_APPLIED; - } -@@ -1432,7 +1433,6 @@ static inline void revert_payload_tail(struct payload *data) - * The applied_list is iterated by the trap code. - */ - list_del_rcu(&data->applied_list); -- unregister_virtual_region(&data->region); - - data->reverted = true; - data->state = LIVEPATCH_STATE_CHECKED; -diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c -index 5f89703f51..9f12c30efe 100644 ---- a/xen/common/virtual_region.c -+++ b/xen/common/virtual_region.c -@@ -23,14 +23,8 @@ static struct virtual_region core_init __initdata = { - }; - - /* -- * RCU locking. Additions are done either at startup (when there is only -- * one CPU) or when all CPUs are running without IRQs. -- * -- * Deletions are bit tricky. We do it when Live Patch (all CPUs running -- * without IRQs) or during bootup (when clearing the init). -- * -- * Hence we use list_del_rcu (which sports an memory fence) and a spinlock -- * on deletion. -+ * RCU locking. Modifications to the list must be done in exclusive mode, and -+ * hence need to hold the spinlock. - * - * All readers of virtual_region_list MUST use list_for_each_entry_rcu. - */ -@@ -58,41 +52,36 @@ const struct virtual_region *find_text_region(unsigned long addr) - - void register_virtual_region(struct virtual_region *r) - { -- ASSERT(!local_irq_is_enabled()); -+ unsigned long flags; - -+ spin_lock_irqsave(&virtual_region_lock, flags); - list_add_tail_rcu(&r->list, &virtual_region_list); -+ spin_unlock_irqrestore(&virtual_region_lock, flags); - } - --static void remove_virtual_region(struct virtual_region *r) -+/* -+ * Suggest inline so when !CONFIG_LIVEPATCH the function is not left -+ * unreachable after init code is removed. -+ */ -+static void inline remove_virtual_region(struct virtual_region *r) - { - unsigned long flags; - - spin_lock_irqsave(&virtual_region_lock, flags); - list_del_rcu(&r->list); - spin_unlock_irqrestore(&virtual_region_lock, flags); -- /* -- * We do not need to invoke call_rcu. -- * -- * This is due to the fact that on the deletion we have made sure -- * to use spinlocks (to guard against somebody else calling -- * unregister_virtual_region) and list_deletion spiced with -- * memory barrier. -- * -- * That protects us from corrupting the list as the readers all -- * use list_for_each_entry_rcu which is safe against concurrent -- * deletions. -- */ - } - -+#ifdef CONFIG_LIVEPATCH - void unregister_virtual_region(struct virtual_region *r) - { -- /* Expected to be called from Live Patch - which has IRQs disabled. */ -- ASSERT(!local_irq_is_enabled()); -- - remove_virtual_region(r); -+ -+ /* Assert that no CPU might be using the removed region. */ -+ rcu_barrier(); - } - --#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_X86) -+#ifdef CONFIG_X86 - void relax_virtual_region_perms(void) - { - const struct virtual_region *region; -@@ -116,7 +105,8 @@ void tighten_virtual_region_perms(void) - PAGE_HYPERVISOR_RX); - rcu_read_unlock(&rcu_virtual_region_lock); - } --#endif -+#endif /* CONFIG_X86 */ -+#endif /* CONFIG_LIVEPATCH */ - - void __init unregister_init_virtual_region(void) - { --- -2.44.0 - - -From 50a8f74df76b7ce7c35ad97a539f505eb0a9baa6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:53:05 +0100 -Subject: [PATCH 49/70] xen/livepatch: search for symbols in all loaded - payloads -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -When checking if an address belongs to a patch, or when resolving a symbol, -take into account all loaded livepatch payloads, even if not applied. - -This is required in order for the pre-apply and post-revert hooks to work -properly, or else Xen won't detect the instruction pointer belonging to those -hooks as being part of the currently active text. - -Move the RCU handling to be used for payload_list instead of applied_list, as -now the calls from trap code will iterate over the payload_list. - -Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> -master commit: d2daa40fb3ddb8f83e238e57854bd878924cde90 -master date: 2024-02-28 16:57:25 +0000 ---- - xen/common/livepatch.c | 49 +++++++++++++++--------------------------- - 1 file changed, 17 insertions(+), 32 deletions(-) - -diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c -index e1964b841a..135c47e9b8 100644 ---- a/xen/common/livepatch.c -+++ b/xen/common/livepatch.c -@@ -36,13 +36,14 @@ - * caller in schedule_work. - */ - static DEFINE_SPINLOCK(payload_lock); --static LIST_HEAD(payload_list); -- - /* -- * Patches which have been applied. Need RCU in case we crash (and then -- * traps code would iterate via applied_list) when adding entries on the list. -+ * Need RCU in case we crash (and then traps code would iterate via -+ * payload_list) when adding entries on the list. - */ --static DEFINE_RCU_READ_LOCK(rcu_applied_lock); -+static DEFINE_RCU_READ_LOCK(rcu_payload_lock); -+static LIST_HEAD(payload_list); -+ -+/* Patches which have been applied. Only modified from stop machine context. */ - static LIST_HEAD(applied_list); - - static unsigned int payload_cnt; -@@ -111,12 +112,8 @@ bool_t is_patch(const void *ptr) - const struct payload *data; - bool_t r = 0; - -- /* -- * Only RCU locking since this list is only ever changed during apply -- * or revert context. And in case it dies there we need an safe list. -- */ -- rcu_read_lock(&rcu_applied_lock); -- list_for_each_entry_rcu ( data, &applied_list, applied_list ) -+ rcu_read_lock(&rcu_payload_lock); -+ list_for_each_entry_rcu ( data, &payload_list, list ) - { - if ( (ptr >= data->rw_addr && - ptr < (data->rw_addr + data->rw_size)) || -@@ -130,7 +127,7 @@ bool_t is_patch(const void *ptr) - } - - } -- rcu_read_unlock(&rcu_applied_lock); -+ rcu_read_unlock(&rcu_payload_lock); - - return r; - } -@@ -166,12 +163,8 @@ static const char *cf_check livepatch_symbols_lookup( - const void *va = (const void *)addr; - const char *n = NULL; - -- /* -- * Only RCU locking since this list is only ever changed during apply -- * or revert context. And in case it dies there we need an safe list. -- */ -- rcu_read_lock(&rcu_applied_lock); -- list_for_each_entry_rcu ( data, &applied_list, applied_list ) -+ rcu_read_lock(&rcu_payload_lock); -+ list_for_each_entry_rcu ( data, &payload_list, list ) - { - if ( va < data->text_addr || - va >= (data->text_addr + data->text_size) ) -@@ -200,7 +193,7 @@ static const char *cf_check livepatch_symbols_lookup( - n = data->symtab[best].name; - break; - } -- rcu_read_unlock(&rcu_applied_lock); -+ rcu_read_unlock(&rcu_payload_lock); - - return n; - } -@@ -1072,7 +1065,8 @@ static void free_payload(struct payload *data) - { - ASSERT(spin_is_locked(&payload_lock)); - unregister_virtual_region(&data->region); -- list_del(&data->list); -+ list_del_rcu(&data->list); -+ rcu_barrier(); - payload_cnt--; - payload_version++; - free_payload_data(data); -@@ -1172,7 +1166,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload) - INIT_LIST_HEAD(&data->applied_list); - - register_virtual_region(&data->region); -- list_add_tail(&data->list, &payload_list); -+ list_add_tail_rcu(&data->list, &payload_list); - payload_cnt++; - payload_version++; - } -@@ -1383,11 +1377,7 @@ static int apply_payload(struct payload *data) - - static inline void apply_payload_tail(struct payload *data) - { -- /* -- * We need RCU variant (which has barriers) in case we crash here. -- * The applied_list is iterated by the trap code. -- */ -- list_add_tail_rcu(&data->applied_list, &applied_list); -+ list_add_tail(&data->applied_list, &applied_list); - - data->state = LIVEPATCH_STATE_APPLIED; - } -@@ -1427,12 +1417,7 @@ static int revert_payload(struct payload *data) - - static inline void revert_payload_tail(struct payload *data) - { -- -- /* -- * We need RCU variant (which has barriers) in case we crash here. -- * The applied_list is iterated by the trap code. -- */ -- list_del_rcu(&data->applied_list); -+ list_del(&data->applied_list); - - data->reverted = true; - data->state = LIVEPATCH_STATE_CHECKED; --- -2.44.0 - - -From d81bfc7ff887426727504086fa363f91bf8c19f8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:53:13 +0100 -Subject: [PATCH 50/70] xen/livepatch: fix norevert test attempt to open-code - revert -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The purpose of the norevert test is to install a dummy handler that replaces -the internal Xen revert code, and then perform the revert in the post-revert -hook. For that purpose the usage of the previous common_livepatch_revert() is -not enough, as that just reverts specific functions, but not the whole state of -the payload. - -Remove both common_livepatch_{apply,revert}() and instead expose -revert_payload{,_tail}() in order to perform the patch revert from the -post-revert hook. - -Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> -master commit: cdae267ce10d04d71d1687b5701ff2911a96b6dc -master date: 2024-02-28 16:57:25 +0000 ---- - xen/common/livepatch.c | 41 +++++++++++++++++-- - xen/include/xen/livepatch.h | 32 ++------------- - .../livepatch/xen_action_hooks_norevert.c | 22 +++------- - 3 files changed, 46 insertions(+), 49 deletions(-) - -diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c -index 135c47e9b8..0cc048fd83 100644 ---- a/xen/common/livepatch.c -+++ b/xen/common/livepatch.c -@@ -1366,7 +1366,22 @@ static int apply_payload(struct payload *data) - ASSERT(!local_irq_is_enabled()); - - for ( i = 0; i < data->nfuncs; i++ ) -- common_livepatch_apply(&data->funcs[i], &data->fstate[i]); -+ { -+ const struct livepatch_func *func = &data->funcs[i]; -+ struct livepatch_fstate *state = &data->fstate[i]; -+ -+ /* If the action has been already executed on this function, do nothing. */ -+ if ( state->applied == LIVEPATCH_FUNC_APPLIED ) -+ { -+ printk(XENLOG_WARNING LIVEPATCH -+ "%s: %s has been already applied before\n", -+ __func__, func->name); -+ continue; -+ } -+ -+ arch_livepatch_apply(func, state); -+ state->applied = LIVEPATCH_FUNC_APPLIED; -+ } - - arch_livepatch_revive(); - -@@ -1382,7 +1397,7 @@ static inline void apply_payload_tail(struct payload *data) - data->state = LIVEPATCH_STATE_APPLIED; - } - --static int revert_payload(struct payload *data) -+int revert_payload(struct payload *data) - { - unsigned int i; - int rc; -@@ -1397,7 +1412,25 @@ static int revert_payload(struct payload *data) - } - - for ( i = 0; i < data->nfuncs; i++ ) -- common_livepatch_revert(&data->funcs[i], &data->fstate[i]); -+ { -+ const struct livepatch_func *func = &data->funcs[i]; -+ struct livepatch_fstate *state = &data->fstate[i]; -+ -+ /* -+ * If the apply action hasn't been executed on this function, do -+ * nothing. -+ */ -+ if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) -+ { -+ printk(XENLOG_WARNING LIVEPATCH -+ "%s: %s has not been applied before\n", -+ __func__, func->name); -+ continue; -+ } -+ -+ arch_livepatch_revert(func, state); -+ state->applied = LIVEPATCH_FUNC_NOT_APPLIED; -+ } - - /* - * Since we are running with IRQs disabled and the hooks may call common -@@ -1415,7 +1448,7 @@ static int revert_payload(struct payload *data) - return 0; - } - --static inline void revert_payload_tail(struct payload *data) -+void revert_payload_tail(struct payload *data) - { - list_del(&data->applied_list); - -diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h -index 537d3d58b6..c9ee58fd37 100644 ---- a/xen/include/xen/livepatch.h -+++ b/xen/include/xen/livepatch.h -@@ -136,35 +136,11 @@ void arch_livepatch_post_action(void); - void arch_livepatch_mask(void); - void arch_livepatch_unmask(void); - --static inline void common_livepatch_apply(const struct livepatch_func *func, -- struct livepatch_fstate *state) --{ -- /* If the action has been already executed on this function, do nothing. */ -- if ( state->applied == LIVEPATCH_FUNC_APPLIED ) -- { -- printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n", -- __func__, func->name); -- return; -- } -- -- arch_livepatch_apply(func, state); -- state->applied = LIVEPATCH_FUNC_APPLIED; --} -+/* Only for testing purposes. */ -+struct payload; -+int revert_payload(struct payload *data); -+void revert_payload_tail(struct payload *data); - --static inline void common_livepatch_revert(const struct livepatch_func *func, -- struct livepatch_fstate *state) --{ -- /* If the apply action hasn't been executed on this function, do nothing. */ -- if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) -- { -- printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n", -- __func__, func->name); -- return; -- } -- -- arch_livepatch_revert(func, state); -- state->applied = LIVEPATCH_FUNC_NOT_APPLIED; --} - #else - - /* -diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c -index c173855192..c5fbab1746 100644 ---- a/xen/test/livepatch/xen_action_hooks_norevert.c -+++ b/xen/test/livepatch/xen_action_hooks_norevert.c -@@ -96,26 +96,14 @@ static int revert_hook(livepatch_payload_t *payload) - - static void post_revert_hook(livepatch_payload_t *payload) - { -- int i; -+ unsigned long flags; - - printk(KERN_DEBUG "%s: Hook starting.\n", __func__); - -- for (i = 0; i < payload->nfuncs; i++) -- { -- const struct livepatch_func *func = &payload->funcs[i]; -- struct livepatch_fstate *fstate = &payload->fstate[i]; -- -- BUG_ON(revert_cnt != 1); -- BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); -- -- /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */ -- arch_livepatch_quiesce(); -- common_livepatch_revert(payload); -- arch_livepatch_revive(); -- BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); -- -- printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); -- } -+ local_irq_save(flags); -+ BUG_ON(revert_payload(payload)); -+ revert_payload_tail(payload); -+ local_irq_restore(flags); - - printk(KERN_DEBUG "%s: Hook done.\n", __func__); - } --- -2.44.0 - - -From e9516b73e7d499684092c1d345818585403cf190 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:53:22 +0100 -Subject: [PATCH 51/70] xen/livepatch: properly build the noapply and norevert - tests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -It seems the build variables for those tests where copy-pasted from -xen_action_hooks_marker-objs and not adjusted to use the correct source files. - -Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com> -master commit: e579677095782c7dec792597ba8b037b7d716b32 -master date: 2024-02-28 16:57:25 +0000 ---- - xen/test/livepatch/Makefile | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/xen/test/livepatch/Makefile b/xen/test/livepatch/Makefile -index c258ab0b59..d987a8367f 100644 ---- a/xen/test/livepatch/Makefile -+++ b/xen/test/livepatch/Makefile -@@ -118,12 +118,12 @@ xen_action_hooks_marker-objs := xen_action_hooks_marker.o xen_hello_world_func.o - $(obj)/xen_action_hooks_noapply.o: $(obj)/config.h - - extra-y += xen_action_hooks_noapply.livepatch --xen_action_hooks_noapply-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o -+xen_action_hooks_noapply-objs := xen_action_hooks_noapply.o xen_hello_world_func.o note.o xen_note.o - - $(obj)/xen_action_hooks_norevert.o: $(obj)/config.h - - extra-y += xen_action_hooks_norevert.livepatch --xen_action_hooks_norevert-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o -+xen_action_hooks_norevert-objs := xen_action_hooks_norevert.o xen_hello_world_func.o note.o xen_note.o - - EXPECT_BYTES_COUNT := 8 - CODE_GET_EXPECT=$(shell $(OBJDUMP) -d --insn-width=1 $(1) | sed -n -e '/<'$(2)'>:$$/,/^$$/ p' | tail -n +2 | head -n $(EXPECT_BYTES_COUNT) | awk '{$$0=$$2; printf "%s", substr($$0,length-1)}' | sed 's/.\{2\}/0x&,/g' | sed 's/^/{/;s/,$$/}/g') --- -2.44.0 - - -From 267845a8389d5d34edb2b38a1972f32f51f70b4e Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Tue, 5 Mar 2024 11:54:12 +0100 -Subject: [PATCH 52/70] libxl: Fix segfault in device_model_spawn_outcome - -libxl__spawn_qdisk_backend() explicitly sets guest_config to NULL when -starting QEMU (the usual launch through libxl__spawn_local_dm() has a -guest_config though). - -Bail early on a NULL guest_config/d_config. This skips the QMP queries -for chardevs and VNC, but this xenpv QEMU instance isn't expected to -provide those - only qdisk (or 9pfs backends after an upcoming change). - -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: d4f3d35f043f6ef29393166b0dd131c8102cf255 -master date: 2024-02-29 08:18:38 +0100 ---- - tools/libs/light/libxl_dm.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index ed620a9d8e..29b43ed20a 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -3172,8 +3172,8 @@ static void device_model_spawn_outcome(libxl__egc *egc, - - /* Check if spawn failed */ - if (rc) goto out; -- -- if (d_config->b_info.device_model_version -+ /* d_config is NULL for xl devd/libxl__spawn_qemu_xenpv_backend(). */ -+ if (d_config && d_config->b_info.device_model_version - == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN) { - rc = libxl__ev_time_register_rel(ao, &dmss->timeout, - devise_model_postconfig_timeout, --- -2.44.0 - - -From 75221fb0f87e4d7278b0a540bc28a6d0b74afeba Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Mar 2024 11:54:33 +0100 -Subject: [PATCH 53/70] x86/altcall: always use a temporary parameter stashing - variable -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The usage in ALT_CALL_ARG() on clang of: - -register union { - typeof(arg) e; - const unsigned long r; -} ... - -When `arg` is the first argument to alternative_{,v}call() and -const_vlapic_vcpu() is used results in clang 3.5.0 complaining with: - -arch/x86/hvm/vlapic.c:141:47: error: non-const static data member must be initialized out of line - alternative_call(hvm_funcs.test_pir, const_vlapic_vcpu(vlapic), vec) ) - -Workaround this by pulling `arg1` into a local variable, like it's done for -further arguments (arg2, arg3...) - -Originally arg1 wasn't pulled into a variable because for the a1_ register -local variable the possible clobbering as a result of operators on other -variables don't matter: - -https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables - -Note clang version 3.8.1 seems to already be fixed and don't require the -workaround, but since it's harmless do it uniformly everywhere. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: c20850540ad6a32f4fc17bde9b01c92b0df18bf0 -master date: 2024-02-29 08:21:49 +0100 ---- - xen/arch/x86/include/asm/alternative.h | 36 +++++++++++++++++--------- - 1 file changed, 24 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h -index 3c14db5078..0d3697f1de 100644 ---- a/xen/arch/x86/include/asm/alternative.h -+++ b/xen/arch/x86/include/asm/alternative.h -@@ -253,21 +253,24 @@ extern void alternative_branches(void); - }) - - #define alternative_vcall1(func, arg) ({ \ -- ALT_CALL_ARG(arg, 1); \ -+ typeof(arg) v1_ = (arg); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_NO_ARG2; \ - (void)sizeof(func(arg)); \ - (void)alternative_callN(1, int, func); \ - }) - - #define alternative_call1(func, arg) ({ \ -- ALT_CALL_ARG(arg, 1); \ -+ typeof(arg) v1_ = (arg); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_NO_ARG2; \ - alternative_callN(1, typeof(func(arg)), func); \ - }) - - #define alternative_vcall2(func, arg1, arg2) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_NO_ARG3; \ - (void)sizeof(func(arg1, arg2)); \ -@@ -275,17 +278,19 @@ extern void alternative_branches(void); - }) - - #define alternative_call2(func, arg1, arg2) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_NO_ARG3; \ - alternative_callN(2, typeof(func(arg1, arg2)), func); \ - }) - - #define alternative_vcall3(func, arg1, arg2, arg3) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_NO_ARG4; \ -@@ -294,9 +299,10 @@ extern void alternative_branches(void); - }) - - #define alternative_call3(func, arg1, arg2, arg3) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_NO_ARG4; \ -@@ -305,10 +311,11 @@ extern void alternative_branches(void); - }) - - #define alternative_vcall4(func, arg1, arg2, arg3, arg4) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ - typeof(arg4) v4_ = (arg4); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_ARG(v4_, 4); \ -@@ -318,10 +325,11 @@ extern void alternative_branches(void); - }) - - #define alternative_call4(func, arg1, arg2, arg3, arg4) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ - typeof(arg4) v4_ = (arg4); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_ARG(v4_, 4); \ -@@ -332,11 +340,12 @@ extern void alternative_branches(void); - }) - - #define alternative_vcall5(func, arg1, arg2, arg3, arg4, arg5) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ - typeof(arg4) v4_ = (arg4); \ - typeof(arg5) v5_ = (arg5); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_ARG(v4_, 4); \ -@@ -347,11 +356,12 @@ extern void alternative_branches(void); - }) - - #define alternative_call5(func, arg1, arg2, arg3, arg4, arg5) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ - typeof(arg4) v4_ = (arg4); \ - typeof(arg5) v5_ = (arg5); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_ARG(v4_, 4); \ -@@ -363,12 +373,13 @@ extern void alternative_branches(void); - }) - - #define alternative_vcall6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ - typeof(arg4) v4_ = (arg4); \ - typeof(arg5) v5_ = (arg5); \ - typeof(arg6) v6_ = (arg6); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_ARG(v4_, 4); \ -@@ -379,12 +390,13 @@ extern void alternative_branches(void); - }) - - #define alternative_call6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ -+ typeof(arg1) v1_ = (arg1); \ - typeof(arg2) v2_ = (arg2); \ - typeof(arg3) v3_ = (arg3); \ - typeof(arg4) v4_ = (arg4); \ - typeof(arg5) v5_ = (arg5); \ - typeof(arg6) v6_ = (arg6); \ -- ALT_CALL_ARG(arg1, 1); \ -+ ALT_CALL_ARG(v1_, 1); \ - ALT_CALL_ARG(v2_, 2); \ - ALT_CALL_ARG(v3_, 3); \ - ALT_CALL_ARG(v4_, 4); \ --- -2.44.0 - - -From fd7cb7a1d0433049d8fc59444d0e91b71728763e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 5 Mar 2024 11:55:17 +0100 -Subject: [PATCH 54/70] x86/cpu-policy: Allow for levelling of VERW side - effects -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -MD_CLEAR and FB_CLEAR need OR-ing across a migrate pool. Allow this, by -having them unconditinally set in max, with the host values reflected in -default. Annotate the bits as having special properies. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: de17162cafd27f2865a3102a2ec0f386a02ed03d -master date: 2024-03-01 20:14:19 +0000 ---- - xen/arch/x86/cpu-policy.c | 24 +++++++++++++++++++++ - xen/arch/x86/include/asm/cpufeature.h | 1 + - xen/include/public/arch-x86/cpufeatureset.h | 4 ++-- - 3 files changed, 27 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c -index bcb17b7ce3..c7c5e99b7b 100644 ---- a/xen/arch/x86/cpu-policy.c -+++ b/xen/arch/x86/cpu-policy.c -@@ -442,6 +442,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) - __set_bit(X86_FEATURE_RSBA, fs); - __set_bit(X86_FEATURE_RRSBA, fs); - -+ /* -+ * These bits indicate that the VERW instruction may have gained -+ * scrubbing side effects. With pooling, they mean "you might migrate -+ * somewhere where scrubbing is necessary", and may need exposing on -+ * unaffected hardware. This is fine, because the VERW instruction -+ * has been around since the 286. -+ */ -+ __set_bit(X86_FEATURE_MD_CLEAR, fs); -+ __set_bit(X86_FEATURE_FB_CLEAR, fs); -+ - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an - * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. -@@ -476,6 +486,20 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) - cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) ) - __clear_bit(X86_FEATURE_RDRAND, fs); - -+ /* -+ * These bits indicate that the VERW instruction may have gained -+ * scrubbing side effects. The max policy has them set for migration -+ * reasons, so reset the default policy back to the host values in -+ * case we're unaffected. -+ */ -+ __clear_bit(X86_FEATURE_MD_CLEAR, fs); -+ if ( cpu_has_md_clear ) -+ __set_bit(X86_FEATURE_MD_CLEAR, fs); -+ -+ __clear_bit(X86_FEATURE_FB_CLEAR, fs); -+ if ( cpu_has_fb_clear ) -+ __set_bit(X86_FEATURE_FB_CLEAR, fs); -+ - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an - * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. -diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h -index 06e1dd7f33..76ef2aeb1d 100644 ---- a/xen/arch/x86/include/asm/cpufeature.h -+++ b/xen/arch/x86/include/asm/cpufeature.h -@@ -177,6 +177,7 @@ static inline bool boot_cpu_has(unsigned int feat) - #define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS) - #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT) - #define cpu_has_srbds_ctrl boot_cpu_has(X86_FEATURE_SRBDS_CTRL) -+#define cpu_has_md_clear boot_cpu_has(X86_FEATURE_MD_CLEAR) - #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) - #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) - #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 6b6ce2745c..337aaa9c77 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -262,7 +262,7 @@ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single - XEN_CPUFEATURE(FSRM, 9*32+ 4) /*A Fast Short REP MOVS */ - XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a VP2INTERSECT{D,Q} insns */ - XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ --XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ -+XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffers */ - XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ - XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ - XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ -@@ -329,7 +329,7 @@ XEN_CPUFEATURE(DOITM, 16*32+12) /* Data Operand Invariant Timing - XEN_CPUFEATURE(SBDR_SSDP_NO, 16*32+13) /*A No Shared Buffer Data Read or Sideband Stale Data Propagation */ - XEN_CPUFEATURE(FBSDP_NO, 16*32+14) /*A No Fill Buffer Stale Data Propagation */ - XEN_CPUFEATURE(PSDP_NO, 16*32+15) /*A No Primary Stale Data Propagation */ --XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*A Fill Buffers cleared by VERW */ -+XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*!A Fill Buffers cleared by VERW */ - XEN_CPUFEATURE(FB_CLEAR_CTRL, 16*32+18) /* MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */ - XEN_CPUFEATURE(RRSBA, 16*32+19) /*! Restricted RSB Alternative */ - XEN_CPUFEATURE(BHI_NO, 16*32+20) /*A No Branch History Injection */ --- -2.44.0 - - -From 4c84fa6cb66fe66f2c5dad65208c497558ab7d17 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 12 Mar 2024 12:06:57 +0100 -Subject: [PATCH 55/70] hvmloader/PCI: skip huge BARs in certain calculations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -BARs of size 2Gb and up can't possibly fit below 4Gb: Both the bottom of -the lower 2Gb range and the top of the higher 2Gb range have special -purpose. Don't even have them influence whether to (perhaps) relocate -low RAM. - -Reported-by: Neowutran <xen@neowutran.ovh> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 57acad12a09ffa490e870ebe17596aad858f0191 -master date: 2024-03-06 10:19:29 +0100 ---- - tools/firmware/hvmloader/pci.c | 28 ++++++++++++++++++++-------- - 1 file changed, 20 insertions(+), 8 deletions(-) - -diff --git a/tools/firmware/hvmloader/pci.c b/tools/firmware/hvmloader/pci.c -index 257a6feb61..c3c61ca060 100644 ---- a/tools/firmware/hvmloader/pci.c -+++ b/tools/firmware/hvmloader/pci.c -@@ -33,6 +33,13 @@ uint32_t pci_mem_start = HVM_BELOW_4G_MMIO_START; - const uint32_t pci_mem_end = RESERVED_MEMBASE; - uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0; - -+/* -+ * BARs larger than this value are put in 64-bit space unconditionally. That -+ * is, such BARs also don't play into the determination of how big the lowmem -+ * MMIO hole needs to be. -+ */ -+#define BAR_RELOC_THRESH GB(1) -+ - enum virtual_vga virtual_vga = VGA_none; - unsigned long igd_opregion_pgbase = 0; - -@@ -286,9 +293,11 @@ void pci_setup(void) - bars[i].bar_reg = bar_reg; - bars[i].bar_sz = bar_sz; - -- if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == -- PCI_BASE_ADDRESS_SPACE_MEMORY) || -- (bar_reg == PCI_ROM_ADDRESS) ) -+ if ( is_64bar && bar_sz > BAR_RELOC_THRESH ) -+ bar64_relocate = 1; -+ else if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == -+ PCI_BASE_ADDRESS_SPACE_MEMORY) || -+ (bar_reg == PCI_ROM_ADDRESS) ) - mmio_total += bar_sz; - - nr_bars++; -@@ -367,7 +376,7 @@ void pci_setup(void) - pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT; - } - -- if ( mmio_total > (pci_mem_end - pci_mem_start) ) -+ if ( mmio_total > (pci_mem_end - pci_mem_start) || bar64_relocate ) - { - printf("Low MMIO hole not large enough for all devices," - " relocating some BARs to 64-bit\n"); -@@ -430,7 +439,8 @@ void pci_setup(void) - - /* - * Relocate to high memory if the total amount of MMIO needed -- * is more than the low MMIO available. Because devices are -+ * is more than the low MMIO available or BARs bigger than -+ * BAR_RELOC_THRESH are present. Because devices are - * processed in order of bar_sz, this will preferentially - * relocate larger devices to high memory first. - * -@@ -446,8 +456,9 @@ void pci_setup(void) - * the code here assumes it to be.) - * Should either of those two conditions change, this code will break. - */ -- using_64bar = bars[i].is_64bar && bar64_relocate -- && (mmio_total > (mem_resource.max - mem_resource.base)); -+ using_64bar = bars[i].is_64bar && bar64_relocate && -+ (mmio_total > (mem_resource.max - mem_resource.base) || -+ bar_sz > BAR_RELOC_THRESH); - bar_data = pci_readl(devfn, bar_reg); - - if ( (bar_data & PCI_BASE_ADDRESS_SPACE) == -@@ -467,7 +478,8 @@ void pci_setup(void) - resource = &mem_resource; - bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK; - } -- mmio_total -= bar_sz; -+ if ( bar_sz <= BAR_RELOC_THRESH ) -+ mmio_total -= bar_sz; - } - else - { --- -2.44.0 - - -From a96d2d4355d85fc82abd0a3799978db04ee8cff3 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 12 Mar 2024 12:07:07 +0100 -Subject: [PATCH 56/70] x86/mm: fix detection of last L1 entry in - modify_xen_mappings_lite() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current logic to detect when to switch to the next L1 table is incorrectly -using l2_table_offset() in order to notice when the last entry on the current -L1 table has been reached. - -It should instead use l1_table_offset() to check whether the index has wrapped -to point to the first entry, and so the next L1 table should be used. - -Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 7c81558208de7858251b62f168a449be84305595 -master date: 2024-03-11 11:09:42 +0000 ---- - xen/arch/x86/mm.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 39544bd9f9..ab0acbfea6 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5947,7 +5947,7 @@ void init_or_livepatch modify_xen_mappings_lite( - - v += 1UL << L1_PAGETABLE_SHIFT; - -- if ( l2_table_offset(v) == 0 ) -+ if ( l1_table_offset(v) == 0 ) - break; - } - --- -2.44.0 - - -From fe1869a569bab56e44c35d1522ee064bab6286da Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Sat, 27 Jan 2024 17:52:09 +0000 -Subject: [PATCH 57/70] x86/entry: Introduce EFRAME_* constants - -restore_all_guest() does a lot of manipulation of the stack after popping the -GPRs, and uses raw %rsp displacements to do so. Also, almost all entrypaths -use raw %rsp displacements prior to pushing GPRs. - -Provide better mnemonics, to aid readability and reduce the chance of errors -when editing. - -No functional change. The resulting binary is identical. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 37541208f119a9c552c6c6c3246ea61be0d44035) ---- - xen/arch/x86/x86_64/asm-offsets.c | 17 ++++++++ - xen/arch/x86/x86_64/compat/entry.S | 2 +- - xen/arch/x86/x86_64/entry.S | 70 +++++++++++++++--------------- - 3 files changed, 53 insertions(+), 36 deletions(-) - -diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c -index 57b73a4e62..2fc4d9130a 100644 ---- a/xen/arch/x86/x86_64/asm-offsets.c -+++ b/xen/arch/x86/x86_64/asm-offsets.c -@@ -51,6 +51,23 @@ void __dummy__(void) - OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); - BLANK(); - -+ /* -+ * EFRAME_* is for the entry/exit logic where %rsp is pointing at -+ * UREGS_error_code and GPRs are still/already guest values. -+ */ -+#define OFFSET_EF(sym, mem) \ -+ DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ -+ offsetof(struct cpu_user_regs, error_code)) -+ -+ OFFSET_EF(EFRAME_entry_vector, entry_vector); -+ OFFSET_EF(EFRAME_rip, rip); -+ OFFSET_EF(EFRAME_cs, cs); -+ OFFSET_EF(EFRAME_eflags, eflags); -+ OFFSET_EF(EFRAME_rsp, rsp); -+ BLANK(); -+ -+#undef OFFSET_EF -+ - OFFSET(VCPU_processor, struct vcpu, processor); - OFFSET(VCPU_domain, struct vcpu, domain); - OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map); -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index fcc3a721f1..cb473f08ee 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -15,7 +15,7 @@ ENTRY(entry_int82) - ENDBR64 - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP - pushq $0 -- movl $HYPERCALL_VECTOR, 4(%rsp) -+ movl $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp) - SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 9a7b129aa7..968da9d727 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -190,15 +190,15 @@ restore_all_guest: - SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ - - RESTORE_ALL -- testw $TRAP_syscall,4(%rsp) -+ testw $TRAP_syscall, EFRAME_entry_vector(%rsp) - jz iret_exit_to_guest - -- movq 24(%rsp),%r11 # RFLAGS -+ mov EFRAME_eflags(%rsp), %r11 - andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 - orq $X86_EFLAGS_IF,%r11 - - /* Don't use SYSRET path if the return address is not canonical. */ -- movq 8(%rsp),%rcx -+ mov EFRAME_rip(%rsp), %rcx - sarq $47,%rcx - incl %ecx - cmpl $1,%ecx -@@ -213,20 +213,20 @@ restore_all_guest: - ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK - #endif - -- movq 8(%rsp), %rcx # RIP -- cmpw $FLAT_USER_CS32,16(%rsp)# CS -- movq 32(%rsp),%rsp # RSP -+ mov EFRAME_rip(%rsp), %rcx -+ cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) -+ mov EFRAME_rsp(%rsp), %rsp - je 1f - sysretq - 1: sysretl - - ALIGN - .Lrestore_rcx_iret_exit_to_guest: -- movq 8(%rsp), %rcx # RIP -+ mov EFRAME_rip(%rsp), %rcx - /* No special register assumptions. */ - iret_exit_to_guest: -- andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) -- orl $X86_EFLAGS_IF,24(%rsp) -+ andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) -+ orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) - addq $8,%rsp - .Lft0: iretq - _ASM_PRE_EXTABLE(.Lft0, handle_exception) -@@ -257,7 +257,7 @@ ENTRY(lstar_enter) - pushq $FLAT_KERNEL_CS64 - pushq %rcx - pushq $0 -- movl $TRAP_syscall, 4(%rsp) -+ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -294,7 +294,7 @@ ENTRY(cstar_enter) - pushq $FLAT_USER_CS32 - pushq %rcx - pushq $0 -- movl $TRAP_syscall, 4(%rsp) -+ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -335,7 +335,7 @@ GLOBAL(sysenter_eflags_saved) - pushq $3 /* ring 3 null cs */ - pushq $0 /* null rip */ - pushq $0 -- movl $TRAP_syscall, 4(%rsp) -+ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -389,7 +389,7 @@ ENTRY(int80_direct_trap) - ENDBR64 - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP - pushq $0 -- movl $0x80, 4(%rsp) -+ movl $0x80, EFRAME_entry_vector(%rsp) - SAVE_ALL - - SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ -@@ -649,7 +649,7 @@ ret_from_intr: - .section .init.text, "ax", @progbits - ENTRY(early_page_fault) - ENDBR64 -- movl $X86_EXC_PF, 4(%rsp) -+ movl $X86_EXC_PF, EFRAME_entry_vector(%rsp) - SAVE_ALL - movq %rsp, %rdi - call do_early_page_fault -@@ -716,7 +716,7 @@ ENTRY(common_interrupt) - - ENTRY(entry_PF) - ENDBR64 -- movl $X86_EXC_PF, 4(%rsp) -+ movl $X86_EXC_PF, EFRAME_entry_vector(%rsp) - /* No special register assumptions. */ - GLOBAL(handle_exception) - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP -@@ -890,90 +890,90 @@ FATAL_exception_with_ints_disabled: - ENTRY(entry_DE) - ENDBR64 - pushq $0 -- movl $X86_EXC_DE, 4(%rsp) -+ movl $X86_EXC_DE, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_MF) - ENDBR64 - pushq $0 -- movl $X86_EXC_MF, 4(%rsp) -+ movl $X86_EXC_MF, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_XM) - ENDBR64 - pushq $0 -- movl $X86_EXC_XM, 4(%rsp) -+ movl $X86_EXC_XM, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_NM) - ENDBR64 - pushq $0 -- movl $X86_EXC_NM, 4(%rsp) -+ movl $X86_EXC_NM, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_DB) - ENDBR64 - pushq $0 -- movl $X86_EXC_DB, 4(%rsp) -+ movl $X86_EXC_DB, EFRAME_entry_vector(%rsp) - jmp handle_ist_exception - - ENTRY(entry_BP) - ENDBR64 - pushq $0 -- movl $X86_EXC_BP, 4(%rsp) -+ movl $X86_EXC_BP, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_OF) - ENDBR64 - pushq $0 -- movl $X86_EXC_OF, 4(%rsp) -+ movl $X86_EXC_OF, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_BR) - ENDBR64 - pushq $0 -- movl $X86_EXC_BR, 4(%rsp) -+ movl $X86_EXC_BR, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_UD) - ENDBR64 - pushq $0 -- movl $X86_EXC_UD, 4(%rsp) -+ movl $X86_EXC_UD, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_TS) - ENDBR64 -- movl $X86_EXC_TS, 4(%rsp) -+ movl $X86_EXC_TS, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_NP) - ENDBR64 -- movl $X86_EXC_NP, 4(%rsp) -+ movl $X86_EXC_NP, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_SS) - ENDBR64 -- movl $X86_EXC_SS, 4(%rsp) -+ movl $X86_EXC_SS, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_GP) - ENDBR64 -- movl $X86_EXC_GP, 4(%rsp) -+ movl $X86_EXC_GP, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_AC) - ENDBR64 -- movl $X86_EXC_AC, 4(%rsp) -+ movl $X86_EXC_AC, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_CP) - ENDBR64 -- movl $X86_EXC_CP, 4(%rsp) -+ movl $X86_EXC_CP, EFRAME_entry_vector(%rsp) - jmp handle_exception - - ENTRY(entry_DF) - ENDBR64 -- movl $X86_EXC_DF, 4(%rsp) -+ movl $X86_EXC_DF, EFRAME_entry_vector(%rsp) - /* Set AC to reduce chance of further SMAP faults */ - ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP - SAVE_ALL -@@ -998,7 +998,7 @@ ENTRY(entry_DF) - ENTRY(entry_NMI) - ENDBR64 - pushq $0 -- movl $X86_EXC_NMI, 4(%rsp) -+ movl $X86_EXC_NMI, EFRAME_entry_vector(%rsp) - handle_ist_exception: - ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP - SAVE_ALL -@@ -1130,7 +1130,7 @@ handle_ist_exception: - ENTRY(entry_MC) - ENDBR64 - pushq $0 -- movl $X86_EXC_MC, 4(%rsp) -+ movl $X86_EXC_MC, EFRAME_entry_vector(%rsp) - jmp handle_ist_exception - - /* No op trap handler. Required for kexec crash path. */ -@@ -1167,7 +1167,7 @@ autogen_stubs: /* Automatically generated stubs. */ - 1: - ENDBR64 - pushq $0 -- movb $vec,4(%rsp) -+ movb $vec, EFRAME_entry_vector(%rsp) - jmp common_interrupt - - entrypoint 1b -@@ -1181,7 +1181,7 @@ autogen_stubs: /* Automatically generated stubs. */ - test $8,%spl /* 64bit exception frames are 16 byte aligned, but the word */ - jz 2f /* size is 8 bytes. Check whether the processor gave us an */ - pushq $0 /* error code, and insert an empty one if not. */ --2: movb $vec,4(%rsp) -+2: movb $vec, EFRAME_entry_vector(%rsp) - jmp handle_exception - - entrypoint 1b --- -2.44.0 - - -From b91c253e81db915f685b29e6947144ab9905388d Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 27 Feb 2024 16:07:39 +0000 -Subject: [PATCH 58/70] x86: Resync intel-family.h from Linux - -From v6.8-rc6 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 195e75371b13c4f7ecdf7b5c50aed0d02f2d7ce8) ---- - xen/arch/x86/include/asm/intel-family.h | 38 ++++++++++++++++++++++--- - 1 file changed, 34 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/x86/include/asm/intel-family.h b/xen/arch/x86/include/asm/intel-family.h -index ffc49151be..b65e9c46b9 100644 ---- a/xen/arch/x86/include/asm/intel-family.h -+++ b/xen/arch/x86/include/asm/intel-family.h -@@ -26,6 +26,9 @@ - * _G - parts with extra graphics on - * _X - regular server parts - * _D - micro server parts -+ * _N,_P - other mobile parts -+ * _H - premium mobile parts -+ * _S - other client parts - * - * Historical OPTDIFFs: - * -@@ -37,6 +40,9 @@ - * their own names :-( - */ - -+/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ -+#define INTEL_FAM6_ANY X86_MODEL_ANY -+ - #define INTEL_FAM6_CORE_YONAH 0x0E - - #define INTEL_FAM6_CORE2_MEROM 0x0F -@@ -93,8 +99,6 @@ - #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ - #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ - --#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ -- - #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ - - #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ -@@ -102,12 +106,31 @@ - - #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ - -+#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF -+ -+#define INTEL_FAM6_GRANITERAPIDS_X 0xAD -+#define INTEL_FAM6_GRANITERAPIDS_D 0xAE -+ -+/* "Hybrid" Processors (P-Core/E-Core) */ -+ -+#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ -+ - #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ - #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ - --#define INTEL_FAM6_RAPTORLAKE 0xB7 -+#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ -+#define INTEL_FAM6_RAPTORLAKE_P 0xBA -+#define INTEL_FAM6_RAPTORLAKE_S 0xBF -+ -+#define INTEL_FAM6_METEORLAKE 0xAC -+#define INTEL_FAM6_METEORLAKE_L 0xAA -+ -+#define INTEL_FAM6_ARROWLAKE_H 0xC5 -+#define INTEL_FAM6_ARROWLAKE 0xC6 -+ -+#define INTEL_FAM6_LUNARLAKE_M 0xBD - --/* "Small Core" Processors (Atom) */ -+/* "Small Core" Processors (Atom/E-Core) */ - - #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ - #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ -@@ -134,6 +157,13 @@ - #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ - #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ - -+#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ -+ -+#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ -+#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ -+ -+#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ -+ - /* Xeon Phi */ - - #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ --- -2.44.0 - - -From 9f89ec65fbe49c3be32a456091097d7ef017d268 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 23 Jun 2023 11:32:00 +0100 -Subject: [PATCH 59/70] x86/vmx: Perform VERW flushing later in the VMExit path - -Broken out of the following patch because this change is subtle enough on its -own. See it for the rational of why we're moving VERW. - -As for how, extend the trick already used to hold one condition in -flags (RESUME vs LAUNCH) through the POPing of GPRs. - -Move the MOV CR earlier. Intel specify flags to be undefined across it. - -Encode the two conditions we want using SF and PF. See the code comment for -exactly how. - -Leave a comment to explain the lack of any content around -SPEC_CTRL_EXIT_TO_VMX, but leave the block in place. Sods law says if we -delete it, we'll need to reintroduce it. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 475fa20b7384464210f42bad7195f87bd6f1c63f) ---- - xen/arch/x86/hvm/vmx/entry.S | 36 +++++++++++++++++++++--- - xen/arch/x86/include/asm/asm_defns.h | 8 ++++++ - xen/arch/x86/include/asm/spec_ctrl_asm.h | 7 +++++ - xen/arch/x86/x86_64/asm-offsets.c | 1 + - 4 files changed, 48 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S -index e3f60d5a82..1bead826ca 100644 ---- a/xen/arch/x86/hvm/vmx/entry.S -+++ b/xen/arch/x86/hvm/vmx/entry.S -@@ -87,17 +87,39 @@ UNLIKELY_END(realmode) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ - /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ -- DO_SPEC_CTRL_COND_VERW -+ /* -+ * All speculation safety work happens to be elsewhere. VERW is after -+ * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left -+ * to the MSR load list. -+ */ - - mov VCPU_hvm_guest_cr2(%rbx),%rax -+ mov %rax, %cr2 -+ -+ /* -+ * We need to perform two conditional actions (VERW, and Resume vs -+ * Launch) after popping GPRs. With some cunning, we can encode both -+ * of these in eflags together. -+ * -+ * Parity is only calculated over the bottom byte of the answer, while -+ * Sign is simply the top bit. -+ * -+ * Therefore, the final OR instruction ends up producing: -+ * SF = VCPU_vmx_launched -+ * PF = !SCF_verw -+ */ -+ BUILD_BUG_ON(SCF_verw & ~0xff) -+ movzbl VCPU_vmx_launched(%rbx), %ecx -+ shl $31, %ecx -+ movzbl CPUINFO_spec_ctrl_flags(%rsp), %eax -+ and $SCF_verw, %eax -+ or %eax, %ecx - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp -- mov %rax,%cr2 -- cmpb $0,VCPU_vmx_launched(%rbx) - pop %rbx - pop %r11 - pop %r10 -@@ -108,7 +130,13 @@ UNLIKELY_END(realmode) - pop %rdx - pop %rsi - pop %rdi -- je .Lvmx_launch -+ -+ jpe .L_skip_verw -+ /* VERW clobbers ZF, but preserves all others, including SF. */ -+ verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp) -+.L_skip_verw: -+ -+ jns .Lvmx_launch - - /*.Lvmx_resume:*/ - VMRESUME -diff --git a/xen/arch/x86/include/asm/asm_defns.h b/xen/arch/x86/include/asm/asm_defns.h -index baaaccb26e..56ae26e542 100644 ---- a/xen/arch/x86/include/asm/asm_defns.h -+++ b/xen/arch/x86/include/asm/asm_defns.h -@@ -81,6 +81,14 @@ register unsigned long current_stack_pointer asm("rsp"); - - #ifdef __ASSEMBLY__ - -+.macro BUILD_BUG_ON condstr, cond:vararg -+ .if \cond -+ .error "Condition \"\condstr\" not satisfied" -+ .endif -+.endm -+/* preprocessor macro to make error message more user friendly */ -+#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond -+ - #ifdef HAVE_AS_QUOTED_SYM - #define SUBSECTION_LBL(tag) \ - .ifndef .L.tag; \ -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 6cb7c1b949..525745a066 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -152,6 +152,13 @@ - #endif - .endm - -+/* -+ * Helper to improve the readibility of stack dispacements with %rsp in -+ * unusual positions. Both @field and @top_of_stack should be constants from -+ * the same object. @top_of_stack should be where %rsp is currently pointing. -+ */ -+#define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) -+ - .macro DO_SPEC_CTRL_COND_VERW - /* - * Requires %rsp=cpuinfo -diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c -index 2fc4d9130a..0d33678898 100644 ---- a/xen/arch/x86/x86_64/asm-offsets.c -+++ b/xen/arch/x86/x86_64/asm-offsets.c -@@ -135,6 +135,7 @@ void __dummy__(void) - #endif - - OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); -+ OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); - OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); - OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); - OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); --- -2.44.0 - - -From 95dd34fdbea5408872d5c244fe268222a4f145d0 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Sat, 27 Jan 2024 18:20:56 +0000 -Subject: [PATCH 60/70] x86/spec-ctrl: Perform VERW flushing later in exit - paths - -On parts vulnerable to RFDS, VERW's side effects are extended to scrub all -non-architectural entries in various Physical Register Files. To remove all -of Xen's values, the VERW must be after popping the GPRs. - -Rework SPEC_CTRL_COND_VERW to default to an CPUINFO_error_code %rsp position, -but with overrides for other contexts. Identify that it clobbers eflags; this -is particularly relevant for the SYSRET path. - -For the IST exit return to Xen, have the main SPEC_CTRL_EXIT_TO_XEN put a -shadow copy of spec_ctrl_flags, as GPRs can't be used at the point we want to -issue the VERW. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 0a666cf2cd99df6faf3eebc81a1fc286e4eca4c7) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 36 ++++++++++++++++-------- - xen/arch/x86/x86_64/asm-offsets.c | 13 +++++++-- - xen/arch/x86/x86_64/compat/entry.S | 6 ++++ - xen/arch/x86/x86_64/entry.S | 21 +++++++++++++- - 4 files changed, 61 insertions(+), 15 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 525745a066..13acebc75d 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -159,16 +159,23 @@ - */ - #define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) - --.macro DO_SPEC_CTRL_COND_VERW -+.macro SPEC_CTRL_COND_VERW \ -+ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_error_code), \ -+ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_error_code) - /* -- * Requires %rsp=cpuinfo -+ * Requires \scf and \sel as %rsp-relative expressions -+ * Clobbers eflags -+ * -+ * VERW needs to run after guest GPRs have been restored, where only %rsp is -+ * good to use. Default to expecting %rsp pointing at CPUINFO_error_code. -+ * Contexts where this is not true must provide an alternative \scf and \sel. - * - * Issue a VERW for its flushing side effect, if indicated. This is a Spectre - * v1 gadget, but the IRET/VMEntry is serialising. - */ -- testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) -+ testb $SCF_verw, \scf(%rsp) - jz .L\@_verw_skip -- verw CPUINFO_verw_sel(%rsp) -+ verw \sel(%rsp) - .L\@_verw_skip: - .endm - -@@ -286,8 +293,6 @@ - */ - ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV - -- DO_SPEC_CTRL_COND_VERW -- - ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - .endm - -@@ -367,7 +372,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* -- * Requires %r12=ist_exit, %r14=stack_end -+ * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs - * Clobbers %rax, %rbx, %rcx, %rdx - */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx -@@ -395,11 +400,18 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - test %r12, %r12 - jz .L\@_skip_ist_exit - -- /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ -- testb $SCF_verw, %bl -- jz .L\@_skip_verw -- verw STACK_CPUINFO_FIELD(verw_sel)(%r14) --.L\@_skip_verw: -+ /* -+ * Stash SCF and verw_sel above eflags in the case of an IST_exit. The -+ * VERW logic needs to run after guest GPRs have been restored; i.e. where -+ * we cannot use %r12 or %r14 for the purposes they have here. -+ * -+ * When the CPU pushed this exception frame, it zero-extended eflags. -+ * Therefore it is safe for the VERW logic to look at the stashed SCF -+ * outside of the ist_exit condition. Also, this stashing won't influence -+ * any other restore_all_guest() paths. -+ */ -+ or $(__HYPERVISOR_DS32 << 16), %ebx -+ mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */ - - ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - -diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c -index 0d33678898..85c7d0c989 100644 ---- a/xen/arch/x86/x86_64/asm-offsets.c -+++ b/xen/arch/x86/x86_64/asm-offsets.c -@@ -55,14 +55,22 @@ void __dummy__(void) - * EFRAME_* is for the entry/exit logic where %rsp is pointing at - * UREGS_error_code and GPRs are still/already guest values. - */ --#define OFFSET_EF(sym, mem) \ -+#define OFFSET_EF(sym, mem, ...) \ - DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ -- offsetof(struct cpu_user_regs, error_code)) -+ offsetof(struct cpu_user_regs, error_code) __VA_ARGS__) - - OFFSET_EF(EFRAME_entry_vector, entry_vector); - OFFSET_EF(EFRAME_rip, rip); - OFFSET_EF(EFRAME_cs, cs); - OFFSET_EF(EFRAME_eflags, eflags); -+ -+ /* -+ * These aren't real fields. They're spare space, used by the IST -+ * exit-to-xen path. -+ */ -+ OFFSET_EF(EFRAME_shadow_scf, eflags, +4); -+ OFFSET_EF(EFRAME_shadow_sel, eflags, +6); -+ - OFFSET_EF(EFRAME_rsp, rsp); - BLANK(); - -@@ -136,6 +144,7 @@ void __dummy__(void) - - OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); - OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); -+ OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip); - OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); - OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); - OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index cb473f08ee..3bbe3a79a5 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -161,6 +161,12 @@ ENTRY(compat_restore_all_guest) - SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ - - RESTORE_ALL adj=8 compat=1 -+ -+ /* Account for ev/ec having already been popped off the stack. */ -+ SPEC_CTRL_COND_VERW \ -+ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_rip), \ -+ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_rip) -+ - .Lft0: iretq - _ASM_PRE_EXTABLE(.Lft0, handle_exception) - -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 968da9d727..2c7512130f 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -214,6 +214,9 @@ restore_all_guest: - #endif - - mov EFRAME_rip(%rsp), %rcx -+ -+ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ -+ - cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) - mov EFRAME_rsp(%rsp), %rsp - je 1f -@@ -227,6 +230,9 @@ restore_all_guest: - iret_exit_to_guest: - andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) - orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) -+ -+ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ -+ - addq $8,%rsp - .Lft0: iretq - _ASM_PRE_EXTABLE(.Lft0, handle_exception) -@@ -679,9 +685,22 @@ UNLIKELY_START(ne, exit_cr3) - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */ - - RESTORE_ALL adj=8 -+ -+ /* -+ * When the CPU pushed this exception frame, it zero-extended eflags. -+ * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of -+ * spec_ctrl_flags and ver_sel above eflags, as we can't use any GPRs, -+ * and we're at a random place on the stack, not in a CPUFINFO block. -+ * -+ * Account for ev/ec having already been popped off the stack. -+ */ -+ SPEC_CTRL_COND_VERW \ -+ scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \ -+ sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip) -+ - iretq - - ENTRY(common_interrupt) --- -2.44.0 - - -From b7205fc1cbad0c633e92d2d019a02a507467507b Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 12 Feb 2024 17:50:43 +0000 -Subject: [PATCH 61/70] x86/spec-ctrl: Rename VERW related options - -VERW is going to be used for a 3rd purpose, and the existing nomenclature -didn't survive the Stale MMIO issues terribly well. - -Rename the command line option from `md-clear=` to `verw=`. This is more -consistent with other options which tend to be named based on what they're -doing, not which feature enumeration they use behind the scenes. Retain -`md-clear=` as a deprecated alias. - -Rename opt_md_clear_{pv,hvm} and opt_fb_clear_mmio to opt_verw_{pv,hvm,mmio}, -which has a side effect of making spec_ctrl_init_domain() rather clearer to -follow. - -No functional change. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit f7603ca252e4226739eb3129a5290ee3da3f8ea4) ---- - docs/misc/xen-command-line.pandoc | 15 ++++---- - xen/arch/x86/spec_ctrl.c | 62 ++++++++++++++++--------------- - 2 files changed, 40 insertions(+), 37 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 582d6741d1..fbf1683924 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2370,7 +2370,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>, --> {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>, -+> {msr-sc,rsb,verw,ibpb-entry}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio,gds-mit,div-scrub}=<bool> ]` -@@ -2395,7 +2395,7 @@ in place for guests to use. - - Use of a positive boolean value for either of these options is invalid. - --The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options -+The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=` and `ibpb-entry=` options - offer fine grained control over the primitives by Xen. These impact Xen's - ability to protect itself, and/or Xen's ability to virtualise support for - guests to use. -@@ -2412,11 +2412,12 @@ guests to use. - guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. - * `rsb=` offers control over whether to overwrite the Return Stack Buffer / - Return Address Stack on entry to Xen and on idle. --* `md-clear=` offers control over whether to use VERW to flush -- microarchitectural buffers on idle and exit from Xen. *Note: For -- compatibility with development versions of this fix, `mds=` is also accepted -- on Xen 4.12 and earlier as an alias. Consult vendor documentation in -- preference to here.* -+* `verw=` offers control over whether to use VERW for its scrubbing side -+ effects at appropriate privilege transitions. The exact side effects are -+ microarchitecture and microcode specific. *Note: `md-clear=` is accepted as -+ a deprecated alias. For compatibility with development versions of XSA-297, -+ `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor -+ documentation in preference to here.* - * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction - Barrier) is used on entry to Xen. This is used by default on hardware - vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index a965b6db28..c42d8cdc22 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -25,8 +25,8 @@ static bool __initdata opt_msr_sc_pv = true; - static bool __initdata opt_msr_sc_hvm = true; - static int8_t __initdata opt_rsb_pv = -1; - static bool __initdata opt_rsb_hvm = true; --static int8_t __ro_after_init opt_md_clear_pv = -1; --static int8_t __ro_after_init opt_md_clear_hvm = -1; -+static int8_t __ro_after_init opt_verw_pv = -1; -+static int8_t __ro_after_init opt_verw_hvm = -1; - - static int8_t __ro_after_init opt_ibpb_entry_pv = -1; - static int8_t __ro_after_init opt_ibpb_entry_hvm = -1; -@@ -66,7 +66,7 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. - - static int8_t __initdata opt_srb_lock = -1; - static bool __initdata opt_unpriv_mmio; --static bool __ro_after_init opt_fb_clear_mmio; -+static bool __ro_after_init opt_verw_mmio; - static int8_t __initdata opt_gds_mit = -1; - static int8_t __initdata opt_div_scrub = -1; - -@@ -108,8 +108,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) - disable_common: - opt_rsb_pv = false; - opt_rsb_hvm = false; -- opt_md_clear_pv = 0; -- opt_md_clear_hvm = 0; -+ opt_verw_pv = 0; -+ opt_verw_hvm = 0; - opt_ibpb_entry_pv = 0; - opt_ibpb_entry_hvm = 0; - opt_ibpb_entry_dom0 = false; -@@ -140,14 +140,14 @@ static int __init cf_check parse_spec_ctrl(const char *s) - { - opt_msr_sc_pv = val; - opt_rsb_pv = val; -- opt_md_clear_pv = val; -+ opt_verw_pv = val; - opt_ibpb_entry_pv = val; - } - else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) - { - opt_msr_sc_hvm = val; - opt_rsb_hvm = val; -- opt_md_clear_hvm = val; -+ opt_verw_hvm = val; - opt_ibpb_entry_hvm = val; - } - else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) -@@ -192,21 +192,22 @@ static int __init cf_check parse_spec_ctrl(const char *s) - break; - } - } -- else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) -+ else if ( (val = parse_boolean("verw", s, ss)) != -1 || -+ (val = parse_boolean("md-clear", s, ss)) != -1 ) - { - switch ( val ) - { - case 0: - case 1: -- opt_md_clear_pv = opt_md_clear_hvm = val; -+ opt_verw_pv = opt_verw_hvm = val; - break; - - case -2: -- s += strlen("md-clear="); -+ s += (*s == 'v') ? strlen("verw=") : strlen("md-clear="); - if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -- opt_md_clear_pv = val; -+ opt_verw_pv = val; - else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -- opt_md_clear_hvm = val; -+ opt_verw_hvm = val; - else - default: - rc = -EINVAL; -@@ -528,8 +529,8 @@ static void __init print_details(enum ind_thunk thunk) - opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", - opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", -- opt_md_clear_pv || opt_md_clear_hvm || -- opt_fb_clear_mmio ? " VERW" : "", -+ opt_verw_pv || opt_verw_hvm || -+ opt_verw_mmio ? " VERW" : "", - opt_div_scrub ? " DIV" : "", - opt_branch_harden ? " BRANCH_HARDEN" : ""); - -@@ -550,13 +551,13 @@ static void __init print_details(enum ind_thunk thunk) - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || - amd_virt_spec_ctrl || -- opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", -+ opt_eager_fpu || opt_verw_hvm) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", - (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || - amd_virt_spec_ctrl) ? " MSR_VIRT_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- opt_md_clear_hvm ? " MD_CLEAR" : "", -+ opt_verw_hvm ? " VERW" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); - - #endif -@@ -565,11 +566,11 @@ static void __init print_details(enum ind_thunk thunk) - (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || - boot_cpu_has(X86_FEATURE_SC_RSB_PV) || - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || -- opt_eager_fpu || opt_md_clear_pv) ? "" : " None", -+ opt_eager_fpu || opt_verw_pv) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- opt_md_clear_pv ? " MD_CLEAR" : "", -+ opt_verw_pv ? " VERW" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); - - printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", -@@ -1502,8 +1503,8 @@ void spec_ctrl_init_domain(struct domain *d) - { - bool pv = is_pv_domain(d); - -- bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || -- (opt_fb_clear_mmio && is_iommu_enabled(d))); -+ bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) || -+ (opt_verw_mmio && is_iommu_enabled(d))); - - bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && - (d->domain_id != 0 || opt_ibpb_entry_dom0)); -@@ -1866,19 +1867,20 @@ void __init init_speculation_mitigations(void) - * the return-to-guest path. - */ - if ( opt_unpriv_mmio ) -- opt_fb_clear_mmio = cpu_has_fb_clear; -+ opt_verw_mmio = cpu_has_fb_clear; - - /* - * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. - * This will only be a token effort for MLPDS/MFBDS when HT is enabled, - * but it is somewhat better than nothing. - */ -- if ( opt_md_clear_pv == -1 ) -- opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- boot_cpu_has(X86_FEATURE_MD_CLEAR)); -- if ( opt_md_clear_hvm == -1 ) -- opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- boot_cpu_has(X86_FEATURE_MD_CLEAR)); -+ if ( opt_verw_pv == -1 ) -+ opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -+ cpu_has_md_clear); -+ -+ if ( opt_verw_hvm == -1 ) -+ opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -+ cpu_has_md_clear); - - /* - * Enable MDS/MMIO defences as applicable. The Idle blocks need using if -@@ -1891,12 +1893,12 @@ void __init init_speculation_mitigations(void) - * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) - * - * After calculating the appropriate idle setting, simplify -- * opt_md_clear_hvm to mean just "should we VERW on the way into HVM -+ * opt_verw_hvm to mean just "should we VERW on the way into HVM - * guests", so spec_ctrl_init_domain() can calculate suitable settings. - */ -- if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) -+ if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); -- opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; -+ opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; - - /* - * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT --- -2.44.0 - - -From fb85a8fc91f8cfd61d7c7f9742502b223d4024b5 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 5 Mar 2024 19:33:37 +0000 -Subject: [PATCH 62/70] x86/spec-ctrl: VERW-handling adjustments - -... before we add yet more complexity to this logic. Mostly expanded -comments, but with three minor changes. - -1) Introduce cpu_has_useful_md_clear to simplify later logic in this patch and - future ones. - -2) We only ever need SC_VERW_IDLE when SMT is active. If SMT isn't active, - then there's no re-partition of pipeline resources based on thread-idleness - to worry about. - -3) The logic to adjust HVM VERW based on L1D_FLUSH is unmaintainable and, as - it turns out, wrong. SKIP_L1DFL is just a hint bit, whereas opt_l1d_flush - is the relevant decision of whether to use L1D_FLUSH based on - susceptibility and user preference. - - Rewrite the logic so it can be followed, and incorporate the fact that when - FB_CLEAR is visible, L1D_FLUSH isn't a safe substitution. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 1eb91a8a06230b4b64228c9a380194f8cfe6c5e2) ---- - xen/arch/x86/spec_ctrl.c | 99 +++++++++++++++++++++++++++++----------- - 1 file changed, 73 insertions(+), 26 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index c42d8cdc22..a4afcd8570 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1519,7 +1519,7 @@ void __init init_speculation_mitigations(void) - { - enum ind_thunk thunk = THUNK_DEFAULT; - bool has_spec_ctrl, ibrs = false, hw_smt_enabled; -- bool cpu_has_bug_taa, retpoline_safe; -+ bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe; - - hw_smt_enabled = check_smt_enabled(); - -@@ -1855,50 +1855,97 @@ void __init init_speculation_mitigations(void) - "enabled. Please assess your configuration and choose an\n" - "explicit 'smt=<bool>' setting. See XSA-273.\n"); - -+ /* -+ * A brief summary of VERW-related changes. -+ * -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html -+ * -+ * Relevant ucodes: -+ * -+ * - May 2019, for MDS. Introduces the MD_CLEAR CPUID bit and VERW side -+ * effects to scrub Store/Load/Fill buffers as applicable. MD_CLEAR -+ * exists architecturally, even when the side effects have been removed. -+ * -+ * Use VERW to scrub on return-to-guest. Parts with L1D_FLUSH to -+ * mitigate L1TF have the same side effect, so no need to do both. -+ * -+ * Various Atoms suffer from Store-buffer sampling only. Store buffers -+ * are statically partitioned between non-idle threads, so scrubbing is -+ * wanted when going idle too. -+ * -+ * Load ports and Fill buffers are competitively shared between threads. -+ * SMT must be disabled for VERW scrubbing to be fully effective. -+ * -+ * - November 2019, for TAA. Extended VERW side effects to TSX-enabled -+ * MDS_NO parts. -+ * -+ * - February 2022, for Client TSX de-feature. Removed VERW side effects -+ * from Client CPUs only. -+ * -+ * - May 2022, for MMIO Stale Data. (Re)introduced Fill Buffer scrubbing -+ * on all MMIO-affected parts which didn't already have it for MDS -+ * reasons, enumerating FB_CLEAR on those parts only. -+ * -+ * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing -+ * side effects as VERW and cannot be used in its place. -+ */ - mds_calculations(); - - /* -- * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have -- * reintroduced the VERW fill buffer flushing side effect because of a -- * susceptibility to FBSDP. -+ * Parts which enumerate FB_CLEAR are those with now-updated microcode -+ * which weren't susceptible to the original MFBDS (and therefore didn't -+ * have Fill Buffer scrubbing side effects to begin with, or were Client -+ * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had -+ * the scrubbing reintroduced because of a susceptibility to FBSDP. - * - * If unprivileged guests have (or will have) MMIO mappings, we can - * mitigate cross-domain leakage of fill buffer data by issuing VERW on -- * the return-to-guest path. -+ * the return-to-guest path. This is only a token effort if SMT is -+ * active. - */ - if ( opt_unpriv_mmio ) - opt_verw_mmio = cpu_has_fb_clear; - - /* -- * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. -- * This will only be a token effort for MLPDS/MFBDS when HT is enabled, -- * but it is somewhat better than nothing. -+ * MD_CLEAR is enumerated architecturally forevermore, even after the -+ * scrubbing side effects have been removed. Create ourselves an version -+ * which expressed whether we think MD_CLEAR is having any useful side -+ * effect. -+ */ -+ cpu_has_useful_md_clear = (cpu_has_md_clear && -+ (cpu_has_bug_mds || cpu_has_bug_msbds_only)); -+ -+ /* -+ * By default, use VERW scrubbing on applicable hardware, if we think it's -+ * going to have an effect. This will only be a token effort for -+ * MLPDS/MFBDS when SMT is enabled. - */ - if ( opt_verw_pv == -1 ) -- opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- cpu_has_md_clear); -+ opt_verw_pv = cpu_has_useful_md_clear; - - if ( opt_verw_hvm == -1 ) -- opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && -- cpu_has_md_clear); -+ opt_verw_hvm = cpu_has_useful_md_clear; - - /* -- * Enable MDS/MMIO defences as applicable. The Idle blocks need using if -- * either the PV or HVM MDS defences are used, or if we may give MMIO -- * access to untrusted guests. -- * -- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with -- * equivalent semantics to avoid needing to perform both flushes on the -- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for -- * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) -- * -- * After calculating the appropriate idle setting, simplify -- * opt_verw_hvm to mean just "should we VERW on the way into HVM -- * guests", so spec_ctrl_init_domain() can calculate suitable settings. -+ * If SMT is active, and we're protecting against MDS or MMIO stale data, -+ * we need to scrub before going idle as well as on return to guest. -+ * Various pipeline resources are repartitioned amongst non-idle threads. - */ -- if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) -+ if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || -+ opt_verw_mmio) && hw_smt_enabled ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); -- opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; -+ -+ /* -+ * After calculating the appropriate idle setting, simplify opt_verw_hvm -+ * to mean just "should we VERW on the way into HVM guests", so -+ * spec_ctrl_init_domain() can calculate suitable settings. -+ * -+ * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the -+ * only *_CLEAR we can see. -+ */ -+ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) -+ opt_verw_hvm = false; - - /* - * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT --- -2.44.0 - - -From 908cbd1893e80eb52b92b2c70c2bfd9ffdf6f77b Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 22 Jun 2023 23:32:19 +0100 -Subject: [PATCH 63/70] x86/spec-ctrl: Mitigation Register File Data Sampling - -RFDS affects Atom cores, also branded E-cores, between the Goldmont and -Gracemont microarchitectures. This includes Alder Lake and Raptor Lake hybrid -clien systems which have a mix of Gracemont and other types of cores. - -Two new bits have been defined; RFDS_CLEAR to indicate VERW has more side -effets, and RFDS_NO to incidate that the system is unaffected. Plenty of -unaffected CPUs won't be getting RFDS_NO retrofitted in microcode, so we -synthesise it. Alder Lake and Raptor Lake Xeon-E's are unaffected due to -their platform configuration, and we must use the Hybrid CPUID bit to -distinguish them from their non-Xeon counterparts. - -Like MD_CLEAR and FB_CLEAR, RFDS_CLEAR needs OR-ing across a resource pool, so -set it in the max policies and reflect the host setting in default. - -This is part of XSA-452 / CVE-2023-28746. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit fb5b6f6744713410c74cfc12b7176c108e3c9a31) ---- - tools/misc/xen-cpuid.c | 5 +- - xen/arch/x86/cpu-policy.c | 5 + - xen/arch/x86/include/asm/cpufeature.h | 3 + - xen/arch/x86/include/asm/msr-index.h | 2 + - xen/arch/x86/spec_ctrl.c | 100 +++++++++++++++++++- - xen/include/public/arch-x86/cpufeatureset.h | 3 + - 6 files changed, 111 insertions(+), 7 deletions(-) - -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index 7370f1b56e..52e451a806 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -172,7 +172,7 @@ static const char *const str_7d0[32] = - [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl", - [10] = "md-clear", [11] = "rtm-always-abort", - /* 12 */ [13] = "tsx-force-abort", -- [14] = "serialize", -+ [14] = "serialize", [15] = "hybrid", - [16] = "tsxldtrk", - [18] = "pconfig", - [20] = "cet-ibt", -@@ -245,7 +245,8 @@ static const char *const str_m10Al[32] = - [20] = "bhi-no", [21] = "xapic-status", - /* 22 */ [23] = "ovrclk-status", - [24] = "pbrsb-no", [25] = "gds-ctrl", -- [26] = "gds-no", -+ [26] = "gds-no", [27] = "rfds-no", -+ [28] = "rfds-clear", - }; - - static const char *const str_m10Ah[32] = -diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c -index c7c5e99b7b..12e621b97d 100644 ---- a/xen/arch/x86/cpu-policy.c -+++ b/xen/arch/x86/cpu-policy.c -@@ -451,6 +451,7 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) - */ - __set_bit(X86_FEATURE_MD_CLEAR, fs); - __set_bit(X86_FEATURE_FB_CLEAR, fs); -+ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); - - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an -@@ -500,6 +501,10 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) - if ( cpu_has_fb_clear ) - __set_bit(X86_FEATURE_FB_CLEAR, fs); - -+ __clear_bit(X86_FEATURE_RFDS_CLEAR, fs); -+ if ( cpu_has_rfds_clear ) -+ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); -+ - /* - * The Gather Data Sampling microcode mitigation (August 2023) has an - * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. -diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h -index 76ef2aeb1d..3c57f55de0 100644 ---- a/xen/arch/x86/include/asm/cpufeature.h -+++ b/xen/arch/x86/include/asm/cpufeature.h -@@ -181,6 +181,7 @@ static inline bool boot_cpu_has(unsigned int feat) - #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) - #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) - #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) -+#define cpu_has_hybrid boot_cpu_has(X86_FEATURE_HYBRID) - #define cpu_has_avx512_fp16 boot_cpu_has(X86_FEATURE_AVX512_FP16) - #define cpu_has_arch_caps boot_cpu_has(X86_FEATURE_ARCH_CAPS) - -@@ -208,6 +209,8 @@ static inline bool boot_cpu_has(unsigned int feat) - #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) - #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) - #define cpu_has_gds_no boot_cpu_has(X86_FEATURE_GDS_NO) -+#define cpu_has_rfds_no boot_cpu_has(X86_FEATURE_RFDS_NO) -+#define cpu_has_rfds_clear boot_cpu_has(X86_FEATURE_RFDS_CLEAR) - - /* Synthesized. */ - #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) -diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h -index 82a81bd0a2..85ef28a612 100644 ---- a/xen/arch/x86/include/asm/msr-index.h -+++ b/xen/arch/x86/include/asm/msr-index.h -@@ -89,6 +89,8 @@ - #define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) - #define ARCH_CAPS_GDS_CTRL (_AC(1, ULL) << 25) - #define ARCH_CAPS_GDS_NO (_AC(1, ULL) << 26) -+#define ARCH_CAPS_RFDS_NO (_AC(1, ULL) << 27) -+#define ARCH_CAPS_RFDS_CLEAR (_AC(1, ULL) << 28) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index a4afcd8570..8165379fed 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -12,6 +12,7 @@ - - #include <asm/amd.h> - #include <asm/hvm/svm/svm.h> -+#include <asm/intel-family.h> - #include <asm/microcode.h> - #include <asm/msr.h> - #include <asm/pv/domain.h> -@@ -435,7 +436,7 @@ static void __init print_details(enum ind_thunk thunk) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -451,6 +452,7 @@ static void __init print_details(enum ind_thunk thunk) - (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", - (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", - (caps & ARCH_CAPS_GDS_NO) ? " GDS_NO" : "", -+ (caps & ARCH_CAPS_RFDS_NO) ? " RFDS_NO" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", -@@ -461,7 +463,7 @@ static void __init print_details(enum ind_thunk thunk) - (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); - - /* Hardware features which need driving to mitigate issues. */ -- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || -@@ -479,6 +481,7 @@ static void __init print_details(enum ind_thunk thunk) - (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", - (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", - (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", -+ (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", - (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); - - /* Compiled-in support which pertains to mitigations. */ -@@ -1347,6 +1350,83 @@ static __init void mds_calculations(void) - } - } - -+/* -+ * Register File Data Sampling affects Atom cores from the Goldmont to -+ * Gracemont microarchitectures. The March 2024 microcode adds RFDS_NO to -+ * some but not all unaffected parts, and RFDS_CLEAR to affected parts still -+ * in support. -+ * -+ * Alder Lake and Raptor Lake client CPUs have a mix of P cores -+ * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont, -+ * vulnerable), and both enumerate RFDS_CLEAR. -+ * -+ * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by -+ * platform configuration, and enumerate RFDS_NO. -+ * -+ * With older parts, or with out-of-date microcode, synthesise RFDS_NO when -+ * safe to do so. -+ * -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html -+ */ -+static void __init rfds_calculations(void) -+{ -+ /* RFDS is only known to affect Intel Family 6 processors at this time. */ -+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || -+ boot_cpu_data.x86 != 6 ) -+ return; -+ -+ /* -+ * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable -+ * microcode, or an RFDS-aware hypervisor is levelling us in a pool. -+ */ -+ if ( cpu_has_rfds_no || cpu_has_rfds_clear ) -+ return; -+ -+ /* If we're virtualised, don't attempt to synthesise RFDS_NO. */ -+ if ( cpu_has_hypervisor ) -+ return; -+ -+ /* -+ * Not all CPUs are expected to get a microcode update enumerating one of -+ * RFDS_{NO,CLEAR}, or we might have out-of-date microcode. -+ */ -+ switch ( boot_cpu_data.x86_model ) -+ { -+ case INTEL_FAM6_ALDERLAKE: -+ case INTEL_FAM6_RAPTORLAKE: -+ /* -+ * Alder Lake and Raptor Lake might be a client SKU (with the -+ * Gracemont cores active, and therefore vulnerable) or might be a -+ * server SKU (with the Gracemont cores disabled, and therefore not -+ * vulnerable). -+ * -+ * See if the CPU identifies as hybrid to distinguish the two cases. -+ */ -+ if ( !cpu_has_hybrid ) -+ break; -+ fallthrough; -+ case INTEL_FAM6_ALDERLAKE_L: -+ case INTEL_FAM6_RAPTORLAKE_P: -+ case INTEL_FAM6_RAPTORLAKE_S: -+ -+ case INTEL_FAM6_ATOM_GOLDMONT: /* Apollo Lake */ -+ case INTEL_FAM6_ATOM_GOLDMONT_D: /* Denverton */ -+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* Gemini Lake */ -+ case INTEL_FAM6_ATOM_TREMONT_D: /* Snow Ridge / Parker Ridge */ -+ case INTEL_FAM6_ATOM_TREMONT: /* Elkhart Lake */ -+ case INTEL_FAM6_ATOM_TREMONT_L: /* Jasper Lake */ -+ case INTEL_FAM6_ATOM_GRACEMONT: /* Alder Lake N */ -+ return; -+ } -+ -+ /* -+ * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO, -+ * perhaps because of it's age or because of out-of-date microcode. -+ * Synthesise it. -+ */ -+ setup_force_cpu_cap(X86_FEATURE_RFDS_NO); -+} -+ - static bool __init cpu_has_gds(void) - { - /* -@@ -1860,6 +1940,7 @@ void __init init_speculation_mitigations(void) - * - * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html - * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html -+ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html - * - * Relevant ucodes: - * -@@ -1889,8 +1970,12 @@ void __init init_speculation_mitigations(void) - * - * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing - * side effects as VERW and cannot be used in its place. -+ * -+ * - March 2023, for RFDS. Enumerate RFDS_CLEAR to mean that VERW now -+ * scrubs non-architectural entries from certain register files. - */ - mds_calculations(); -+ rfds_calculations(); - - /* - * Parts which enumerate FB_CLEAR are those with now-updated microcode -@@ -1922,15 +2007,19 @@ void __init init_speculation_mitigations(void) - * MLPDS/MFBDS when SMT is enabled. - */ - if ( opt_verw_pv == -1 ) -- opt_verw_pv = cpu_has_useful_md_clear; -+ opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear; - - if ( opt_verw_hvm == -1 ) -- opt_verw_hvm = cpu_has_useful_md_clear; -+ opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear; - - /* - * If SMT is active, and we're protecting against MDS or MMIO stale data, - * we need to scrub before going idle as well as on return to guest. - * Various pipeline resources are repartitioned amongst non-idle threads. -+ * -+ * We don't need to scrub on idle for RFDS. There are no affected cores -+ * which support SMT, despite there being affected cores in hybrid systems -+ * which have SMT elsewhere in the platform. - */ - if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || - opt_verw_mmio) && hw_smt_enabled ) -@@ -1944,7 +2033,8 @@ void __init init_speculation_mitigations(void) - * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the - * only *_CLEAR we can see. - */ -- if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) -+ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear && -+ !cpu_has_rfds_clear ) - opt_verw_hvm = false; - - /* -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 337aaa9c77..8e17ef670f 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -266,6 +266,7 @@ XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffe - XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ - XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ - XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ -+XEN_CPUFEATURE(HYBRID, 9*32+15) /* Heterogeneous platform */ - XEN_CPUFEATURE(TSXLDTRK, 9*32+16) /*a TSX load tracking suspend/resume insns */ - XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ - XEN_CPUFEATURE(AVX512_FP16, 9*32+23) /*A AVX512 FP16 instructions */ -@@ -338,6 +339,8 @@ XEN_CPUFEATURE(OVRCLK_STATUS, 16*32+23) /* MSR_OVERCLOCKING_STATUS */ - XEN_CPUFEATURE(PBRSB_NO, 16*32+24) /*A No Post-Barrier RSB predictions */ - XEN_CPUFEATURE(GDS_CTRL, 16*32+25) /* MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */ - XEN_CPUFEATURE(GDS_NO, 16*32+26) /*A No Gather Data Sampling */ -+XEN_CPUFEATURE(RFDS_NO, 16*32+27) /*A No Register File Data Sampling */ -+XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A Register File(s) cleared by VERW */ - - /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ - --- -2.44.0 - - -From bdda600406e5f5c35bcb17b2f9458e2138d7ad46 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 2 Feb 2024 00:39:42 +0000 -Subject: [PATCH 64/70] xen: Swap order of actions in the FREE*() macros - -Wherever possible, it is a good idea to NULL out the visible reference to an -object prior to freeing it. The FREE*() macros already collect together both -parts, making it easy to adjust. - -This has a marginal code generation improvement, as some of the calls to the -free() function can be tailcall optimised. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit c4f427ec879e7c0df6d44d02561e8bee838a293e) ---- - xen/include/xen/mm.h | 3 ++- - xen/include/xen/xmalloc.h | 7 ++++--- - 2 files changed, 6 insertions(+), 4 deletions(-) - -diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h -index 8b9618609f..8bc5f4249d 100644 ---- a/xen/include/xen/mm.h -+++ b/xen/include/xen/mm.h -@@ -91,8 +91,9 @@ bool scrub_free_pages(void); - - /* Free an allocation, and zero the pointer to it. */ - #define FREE_XENHEAP_PAGES(p, o) do { \ -- free_xenheap_pages(p, o); \ -+ void *_ptr_ = (p); \ - (p) = NULL; \ -+ free_xenheap_pages(_ptr_, o); \ - } while ( false ) - #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) - -diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h -index 16979a117c..d857298011 100644 ---- a/xen/include/xen/xmalloc.h -+++ b/xen/include/xen/xmalloc.h -@@ -66,9 +66,10 @@ - extern void xfree(void *); - - /* Free an allocation, and zero the pointer to it. */ --#define XFREE(p) do { \ -- xfree(p); \ -- (p) = NULL; \ -+#define XFREE(p) do { \ -+ void *_ptr_ = (p); \ -+ (p) = NULL; \ -+ xfree(_ptr_); \ - } while ( false ) - - /* Underlying functions */ --- -2.44.0 - - -From 1932973ac9a8c28197ebb24749c73c18cf23f5f1 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 13 Feb 2024 13:08:05 +0100 -Subject: [PATCH 65/70] x86/spinlock: introduce support for blocking - speculation into critical regions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce a new Kconfig option to block speculation into lock protected -critical regions. The Kconfig option is enabled by default, but the mitigation -won't be engaged unless it's explicitly enabled in the command line using -`spec-ctrl=lock-harden`. - -Convert the spinlock acquire macros into always-inline functions, and introduce -a speculation barrier after the lock has been taken. Note the speculation -barrier is not placed inside the implementation of the spin lock functions, as -to prevent speculation from falling through the call to the lock functions -resulting in the barrier also being skipped. - -trylock variants are protected using a construct akin to the existing -evaluate_nospec(). - -This patch only implements the speculation barrier for x86. - -Note spin locks are the only locking primitive taken care in this change, -further locking primitives will be adjusted by separate changes. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 7ef0084418e188d05f338c3e028fbbe8b6924afa) ---- - docs/misc/xen-command-line.pandoc | 7 ++++- - xen/arch/x86/include/asm/cpufeatures.h | 2 +- - xen/arch/x86/include/asm/nospec.h | 26 ++++++++++++++++++ - xen/arch/x86/spec_ctrl.c | 26 +++++++++++++++--- - xen/common/Kconfig | 17 ++++++++++++ - xen/include/xen/nospec.h | 15 +++++++++++ - xen/include/xen/spinlock.h | 37 +++++++++++++++++++++----- - 7 files changed, 119 insertions(+), 11 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index fbf1683924..3f9f916718 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2373,7 +2373,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - > {msr-sc,rsb,verw,ibpb-entry}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, --> unpriv-mmio,gds-mit,div-scrub}=<bool> ]` -+> unpriv-mmio,gds-mit,div-scrub,lock-harden}=<bool> ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2500,6 +2500,11 @@ On all hardware, the `div-scrub=` option can be used to force or prevent Xen - from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate - DIV-leakage on hardware believed to be vulnerable. - -+If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=` -+boolean can be used to force or prevent Xen from using speculation barriers to -+protect lock critical regions. This mitigation won't be engaged by default, -+and needs to be explicitly enabled on the command line. -+ - ### sync_console - > `= <boolean>` - -diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h -index c3aad21c3b..7e8221fd85 100644 ---- a/xen/arch/x86/include/asm/cpufeatures.h -+++ b/xen/arch/x86/include/asm/cpufeatures.h -@@ -24,7 +24,7 @@ XEN_CPUFEATURE(APERFMPERF, X86_SYNTH( 8)) /* APERFMPERF */ - XEN_CPUFEATURE(MFENCE_RDTSC, X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */ - XEN_CPUFEATURE(XEN_SMEP, X86_SYNTH(10)) /* SMEP gets used by Xen itself */ - XEN_CPUFEATURE(XEN_SMAP, X86_SYNTH(11)) /* SMAP gets used by Xen itself */ --/* Bit 12 unused. */ -+XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */ - XEN_CPUFEATURE(IND_THUNK_LFENCE, X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */ - XEN_CPUFEATURE(IND_THUNK_JMP, X86_SYNTH(14)) /* Use IND_THUNK_JMP */ - XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */ -diff --git a/xen/arch/x86/include/asm/nospec.h b/xen/arch/x86/include/asm/nospec.h -index 7150e76b87..0725839e19 100644 ---- a/xen/arch/x86/include/asm/nospec.h -+++ b/xen/arch/x86/include/asm/nospec.h -@@ -38,6 +38,32 @@ static always_inline void block_speculation(void) - barrier_nospec_true(); - } - -+static always_inline void arch_block_lock_speculation(void) -+{ -+ alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN); -+} -+ -+/* Allow to insert a read memory barrier into conditionals */ -+static always_inline bool barrier_lock_true(void) -+{ -+ alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN); -+ return true; -+} -+ -+static always_inline bool barrier_lock_false(void) -+{ -+ alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN); -+ return false; -+} -+ -+static always_inline bool arch_lock_evaluate_nospec(bool condition) -+{ -+ if ( condition ) -+ return barrier_lock_true(); -+ else -+ return barrier_lock_false(); -+} -+ - #endif /* _ASM_X86_NOSPEC_H */ - - /* -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 8165379fed..5dfc4ed69e 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -53,6 +53,7 @@ int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - static bool __initdata opt_branch_harden = - IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); -+static bool __initdata opt_lock_harden; - - bool __initdata bsp_delay_spec_ctrl; - uint8_t __read_mostly default_xen_spec_ctrl; -@@ -121,6 +122,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_ssbd = false; - opt_l1d_flush = 0; - opt_branch_harden = false; -+ opt_lock_harden = false; - opt_srb_lock = 0; - opt_unpriv_mmio = false; - opt_gds_mit = 0; -@@ -286,6 +288,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) - rc = -EINVAL; - } - } -+ else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 ) -+ { -+ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) -+ opt_lock_harden = val; -+ else -+ { -+ no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss); -+ rc = -EINVAL; -+ } -+ } - else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) - opt_srb_lock = val; - else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) -@@ -488,7 +500,8 @@ static void __init print_details(enum ind_thunk thunk) - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || - IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || - IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || -- IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) || -+ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) - printk(" Compiled-in support:" - #ifdef CONFIG_INDIRECT_THUNK - " INDIRECT_THUNK" -@@ -504,11 +517,14 @@ static void __init print_details(enum ind_thunk thunk) - #endif - #ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS - " HARDEN_GUEST_ACCESS" -+#endif -+#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK -+ " HARDEN_LOCK" - #endif - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", -+ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s%s\n", - thunk != THUNK_NONE ? "BTI-Thunk: " : "", - thunk == THUNK_NONE ? "" : - thunk == THUNK_RETPOLINE ? "RETPOLINE, " : -@@ -535,7 +551,8 @@ static void __init print_details(enum ind_thunk thunk) - opt_verw_pv || opt_verw_hvm || - opt_verw_mmio ? " VERW" : "", - opt_div_scrub ? " DIV" : "", -- opt_branch_harden ? " BRANCH_HARDEN" : ""); -+ opt_branch_harden ? " BRANCH_HARDEN" : "", -+ opt_lock_harden ? " LOCK_HARDEN" : ""); - - /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ - if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu ) -@@ -1918,6 +1935,9 @@ void __init init_speculation_mitigations(void) - if ( !opt_branch_harden ) - setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN); - -+ if ( !opt_lock_harden ) -+ setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN); -+ - /* - * We do not disable HT by default on affected hardware. - * -diff --git a/xen/common/Kconfig b/xen/common/Kconfig -index 4d6fe05164..3361a6d892 100644 ---- a/xen/common/Kconfig -+++ b/xen/common/Kconfig -@@ -188,6 +188,23 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS - - If unsure, say Y. - -+config SPECULATIVE_HARDEN_LOCK -+ bool "Speculative lock context hardening" -+ default y -+ depends on X86 -+ help -+ Contemporary processors may use speculative execution as a -+ performance optimisation, but this can potentially be abused by an -+ attacker to leak data via speculative sidechannels. -+ -+ One source of data leakage is via speculative accesses to lock -+ critical regions. -+ -+ This option is disabled by default at run time, and needs to be -+ enabled on the command line. -+ -+ If unsure, say Y. -+ - endmenu - - config DIT_DEFAULT -diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h -index 76255bc46e..4552846403 100644 ---- a/xen/include/xen/nospec.h -+++ b/xen/include/xen/nospec.h -@@ -70,6 +70,21 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, - #define array_access_nospec(array, index) \ - (array)[array_index_nospec(index, ARRAY_SIZE(array))] - -+static always_inline void block_lock_speculation(void) -+{ -+#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK -+ arch_block_lock_speculation(); -+#endif -+} -+ -+static always_inline bool lock_evaluate_nospec(bool condition) -+{ -+#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK -+ return arch_lock_evaluate_nospec(condition); -+#endif -+ return condition; -+} -+ - #endif /* XEN_NOSPEC_H */ - - /* -diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h -index e7a1c1aa89..28fce5615e 100644 ---- a/xen/include/xen/spinlock.h -+++ b/xen/include/xen/spinlock.h -@@ -1,6 +1,7 @@ - #ifndef __SPINLOCK_H__ - #define __SPINLOCK_H__ - -+#include <xen/nospec.h> - #include <xen/time.h> - #include <xen/types.h> - -@@ -195,13 +196,30 @@ int _spin_trylock_recursive(spinlock_t *lock); - void _spin_lock_recursive(spinlock_t *lock); - void _spin_unlock_recursive(spinlock_t *lock); - --#define spin_lock(l) _spin_lock(l) --#define spin_lock_cb(l, c, d) _spin_lock_cb(l, c, d) --#define spin_lock_irq(l) _spin_lock_irq(l) -+static always_inline void spin_lock(spinlock_t *l) -+{ -+ _spin_lock(l); -+ block_lock_speculation(); -+} -+ -+static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data), -+ void *d) -+{ -+ _spin_lock_cb(l, c, d); -+ block_lock_speculation(); -+} -+ -+static always_inline void spin_lock_irq(spinlock_t *l) -+{ -+ _spin_lock_irq(l); -+ block_lock_speculation(); -+} -+ - #define spin_lock_irqsave(l, f) \ - ({ \ - BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ - ((f) = _spin_lock_irqsave(l)); \ -+ block_lock_speculation(); \ - }) - - #define spin_unlock(l) _spin_unlock(l) -@@ -209,7 +227,7 @@ void _spin_unlock_recursive(spinlock_t *lock); - #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) - - #define spin_is_locked(l) _spin_is_locked(l) --#define spin_trylock(l) _spin_trylock(l) -+#define spin_trylock(l) lock_evaluate_nospec(_spin_trylock(l)) - - #define spin_trylock_irqsave(lock, flags) \ - ({ \ -@@ -230,8 +248,15 @@ void _spin_unlock_recursive(spinlock_t *lock); - * are any critical regions that cannot form part of such a set, they can use - * standard spin_[un]lock(). - */ --#define spin_trylock_recursive(l) _spin_trylock_recursive(l) --#define spin_lock_recursive(l) _spin_lock_recursive(l) -+#define spin_trylock_recursive(l) \ -+ lock_evaluate_nospec(_spin_trylock_recursive(l)) -+ -+static always_inline void spin_lock_recursive(spinlock_t *l) -+{ -+ _spin_lock_recursive(l); -+ block_lock_speculation(); -+} -+ - #define spin_unlock_recursive(l) _spin_unlock_recursive(l) - - #endif /* __SPINLOCK_H__ */ --- -2.44.0 - - -From e7f0f11c888757e62940ded87b4ab5ebc992764f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 13 Feb 2024 16:08:52 +0100 -Subject: [PATCH 66/70] rwlock: introduce support for blocking speculation into - critical regions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce inline wrappers as required and add direct calls to -block_lock_speculation() in order to prevent speculation into the rwlock -protected critical regions. - -Note the rwlock primitives are adjusted to use the non speculation safe variants -of the spinlock handlers, as a speculation barrier is added in the rwlock -calling wrappers. - -trylock variants are protected by using lock_evaluate_nospec(). - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit a1fb15f61692b1fa9945fc51f55471ace49cdd59) ---- - xen/common/rwlock.c | 14 +++++++++++--- - xen/include/xen/rwlock.h | 34 ++++++++++++++++++++++++++++------ - 2 files changed, 39 insertions(+), 9 deletions(-) - -diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c -index 18224a4bb5..290602936d 100644 ---- a/xen/common/rwlock.c -+++ b/xen/common/rwlock.c -@@ -34,8 +34,11 @@ void queue_read_lock_slowpath(rwlock_t *lock) - - /* - * Put the reader into the wait queue. -+ * -+ * Use the speculation unsafe helper, as it's the caller responsibility to -+ * issue a speculation barrier if required. - */ -- spin_lock(&lock->lock); -+ _spin_lock(&lock->lock); - - /* - * At the head of the wait queue now, wait until the writer state -@@ -66,8 +69,13 @@ void queue_write_lock_slowpath(rwlock_t *lock) - { - u32 cnts; - -- /* Put the writer into the wait queue. */ -- spin_lock(&lock->lock); -+ /* -+ * Put the writer into the wait queue. -+ * -+ * Use the speculation unsafe helper, as it's the caller responsibility to -+ * issue a speculation barrier if required. -+ */ -+ _spin_lock(&lock->lock); - - /* Try to acquire the lock directly if no reader is present. */ - if ( !atomic_read(&lock->cnts) && -diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h -index e0d2b41c5c..9a0d3ec238 100644 ---- a/xen/include/xen/rwlock.h -+++ b/xen/include/xen/rwlock.h -@@ -259,27 +259,49 @@ static inline int _rw_is_write_locked(const rwlock_t *lock) - return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED; - } - --#define read_lock(l) _read_lock(l) --#define read_lock_irq(l) _read_lock_irq(l) -+static always_inline void read_lock(rwlock_t *l) -+{ -+ _read_lock(l); -+ block_lock_speculation(); -+} -+ -+static always_inline void read_lock_irq(rwlock_t *l) -+{ -+ _read_lock_irq(l); -+ block_lock_speculation(); -+} -+ - #define read_lock_irqsave(l, f) \ - ({ \ - BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ - ((f) = _read_lock_irqsave(l)); \ -+ block_lock_speculation(); \ - }) - - #define read_unlock(l) _read_unlock(l) - #define read_unlock_irq(l) _read_unlock_irq(l) - #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) --#define read_trylock(l) _read_trylock(l) -+#define read_trylock(l) lock_evaluate_nospec(_read_trylock(l)) -+ -+static always_inline void write_lock(rwlock_t *l) -+{ -+ _write_lock(l); -+ block_lock_speculation(); -+} -+ -+static always_inline void write_lock_irq(rwlock_t *l) -+{ -+ _write_lock_irq(l); -+ block_lock_speculation(); -+} - --#define write_lock(l) _write_lock(l) --#define write_lock_irq(l) _write_lock_irq(l) - #define write_lock_irqsave(l, f) \ - ({ \ - BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ - ((f) = _write_lock_irqsave(l)); \ -+ block_lock_speculation(); \ - }) --#define write_trylock(l) _write_trylock(l) -+#define write_trylock(l) lock_evaluate_nospec(_write_trylock(l)) - - #define write_unlock(l) _write_unlock(l) - #define write_unlock_irq(l) _write_unlock_irq(l) --- -2.44.0 - - -From 5a13c81542a163718d7cb9b150b0282b7855efde Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 13 Feb 2024 17:57:38 +0100 -Subject: [PATCH 67/70] percpu-rwlock: introduce support for blocking - speculation into critical regions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add direct calls to block_lock_speculation() where required in order to prevent -speculation into the lock protected critical regions. Also convert -_percpu_read_lock() from inline to always_inline. - -Note that _percpu_write_lock() has been modified the use the non speculation -safe of the locking primites, as a speculation is added unconditionally by the -calling wrapper. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit f218daf6d3a3b847736d37c6a6b76031a0d08441) ---- - xen/common/rwlock.c | 6 +++++- - xen/include/xen/rwlock.h | 14 ++++++++++---- - 2 files changed, 15 insertions(+), 5 deletions(-) - -diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c -index 290602936d..f5a249bcc2 100644 ---- a/xen/common/rwlock.c -+++ b/xen/common/rwlock.c -@@ -129,8 +129,12 @@ void _percpu_write_lock(percpu_rwlock_t **per_cpudata, - /* - * First take the write lock to protect against other writers or slow - * path readers. -+ * -+ * Note we use the speculation unsafe variant of write_lock(), as the -+ * calling wrapper already adds a speculation barrier after the lock has -+ * been taken. - */ -- write_lock(&percpu_rwlock->rwlock); -+ _write_lock(&percpu_rwlock->rwlock); - - /* Now set the global variable so that readers start using read_lock. */ - percpu_rwlock->writer_activating = 1; -diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h -index 9a0d3ec238..9e35ee2edf 100644 ---- a/xen/include/xen/rwlock.h -+++ b/xen/include/xen/rwlock.h -@@ -338,8 +338,8 @@ static inline void _percpu_rwlock_owner_check(percpu_rwlock_t **per_cpudata, - #define percpu_rwlock_resource_init(l, owner) \ - (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner))) - --static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, -- percpu_rwlock_t *percpu_rwlock) -+static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, -+ percpu_rwlock_t *percpu_rwlock) - { - /* Validate the correct per_cpudata variable has been provided. */ - _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock); -@@ -374,6 +374,8 @@ static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, - } - else - { -+ /* Other branch already has a speculation barrier in read_lock(). */ -+ block_lock_speculation(); - /* All other paths have implicit check_lock() calls via read_lock(). */ - check_lock(&percpu_rwlock->rwlock.lock.debug, false); - } -@@ -430,8 +432,12 @@ static inline void _percpu_write_unlock(percpu_rwlock_t **per_cpudata, - _percpu_read_lock(&get_per_cpu_var(percpu), lock) - #define percpu_read_unlock(percpu, lock) \ - _percpu_read_unlock(&get_per_cpu_var(percpu), lock) --#define percpu_write_lock(percpu, lock) \ -- _percpu_write_lock(&get_per_cpu_var(percpu), lock) -+ -+#define percpu_write_lock(percpu, lock) \ -+({ \ -+ _percpu_write_lock(&get_per_cpu_var(percpu), lock); \ -+ block_lock_speculation(); \ -+}) - #define percpu_write_unlock(percpu, lock) \ - _percpu_write_unlock(&get_per_cpu_var(percpu), lock) - --- -2.44.0 - - -From 9de8a52b0e09a2491736abbd4a865a06ac2ced7a Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 4 Mar 2024 14:29:36 +0100 -Subject: [PATCH 68/70] locking: attempt to ensure lock wrappers are always - inline -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -In order to prevent the locking speculation barriers from being inside of -`call`ed functions that could be speculatively bypassed. - -While there also add an extra locking barrier to _mm_write_lock() in the branch -taken when the lock is already held. - -Note some functions are switched to use the unsafe variants (without speculation -barrier) of the locking primitives, but a speculation barrier is always added -to the exposed public lock wrapping helper. That's the case with -sched_spin_lock_double() or pcidevs_lock() for example. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762) ---- - xen/arch/x86/hvm/vpt.c | 10 +++++++--- - xen/arch/x86/include/asm/irq.h | 1 + - xen/arch/x86/mm/mm-locks.h | 28 +++++++++++++++------------- - xen/arch/x86/mm/p2m-pod.c | 2 +- - xen/common/event_channel.c | 5 +++-- - xen/common/grant_table.c | 6 +++--- - xen/common/sched/core.c | 19 ++++++++++++------- - xen/common/sched/private.h | 26 ++++++++++++++++++++++++-- - xen/common/timer.c | 8 +++++--- - xen/drivers/passthrough/pci.c | 5 +++-- - xen/include/xen/event.h | 4 ++-- - xen/include/xen/pci.h | 8 ++++++-- - 12 files changed, 82 insertions(+), 40 deletions(-) - -diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c -index 8f53e88d67..e1d6845a28 100644 ---- a/xen/arch/x86/hvm/vpt.c -+++ b/xen/arch/x86/hvm/vpt.c -@@ -150,7 +150,7 @@ static int pt_irq_masked(struct periodic_time *pt) - * pt->vcpu field, because another thread holding the pt_migrate lock - * may already be spinning waiting for your vcpu lock. - */ --static void pt_vcpu_lock(struct vcpu *v) -+static always_inline void pt_vcpu_lock(struct vcpu *v) - { - spin_lock(&v->arch.hvm.tm_lock); - } -@@ -169,9 +169,13 @@ static void pt_vcpu_unlock(struct vcpu *v) - * need to take an additional lock that protects against pt->vcpu - * changing. - */ --static void pt_lock(struct periodic_time *pt) -+static always_inline void pt_lock(struct periodic_time *pt) - { -- read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); -+ /* -+ * Use the speculation unsafe variant for the first lock, as the following -+ * lock taking helper already includes a speculation barrier. -+ */ -+ _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); - spin_lock(&pt->vcpu->arch.hvm.tm_lock); - } - -diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h -index a87af47ece..465ab39bb0 100644 ---- a/xen/arch/x86/include/asm/irq.h -+++ b/xen/arch/x86/include/asm/irq.h -@@ -174,6 +174,7 @@ void cf_check irq_complete_move(struct irq_desc *desc); - - extern struct irq_desc *irq_desc; - -+/* Not speculation safe, only used for AP bringup. */ - void lock_vector_lock(void); - void unlock_vector_lock(void); - -diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h -index 5a3f96fbaa..5ec080c02f 100644 ---- a/xen/arch/x86/mm/mm-locks.h -+++ b/xen/arch/x86/mm/mm-locks.h -@@ -74,8 +74,8 @@ static inline void _set_lock_level(int l) - this_cpu(mm_lock_level) = l; - } - --static inline void _mm_lock(const struct domain *d, mm_lock_t *l, -- const char *func, int level, int rec) -+static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l, -+ const char *func, int level, int rec) - { - if ( !((mm_locked_by_me(l)) && rec) ) - _check_lock_level(d, level); -@@ -125,8 +125,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l) - return (l->locker == get_processor_id()); - } - --static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, -- const char *func, int level) -+static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, -+ const char *func, int level) - { - if ( !mm_write_locked_by_me(l) ) - { -@@ -137,6 +137,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, - l->unlock_level = _get_lock_level(); - _set_lock_level(_lock_level(d, level)); - } -+ else -+ block_speculation(); - l->recurse_count++; - } - -@@ -150,8 +152,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l) - percpu_write_unlock(p2m_percpu_rwlock, &l->lock); - } - --static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, -- int level) -+static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, -+ int level) - { - _check_lock_level(d, level); - percpu_read_lock(p2m_percpu_rwlock, &l->lock); -@@ -166,15 +168,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l) - - /* This wrapper uses the line number to express the locking order below */ - #define declare_mm_lock(name) \ -- static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ -- const char *func, int rec) \ -+ static always_inline void mm_lock_##name( \ -+ const struct domain *d, mm_lock_t *l, const char *func, int rec) \ - { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } - #define declare_mm_rwlock(name) \ -- static inline void mm_write_lock_##name(const struct domain *d, \ -- mm_rwlock_t *l, const char *func) \ -+ static always_inline void mm_write_lock_##name( \ -+ const struct domain *d, mm_rwlock_t *l, const char *func) \ - { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ -- static inline void mm_read_lock_##name(const struct domain *d, \ -- mm_rwlock_t *l) \ -+ static always_inline void mm_read_lock_##name(const struct domain *d, \ -+ mm_rwlock_t *l) \ - { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } - /* These capture the name of the calling function */ - #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) -@@ -309,7 +311,7 @@ declare_mm_lock(altp2mlist) - #define MM_LOCK_ORDER_altp2m 40 - declare_mm_rwlock(altp2m); - --static inline void p2m_lock(struct p2m_domain *p) -+static always_inline void p2m_lock(struct p2m_domain *p) - { - if ( p2m_is_altp2m(p) ) - mm_write_lock(altp2m, p->domain, &p->lock); -diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c -index 9969eb45fa..9be67b63ce 100644 ---- a/xen/arch/x86/mm/p2m-pod.c -+++ b/xen/arch/x86/mm/p2m-pod.c -@@ -24,7 +24,7 @@ - #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) - - /* Enforce lock ordering when grabbing the "external" page_alloc lock */ --static inline void lock_page_alloc(struct p2m_domain *p2m) -+static always_inline void lock_page_alloc(struct p2m_domain *p2m) - { - page_alloc_mm_pre_lock(p2m->domain); - spin_lock(&(p2m->domain->page_alloc_lock)); -diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c -index a7a004a084..66f924a7b0 100644 ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -45,7 +45,7 @@ - * just assume the event channel is free or unbound at the moment when the - * evtchn_read_trylock() returns false. - */ --static inline void evtchn_write_lock(struct evtchn *evtchn) -+static always_inline void evtchn_write_lock(struct evtchn *evtchn) - { - write_lock(&evtchn->lock); - -@@ -351,7 +351,8 @@ int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc, evtchn_port_t port) - return rc; - } - --static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) -+static always_inline void double_evtchn_lock(struct evtchn *lchn, -+ struct evtchn *rchn) - { - ASSERT(lchn != rchn); - -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index 89b7811c51..934924cbda 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -403,7 +403,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn) - - static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock); - --static inline void grant_read_lock(struct grant_table *gt) -+static always_inline void grant_read_lock(struct grant_table *gt) - { - percpu_read_lock(grant_rwlock, >->lock); - } -@@ -413,7 +413,7 @@ static inline void grant_read_unlock(struct grant_table *gt) - percpu_read_unlock(grant_rwlock, >->lock); - } - --static inline void grant_write_lock(struct grant_table *gt) -+static always_inline void grant_write_lock(struct grant_table *gt) - { - percpu_write_lock(grant_rwlock, >->lock); - } -@@ -450,7 +450,7 @@ nr_active_grant_frames(struct grant_table *gt) - return num_act_frames_from_sha_frames(nr_grant_frames(gt)); - } - --static inline struct active_grant_entry * -+static always_inline struct active_grant_entry * - active_entry_acquire(struct grant_table *t, grant_ref_t e) - { - struct active_grant_entry *act; -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 901782bbb4..34ad39b9ad 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu) - * This avoids dead- or live-locks when this code is running on both - * cpus at the same time. - */ --static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, -- unsigned long *flags) -+static always_inline void sched_spin_lock_double( -+ spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) - { -+ /* -+ * In order to avoid extra overhead, use the locking primitives without the -+ * speculation barrier, and introduce a single barrier here. -+ */ - if ( lock1 == lock2 ) - { -- spin_lock_irqsave(lock1, *flags); -+ *flags = _spin_lock_irqsave(lock1); - } - else if ( lock1 < lock2 ) - { -- spin_lock_irqsave(lock1, *flags); -- spin_lock(lock2); -+ *flags = _spin_lock_irqsave(lock1); -+ _spin_lock(lock2); - } - else - { -- spin_lock_irqsave(lock2, *flags); -- spin_lock(lock1); -+ *flags = _spin_lock_irqsave(lock2); -+ _spin_lock(lock1); - } -+ block_lock_speculation(); - } - - static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index c516976c37..3b97f15767 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch); - #define cpumask_scratch (&this_cpu(cpumask_scratch)) - #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c)) - -+/* -+ * Deal with _spin_lock_irqsave() returning the flags value instead of storing -+ * it in a passed parameter. -+ */ -+#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock) -+#define _sched_spinlock1(lock, irq, arg) ({ \ -+ BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \ -+ (arg) = _spin_lock##irq(lock); \ -+}) -+ -+#define _sched_spinlock__(nr) _sched_spinlock ## nr -+#define _sched_spinlock_(nr) _sched_spinlock__(nr) -+#define _sched_spinlock(lock, irq, args...) \ -+ _sched_spinlock_(count_args(args))(lock, irq, ## args) -+ - #define sched_lock(kind, param, cpu, irq, arg...) \ --static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ -+static always_inline spinlock_t \ -+*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ - { \ - for ( ; ; ) \ - { \ -@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ - * \ - * It may also be the case that v->processor may change but the \ - * lock may be the same; this will succeed in that case. \ -+ * \ -+ * Use the speculation unsafe locking helper, there's a speculation \ -+ * barrier before returning to the caller. \ - */ \ -- spin_lock##irq(lock, ## arg); \ -+ _sched_spinlock(lock, irq, ## arg); \ - if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \ -+ { \ -+ block_lock_speculation(); \ - return lock; \ -+ } \ - spin_unlock##irq(lock, ## arg); \ - } \ - } -diff --git a/xen/common/timer.c b/xen/common/timer.c -index 0fddfa7487..38eb5fd20d 100644 ---- a/xen/common/timer.c -+++ b/xen/common/timer.c -@@ -239,7 +239,7 @@ static inline void deactivate_timer(struct timer *timer) - list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); - } - --static inline bool_t timer_lock(struct timer *timer) -+static inline bool_t timer_lock_unsafe(struct timer *timer) - { - unsigned int cpu; - -@@ -253,7 +253,8 @@ static inline bool_t timer_lock(struct timer *timer) - rcu_read_unlock(&timer_cpu_read_lock); - return 0; - } -- spin_lock(&per_cpu(timers, cpu).lock); -+ /* Use the speculation unsafe variant, the wrapper has the barrier. */ -+ _spin_lock(&per_cpu(timers, cpu).lock); - if ( likely(timer->cpu == cpu) ) - break; - spin_unlock(&per_cpu(timers, cpu).lock); -@@ -266,8 +267,9 @@ static inline bool_t timer_lock(struct timer *timer) - #define timer_lock_irqsave(t, flags) ({ \ - bool_t __x; \ - local_irq_save(flags); \ -- if ( !(__x = timer_lock(t)) ) \ -+ if ( !(__x = timer_lock_unsafe(t)) ) \ - local_irq_restore(flags); \ -+ block_lock_speculation(); \ - __x; \ - }) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index e99837b6e1..2a1e7ee89a 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -52,9 +52,10 @@ struct pci_seg { - - static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED; - --void pcidevs_lock(void) -+/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */ -+void pcidevs_lock_unsafe(void) - { -- spin_lock_recursive(&_pcidevs_lock); -+ _spin_lock_recursive(&_pcidevs_lock); - } - - void pcidevs_unlock(void) -diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h -index 8e509e0784..f1472ea1eb 100644 ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -114,12 +114,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); - #define bucket_from_port(d, p) \ - ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) - --static inline void evtchn_read_lock(struct evtchn *evtchn) -+static always_inline void evtchn_read_lock(struct evtchn *evtchn) - { - read_lock(&evtchn->lock); - } - --static inline bool evtchn_read_trylock(struct evtchn *evtchn) -+static always_inline bool evtchn_read_trylock(struct evtchn *evtchn) - { - return read_trylock(&evtchn->lock); - } -diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h -index 251b8761a8..a71bed36be 100644 ---- a/xen/include/xen/pci.h -+++ b/xen/include/xen/pci.h -@@ -155,8 +155,12 @@ struct pci_dev { - * devices, it also sync the access to the msi capability that is not - * interrupt handling related (the mask bit register). - */ -- --void pcidevs_lock(void); -+void pcidevs_lock_unsafe(void); -+static always_inline void pcidevs_lock(void) -+{ -+ pcidevs_lock_unsafe(); -+ block_lock_speculation(); -+} - void pcidevs_unlock(void); - bool __must_check pcidevs_locked(void); - --- -2.44.0 - - -From e107a8ece71ec4e1bb0092d5beea6cb16a96f7ae Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 4 Mar 2024 18:08:48 +0100 -Subject: [PATCH 69/70] x86/mm: add speculation barriers to open coded locks -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add a speculation barrier to the clearly identified open-coded lock taking -functions. - -Note that the memory sharing page_lock() replacement (_page_lock()) is left -as-is, as the code is experimental and not security supported. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 42a572a38e22a97d86a4b648a22597628d5b42e4) ---- - xen/arch/x86/include/asm/mm.h | 4 +++- - xen/arch/x86/mm.c | 6 ++++-- - 2 files changed, 7 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h -index 05dfe35502..d1b1fee99b 100644 ---- a/xen/arch/x86/include/asm/mm.h -+++ b/xen/arch/x86/include/asm/mm.h -@@ -399,7 +399,9 @@ const struct platform_bad_page *get_platform_badpages(unsigned int *array_size); - * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is - * only supported for hvm guests, which do not have PV PTEs updated. - */ --int page_lock(struct page_info *page); -+int page_lock_unsafe(struct page_info *page); -+#define page_lock(pg) lock_evaluate_nospec(page_lock_unsafe(pg)) -+ - void page_unlock(struct page_info *page); - - void put_page_type(struct page_info *page); -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index ab0acbfea6..000fd0fb55 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2017,7 +2017,7 @@ static inline bool current_locked_page_ne_check(struct page_info *page) { - #define current_locked_page_ne_check(x) true - #endif - --int page_lock(struct page_info *page) -+int page_lock_unsafe(struct page_info *page) - { - unsigned long x, nx; - -@@ -2078,7 +2078,7 @@ void page_unlock(struct page_info *page) - * l3t_lock(), so to avoid deadlock we must avoid grabbing them in - * reverse order. - */ --static void l3t_lock(struct page_info *page) -+static always_inline void l3t_lock(struct page_info *page) - { - unsigned long x, nx; - -@@ -2087,6 +2087,8 @@ static void l3t_lock(struct page_info *page) - cpu_relax(); - nx = x | PGT_locked; - } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); -+ -+ block_lock_speculation(); - } - - static void l3t_unlock(struct page_info *page) --- -2.44.0 - - -From 4da8ca9cb9cfdb92c9dd09d5270ae16a3b2dbc89 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 4 Mar 2024 16:24:21 +0100 -Subject: [PATCH 70/70] x86: protect conditional lock taking from speculative - execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Conditionally taken locks that use the pattern: - -if ( lock ) - spin_lock(...); - -Need an else branch in order to issue an speculation barrier in the else case, -just like it's done in case the lock needs to be acquired. - -eval_nospec() could be used on the condition itself, but that would result in a -double barrier on the branch where the lock is taken. - -Introduce a new pair of helpers, {gfn,spin}_lock_if() that can be used to -conditionally take a lock in a speculation safe way. - -This is part of XSA-453 / CVE-2024-2193 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 03cf7ca23e0e876075954c558485b267b7d02406) ---- - xen/arch/x86/mm.c | 35 +++++++++++++---------------------- - xen/arch/x86/mm/mm-locks.h | 9 +++++++++ - xen/arch/x86/mm/p2m.c | 5 ++--- - xen/include/xen/spinlock.h | 8 ++++++++ - 4 files changed, 32 insertions(+), 25 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 000fd0fb55..45bfbc2522 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5007,8 +5007,7 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) - if ( !l3t ) - return NULL; - UNMAP_DOMAIN_PAGE(l3t); -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) - { - l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); -@@ -5045,8 +5044,7 @@ static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) - return NULL; - } - UNMAP_DOMAIN_PAGE(l2t); -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) - { - l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); -@@ -5084,8 +5082,7 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) - return NULL; - } - UNMAP_DOMAIN_PAGE(l1t); -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) - { - l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); -@@ -5116,6 +5113,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) - do { \ - if ( locking ) \ - l3t_lock(page); \ -+ else \ -+ block_lock_speculation(); \ - } while ( false ) - - #define L3T_UNLOCK(page) \ -@@ -5331,8 +5330,7 @@ int map_pages_to_xen( - if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) - flush_flags |= FLUSH_TLB_GLOBAL; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && - (l3e_get_flags(*pl3e) & _PAGE_PSE) ) - { -@@ -5436,8 +5434,7 @@ int map_pages_to_xen( - if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) - flush_flags |= FLUSH_TLB_GLOBAL; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && - (l2e_get_flags(*pl2e) & _PAGE_PSE) ) - { -@@ -5478,8 +5475,7 @@ int map_pages_to_xen( - unsigned long base_mfn; - const l1_pgentry_t *l1t; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - ol2e = *pl2e; - /* -@@ -5533,8 +5529,7 @@ int map_pages_to_xen( - unsigned long base_mfn; - const l2_pgentry_t *l2t; - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - ol3e = *pl3e; - /* -@@ -5678,8 +5673,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - l3e_get_flags(*pl3e))); - UNMAP_DOMAIN_PAGE(l2t); - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && - (l3e_get_flags(*pl3e) & _PAGE_PSE) ) - { -@@ -5738,8 +5732,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - l2e_get_flags(*pl2e) & ~_PAGE_PSE)); - UNMAP_DOMAIN_PAGE(l1t); - -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && - (l2e_get_flags(*pl2e) & _PAGE_PSE) ) - { -@@ -5783,8 +5776,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - */ - if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) - continue; -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - /* - * L2E may be already cleared, or set to a superpage, by -@@ -5831,8 +5823,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - if ( (nf & _PAGE_PRESENT) || - ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) - continue; -- if ( locking ) -- spin_lock(&map_pgdir_lock); -+ spin_lock_if(locking, &map_pgdir_lock); - - /* - * L3E may be already cleared, or set to a superpage, by -diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h -index 5ec080c02f..b4960fb90e 100644 ---- a/xen/arch/x86/mm/mm-locks.h -+++ b/xen/arch/x86/mm/mm-locks.h -@@ -335,6 +335,15 @@ static inline void p2m_unlock(struct p2m_domain *p) - #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) - #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) - -+static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m, -+ gfn_t gfn, unsigned int order) -+{ -+ if ( condition ) -+ gfn_lock(p2m, gfn, order); -+ else -+ block_lock_speculation(); -+} -+ - /* PoD lock (per-p2m-table) - * - * Protects private PoD data structs: entry and cache -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index 0983bd71d9..22ab1d606e 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -280,9 +280,8 @@ mfn_t p2m_get_gfn_type_access(struct p2m_domain *p2m, gfn_t gfn, - if ( q & P2M_UNSHARE ) - q |= P2M_ALLOC; - -- if ( locked ) -- /* Grab the lock here, don't release until put_gfn */ -- gfn_lock(p2m, gfn, 0); -+ /* Grab the lock here, don't release until put_gfn */ -+ gfn_lock_if(locked, p2m, gfn, 0); - - mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); - -diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h -index 28fce5615e..c830df3430 100644 ---- a/xen/include/xen/spinlock.h -+++ b/xen/include/xen/spinlock.h -@@ -222,6 +222,14 @@ static always_inline void spin_lock_irq(spinlock_t *l) - block_lock_speculation(); \ - }) - -+/* Conditionally take a spinlock in a speculation safe way. */ -+static always_inline void spin_lock_if(bool condition, spinlock_t *l) -+{ -+ if ( condition ) -+ _spin_lock(l); -+ block_lock_speculation(); -+} -+ - #define spin_unlock(l) _spin_unlock(l) - #define spin_unlock_irq(l) _spin_unlock_irq(l) - #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) --- -2.44.0 -