From d0ce95087288b30e5e211bac8e9a0817f2effcf5 Mon Sep 17 00:00:00 2001 From: Tomáš Mózes Date: Fri, 5 Apr 2024 08:59:40 +0200 Subject: Xen 4.17.4-pre-patchset-1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Tomáš Mózes --- 0001-update-Xen-version-to-4.17.4-pre.patch | 4 +- ...vice-assignment-if-phantom-functions-cann.patch | 4 +- 0003-VT-d-Fix-else-vs-endif-misplacement.patch | 4 +- ...end-CPU-erratum-1474-fix-to-more-affected.patch | 4 +- 0005-CirrusCI-drop-FreeBSD-12.patch | 4 +- ...nsure-Global-Performance-Counter-Control-.patch | 4 +- ...vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch | 4 +- ...vmx-Disallow-the-use-of-inactivity-states.patch | 4 +- ...-move-lib-fdt-elf-temp.o-and-their-deps-t.patch | 4 +- ...m-pt-fix-off-by-one-in-entry-check-assert.patch | 4 +- ...s-xentop-fix-sorting-bug-for-some-columns.patch | 67 ++++ 0012-amd-vi-fix-IVMD-memory-type-checks.patch | 53 +++ ...hvm-Fix-fast-singlestep-state-persistence.patch | 86 +++++ ...y-state-on-hvmemul_map_linear_addr-s-erro.patch | 63 ++++ 0015-build-Replace-which-with-command-v.patch | 57 +++ ...le-relocating-memory-for-qemu-xen-in-stub.patch | 50 +++ ...sure-build-fails-when-running-kconfig-fai.patch | 58 +++ 0018-x86emul-add-missing-EVEX.R-checks.patch | 50 +++ ...vepatch-fix-norevert-test-hook-setup-typo.patch | 36 ++ ...-fix-printf-format-specifier-in-no_config.patch | 38 ++ ...-use-a-union-as-register-type-for-functio.patch | 141 +++++++ ...x-BRANCH_HARDEN-option-to-only-be-set-whe.patch | 57 +++ ...-for-shadow-stack-in-exception-from-stub-.patch | 212 +++++++++++ 0024-xen-arm-Fix-UBSAN-failure-in-start_xen.patch | 52 +++ ...e-SVM-VMX-when-their-enabling-is-prohibit.patch | 67 ++++ ...sched-Fix-UB-shift-in-compat_set_timer_op.patch | 86 +++++ ...int-the-built-in-SPECULATIVE_HARDEN_-opti.patch | 54 +++ ...x-INDIRECT_THUNK-option-to-only-be-set-wh.patch | 67 ++++ ...-not-print-thunk-option-selection-if-not-.patch | 50 +++ ...ch-register-livepatch-regions-when-loaded.patch | 159 ++++++++ ...ch-search-for-symbols-in-all-loaded-paylo.patch | 149 ++++++++ ...ch-fix-norevert-test-attempt-to-open-code.patch | 186 ++++++++++ ...ch-properly-build-the-noapply-and-norever.patch | 43 +++ ...ix-segfault-in-device_model_spawn_outcome.patch | 39 ++ ...-always-use-a-temporary-parameter-stashin.patch | 197 ++++++++++ ...icy-Allow-for-levelling-of-VERW-side-effe.patch | 102 ++++++ ...CI-skip-huge-BARs-in-certain-calculations.patch | 99 +++++ ...detection-of-last-L1-entry-in-modify_xen_.patch | 41 +++ 0039-x86-entry-Introduce-EFRAME_-constants.patch | 314 ++++++++++++++++ 0040-x86-Resync-intel-family.h-from-Linux.patch | 98 +++++ ...form-VERW-flushing-later-in-the-VMExit-pa.patch | 146 ++++++++ ...rl-Perform-VERW-flushing-later-in-exit-pa.patch | 209 +++++++++++ ...x86-spec-ctrl-Rename-VERW-related-options.patch | 248 +++++++++++++ 0044-x86-spec-ctrl-VERW-handling-adjustments.patch | 171 +++++++++ ...rl-Mitigation-Register-File-Data-Sampling.patch | 320 ++++++++++++++++ ...-Delete-update_cr3-s-do_locking-parameter.patch | 161 ++++++++ ...-Swap-order-of-actions-in-the-FREE-macros.patch | 58 +++ ...k-introduce-support-for-blocking-speculat.patch | 331 +++++++++++++++++ ...oduce-support-for-blocking-speculation-in.patch | 125 +++++++ ...ck-introduce-support-for-blocking-specula.patch | 87 +++++ ...empt-to-ensure-lock-wrappers-are-always-i.patch | 405 +++++++++++++++++++++ ...-speculation-barriers-to-open-coded-locks.patch | 73 ++++ ...-conditional-lock-taking-from-speculative.patch | 216 +++++++++++ ...s-ipxe-update-for-fixing-build-with-GCC12.patch | 33 ++ ...-block_lock_speculation-in-_mm_write_lock.patch | 35 ++ ...x-setup_apic_nmi_watchdog-to-fail-more-cl.patch | 120 ++++++ ...-together-P2M-update-and-increment-of-ent.patch | 61 ++++ ...tored-Use-Map-instead-of-Hashtbl-for-quot.patch | 143 ++++++++ 0059-tools-oxenstored-Make-Quota.t-pure.patch | 121 ++++++ ...x86-cpu-policy-Hide-x2APIC-from-PV-guests.patch | 90 +++++ ...icy-Fix-visibility-of-HTT-CMP_LEGACY-in-m.patch | 85 +++++ ...irtual-region-Rename-the-start-end-fields.patch | 140 +++++++ ...en-virtual-region-Include-rodata-pointers.patch | 71 ++++ ...livepatch-Relax-permissions-on-rodata-too.patch | 85 +++++ ...prove-the-boot-watchdog-determination-of-.patch | 106 ++++++ ...Support-the-watchdog-on-newer-AMD-systems.patch | 48 +++ ...s-resource-Fix-HVM-guest-in-SHADOW-builds.patch | 110 ++++++ info.txt | 4 +- 68 files changed, 6591 insertions(+), 22 deletions(-) create mode 100644 0011-tools-xentop-fix-sorting-bug-for-some-columns.patch create mode 100644 0012-amd-vi-fix-IVMD-memory-type-checks.patch create mode 100644 0013-x86-hvm-Fix-fast-singlestep-state-persistence.patch create mode 100644 0014-x86-HVM-tidy-state-on-hvmemul_map_linear_addr-s-erro.patch create mode 100644 0015-build-Replace-which-with-command-v.patch create mode 100644 0016-libxl-Disable-relocating-memory-for-qemu-xen-in-stub.patch create mode 100644 0017-build-make-sure-build-fails-when-running-kconfig-fai.patch create mode 100644 0018-x86emul-add-missing-EVEX.R-checks.patch create mode 100644 0019-xen-livepatch-fix-norevert-test-hook-setup-typo.patch create mode 100644 0020-xen-cmdline-fix-printf-format-specifier-in-no_config.patch create mode 100644 0021-x86-altcall-use-a-union-as-register-type-for-functio.patch create mode 100644 0022-x86-spec-fix-BRANCH_HARDEN-option-to-only-be-set-whe.patch create mode 100644 0023-x86-account-for-shadow-stack-in-exception-from-stub-.patch create mode 100644 0024-xen-arm-Fix-UBSAN-failure-in-start_xen.patch create mode 100644 0025-x86-HVM-hide-SVM-VMX-when-their-enabling-is-prohibit.patch create mode 100644 0026-xen-sched-Fix-UB-shift-in-compat_set_timer_op.patch create mode 100644 0027-x86-spec-print-the-built-in-SPECULATIVE_HARDEN_-opti.patch create mode 100644 0028-x86-spec-fix-INDIRECT_THUNK-option-to-only-be-set-wh.patch create mode 100644 0029-x86-spec-do-not-print-thunk-option-selection-if-not-.patch create mode 100644 0030-xen-livepatch-register-livepatch-regions-when-loaded.patch create mode 100644 0031-xen-livepatch-search-for-symbols-in-all-loaded-paylo.patch create mode 100644 0032-xen-livepatch-fix-norevert-test-attempt-to-open-code.patch create mode 100644 0033-xen-livepatch-properly-build-the-noapply-and-norever.patch create mode 100644 0034-libxl-Fix-segfault-in-device_model_spawn_outcome.patch create mode 100644 0035-x86-altcall-always-use-a-temporary-parameter-stashin.patch create mode 100644 0036-x86-cpu-policy-Allow-for-levelling-of-VERW-side-effe.patch create mode 100644 0037-hvmloader-PCI-skip-huge-BARs-in-certain-calculations.patch create mode 100644 0038-x86-mm-fix-detection-of-last-L1-entry-in-modify_xen_.patch create mode 100644 0039-x86-entry-Introduce-EFRAME_-constants.patch create mode 100644 0040-x86-Resync-intel-family.h-from-Linux.patch create mode 100644 0041-x86-vmx-Perform-VERW-flushing-later-in-the-VMExit-pa.patch create mode 100644 0042-x86-spec-ctrl-Perform-VERW-flushing-later-in-exit-pa.patch create mode 100644 0043-x86-spec-ctrl-Rename-VERW-related-options.patch create mode 100644 0044-x86-spec-ctrl-VERW-handling-adjustments.patch create mode 100644 0045-x86-spec-ctrl-Mitigation-Register-File-Data-Sampling.patch create mode 100644 0046-x86-paging-Delete-update_cr3-s-do_locking-parameter.patch create mode 100644 0047-xen-Swap-order-of-actions-in-the-FREE-macros.patch create mode 100644 0048-x86-spinlock-introduce-support-for-blocking-speculat.patch create mode 100644 0049-rwlock-introduce-support-for-blocking-speculation-in.patch create mode 100644 0050-percpu-rwlock-introduce-support-for-blocking-specula.patch create mode 100644 0051-locking-attempt-to-ensure-lock-wrappers-are-always-i.patch create mode 100644 0052-x86-mm-add-speculation-barriers-to-open-coded-locks.patch create mode 100644 0053-x86-protect-conditional-lock-taking-from-speculative.patch create mode 100644 0054-tools-ipxe-update-for-fixing-build-with-GCC12.patch create mode 100644 0055-x86-mm-use-block_lock_speculation-in-_mm_write_lock.patch create mode 100644 0056-x86-boot-Fix-setup_apic_nmi_watchdog-to-fail-more-cl.patch create mode 100644 0057-x86-PoD-tie-together-P2M-update-and-increment-of-ent.patch create mode 100644 0058-tools-oxenstored-Use-Map-instead-of-Hashtbl-for-quot.patch create mode 100644 0059-tools-oxenstored-Make-Quota.t-pure.patch create mode 100644 0060-x86-cpu-policy-Hide-x2APIC-from-PV-guests.patch create mode 100644 0061-x86-cpu-policy-Fix-visibility-of-HTT-CMP_LEGACY-in-m.patch create mode 100644 0062-xen-virtual-region-Rename-the-start-end-fields.patch create mode 100644 0063-xen-virtual-region-Include-rodata-pointers.patch create mode 100644 0064-x86-livepatch-Relax-permissions-on-rodata-too.patch create mode 100644 0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch create mode 100644 0066-x86-boot-Support-the-watchdog-on-newer-AMD-systems.patch create mode 100644 0067-tests-resource-Fix-HVM-guest-in-SHADOW-builds.patch diff --git a/0001-update-Xen-version-to-4.17.4-pre.patch b/0001-update-Xen-version-to-4.17.4-pre.patch index b532743..e1070c9 100644 --- a/0001-update-Xen-version-to-4.17.4-pre.patch +++ b/0001-update-Xen-version-to-4.17.4-pre.patch @@ -1,7 +1,7 @@ From 4f6e9d4327eb5252f1e8cac97a095d8b8485dadb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 30 Jan 2024 14:36:44 +0100 -Subject: [PATCH 01/10] update Xen version to 4.17.4-pre +Subject: [PATCH 01/67] update Xen version to 4.17.4-pre --- xen/Makefile | 2 +- @@ -21,5 +21,5 @@ index a46e6330db..dd0b004e1c 100644 -include xen-version -- -2.43.0 +2.44.0 diff --git a/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch b/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch index d91802f..bafad55 100644 --- a/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch +++ b/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch @@ -1,7 +1,7 @@ From f9e1ed51bdba31017ea17e1819eb2ade6b5c8615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Tue, 30 Jan 2024 14:37:39 +0100 -Subject: [PATCH 02/10] pci: fail device assignment if phantom functions cannot +Subject: [PATCH 02/67] pci: fail device assignment if phantom functions cannot be assigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -87,5 +87,5 @@ index 07d1986d33..8c62b14d19 100644 else if ( d == dom_io ) pdev->quarantine = true; -- -2.43.0 +2.44.0 diff --git a/0003-VT-d-Fix-else-vs-endif-misplacement.patch b/0003-VT-d-Fix-else-vs-endif-misplacement.patch index 2e7f78d..622fa18 100644 --- a/0003-VT-d-Fix-else-vs-endif-misplacement.patch +++ b/0003-VT-d-Fix-else-vs-endif-misplacement.patch @@ -1,7 +1,7 @@ From 6b1864afc14d484cdbc9754ce3172ac3dc189846 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Tue, 30 Jan 2024 14:38:38 +0100 -Subject: [PATCH 03/10] VT-d: Fix "else" vs "#endif" misplacement +Subject: [PATCH 03/67] VT-d: Fix "else" vs "#endif" misplacement In domain_pgd_maddr() the "#endif" is misplaced with respect to "else". This generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body @@ -66,5 +66,5 @@ index b4c11a6b48..908b3ba6ee 100644 if ( !hd->arch.vtd.pgd_maddr ) { -- -2.43.0 +2.44.0 diff --git a/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch b/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch index f1289aa..fa90a46 100644 --- a/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch +++ b/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch @@ -1,7 +1,7 @@ From abcc32f0634627fe21117a48bd10e792bfbdd6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Fri, 2 Feb 2024 08:01:09 +0100 -Subject: [PATCH 04/10] x86/amd: Extend CPU erratum #1474 fix to more affected +Subject: [PATCH 04/67] x86/amd: Extend CPU erratum #1474 fix to more affected models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -119,5 +119,5 @@ index 29ae97e7c0..3d85e9797d 100644 -presmp_initcall(zen2_c6_errata_check); +presmp_initcall(amd_check_erratum_1474); -- -2.43.0 +2.44.0 diff --git a/0005-CirrusCI-drop-FreeBSD-12.patch b/0005-CirrusCI-drop-FreeBSD-12.patch index cca7bb0..dac712b 100644 --- a/0005-CirrusCI-drop-FreeBSD-12.patch +++ b/0005-CirrusCI-drop-FreeBSD-12.patch @@ -1,7 +1,7 @@ From 0ef1fb43ddd61b3c4c953e833e012ac21ad5ca0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Fri, 2 Feb 2024 08:01:50 +0100 -Subject: [PATCH 05/10] CirrusCI: drop FreeBSD 12 +Subject: [PATCH 05/67] CirrusCI: drop FreeBSD 12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -35,5 +35,5 @@ index 7e0beb200d..63f3afb104 100644 name: 'FreeBSD 13' freebsd_instance: -- -2.43.0 +2.44.0 diff --git a/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch b/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch index dc64ad6..ce07803 100644 --- a/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch +++ b/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch @@ -1,7 +1,7 @@ From d0ad2cc5eac1b5d3cfd14204d377ce2384f52607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Fri, 2 Feb 2024 08:02:20 +0100 -Subject: [PATCH 06/10] x86/intel: ensure Global Performance Counter Control is +Subject: [PATCH 06/67] x86/intel: ensure Global Performance Counter Control is setup correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -70,5 +70,5 @@ index b40ac696e6..96723b5d44 100644 if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) ) -- -2.43.0 +2.44.0 diff --git a/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch b/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch index a1937a7..2100acc 100644 --- a/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch +++ b/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch @@ -1,7 +1,7 @@ From eca5416f9b0e179de9553900de8de660ab09199d Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Fri, 2 Feb 2024 08:02:51 +0100 -Subject: [PATCH 07/10] x86/vmx: Fix IRQ handling for EXIT_REASON_INIT +Subject: [PATCH 07/67] x86/vmx: Fix IRQ handling for EXIT_REASON_INIT When receiving an INIT, a prior bugfix tried to ignore the INIT and continue onwards. @@ -61,5 +61,5 @@ index 072288a5ef..31f4a861c6 100644 break; case EXIT_REASON_TRIPLE_FAULT: -- -2.43.0 +2.44.0 diff --git a/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch b/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch index 12c2d59..3af45e8 100644 --- a/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch +++ b/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch @@ -1,7 +1,7 @@ From 7bd612727df792671e44152a8205f0cf821ad984 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Fri, 2 Feb 2024 08:03:26 +0100 -Subject: [PATCH 08/10] x86/vmx: Disallow the use of inactivity states +Subject: [PATCH 08/67] x86/vmx: Disallow the use of inactivity states Right now, vvmx will blindly copy L12's ACTIVITY_STATE into the L02 VMCS and enter the vCPU. Luckily for us, nested-virt is explicitly unsupported for @@ -122,5 +122,5 @@ index 78404e42b3..0af021d5f5 100644 #define VMX_MISC_CR3_TARGET 0x01ff0000 #define VMX_MISC_VMWRITE_ALL 0x20000000 -- -2.43.0 +2.44.0 diff --git a/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch b/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch index 9ee7104..f33d27d 100644 --- a/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch +++ b/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch @@ -1,7 +1,7 @@ From afb85cf1e8f165abf88de9d8a6df625692a753b1 Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Fri, 2 Feb 2024 08:04:07 +0100 -Subject: [PATCH 09/10] lib{fdt,elf}: move lib{fdt,elf}-temp.o and their deps +Subject: [PATCH 09/67] lib{fdt,elf}: move lib{fdt,elf}-temp.o and their deps to $(targets) At the moment, trying to run xencov read/reset (calling SYSCTL_coverage_op @@ -66,5 +66,5 @@ index 75aaefa2e3..4d14fd61ba 100644 -extra-y += libfdt-temp.o $(LIBFDT_OBJS) +targets += libfdt-temp.o $(LIBFDT_OBJS) -- -2.43.0 +2.44.0 diff --git a/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch b/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch index ba99063..9b3b9a0 100644 --- a/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch +++ b/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch @@ -1,7 +1,7 @@ From 091466ba55d1e2e75738f751818ace2e3ed08ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= Date: Fri, 2 Feb 2024 08:04:33 +0100 -Subject: [PATCH 10/10] x86/p2m-pt: fix off by one in entry check assert +Subject: [PATCH 10/67] x86/p2m-pt: fix off by one in entry check assert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -32,5 +32,5 @@ index eaba2b0fb4..f02ebae372 100644 new == p2m_mmio_dm ) ASSERT(mfn_valid(mfn) || mfn_eq(mfn, INVALID_MFN)); -- -2.43.0 +2.44.0 diff --git a/0011-tools-xentop-fix-sorting-bug-for-some-columns.patch b/0011-tools-xentop-fix-sorting-bug-for-some-columns.patch new file mode 100644 index 0000000..6bf11d9 --- /dev/null +++ b/0011-tools-xentop-fix-sorting-bug-for-some-columns.patch @@ -0,0 +1,67 @@ +From 61da71968ea44964fd1dd2e449b053c77eb83139 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Cyril=20R=C3=A9bert=20=28zithro=29?= +Date: Tue, 27 Feb 2024 14:06:53 +0100 +Subject: [PATCH 11/67] tools/xentop: fix sorting bug for some columns +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Sort doesn't work on columns VBD_OO, VBD_RD, VBD_WR and VBD_RSECT. +Fix by adjusting variables names in compare functions. +Bug fix only. No functional change. + +Fixes: 91c3e3dc91d6 ("tools/xentop: Display '-' when stats are not available.") +Signed-off-by: Cyril Rébert (zithro) +Reviewed-by: Anthony PERARD +master commit: 29f17d837421f13c0e0010802de1b2d51d2ded4a +master date: 2024-02-05 17:58:23 +0000 +--- + tools/xentop/xentop.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/tools/xentop/xentop.c b/tools/xentop/xentop.c +index 950e8935c4..545bd5e96d 100644 +--- a/tools/xentop/xentop.c ++++ b/tools/xentop/xentop.c +@@ -684,7 +684,7 @@ static int compare_vbd_oo(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_oo = 0, dom2_vbd_oo = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom1_vbd_oo); +- tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom2_vbd_oo); ++ tot_vbd_reqs(domain2, FIELD_VBD_OO, &dom2_vbd_oo); + + return -compare(dom1_vbd_oo, dom2_vbd_oo); + } +@@ -711,9 +711,9 @@ static int compare_vbd_rd(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_rd = 0, dom2_vbd_rd = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom1_vbd_rd); +- tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom2_vbd_rd); ++ tot_vbd_reqs(domain2, FIELD_VBD_RD, &dom2_vbd_rd); + +- return -compare(dom1_vbd_rd, dom1_vbd_rd); ++ return -compare(dom1_vbd_rd, dom2_vbd_rd); + } + + /* Prints number of total VBD READ requests statistic */ +@@ -738,7 +738,7 @@ static int compare_vbd_wr(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_wr = 0, dom2_vbd_wr = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom1_vbd_wr); +- tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom2_vbd_wr); ++ tot_vbd_reqs(domain2, FIELD_VBD_WR, &dom2_vbd_wr); + + return -compare(dom1_vbd_wr, dom2_vbd_wr); + } +@@ -765,7 +765,7 @@ static int compare_vbd_rsect(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_rsect = 0, dom2_vbd_rsect = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom1_vbd_rsect); +- tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom2_vbd_rsect); ++ tot_vbd_reqs(domain2, FIELD_VBD_RSECT, &dom2_vbd_rsect); + + return -compare(dom1_vbd_rsect, dom2_vbd_rsect); + } +-- +2.44.0 + diff --git a/0012-amd-vi-fix-IVMD-memory-type-checks.patch b/0012-amd-vi-fix-IVMD-memory-type-checks.patch new file mode 100644 index 0000000..f38e39e --- /dev/null +++ b/0012-amd-vi-fix-IVMD-memory-type-checks.patch @@ -0,0 +1,53 @@ +From 463aaf3fbf62d24e898ae0c2ba53d85ca0f94d3f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 14:07:12 +0100 +Subject: [PATCH 12/67] amd-vi: fix IVMD memory type checks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current code that parses the IVMD blocks is relaxed with regard to the +restriction that such unity regions should always fall into memory ranges +marked as reserved in the memory map. + +However the type checks for the IVMD addresses are inverted, and as a result +IVMD ranges falling into RAM areas are accepted. Note that having such ranges +in the first place is a firmware bug, as IVMD should always fall into reserved +ranges. + +Fixes: ed6c77ebf0c1 ('AMD/IOMMU: check / convert IVMD ranges for being / to be reserved') +Reported-by: Ox +Signed-off-by: Roger Pau Monné +Tested-by: oxjo +Reviewed-by: Jan Beulich +master commit: 83afa313583019d9f159c122cecf867735d27ec5 +master date: 2024-02-06 11:56:13 +0100 +--- + xen/drivers/passthrough/amd/iommu_acpi.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu_acpi.c b/xen/drivers/passthrough/amd/iommu_acpi.c +index 3b577c9b39..3a7045c39b 100644 +--- a/xen/drivers/passthrough/amd/iommu_acpi.c ++++ b/xen/drivers/passthrough/amd/iommu_acpi.c +@@ -426,9 +426,14 @@ static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block) + return -EIO; + } + +- /* Types which won't be handed out are considered good enough. */ +- if ( !(type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | +- RAM_TYPE_UNUSABLE)) ) ++ /* ++ * Types which aren't RAM are considered good enough. ++ * Note that a page being partially RESERVED, ACPI or UNUSABLE will ++ * force Xen into assuming the whole page as having that type in ++ * practice. ++ */ ++ if ( type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | ++ RAM_TYPE_UNUSABLE) ) + continue; + + AMD_IOMMU_ERROR("IVMD: page at %lx can't be converted\n", addr); +-- +2.44.0 + diff --git a/0013-x86-hvm-Fix-fast-singlestep-state-persistence.patch b/0013-x86-hvm-Fix-fast-singlestep-state-persistence.patch new file mode 100644 index 0000000..2a14354 --- /dev/null +++ b/0013-x86-hvm-Fix-fast-singlestep-state-persistence.patch @@ -0,0 +1,86 @@ +From 415f770d23f9fcbc02436560fa6583dcd8e1343f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Petr=20Bene=C5=A1?= +Date: Tue, 27 Feb 2024 14:07:45 +0100 +Subject: [PATCH 13/67] x86/hvm: Fix fast singlestep state persistence +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch addresses an issue where the fast singlestep setting would persist +despite xc_domain_debug_control being called with XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF. +Specifically, if fast singlestep was enabled in a VMI session and that session +stopped before the MTF trap occurred, the fast singlestep setting remained +active even though MTF itself was disabled. This led to a situation where, upon +starting a new VMI session, the first event to trigger an EPT violation would +cause the corresponding EPT event callback to be skipped due to the lingering +fast singlestep setting. + +The fix ensures that the fast singlestep setting is properly reset when +disabling single step debugging operations. + +Signed-off-by: Petr Beneš +Reviewed-by: Tamas K Lengyel +master commit: 897def94b56175ce569673a05909d2f223e1e749 +master date: 2024-02-12 09:37:58 +0100 +--- + xen/arch/x86/hvm/hvm.c | 34 ++++++++++++++++++++++++---------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index d6c6ab8897..558dc3eddc 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -5153,26 +5153,40 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg) + + int hvm_debug_op(struct vcpu *v, int32_t op) + { +- int rc; ++ int rc = 0; + + switch ( op ) + { + case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: + case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: +- rc = -EOPNOTSUPP; + if ( !cpu_has_monitor_trap_flag ) +- break; +- rc = 0; +- vcpu_pause(v); +- v->arch.hvm.single_step = +- (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON); +- vcpu_unpause(v); /* guest will latch new state */ ++ return -EOPNOTSUPP; + break; + default: +- rc = -ENOSYS; +- break; ++ return -ENOSYS; ++ } ++ ++ vcpu_pause(v); ++ ++ switch ( op ) ++ { ++ case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: ++ v->arch.hvm.single_step = true; ++ break; ++ ++ case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: ++ v->arch.hvm.single_step = false; ++ v->arch.hvm.fast_single_step.enabled = false; ++ v->arch.hvm.fast_single_step.p2midx = 0; ++ break; ++ ++ default: /* Excluded above */ ++ ASSERT_UNREACHABLE(); ++ return -ENOSYS; + } + ++ vcpu_unpause(v); /* guest will latch new state */ ++ + return rc; + } + +-- +2.44.0 + diff --git a/0014-x86-HVM-tidy-state-on-hvmemul_map_linear_addr-s-erro.patch b/0014-x86-HVM-tidy-state-on-hvmemul_map_linear_addr-s-erro.patch new file mode 100644 index 0000000..6536674 --- /dev/null +++ b/0014-x86-HVM-tidy-state-on-hvmemul_map_linear_addr-s-erro.patch @@ -0,0 +1,63 @@ +From b3ae0e6201495216b12157bd8b2382b28fdd7dae Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 14:08:20 +0100 +Subject: [PATCH 14/67] x86/HVM: tidy state on hvmemul_map_linear_addr()'s + error path + +While in the vast majority of cases failure of the function will not +be followed by re-invocation with the same emulation context, a few +very specific insns - involving multiple independent writes, e.g. ENTER +and PUSHA - exist where this can happen. Since failure of the function +only signals to the caller that it ought to try an MMIO write instead, +such failure also cannot be assumed to result in wholesale failure of +emulation of the current insn. Instead we have to maintain internal +state such that another invocation of the function with the same +emulation context remains possible. To achieve that we need to reset MFN +slots after putting page references on the error path. + +Note that all of this affects debugging code only, in causing an +assertion to trigger (higher up in the function). There's otherwise no +misbehavior - such a "leftover" slot would simply be overwritten by new +contents in a release build. + +Also extend the related unmap() assertion, to further check for MFN 0. + +Fixes: 8cbd4fb0b7ea ("x86/hvm: implement hvmemul_write() using real mappings") +Reported-by: Manuel Andreas +Signed-off-by: Jan Beulich +Acked-by: Paul Durrant +master commit: e72f951df407bc3be82faac64d8733a270036ba1 +master date: 2024-02-13 09:36:14 +0100 +--- + xen/arch/x86/hvm/emulate.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c +index 275451dd36..27928dc3f3 100644 +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -697,7 +697,12 @@ static void *hvmemul_map_linear_addr( + out: + /* Drop all held references. */ + while ( mfn-- > hvmemul_ctxt->mfn ) ++ { + put_page(mfn_to_page(*mfn)); ++#ifndef NDEBUG /* Clean slot for a subsequent map()'s error checking. */ ++ *mfn = _mfn(0); ++#endif ++ } + + return err; + } +@@ -719,7 +724,7 @@ static void hvmemul_unmap_linear_addr( + + for ( i = 0; i < nr_frames; i++ ) + { +- ASSERT(mfn_valid(*mfn)); ++ ASSERT(mfn_x(*mfn) && mfn_valid(*mfn)); + paging_mark_dirty(currd, *mfn); + put_page(mfn_to_page(*mfn)); + +-- +2.44.0 + diff --git a/0015-build-Replace-which-with-command-v.patch b/0015-build-Replace-which-with-command-v.patch new file mode 100644 index 0000000..57f21d4 --- /dev/null +++ b/0015-build-Replace-which-with-command-v.patch @@ -0,0 +1,57 @@ +From 1330a5fe44ca91f98857b53fe8bbe06522d9db27 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD +Date: Tue, 27 Feb 2024 14:08:50 +0100 +Subject: [PATCH 15/67] build: Replace `which` with `command -v` +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The `which` command is not standard, may not exist on the build host, +or may not behave as expected by the build system. It is recommended +to use `command -v` to find out if a command exist and have its path, +and it's part of a POSIX shell standard (at least, it seems to be +mandatory since IEEE Std 1003.1-2008, but was optional before). + +Fixes: c8a8645f1efe ("xen/build: Automatically locate a suitable python interpreter") +Fixes: 3b47bcdb6d38 ("xen/build: Use a distro version of figlet") +Signed-off-by: Anthony PERARD +Tested-by: Marek Marczykowski-Górecki +Acked-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: f93629b18b528a5ab1b1092949c5420069c7226c +master date: 2024-02-19 12:45:48 +0100 +--- + xen/Makefile | 4 ++-- + xen/build.mk | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/Makefile b/xen/Makefile +index dd0b004e1c..7ea13a6791 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -25,8 +25,8 @@ export XEN_BUILD_HOST := $(shell hostname) + endif + + # Best effort attempt to find a python interpreter, defaulting to Python 3 if +-# available. Fall back to just `python` if `which` is nowhere to be found. +-PYTHON_INTERPRETER := $(word 1,$(shell which python3 python python2 2>/dev/null) python) ++# available. Fall back to just `python`. ++PYTHON_INTERPRETER := $(word 1,$(shell command -v python3 || command -v python || command -v python2) python) + export PYTHON ?= $(PYTHON_INTERPRETER) + + export CHECKPOLICY ?= checkpolicy +diff --git a/xen/build.mk b/xen/build.mk +index 9ecb104f1e..b489f77b7c 100644 +--- a/xen/build.mk ++++ b/xen/build.mk +@@ -1,6 +1,6 @@ + quiet_cmd_banner = BANNER $@ + define cmd_banner +- if which figlet >/dev/null 2>&1 ; then \ ++ if command -v figlet >/dev/null 2>&1 ; then \ + echo " Xen $(XEN_FULLVERSION)" | figlet -f $< > $@.tmp; \ + else \ + echo " Xen $(XEN_FULLVERSION)" > $@.tmp; \ +-- +2.44.0 + diff --git a/0016-libxl-Disable-relocating-memory-for-qemu-xen-in-stub.patch b/0016-libxl-Disable-relocating-memory-for-qemu-xen-in-stub.patch new file mode 100644 index 0000000..f75e07c --- /dev/null +++ b/0016-libxl-Disable-relocating-memory-for-qemu-xen-in-stub.patch @@ -0,0 +1,50 @@ +From b9745280736ee526374873aa3c4142596e2ba10b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Tue, 27 Feb 2024 14:09:19 +0100 +Subject: [PATCH 16/67] libxl: Disable relocating memory for qemu-xen in + stubdomain too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +According to comments (and experiments) qemu-xen cannot handle memory +reolcation done by hvmloader. The code was already disabled when running +qemu-xen in dom0 (see libxl__spawn_local_dm()), but it was missed when +adding qemu-xen support to stubdomain. Adjust libxl__spawn_stub_dm() to +be consistent in this regard. + +Reported-by: Neowutran +Signed-off-by: Marek Marczykowski-Górecki +Reviewed-by: Jason Andryuk +Acked-by: Anthony PERARD +master commit: 97883aa269f6745a6ded232be3a855abb1297e0d +master date: 2024-02-22 11:48:22 +0100 +--- + tools/libs/light/libxl_dm.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index 14b593110f..ed620a9d8e 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -2432,6 +2432,16 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss) + "%s", + libxl_bios_type_to_string(guest_config->b_info.u.hvm.bios)); + } ++ /* Disable relocating memory to make the MMIO hole larger ++ * unless we're running qemu-traditional and vNUMA is not ++ * configured. */ ++ libxl__xs_printf(gc, XBT_NULL, ++ libxl__sprintf(gc, "%s/hvmloader/allow-memory-relocate", ++ libxl__xs_get_dompath(gc, guest_domid)), ++ "%d", ++ guest_config->b_info.device_model_version ++ == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL && ++ !libxl__vnuma_configured(&guest_config->b_info)); + ret = xc_domain_set_target(ctx->xch, dm_domid, guest_domid); + if (ret<0) { + LOGED(ERROR, guest_domid, "setting target domain %d -> %d", +-- +2.44.0 + diff --git a/0017-build-make-sure-build-fails-when-running-kconfig-fai.patch b/0017-build-make-sure-build-fails-when-running-kconfig-fai.patch new file mode 100644 index 0000000..1bb3aa8 --- /dev/null +++ b/0017-build-make-sure-build-fails-when-running-kconfig-fai.patch @@ -0,0 +1,58 @@ +From ea869977271f93945451908be9b6117ffd1fb02d Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 14:09:37 +0100 +Subject: [PATCH 17/67] build: make sure build fails when running kconfig fails +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Because of using "-include", failure to (re)build auto.conf (with +auto.conf.cmd produced as a secondary target) won't stop make from +continuing the build. Arrange for it being possible to drop the - from +Rules.mk, requiring that the include be skipped for tools-only targets. +Note that relying on the inclusion in those cases wouldn't be correct +anyway, as it might be a stale file (yet to be rebuilt) which would be +included, while during initial build, the file would be absent +altogether. + +Fixes: 8d4c17a90b0a ("xen/build: silence make warnings about missing auto.conf*") +Reported-by: Roger Pau Monné +Signed-off-by: Jan Beulich +Reviewed-by: Anthony PERARD +master commit: d34e5fa2e8db19f23081f46a3e710bb122130691 +master date: 2024-02-22 11:52:47 +0100 +--- + xen/Makefile | 1 + + xen/Rules.mk | 4 +++- + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/xen/Makefile b/xen/Makefile +index 7ea13a6791..bac3684a36 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -374,6 +374,7 @@ $(KCONFIG_CONFIG): tools_fixdep + # This exploits the 'multi-target pattern rule' trick. + # The syncconfig should be executed only once to make all the targets. + include/config/%.conf include/config/%.conf.cmd: $(KCONFIG_CONFIG) ++ $(Q)rm -f include/config/auto.conf + $(Q)$(MAKE) $(build)=tools/kconfig syncconfig + + ifeq ($(CONFIG_DEBUG),y) +diff --git a/xen/Rules.mk b/xen/Rules.mk +index 8af3dd7277..d759cccee3 100644 +--- a/xen/Rules.mk ++++ b/xen/Rules.mk +@@ -15,7 +15,9 @@ srcdir := $(srctree)/$(src) + PHONY := __build + __build: + +--include $(objtree)/include/config/auto.conf ++ifneq ($(firstword $(subst /, ,$(obj))),tools) ++include $(objtree)/include/config/auto.conf ++endif + + include $(XEN_ROOT)/Config.mk + include $(srctree)/scripts/Kbuild.include +-- +2.44.0 + diff --git a/0018-x86emul-add-missing-EVEX.R-checks.patch b/0018-x86emul-add-missing-EVEX.R-checks.patch new file mode 100644 index 0000000..12e7702 --- /dev/null +++ b/0018-x86emul-add-missing-EVEX.R-checks.patch @@ -0,0 +1,50 @@ +From 16f2e47eb1207d866f95cf694a60a7ceb8f96a36 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 14:09:55 +0100 +Subject: [PATCH 18/67] x86emul: add missing EVEX.R' checks + +EVEX.R' is not ignored in 64-bit code when encoding a GPR or mask +register. While for mask registers suitable checks are in place (there +also covering EVEX.R), they were missing for the few cases where in +EVEX-encoded instructions ModR/M.reg encodes a GPR. While for VPEXTRW +the bit is replaced before an emulation stub is invoked, for +VCVT{,T}{S,D,H}2{,U}SI this actually would have led to #UD from inside +an emulation stub, in turn raising #UD to the guest, but accompanied by +log messages indicating something's wrong in Xen nevertheless. + +Fixes: 001bd91ad864 ("x86emul: support AVX512{F,BW,DQ} extract insns") +Fixes: baf4a376f550 ("x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns") +Signed-off-by: Jan Beulich +Acked-by: Andrew Cooper +master commit: cb319824bfa8d3c9ea0410cc71daaedc3e11aa2a +master date: 2024-02-22 11:54:07 +0100 +--- + xen/arch/x86/x86_emulate/x86_emulate.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c +index 0c0336f737..995670cbc8 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -6829,7 +6829,8 @@ x86_emulate( + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */ +- generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || ++ generate_exception_if((evex.reg != 0xf || !evex.RX || !evex.R || ++ evex.opmsk || + (ea.type != OP_REG && evex.brs)), + EXC_UD); + host_and_vcpu_must_have(avx512f); +@@ -10705,7 +10706,7 @@ x86_emulate( + goto pextr; + + case X86EMUL_OPC_EVEX_66(0x0f, 0xc5): /* vpextrw $imm8,xmm,reg */ +- generate_exception_if(ea.type != OP_REG, EXC_UD); ++ generate_exception_if(ea.type != OP_REG || !evex.R, EXC_UD); + /* Convert to alternative encoding: We want to use a memory operand. */ + evex.opcx = ext_0f3a; + b = 0x15; +-- +2.44.0 + diff --git a/0019-xen-livepatch-fix-norevert-test-hook-setup-typo.patch b/0019-xen-livepatch-fix-norevert-test-hook-setup-typo.patch new file mode 100644 index 0000000..1676f7a --- /dev/null +++ b/0019-xen-livepatch-fix-norevert-test-hook-setup-typo.patch @@ -0,0 +1,36 @@ +From f6b12792542e372f36a71ea4c2563e6dd6e4fa57 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 14:10:24 +0100 +Subject: [PATCH 19/67] xen/livepatch: fix norevert test hook setup typo +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The test code has a typo in using LIVEPATCH_APPLY_HOOK() instead of +LIVEPATCH_REVERT_HOOK(). + +Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: f0622dd4fd6ae6ddb523a45d89ed9b8f3a9a8f36 +master date: 2024-02-26 10:13:46 +0100 +--- + xen/test/livepatch/xen_action_hooks_norevert.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c +index 3e21ade6ab..c173855192 100644 +--- a/xen/test/livepatch/xen_action_hooks_norevert.c ++++ b/xen/test/livepatch/xen_action_hooks_norevert.c +@@ -120,7 +120,7 @@ static void post_revert_hook(livepatch_payload_t *payload) + printk(KERN_DEBUG "%s: Hook done.\n", __func__); + } + +-LIVEPATCH_APPLY_HOOK(revert_hook); ++LIVEPATCH_REVERT_HOOK(revert_hook); + + LIVEPATCH_PREAPPLY_HOOK(pre_apply_hook); + LIVEPATCH_POSTAPPLY_HOOK(post_apply_hook); +-- +2.44.0 + diff --git a/0020-xen-cmdline-fix-printf-format-specifier-in-no_config.patch b/0020-xen-cmdline-fix-printf-format-specifier-in-no_config.patch new file mode 100644 index 0000000..b47d9ee --- /dev/null +++ b/0020-xen-cmdline-fix-printf-format-specifier-in-no_config.patch @@ -0,0 +1,38 @@ +From 229e8a72ee4cde5698aaf42cc59ae57446dce60f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 14:10:39 +0100 +Subject: [PATCH 20/67] xen/cmdline: fix printf format specifier in + no_config_param() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +'*' sets the width field, which is the minimum number of characters to output, +but what we want in no_config_param() is the precision instead, which is '.*' +as it imposes a maximum limit on the output. + +Fixes: 68d757df8dd2 ('x86/pv: Options to disable and/or compile out 32bit PV support') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: ef101f525173cf51dc70f4c77862f6f10a8ddccf +master date: 2024-02-26 10:17:40 +0100 +--- + xen/include/xen/param.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/include/xen/param.h b/xen/include/xen/param.h +index 93c3fe7cb7..e02e49635c 100644 +--- a/xen/include/xen/param.h ++++ b/xen/include/xen/param.h +@@ -191,7 +191,7 @@ static inline void no_config_param(const char *cfg, const char *param, + { + int len = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); + +- printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%*s' setting\n", ++ printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%.*s' setting\n", + cfg, param, len, s); + } + +-- +2.44.0 + diff --git a/0021-x86-altcall-use-a-union-as-register-type-for-functio.patch b/0021-x86-altcall-use-a-union-as-register-type-for-functio.patch new file mode 100644 index 0000000..ab050dd --- /dev/null +++ b/0021-x86-altcall-use-a-union-as-register-type-for-functio.patch @@ -0,0 +1,141 @@ +From 1aafe054e7d1efbf8e8482a9cdd4be5753b79e2f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 14:11:04 +0100 +Subject: [PATCH 21/67] x86/altcall: use a union as register type for function + parameters on clang +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current code for alternative calls uses the caller parameter types as the +types for the register variables that serve as function parameters: + +uint8_t foo; +[...] +alternative_call(myfunc, foo); + +Would expand roughly into: + +register unint8_t a1_ asm("rdi") = foo; +register unsigned long a2_ asm("rsi"); +[...] +asm volatile ("call *%c[addr](%%rip)"...); + +However with -O2 clang will generate incorrect code, given the following +example: + +unsigned int func(uint8_t t) +{ + return t; +} + +static void bar(uint8_t b) +{ + int ret_; + register uint8_t di asm("rdi") = b; + register unsigned long si asm("rsi"); + register unsigned long dx asm("rdx"); + register unsigned long cx asm("rcx"); + register unsigned long r8 asm("r8"); + register unsigned long r9 asm("r9"); + register unsigned long r10 asm("r10"); + register unsigned long r11 asm("r11"); + + asm volatile ( "call %c[addr]" + : "+r" (di), "=r" (si), "=r" (dx), + "=r" (cx), "=r" (r8), "=r" (r9), + "=r" (r10), "=r" (r11), "=a" (ret_) + : [addr] "i" (&(func)), "g" (func) + : "memory" ); +} + +void foo(unsigned int a) +{ + bar(a); +} + +Clang generates the following assembly code: + +func: # @func + movl %edi, %eax + retq +foo: # @foo + callq func + retq + +Note the truncation of the unsigned int parameter 'a' of foo() to uint8_t when +passed into bar() is lost. clang doesn't zero extend the parameters in the +callee when required, as the psABI mandates. + +The above can be worked around by using a union when defining the register +variables, so that `di` becomes: + +register union { + uint8_t e; + unsigned long r; +} di asm("rdi") = { .e = b }; + +Which results in following code generated for `foo()`: + +foo: # @foo + movzbl %dil, %edi + callq func + retq + +So the truncation is not longer lost. Apply such workaround only when built +with clang. + +Reported-by: Matthew Grooms +Link: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=277200 +Link: https://github.com/llvm/llvm-project/issues/12579 +Link: https://github.com/llvm/llvm-project/issues/82598 +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +master commit: 2ce562b2a413cbdb2e1128989ed1722290a27c4e +master date: 2024-02-26 10:18:01 +0100 +--- + xen/arch/x86/include/asm/alternative.h | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h +index a7a82c2c03..bcb1dc94f4 100644 +--- a/xen/arch/x86/include/asm/alternative.h ++++ b/xen/arch/x86/include/asm/alternative.h +@@ -167,9 +167,34 @@ extern void alternative_branches(void); + #define ALT_CALL_arg5 "r8" + #define ALT_CALL_arg6 "r9" + ++#ifdef CONFIG_CC_IS_CLANG ++/* ++ * Use a union with an unsigned long in order to prevent clang from ++ * skipping a possible truncation of the value. By using the union any ++ * truncation is carried before the call instruction, in turn covering ++ * for ABI-non-compliance in that the necessary clipping / extension of ++ * the value is supposed to be carried out in the callee. ++ * ++ * Note this behavior is not mandated by the standard, and hence could ++ * stop being a viable workaround, or worse, could cause a different set ++ * of code-generation issues in future clang versions. ++ * ++ * This has been reported upstream: ++ * https://github.com/llvm/llvm-project/issues/12579 ++ * https://github.com/llvm/llvm-project/issues/82598 ++ */ ++#define ALT_CALL_ARG(arg, n) \ ++ register union { \ ++ typeof(arg) e; \ ++ unsigned long r; \ ++ } a ## n ## _ asm ( ALT_CALL_arg ## n ) = { \ ++ .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \ ++ } ++#else + #define ALT_CALL_ARG(arg, n) \ + register typeof(arg) a ## n ## _ asm ( ALT_CALL_arg ## n ) = \ + ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) ++#endif + #define ALT_CALL_NO_ARG(n) \ + register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) + +-- +2.44.0 + diff --git a/0022-x86-spec-fix-BRANCH_HARDEN-option-to-only-be-set-whe.patch b/0022-x86-spec-fix-BRANCH_HARDEN-option-to-only-be-set-whe.patch new file mode 100644 index 0000000..ce01c1a --- /dev/null +++ b/0022-x86-spec-fix-BRANCH_HARDEN-option-to-only-be-set-whe.patch @@ -0,0 +1,57 @@ +From 91650010815f3da0834bc9781c4359350d1162a5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 14:11:40 +0100 +Subject: [PATCH 22/67] x86/spec: fix BRANCH_HARDEN option to only be set when + build-enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current logic to handle the BRANCH_HARDEN option will report it as enabled +even when build-time disabled. Fix this by only allowing the option to be set +when support for it is built into Xen. + +Fixes: 2d6f36daa086 ('x86/nospec: Introduce CONFIG_SPECULATIVE_HARDEN_BRANCH') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 60e00f77a5cc671d30c5ef3318f5b8e9b74e4aa3 +master date: 2024-02-26 16:06:42 +0100 +--- + xen/arch/x86/spec_ctrl.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 56e07d7536..661716d695 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -62,7 +62,8 @@ int8_t __initdata opt_psfd = -1; + int8_t __ro_after_init opt_ibpb_ctxt_switch = -1; + int8_t __read_mostly opt_eager_fpu = -1; + int8_t __read_mostly opt_l1d_flush = -1; +-static bool __initdata opt_branch_harden = true; ++static bool __initdata opt_branch_harden = ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); + + bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; +@@ -280,7 +281,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) + else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) + opt_l1d_flush = val; + else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) +- opt_branch_harden = val; ++ { ++ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ) ++ opt_branch_harden = val; ++ else ++ { ++ no_config_param("SPECULATIVE_HARDEN_BRANCH", "spec-ctrl", s, ++ ss); ++ rc = -EINVAL; ++ } ++ } + else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) + opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) +-- +2.44.0 + diff --git a/0023-x86-account-for-shadow-stack-in-exception-from-stub-.patch b/0023-x86-account-for-shadow-stack-in-exception-from-stub-.patch new file mode 100644 index 0000000..e23a764 --- /dev/null +++ b/0023-x86-account-for-shadow-stack-in-exception-from-stub-.patch @@ -0,0 +1,212 @@ +From 49f77602373b58b7bbdb40cea2b49d2f88d4003d Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 14:12:11 +0100 +Subject: [PATCH 23/67] x86: account for shadow stack in exception-from-stub + recovery + +Dealing with exceptions raised from within emulation stubs involves +discarding return address (replaced by exception related information). +Such discarding of course also requires removing the corresponding entry +from the shadow stack. + +Also amend the comment in fixup_exception_return(), to further clarify +why use of ptr[1] can't be an out-of-bounds access. + +This is CVE-2023-46841 / XSA-451. + +Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible") +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +master commit: 91f5f7a9154919a765c3933521760acffeddbf28 +master date: 2024-02-27 13:49:22 +0100 +--- + xen/arch/x86/extable.c | 20 ++++++---- + xen/arch/x86/include/asm/uaccess.h | 3 +- + xen/arch/x86/traps.c | 63 +++++++++++++++++++++++++++--- + 3 files changed, 71 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/extable.c b/xen/arch/x86/extable.c +index 6758ba1dca..dd9583f2a5 100644 +--- a/xen/arch/x86/extable.c ++++ b/xen/arch/x86/extable.c +@@ -86,26 +86,29 @@ search_one_extable(const struct exception_table_entry *first, + } + + unsigned long +-search_exception_table(const struct cpu_user_regs *regs) ++search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) + { + const struct virtual_region *region = find_text_region(regs->rip); + unsigned long stub = this_cpu(stubs.addr); + + if ( region && region->ex ) ++ { ++ *stub_ra = 0; + return search_one_extable(region->ex, region->ex_end, regs->rip); ++ } + + if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && + regs->rip < stub + STUB_BUF_SIZE && + regs->rsp > (unsigned long)regs && + regs->rsp < (unsigned long)get_cpu_info() ) + { +- unsigned long retptr = *(unsigned long *)regs->rsp; ++ unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; + +- region = find_text_region(retptr); +- retptr = region && region->ex +- ? search_one_extable(region->ex, region->ex_end, retptr) +- : 0; +- if ( retptr ) ++ region = find_text_region(retaddr); ++ fixup = region && region->ex ++ ? search_one_extable(region->ex, region->ex_end, retaddr) ++ : 0; ++ if ( fixup ) + { + /* + * Put trap number and error code on the stack (in place of the +@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_user_regs *regs) + }; + + *(unsigned long *)regs->rsp = token.raw; +- return retptr; ++ *stub_ra = retaddr; ++ return fixup; + } + } + +diff --git a/xen/arch/x86/include/asm/uaccess.h b/xen/arch/x86/include/asm/uaccess.h +index 684fccd95c..74bb222c03 100644 +--- a/xen/arch/x86/include/asm/uaccess.h ++++ b/xen/arch/x86/include/asm/uaccess.h +@@ -421,7 +421,8 @@ union stub_exception_token { + unsigned long raw; + }; + +-extern unsigned long search_exception_table(const struct cpu_user_regs *regs); ++extern unsigned long search_exception_table(const struct cpu_user_regs *regs, ++ unsigned long *stub_ra); + extern void sort_exception_tables(void); + extern void sort_exception_table(struct exception_table_entry *start, + const struct exception_table_entry *stop); +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index 06c4f3868b..7599bee361 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -856,7 +856,7 @@ void do_unhandled_trap(struct cpu_user_regs *regs) + } + + static void fixup_exception_return(struct cpu_user_regs *regs, +- unsigned long fixup) ++ unsigned long fixup, unsigned long stub_ra) + { + if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) + { +@@ -873,7 +873,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs, + /* + * Search for %rip. The shstk currently looks like this: + * +- * ... [Likely pointed to by SSP] ++ * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] ++ * ... [Pointed to by SSP for most exceptions, empty in IST cases] + * %cs [== regs->cs] + * %rip [== regs->rip] + * SSP [Likely points to 3 slots higher, above %cs] +@@ -891,7 +892,56 @@ static void fixup_exception_return(struct cpu_user_regs *regs, + */ + if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) + { ++ unsigned long primary_shstk = ++ (ssp & ~(STACK_SIZE - 1)) + ++ (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; ++ + wrss(fixup, ptr); ++ ++ if ( !stub_ra ) ++ goto shstk_done; ++ ++ /* ++ * Stub recovery ought to happen only when the outer context ++ * was on the main shadow stack. We need to also "pop" the ++ * stub's return address from the interrupted context's shadow ++ * stack. That is, ++ * - if we're still on the main stack, we need to move the ++ * entire stack (up to and including the exception frame) ++ * up by one slot, incrementing the original SSP in the ++ * exception frame, ++ * - if we're on an IST stack, we need to increment the ++ * original SSP. ++ */ ++ BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); ++ ++ if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) ++ { ++ /* ++ * We're on an IST stack. First make sure the two return ++ * addresses actually match. Then increment the interrupted ++ * context's SSP. ++ */ ++ BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); ++ wrss(ptr[-1] + 8, &ptr[-1]); ++ goto shstk_done; ++ } ++ ++ /* Make sure the two return addresses actually match. */ ++ BUG_ON(stub_ra != ptr[2]); ++ ++ /* Move exception frame, updating SSP there. */ ++ wrss(ptr[1], &ptr[2]); /* %cs */ ++ wrss(ptr[0], &ptr[1]); /* %rip */ ++ wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ ++ ++ /* Move all newer entries. */ ++ while ( --ptr != _p(ssp) ) ++ wrss(ptr[-1], &ptr[0]); ++ ++ /* Finally account for our own stack having shifted up. */ ++ asm volatile ( "incsspd %0" :: "r" (2) ); ++ + goto shstk_done; + } + } +@@ -912,7 +962,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs, + + static bool extable_fixup(struct cpu_user_regs *regs, bool print) + { +- unsigned long fixup = search_exception_table(regs); ++ unsigned long stub_ra = 0; ++ unsigned long fixup = search_exception_table(regs, &stub_ra); + + if ( unlikely(fixup == 0) ) + return false; +@@ -926,7 +977,7 @@ static bool extable_fixup(struct cpu_user_regs *regs, bool print) + vector_name(regs->entry_vector), regs->error_code, + _p(regs->rip), _p(regs->rip), _p(fixup)); + +- fixup_exception_return(regs, fixup); ++ fixup_exception_return(regs, fixup, stub_ra); + this_cpu(last_extable_addr) = regs->rip; + + return true; +@@ -1214,7 +1265,7 @@ void do_invalid_op(struct cpu_user_regs *regs) + void (*fn)(struct cpu_user_regs *) = bug_ptr(bug); + + fn(regs); +- fixup_exception_return(regs, (unsigned long)eip); ++ fixup_exception_return(regs, (unsigned long)eip, 0); + return; + } + +@@ -1235,7 +1286,7 @@ void do_invalid_op(struct cpu_user_regs *regs) + case BUGFRAME_warn: + printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); + show_execution_state(regs); +- fixup_exception_return(regs, (unsigned long)eip); ++ fixup_exception_return(regs, (unsigned long)eip, 0); + return; + + case BUGFRAME_bug: +-- +2.44.0 + diff --git a/0024-xen-arm-Fix-UBSAN-failure-in-start_xen.patch b/0024-xen-arm-Fix-UBSAN-failure-in-start_xen.patch new file mode 100644 index 0000000..7bdd651 --- /dev/null +++ b/0024-xen-arm-Fix-UBSAN-failure-in-start_xen.patch @@ -0,0 +1,52 @@ +From 6cbccc4071ef49a8c591ecaddfdcb1cc26d28411 Mon Sep 17 00:00:00 2001 +From: Michal Orzel +Date: Thu, 8 Feb 2024 11:43:39 +0100 +Subject: [PATCH 24/67] xen/arm: Fix UBSAN failure in start_xen() + +When running Xen on arm32, in scenario where Xen is loaded at an address +such as boot_phys_offset >= 2GB, UBSAN reports the following: + +(XEN) UBSAN: Undefined behaviour in arch/arm/setup.c:739:58 +(XEN) pointer operation underflowed 00200000 to 86800000 +(XEN) Xen WARN at common/ubsan/ubsan.c:172 +(XEN) ----[ Xen-4.19-unstable arm32 debug=y ubsan=y Not tainted ]---- +... +(XEN) Xen call trace: +(XEN) [<0031b4c0>] ubsan.c#ubsan_epilogue+0x18/0xf0 (PC) +(XEN) [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 (LR) +(XEN) [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 +(XEN) [<004d15a8>] start_xen+0xe0/0xbe0 +(XEN) [<0020007c>] head.o#primary_switched+0x4/0x30 + +The failure is reported for the following line: +(paddr_t)(uintptr_t)(_start + boot_phys_offset) + +This occurs because the compiler treats (ptr + size) with size bigger than +PTRDIFF_MAX as undefined behavior. To address this, switch to macro +virt_to_maddr(), given the future plans to eliminate boot_phys_offset. + +Signed-off-by: Michal Orzel +Reviewed-by: Luca Fancellu +Tested-by: Luca Fancellu +Acked-by: Julien Grall +(cherry picked from commit e11f5766503c0ff074b4e0f888bbfc931518a169) +--- + xen/arch/arm/setup.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c +index 4395640019..9ee19c2bc1 100644 +--- a/xen/arch/arm/setup.c ++++ b/xen/arch/arm/setup.c +@@ -1025,7 +1025,7 @@ void __init start_xen(unsigned long boot_phys_offset, + + /* Register Xen's load address as a boot module. */ + xen_bootmodule = add_boot_module(BOOTMOD_XEN, +- (paddr_t)(uintptr_t)(_start + boot_phys_offset), ++ virt_to_maddr(_start), + (paddr_t)(uintptr_t)(_end - _start), false); + BUG_ON(!xen_bootmodule); + +-- +2.44.0 + diff --git a/0025-x86-HVM-hide-SVM-VMX-when-their-enabling-is-prohibit.patch b/0025-x86-HVM-hide-SVM-VMX-when-their-enabling-is-prohibit.patch new file mode 100644 index 0000000..28e489b --- /dev/null +++ b/0025-x86-HVM-hide-SVM-VMX-when-their-enabling-is-prohibit.patch @@ -0,0 +1,67 @@ +From 9c0d518eb8dc69430e6a8d767bd101dad19b846a Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 5 Mar 2024 11:56:31 +0100 +Subject: [PATCH 25/67] x86/HVM: hide SVM/VMX when their enabling is prohibited + by firmware +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +... or we fail to enable the functionality on the BSP for other reasons. +The only place where hardware announcing the feature is recorded is the +raw CPU policy/featureset. + +Inspired by https://lore.kernel.org/all/20230921114940.957141-1-pbonzini@redhat.com/. + +Signed-off-by: Jan Beulich +Acked-by: Roger Pau Monné +master commit: 0b5f149338e35a795bf609ce584640b0977f9e6c +master date: 2024-01-09 14:06:34 +0100 +--- + xen/arch/x86/hvm/svm/svm.c | 1 + + xen/arch/x86/hvm/vmx/vmcs.c | 17 +++++++++++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index fd32600ae3..3c17464550 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -1669,6 +1669,7 @@ const struct hvm_function_table * __init start_svm(void) + + if ( _svm_cpu_up(true) ) + { ++ setup_clear_cpu_cap(X86_FEATURE_SVM); + printk("SVM: failed to initialise.\n"); + return NULL; + } +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index bcbecc6945..b5ecc51b43 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -2163,6 +2163,23 @@ int __init vmx_vmcs_init(void) + + if ( !ret ) + register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1); ++ else ++ { ++ setup_clear_cpu_cap(X86_FEATURE_VMX); ++ ++ /* ++ * _vmx_vcpu_up() may have made it past feature identification. ++ * Make sure all dependent features are off as well. ++ */ ++ vmx_basic_msr = 0; ++ vmx_pin_based_exec_control = 0; ++ vmx_cpu_based_exec_control = 0; ++ vmx_secondary_exec_control = 0; ++ vmx_vmexit_control = 0; ++ vmx_vmentry_control = 0; ++ vmx_ept_vpid_cap = 0; ++ vmx_vmfunc = 0; ++ } + + return ret; + } +-- +2.44.0 + diff --git a/0026-xen-sched-Fix-UB-shift-in-compat_set_timer_op.patch b/0026-xen-sched-Fix-UB-shift-in-compat_set_timer_op.patch new file mode 100644 index 0000000..4b051ea --- /dev/null +++ b/0026-xen-sched-Fix-UB-shift-in-compat_set_timer_op.patch @@ -0,0 +1,86 @@ +From b75bee183210318150e678e14b35224d7c73edb6 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 5 Mar 2024 11:57:02 +0100 +Subject: [PATCH 26/67] xen/sched: Fix UB shift in compat_set_timer_op() + +Tamas reported this UBSAN failure from fuzzing: + + (XEN) ================================================================================ + (XEN) UBSAN: Undefined behaviour in common/sched/compat.c:48:37 + (XEN) left shift of negative value -2147425536 + (XEN) ----[ Xen-4.19-unstable x86_64 debug=y ubsan=y Not tainted ]---- + ... + (XEN) Xen call trace: + (XEN) [] R ubsan.c#ubsan_epilogue+0xa/0xd9 + (XEN) [] F __ubsan_handle_shift_out_of_bounds+0x11a/0x1c5 + (XEN) [] F compat_set_timer_op+0x41/0x43 + (XEN) [] F hvm_do_multicall_call+0x77f/0xa75 + (XEN) [] F arch_do_multicall_call+0xec/0xf1 + (XEN) [] F do_multicall+0x1dc/0xde3 + (XEN) [] F hvm_hypercall+0xa00/0x149a + (XEN) [] F vmx_vmexit_handler+0x1596/0x279c + (XEN) [] F vmx_asm_vmexit_handler+0xdb/0x200 + +Left-shifting any negative value is strictly undefined behaviour in C, and +the two parameters here come straight from the guest. + +The fuzzer happened to choose lo 0xf, hi 0x8000e300. + +Switch everything to be unsigned values, making the shift well defined. + +As GCC documents: + + As an extension to the C language, GCC does not use the latitude given in + C99 and C11 only to treat certain aspects of signed '<<' as undefined. + However, -fsanitize=shift (and -fsanitize=undefined) will diagnose such + cases. + +this was deemed not to need an XSA. + +Note: The unsigned -> signed conversion for do_set_timer_op()'s s_time_t +parameter is also well defined. C makes it implementation defined, and GCC +defines it as reduction modulo 2^N to be within range of the new type. + +Fixes: 2942f45e09fb ("Enable compatibility mode operation for HYPERVISOR_sched_op and HYPERVISOR_set_timer_op.") +Reported-by: Tamas K Lengyel +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: ae6d4fd876765e6d623eec67d14f5d0464be09cb +master date: 2024-02-01 19:52:44 +0000 +--- + xen/common/sched/compat.c | 4 ++-- + xen/include/hypercall-defs.c | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c +index 040b4caca2..b827fdecb8 100644 +--- a/xen/common/sched/compat.c ++++ b/xen/common/sched/compat.c +@@ -39,9 +39,9 @@ static int compat_poll(struct compat_sched_poll *compat) + + #include "core.c" + +-int compat_set_timer_op(u32 lo, s32 hi) ++int compat_set_timer_op(uint32_t lo, uint32_t hi) + { +- return do_set_timer_op(((s64)hi << 32) | lo); ++ return do_set_timer_op(((uint64_t)hi << 32) | lo); + } + + /* +diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c +index 1896121074..c442dee284 100644 +--- a/xen/include/hypercall-defs.c ++++ b/xen/include/hypercall-defs.c +@@ -127,7 +127,7 @@ xenoprof_op(int op, void *arg) + + #ifdef CONFIG_COMPAT + prefix: compat +-set_timer_op(uint32_t lo, int32_t hi) ++set_timer_op(uint32_t lo, uint32_t hi) + multicall(multicall_entry_compat_t *call_list, uint32_t nr_calls) + memory_op(unsigned int cmd, void *arg) + #ifdef CONFIG_IOREQ_SERVER +-- +2.44.0 + diff --git a/0027-x86-spec-print-the-built-in-SPECULATIVE_HARDEN_-opti.patch b/0027-x86-spec-print-the-built-in-SPECULATIVE_HARDEN_-opti.patch new file mode 100644 index 0000000..845247a --- /dev/null +++ b/0027-x86-spec-print-the-built-in-SPECULATIVE_HARDEN_-opti.patch @@ -0,0 +1,54 @@ +From 76ea2aab3652cc34e474de0905f0a9cd4df7d087 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:57:41 +0100 +Subject: [PATCH 27/67] x86/spec: print the built-in SPECULATIVE_HARDEN_* + options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Just like it's done for INDIRECT_THUNK and SHADOW_PAGING. + +Reported-by: Jan Beulich +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 6e9507f7d51fe49df8bc70f83e49ce06c92e4e54 +master date: 2024-02-27 14:57:52 +0100 +--- + xen/arch/x86/spec_ctrl.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 661716d695..93f1cf3bb5 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -488,13 +488,25 @@ static void __init print_details(enum ind_thunk thunk) + (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); + + /* Compiled-in support which pertains to mitigations. */ +- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) ++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) + printk(" Compiled-in support:" + #ifdef CONFIG_INDIRECT_THUNK + " INDIRECT_THUNK" + #endif + #ifdef CONFIG_SHADOW_PAGING + " SHADOW_PAGING" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_ARRAY ++ " HARDEN_ARRAY" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH ++ " HARDEN_BRANCH" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS ++ " HARDEN_GUEST_ACCESS" + #endif + "\n"); + +-- +2.44.0 + diff --git a/0028-x86-spec-fix-INDIRECT_THUNK-option-to-only-be-set-wh.patch b/0028-x86-spec-fix-INDIRECT_THUNK-option-to-only-be-set-wh.patch new file mode 100644 index 0000000..dfbf516 --- /dev/null +++ b/0028-x86-spec-fix-INDIRECT_THUNK-option-to-only-be-set-wh.patch @@ -0,0 +1,67 @@ +From 693455c3c370e535eb6cd065800ff91e147815fa Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:58:04 +0100 +Subject: [PATCH 28/67] x86/spec: fix INDIRECT_THUNK option to only be set when + build-enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Attempt to provide a more helpful error message when the user attempts to set +spec-ctrl=bti-thunk option but the support is build-time disabled. + +While there also adjust the command line documentation to mention +CONFIG_INDIRECT_THUNK instead of INDIRECT_THUNK. + +Reported-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 8441fa806a3b778867867cd0159fa1722e90397e +master date: 2024-02-27 14:58:20 +0100 +--- + docs/misc/xen-command-line.pandoc | 10 +++++----- + xen/arch/x86/spec_ctrl.c | 7 ++++++- + 2 files changed, 11 insertions(+), 6 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 05f613c71c..2006697226 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2378,11 +2378,11 @@ guests to use. + performance reasons dom0 is unprotected by default. If it is necessary to + protect dom0 too, boot with `spec-ctrl=ibpb-entry`. + +-If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to +-select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` +-locations. The default thunk is `retpoline` (generally preferred), with the +-alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and +-`lfence` (an `lfence; jmp *%reg` gadget). ++If Xen was compiled with `CONFIG_INDIRECT_THUNK` support, `bti-thunk=` can be ++used to select which of the thunks gets patched into the ++`__x86_indirect_thunk_%reg` locations. The default thunk is `retpoline` ++(generally preferred), with the alternatives being `jmp` (a `jmp *%reg` gadget, ++minimal overhead), and `lfence` (an `lfence; jmp *%reg` gadget). + + On hardware supporting IBRS (Indirect Branch Restricted Speculation), the + `ibrs=` option can be used to force or prevent Xen using the feature itself. +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 93f1cf3bb5..098fa3184d 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -253,7 +253,12 @@ static int __init cf_check parse_spec_ctrl(const char *s) + { + s += 10; + +- if ( !cmdline_strcmp(s, "retpoline") ) ++ if ( !IS_ENABLED(CONFIG_INDIRECT_THUNK) ) ++ { ++ no_config_param("INDIRECT_THUNK", "spec-ctrl", s - 10, ss); ++ rc = -EINVAL; ++ } ++ else if ( !cmdline_strcmp(s, "retpoline") ) + opt_thunk = THUNK_RETPOLINE; + else if ( !cmdline_strcmp(s, "lfence") ) + opt_thunk = THUNK_LFENCE; +-- +2.44.0 + diff --git a/0029-x86-spec-do-not-print-thunk-option-selection-if-not-.patch b/0029-x86-spec-do-not-print-thunk-option-selection-if-not-.patch new file mode 100644 index 0000000..71e6633 --- /dev/null +++ b/0029-x86-spec-do-not-print-thunk-option-selection-if-not-.patch @@ -0,0 +1,50 @@ +From 0ce25b46ab2fb53a1b58f7682ca14971453f4f2c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:58:36 +0100 +Subject: [PATCH 29/67] x86/spec: do not print thunk option selection if not + built-in +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Since the thunk built-in enable is printed as part of the "Compiled-in +support:" line, avoid printing anything in "Xen settings:" if the thunk is +disabled at build time. + +Note the BTI-Thunk option printing is also adjusted to print a colon in the +same way the other options on the line do. + +Requested-by: Jan Beulich +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 576528a2a742069af203e90c613c5c93e23c9755 +master date: 2024-02-27 14:58:40 +0100 +--- + xen/arch/x86/spec_ctrl.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 098fa3184d..25a18ac598 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -516,11 +516,12 @@ static void __init print_details(enum ind_thunk thunk) + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", +- thunk == THUNK_NONE ? "N/A" : +- thunk == THUNK_RETPOLINE ? "RETPOLINE" : +- thunk == THUNK_LFENCE ? "LFENCE" : +- thunk == THUNK_JMP ? "JMP" : "?", ++ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", ++ thunk != THUNK_NONE ? "BTI-Thunk: " : "", ++ thunk == THUNK_NONE ? "" : ++ thunk == THUNK_RETPOLINE ? "RETPOLINE, " : ++ thunk == THUNK_LFENCE ? "LFENCE, " : ++ thunk == THUNK_JMP ? "JMP, " : "?, ", + (!boot_cpu_has(X86_FEATURE_IBRSB) && + !boot_cpu_has(X86_FEATURE_IBRS)) ? "No" : + (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", +-- +2.44.0 + diff --git a/0030-xen-livepatch-register-livepatch-regions-when-loaded.patch b/0030-xen-livepatch-register-livepatch-regions-when-loaded.patch new file mode 100644 index 0000000..f521ecc --- /dev/null +++ b/0030-xen-livepatch-register-livepatch-regions-when-loaded.patch @@ -0,0 +1,159 @@ +From b11917de0cd261a878beaf50c18a689bde0b2f50 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:59:26 +0100 +Subject: [PATCH 30/67] xen/livepatch: register livepatch regions when loaded +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently livepatch regions are registered as virtual regions only after the +livepatch has been applied. + +This can lead to issues when using the pre-apply or post-revert hooks, as at +that point the livepatch is not in the virtual regions list. If a livepatch +pre-apply hook contains a WARN() it would trigger an hypervisor crash, as the +code to handle the bug frame won't be able to find the instruction pointer that +triggered the #UD in any of the registered virtual regions, and hence crash. + +Fix this by adding the livepatch payloads as virtual regions as soon as loaded, +and only remove them once the payload is unloaded. This requires some changes +to the virtual regions code, as the removal of the virtual regions is no longer +done in stop machine context, and hence an RCU barrier is added in order to +make sure there are no users of the virtual region after it's been removed from +the list. + +Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: a57b4074ab39bee78b6c116277f0a9963bd8e687 +master date: 2024-02-28 16:57:25 +0000 +--- + xen/common/livepatch.c | 4 ++-- + xen/common/virtual_region.c | 44 ++++++++++++++----------------------- + 2 files changed, 19 insertions(+), 29 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index c2ae84d18b..537e9f33e4 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -1015,6 +1015,7 @@ static int build_symbol_table(struct payload *payload, + static void free_payload(struct payload *data) + { + ASSERT(spin_is_locked(&payload_lock)); ++ unregister_virtual_region(&data->region); + list_del(&data->list); + payload_cnt--; + payload_version++; +@@ -1114,6 +1115,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload) + INIT_LIST_HEAD(&data->list); + INIT_LIST_HEAD(&data->applied_list); + ++ register_virtual_region(&data->region); + list_add_tail(&data->list, &payload_list); + payload_cnt++; + payload_version++; +@@ -1330,7 +1332,6 @@ static inline void apply_payload_tail(struct payload *data) + * The applied_list is iterated by the trap code. + */ + list_add_tail_rcu(&data->applied_list, &applied_list); +- register_virtual_region(&data->region); + + data->state = LIVEPATCH_STATE_APPLIED; + } +@@ -1376,7 +1377,6 @@ static inline void revert_payload_tail(struct payload *data) + * The applied_list is iterated by the trap code. + */ + list_del_rcu(&data->applied_list); +- unregister_virtual_region(&data->region); + + data->reverted = true; + data->state = LIVEPATCH_STATE_CHECKED; +diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c +index 5f89703f51..9f12c30efe 100644 +--- a/xen/common/virtual_region.c ++++ b/xen/common/virtual_region.c +@@ -23,14 +23,8 @@ static struct virtual_region core_init __initdata = { + }; + + /* +- * RCU locking. Additions are done either at startup (when there is only +- * one CPU) or when all CPUs are running without IRQs. +- * +- * Deletions are bit tricky. We do it when Live Patch (all CPUs running +- * without IRQs) or during bootup (when clearing the init). +- * +- * Hence we use list_del_rcu (which sports an memory fence) and a spinlock +- * on deletion. ++ * RCU locking. Modifications to the list must be done in exclusive mode, and ++ * hence need to hold the spinlock. + * + * All readers of virtual_region_list MUST use list_for_each_entry_rcu. + */ +@@ -58,41 +52,36 @@ const struct virtual_region *find_text_region(unsigned long addr) + + void register_virtual_region(struct virtual_region *r) + { +- ASSERT(!local_irq_is_enabled()); ++ unsigned long flags; + ++ spin_lock_irqsave(&virtual_region_lock, flags); + list_add_tail_rcu(&r->list, &virtual_region_list); ++ spin_unlock_irqrestore(&virtual_region_lock, flags); + } + +-static void remove_virtual_region(struct virtual_region *r) ++/* ++ * Suggest inline so when !CONFIG_LIVEPATCH the function is not left ++ * unreachable after init code is removed. ++ */ ++static void inline remove_virtual_region(struct virtual_region *r) + { + unsigned long flags; + + spin_lock_irqsave(&virtual_region_lock, flags); + list_del_rcu(&r->list); + spin_unlock_irqrestore(&virtual_region_lock, flags); +- /* +- * We do not need to invoke call_rcu. +- * +- * This is due to the fact that on the deletion we have made sure +- * to use spinlocks (to guard against somebody else calling +- * unregister_virtual_region) and list_deletion spiced with +- * memory barrier. +- * +- * That protects us from corrupting the list as the readers all +- * use list_for_each_entry_rcu which is safe against concurrent +- * deletions. +- */ + } + ++#ifdef CONFIG_LIVEPATCH + void unregister_virtual_region(struct virtual_region *r) + { +- /* Expected to be called from Live Patch - which has IRQs disabled. */ +- ASSERT(!local_irq_is_enabled()); +- + remove_virtual_region(r); ++ ++ /* Assert that no CPU might be using the removed region. */ ++ rcu_barrier(); + } + +-#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_X86) ++#ifdef CONFIG_X86 + void relax_virtual_region_perms(void) + { + const struct virtual_region *region; +@@ -116,7 +105,8 @@ void tighten_virtual_region_perms(void) + PAGE_HYPERVISOR_RX); + rcu_read_unlock(&rcu_virtual_region_lock); + } +-#endif ++#endif /* CONFIG_X86 */ ++#endif /* CONFIG_LIVEPATCH */ + + void __init unregister_init_virtual_region(void) + { +-- +2.44.0 + diff --git a/0031-xen-livepatch-search-for-symbols-in-all-loaded-paylo.patch b/0031-xen-livepatch-search-for-symbols-in-all-loaded-paylo.patch new file mode 100644 index 0000000..c778639 --- /dev/null +++ b/0031-xen-livepatch-search-for-symbols-in-all-loaded-paylo.patch @@ -0,0 +1,149 @@ +From c54cf903b06fb1933fad053cc547580c92c856ea Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:59:35 +0100 +Subject: [PATCH 31/67] xen/livepatch: search for symbols in all loaded + payloads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When checking if an address belongs to a patch, or when resolving a symbol, +take into account all loaded livepatch payloads, even if not applied. + +This is required in order for the pre-apply and post-revert hooks to work +properly, or else Xen won't detect the instruction pointer belonging to those +hooks as being part of the currently active text. + +Move the RCU handling to be used for payload_list instead of applied_list, as +now the calls from trap code will iterate over the payload_list. + +Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: d2daa40fb3ddb8f83e238e57854bd878924cde90 +master date: 2024-02-28 16:57:25 +0000 +--- + xen/common/livepatch.c | 49 +++++++++++++++--------------------------- + 1 file changed, 17 insertions(+), 32 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index 537e9f33e4..a129ab9973 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -36,13 +36,14 @@ + * caller in schedule_work. + */ + static DEFINE_SPINLOCK(payload_lock); +-static LIST_HEAD(payload_list); +- + /* +- * Patches which have been applied. Need RCU in case we crash (and then +- * traps code would iterate via applied_list) when adding entries on the list. ++ * Need RCU in case we crash (and then traps code would iterate via ++ * payload_list) when adding entries on the list. + */ +-static DEFINE_RCU_READ_LOCK(rcu_applied_lock); ++static DEFINE_RCU_READ_LOCK(rcu_payload_lock); ++static LIST_HEAD(payload_list); ++ ++/* Patches which have been applied. Only modified from stop machine context. */ + static LIST_HEAD(applied_list); + + static unsigned int payload_cnt; +@@ -111,12 +112,8 @@ bool_t is_patch(const void *ptr) + const struct payload *data; + bool_t r = 0; + +- /* +- * Only RCU locking since this list is only ever changed during apply +- * or revert context. And in case it dies there we need an safe list. +- */ +- rcu_read_lock(&rcu_applied_lock); +- list_for_each_entry_rcu ( data, &applied_list, applied_list ) ++ rcu_read_lock(&rcu_payload_lock); ++ list_for_each_entry_rcu ( data, &payload_list, list ) + { + if ( (ptr >= data->rw_addr && + ptr < (data->rw_addr + data->rw_size)) || +@@ -130,7 +127,7 @@ bool_t is_patch(const void *ptr) + } + + } +- rcu_read_unlock(&rcu_applied_lock); ++ rcu_read_unlock(&rcu_payload_lock); + + return r; + } +@@ -166,12 +163,8 @@ static const char *cf_check livepatch_symbols_lookup( + const void *va = (const void *)addr; + const char *n = NULL; + +- /* +- * Only RCU locking since this list is only ever changed during apply +- * or revert context. And in case it dies there we need an safe list. +- */ +- rcu_read_lock(&rcu_applied_lock); +- list_for_each_entry_rcu ( data, &applied_list, applied_list ) ++ rcu_read_lock(&rcu_payload_lock); ++ list_for_each_entry_rcu ( data, &payload_list, list ) + { + if ( va < data->text_addr || + va >= (data->text_addr + data->text_size) ) +@@ -200,7 +193,7 @@ static const char *cf_check livepatch_symbols_lookup( + n = data->symtab[best].name; + break; + } +- rcu_read_unlock(&rcu_applied_lock); ++ rcu_read_unlock(&rcu_payload_lock); + + return n; + } +@@ -1016,7 +1009,8 @@ static void free_payload(struct payload *data) + { + ASSERT(spin_is_locked(&payload_lock)); + unregister_virtual_region(&data->region); +- list_del(&data->list); ++ list_del_rcu(&data->list); ++ rcu_barrier(); + payload_cnt--; + payload_version++; + free_payload_data(data); +@@ -1116,7 +1110,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload) + INIT_LIST_HEAD(&data->applied_list); + + register_virtual_region(&data->region); +- list_add_tail(&data->list, &payload_list); ++ list_add_tail_rcu(&data->list, &payload_list); + payload_cnt++; + payload_version++; + } +@@ -1327,11 +1321,7 @@ static int apply_payload(struct payload *data) + + static inline void apply_payload_tail(struct payload *data) + { +- /* +- * We need RCU variant (which has barriers) in case we crash here. +- * The applied_list is iterated by the trap code. +- */ +- list_add_tail_rcu(&data->applied_list, &applied_list); ++ list_add_tail(&data->applied_list, &applied_list); + + data->state = LIVEPATCH_STATE_APPLIED; + } +@@ -1371,12 +1361,7 @@ static int revert_payload(struct payload *data) + + static inline void revert_payload_tail(struct payload *data) + { +- +- /* +- * We need RCU variant (which has barriers) in case we crash here. +- * The applied_list is iterated by the trap code. +- */ +- list_del_rcu(&data->applied_list); ++ list_del(&data->applied_list); + + data->reverted = true; + data->state = LIVEPATCH_STATE_CHECKED; +-- +2.44.0 + diff --git a/0032-xen-livepatch-fix-norevert-test-attempt-to-open-code.patch b/0032-xen-livepatch-fix-norevert-test-attempt-to-open-code.patch new file mode 100644 index 0000000..76af9ef --- /dev/null +++ b/0032-xen-livepatch-fix-norevert-test-attempt-to-open-code.patch @@ -0,0 +1,186 @@ +From 5564323f643715f9d364df88e0eb9c7d6fd2c22b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:59:43 +0100 +Subject: [PATCH 32/67] xen/livepatch: fix norevert test attempt to open-code + revert +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The purpose of the norevert test is to install a dummy handler that replaces +the internal Xen revert code, and then perform the revert in the post-revert +hook. For that purpose the usage of the previous common_livepatch_revert() is +not enough, as that just reverts specific functions, but not the whole state of +the payload. + +Remove both common_livepatch_{apply,revert}() and instead expose +revert_payload{,_tail}() in order to perform the patch revert from the +post-revert hook. + +Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: cdae267ce10d04d71d1687b5701ff2911a96b6dc +master date: 2024-02-28 16:57:25 +0000 +--- + xen/common/livepatch.c | 41 +++++++++++++++++-- + xen/include/xen/livepatch.h | 32 ++------------- + .../livepatch/xen_action_hooks_norevert.c | 22 +++------- + 3 files changed, 46 insertions(+), 49 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index a129ab9973..a5068a2217 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -1310,7 +1310,22 @@ static int apply_payload(struct payload *data) + ASSERT(!local_irq_is_enabled()); + + for ( i = 0; i < data->nfuncs; i++ ) +- common_livepatch_apply(&data->funcs[i], &data->fstate[i]); ++ { ++ const struct livepatch_func *func = &data->funcs[i]; ++ struct livepatch_fstate *state = &data->fstate[i]; ++ ++ /* If the action has been already executed on this function, do nothing. */ ++ if ( state->applied == LIVEPATCH_FUNC_APPLIED ) ++ { ++ printk(XENLOG_WARNING LIVEPATCH ++ "%s: %s has been already applied before\n", ++ __func__, func->name); ++ continue; ++ } ++ ++ arch_livepatch_apply(func, state); ++ state->applied = LIVEPATCH_FUNC_APPLIED; ++ } + + arch_livepatch_revive(); + +@@ -1326,7 +1341,7 @@ static inline void apply_payload_tail(struct payload *data) + data->state = LIVEPATCH_STATE_APPLIED; + } + +-static int revert_payload(struct payload *data) ++int revert_payload(struct payload *data) + { + unsigned int i; + int rc; +@@ -1341,7 +1356,25 @@ static int revert_payload(struct payload *data) + } + + for ( i = 0; i < data->nfuncs; i++ ) +- common_livepatch_revert(&data->funcs[i], &data->fstate[i]); ++ { ++ const struct livepatch_func *func = &data->funcs[i]; ++ struct livepatch_fstate *state = &data->fstate[i]; ++ ++ /* ++ * If the apply action hasn't been executed on this function, do ++ * nothing. ++ */ ++ if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) ++ { ++ printk(XENLOG_WARNING LIVEPATCH ++ "%s: %s has not been applied before\n", ++ __func__, func->name); ++ continue; ++ } ++ ++ arch_livepatch_revert(func, state); ++ state->applied = LIVEPATCH_FUNC_NOT_APPLIED; ++ } + + /* + * Since we are running with IRQs disabled and the hooks may call common +@@ -1359,7 +1392,7 @@ static int revert_payload(struct payload *data) + return 0; + } + +-static inline void revert_payload_tail(struct payload *data) ++void revert_payload_tail(struct payload *data) + { + list_del(&data->applied_list); + +diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h +index 537d3d58b6..c9ee58fd37 100644 +--- a/xen/include/xen/livepatch.h ++++ b/xen/include/xen/livepatch.h +@@ -136,35 +136,11 @@ void arch_livepatch_post_action(void); + void arch_livepatch_mask(void); + void arch_livepatch_unmask(void); + +-static inline void common_livepatch_apply(const struct livepatch_func *func, +- struct livepatch_fstate *state) +-{ +- /* If the action has been already executed on this function, do nothing. */ +- if ( state->applied == LIVEPATCH_FUNC_APPLIED ) +- { +- printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n", +- __func__, func->name); +- return; +- } +- +- arch_livepatch_apply(func, state); +- state->applied = LIVEPATCH_FUNC_APPLIED; +-} ++/* Only for testing purposes. */ ++struct payload; ++int revert_payload(struct payload *data); ++void revert_payload_tail(struct payload *data); + +-static inline void common_livepatch_revert(const struct livepatch_func *func, +- struct livepatch_fstate *state) +-{ +- /* If the apply action hasn't been executed on this function, do nothing. */ +- if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) +- { +- printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n", +- __func__, func->name); +- return; +- } +- +- arch_livepatch_revert(func, state); +- state->applied = LIVEPATCH_FUNC_NOT_APPLIED; +-} + #else + + /* +diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c +index c173855192..c5fbab1746 100644 +--- a/xen/test/livepatch/xen_action_hooks_norevert.c ++++ b/xen/test/livepatch/xen_action_hooks_norevert.c +@@ -96,26 +96,14 @@ static int revert_hook(livepatch_payload_t *payload) + + static void post_revert_hook(livepatch_payload_t *payload) + { +- int i; ++ unsigned long flags; + + printk(KERN_DEBUG "%s: Hook starting.\n", __func__); + +- for (i = 0; i < payload->nfuncs; i++) +- { +- const struct livepatch_func *func = &payload->funcs[i]; +- struct livepatch_fstate *fstate = &payload->fstate[i]; +- +- BUG_ON(revert_cnt != 1); +- BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); +- +- /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */ +- arch_livepatch_quiesce(); +- common_livepatch_revert(payload); +- arch_livepatch_revive(); +- BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); +- +- printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); +- } ++ local_irq_save(flags); ++ BUG_ON(revert_payload(payload)); ++ revert_payload_tail(payload); ++ local_irq_restore(flags); + + printk(KERN_DEBUG "%s: Hook done.\n", __func__); + } +-- +2.44.0 + diff --git a/0033-xen-livepatch-properly-build-the-noapply-and-norever.patch b/0033-xen-livepatch-properly-build-the-noapply-and-norever.patch new file mode 100644 index 0000000..76803c6 --- /dev/null +++ b/0033-xen-livepatch-properly-build-the-noapply-and-norever.patch @@ -0,0 +1,43 @@ +From a59106b27609b6ae2873bd6755949b1258290872 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:59:51 +0100 +Subject: [PATCH 33/67] xen/livepatch: properly build the noapply and norevert + tests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It seems the build variables for those tests where copy-pasted from +xen_action_hooks_marker-objs and not adjusted to use the correct source files. + +Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: e579677095782c7dec792597ba8b037b7d716b32 +master date: 2024-02-28 16:57:25 +0000 +--- + xen/test/livepatch/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xen/test/livepatch/Makefile b/xen/test/livepatch/Makefile +index c258ab0b59..d987a8367f 100644 +--- a/xen/test/livepatch/Makefile ++++ b/xen/test/livepatch/Makefile +@@ -118,12 +118,12 @@ xen_action_hooks_marker-objs := xen_action_hooks_marker.o xen_hello_world_func.o + $(obj)/xen_action_hooks_noapply.o: $(obj)/config.h + + extra-y += xen_action_hooks_noapply.livepatch +-xen_action_hooks_noapply-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o ++xen_action_hooks_noapply-objs := xen_action_hooks_noapply.o xen_hello_world_func.o note.o xen_note.o + + $(obj)/xen_action_hooks_norevert.o: $(obj)/config.h + + extra-y += xen_action_hooks_norevert.livepatch +-xen_action_hooks_norevert-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o ++xen_action_hooks_norevert-objs := xen_action_hooks_norevert.o xen_hello_world_func.o note.o xen_note.o + + EXPECT_BYTES_COUNT := 8 + CODE_GET_EXPECT=$(shell $(OBJDUMP) -d --insn-width=1 $(1) | sed -n -e '/<'$(2)'>:$$/,/^$$/ p' | tail -n +2 | head -n $(EXPECT_BYTES_COUNT) | awk '{$$0=$$2; printf "%s", substr($$0,length-1)}' | sed 's/.\{2\}/0x&,/g' | sed 's/^/{/;s/,$$/}/g') +-- +2.44.0 + diff --git a/0034-libxl-Fix-segfault-in-device_model_spawn_outcome.patch b/0034-libxl-Fix-segfault-in-device_model_spawn_outcome.patch new file mode 100644 index 0000000..7f23a73 --- /dev/null +++ b/0034-libxl-Fix-segfault-in-device_model_spawn_outcome.patch @@ -0,0 +1,39 @@ +From c4ee68eda9937743527fff41f4ede0f6a3228080 Mon Sep 17 00:00:00 2001 +From: Jason Andryuk +Date: Tue, 5 Mar 2024 12:00:30 +0100 +Subject: [PATCH 34/67] libxl: Fix segfault in device_model_spawn_outcome + +libxl__spawn_qdisk_backend() explicitly sets guest_config to NULL when +starting QEMU (the usual launch through libxl__spawn_local_dm() has a +guest_config though). + +Bail early on a NULL guest_config/d_config. This skips the QMP queries +for chardevs and VNC, but this xenpv QEMU instance isn't expected to +provide those - only qdisk (or 9pfs backends after an upcoming change). + +Signed-off-by: Jason Andryuk +Acked-by: Anthony PERARD +master commit: d4f3d35f043f6ef29393166b0dd131c8102cf255 +master date: 2024-02-29 08:18:38 +0100 +--- + tools/libs/light/libxl_dm.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index ed620a9d8e..29b43ed20a 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -3172,8 +3172,8 @@ static void device_model_spawn_outcome(libxl__egc *egc, + + /* Check if spawn failed */ + if (rc) goto out; +- +- if (d_config->b_info.device_model_version ++ /* d_config is NULL for xl devd/libxl__spawn_qemu_xenpv_backend(). */ ++ if (d_config && d_config->b_info.device_model_version + == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN) { + rc = libxl__ev_time_register_rel(ao, &dmss->timeout, + devise_model_postconfig_timeout, +-- +2.44.0 + diff --git a/0035-x86-altcall-always-use-a-temporary-parameter-stashin.patch b/0035-x86-altcall-always-use-a-temporary-parameter-stashin.patch new file mode 100644 index 0000000..177c73b --- /dev/null +++ b/0035-x86-altcall-always-use-a-temporary-parameter-stashin.patch @@ -0,0 +1,197 @@ +From 2f49d9f89c14519d4cb1e06ab8370cf4ba50fab7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 12:00:47 +0100 +Subject: [PATCH 35/67] x86/altcall: always use a temporary parameter stashing + variable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The usage in ALT_CALL_ARG() on clang of: + +register union { + typeof(arg) e; + const unsigned long r; +} ... + +When `arg` is the first argument to alternative_{,v}call() and +const_vlapic_vcpu() is used results in clang 3.5.0 complaining with: + +arch/x86/hvm/vlapic.c:141:47: error: non-const static data member must be initialized out of line + alternative_call(hvm_funcs.test_pir, const_vlapic_vcpu(vlapic), vec) ) + +Workaround this by pulling `arg1` into a local variable, like it's done for +further arguments (arg2, arg3...) + +Originally arg1 wasn't pulled into a variable because for the a1_ register +local variable the possible clobbering as a result of operators on other +variables don't matter: + +https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + +Note clang version 3.8.1 seems to already be fixed and don't require the +workaround, but since it's harmless do it uniformly everywhere. + +Reported-by: Andrew Cooper +Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang') +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +master commit: c20850540ad6a32f4fc17bde9b01c92b0df18bf0 +master date: 2024-02-29 08:21:49 +0100 +--- + xen/arch/x86/include/asm/alternative.h | 36 +++++++++++++++++--------- + 1 file changed, 24 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h +index bcb1dc94f4..fa04481316 100644 +--- a/xen/arch/x86/include/asm/alternative.h ++++ b/xen/arch/x86/include/asm/alternative.h +@@ -253,21 +253,24 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall1(func, arg) ({ \ +- ALT_CALL_ARG(arg, 1); \ ++ typeof(arg) v1_ = (arg); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_NO_ARG2; \ + (void)sizeof(func(arg)); \ + (void)alternative_callN(1, int, func); \ + }) + + #define alternative_call1(func, arg) ({ \ +- ALT_CALL_ARG(arg, 1); \ ++ typeof(arg) v1_ = (arg); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_NO_ARG2; \ + alternative_callN(1, typeof(func(arg)), func); \ + }) + + #define alternative_vcall2(func, arg1, arg2) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_NO_ARG3; \ + (void)sizeof(func(arg1, arg2)); \ +@@ -275,17 +278,19 @@ extern void alternative_branches(void); + }) + + #define alternative_call2(func, arg1, arg2) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_NO_ARG3; \ + alternative_callN(2, typeof(func(arg1, arg2)), func); \ + }) + + #define alternative_vcall3(func, arg1, arg2, arg3) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_NO_ARG4; \ +@@ -294,9 +299,10 @@ extern void alternative_branches(void); + }) + + #define alternative_call3(func, arg1, arg2, arg3) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_NO_ARG4; \ +@@ -305,10 +311,11 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall4(func, arg1, arg2, arg3, arg4) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -318,10 +325,11 @@ extern void alternative_branches(void); + }) + + #define alternative_call4(func, arg1, arg2, arg3, arg4) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -332,11 +340,12 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall5(func, arg1, arg2, arg3, arg4, arg5) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -347,11 +356,12 @@ extern void alternative_branches(void); + }) + + #define alternative_call5(func, arg1, arg2, arg3, arg4, arg5) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -363,12 +373,13 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ + typeof(arg6) v6_ = (arg6); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -379,12 +390,13 @@ extern void alternative_branches(void); + }) + + #define alternative_call6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ + typeof(arg6) v6_ = (arg6); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +-- +2.44.0 + diff --git a/0036-x86-cpu-policy-Allow-for-levelling-of-VERW-side-effe.patch b/0036-x86-cpu-policy-Allow-for-levelling-of-VERW-side-effe.patch new file mode 100644 index 0000000..b91ff52 --- /dev/null +++ b/0036-x86-cpu-policy-Allow-for-levelling-of-VERW-side-effe.patch @@ -0,0 +1,102 @@ +From 54dacb5c02cba4676879ed077765734326b78e39 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 5 Mar 2024 12:01:22 +0100 +Subject: [PATCH 36/67] x86/cpu-policy: Allow for levelling of VERW side + effects +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +MD_CLEAR and FB_CLEAR need OR-ing across a migrate pool. Allow this, by +having them unconditinally set in max, with the host values reflected in +default. Annotate the bits as having special properies. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +master commit: de17162cafd27f2865a3102a2ec0f386a02ed03d +master date: 2024-03-01 20:14:19 +0000 +--- + xen/arch/x86/cpu-policy.c | 24 +++++++++++++++++++++ + xen/arch/x86/include/asm/cpufeature.h | 1 + + xen/include/public/arch-x86/cpufeatureset.h | 4 ++-- + 3 files changed, 27 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index f0f2c8a1c0..7b875a7221 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -435,6 +435,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + __set_bit(X86_FEATURE_RSBA, fs); + __set_bit(X86_FEATURE_RRSBA, fs); + ++ /* ++ * These bits indicate that the VERW instruction may have gained ++ * scrubbing side effects. With pooling, they mean "you might migrate ++ * somewhere where scrubbing is necessary", and may need exposing on ++ * unaffected hardware. This is fine, because the VERW instruction ++ * has been around since the 286. ++ */ ++ __set_bit(X86_FEATURE_MD_CLEAR, fs); ++ __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +@@ -469,6 +479,20 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) ) + __clear_bit(X86_FEATURE_RDRAND, fs); + ++ /* ++ * These bits indicate that the VERW instruction may have gained ++ * scrubbing side effects. The max policy has them set for migration ++ * reasons, so reset the default policy back to the host values in ++ * case we're unaffected. ++ */ ++ __clear_bit(X86_FEATURE_MD_CLEAR, fs); ++ if ( cpu_has_md_clear ) ++ __set_bit(X86_FEATURE_MD_CLEAR, fs); ++ ++ __clear_bit(X86_FEATURE_FB_CLEAR, fs); ++ if ( cpu_has_fb_clear ) ++ __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h +index 9ef7756593..ec824e8954 100644 +--- a/xen/arch/x86/include/asm/cpufeature.h ++++ b/xen/arch/x86/include/asm/cpufeature.h +@@ -136,6 +136,7 @@ + #define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS) + #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT) + #define cpu_has_srbds_ctrl boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ++#define cpu_has_md_clear boot_cpu_has(X86_FEATURE_MD_CLEAR) + #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) + #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) + #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 94d211df2f..aec1407613 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -260,7 +260,7 @@ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single + XEN_CPUFEATURE(FSRM, 9*32+ 4) /*A Fast Short REP MOVS */ + XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a VP2INTERSECT{D,Q} insns */ + XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ +-XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ ++XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffers */ + XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ + XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ + XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ +@@ -321,7 +321,7 @@ XEN_CPUFEATURE(DOITM, 16*32+12) /* Data Operand Invariant Timing + XEN_CPUFEATURE(SBDR_SSDP_NO, 16*32+13) /*A No Shared Buffer Data Read or Sideband Stale Data Propagation */ + XEN_CPUFEATURE(FBSDP_NO, 16*32+14) /*A No Fill Buffer Stale Data Propagation */ + XEN_CPUFEATURE(PSDP_NO, 16*32+15) /*A No Primary Stale Data Propagation */ +-XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*A Fill Buffers cleared by VERW */ ++XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*!A Fill Buffers cleared by VERW */ + XEN_CPUFEATURE(FB_CLEAR_CTRL, 16*32+18) /* MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */ + XEN_CPUFEATURE(RRSBA, 16*32+19) /*! Restricted RSB Alternative */ + XEN_CPUFEATURE(BHI_NO, 16*32+20) /*A No Branch History Injection */ +-- +2.44.0 + diff --git a/0037-hvmloader-PCI-skip-huge-BARs-in-certain-calculations.patch b/0037-hvmloader-PCI-skip-huge-BARs-in-certain-calculations.patch new file mode 100644 index 0000000..a46f913 --- /dev/null +++ b/0037-hvmloader-PCI-skip-huge-BARs-in-certain-calculations.patch @@ -0,0 +1,99 @@ +From 1e9808227c10717228969e924cab49cad4af6265 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 12 Mar 2024 12:08:48 +0100 +Subject: [PATCH 37/67] hvmloader/PCI: skip huge BARs in certain calculations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +BARs of size 2Gb and up can't possibly fit below 4Gb: Both the bottom of +the lower 2Gb range and the top of the higher 2Gb range have special +purpose. Don't even have them influence whether to (perhaps) relocate +low RAM. + +Reported-by: Neowutran +Signed-off-by: Jan Beulich +Acked-by: Roger Pau Monné +master commit: 57acad12a09ffa490e870ebe17596aad858f0191 +master date: 2024-03-06 10:19:29 +0100 +--- + tools/firmware/hvmloader/pci.c | 28 ++++++++++++++++++++-------- + 1 file changed, 20 insertions(+), 8 deletions(-) + +diff --git a/tools/firmware/hvmloader/pci.c b/tools/firmware/hvmloader/pci.c +index 257a6feb61..c3c61ca060 100644 +--- a/tools/firmware/hvmloader/pci.c ++++ b/tools/firmware/hvmloader/pci.c +@@ -33,6 +33,13 @@ uint32_t pci_mem_start = HVM_BELOW_4G_MMIO_START; + const uint32_t pci_mem_end = RESERVED_MEMBASE; + uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0; + ++/* ++ * BARs larger than this value are put in 64-bit space unconditionally. That ++ * is, such BARs also don't play into the determination of how big the lowmem ++ * MMIO hole needs to be. ++ */ ++#define BAR_RELOC_THRESH GB(1) ++ + enum virtual_vga virtual_vga = VGA_none; + unsigned long igd_opregion_pgbase = 0; + +@@ -286,9 +293,11 @@ void pci_setup(void) + bars[i].bar_reg = bar_reg; + bars[i].bar_sz = bar_sz; + +- if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == +- PCI_BASE_ADDRESS_SPACE_MEMORY) || +- (bar_reg == PCI_ROM_ADDRESS) ) ++ if ( is_64bar && bar_sz > BAR_RELOC_THRESH ) ++ bar64_relocate = 1; ++ else if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == ++ PCI_BASE_ADDRESS_SPACE_MEMORY) || ++ (bar_reg == PCI_ROM_ADDRESS) ) + mmio_total += bar_sz; + + nr_bars++; +@@ -367,7 +376,7 @@ void pci_setup(void) + pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT; + } + +- if ( mmio_total > (pci_mem_end - pci_mem_start) ) ++ if ( mmio_total > (pci_mem_end - pci_mem_start) || bar64_relocate ) + { + printf("Low MMIO hole not large enough for all devices," + " relocating some BARs to 64-bit\n"); +@@ -430,7 +439,8 @@ void pci_setup(void) + + /* + * Relocate to high memory if the total amount of MMIO needed +- * is more than the low MMIO available. Because devices are ++ * is more than the low MMIO available or BARs bigger than ++ * BAR_RELOC_THRESH are present. Because devices are + * processed in order of bar_sz, this will preferentially + * relocate larger devices to high memory first. + * +@@ -446,8 +456,9 @@ void pci_setup(void) + * the code here assumes it to be.) + * Should either of those two conditions change, this code will break. + */ +- using_64bar = bars[i].is_64bar && bar64_relocate +- && (mmio_total > (mem_resource.max - mem_resource.base)); ++ using_64bar = bars[i].is_64bar && bar64_relocate && ++ (mmio_total > (mem_resource.max - mem_resource.base) || ++ bar_sz > BAR_RELOC_THRESH); + bar_data = pci_readl(devfn, bar_reg); + + if ( (bar_data & PCI_BASE_ADDRESS_SPACE) == +@@ -467,7 +478,8 @@ void pci_setup(void) + resource = &mem_resource; + bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK; + } +- mmio_total -= bar_sz; ++ if ( bar_sz <= BAR_RELOC_THRESH ) ++ mmio_total -= bar_sz; + } + else + { +-- +2.44.0 + diff --git a/0038-x86-mm-fix-detection-of-last-L1-entry-in-modify_xen_.patch b/0038-x86-mm-fix-detection-of-last-L1-entry-in-modify_xen_.patch new file mode 100644 index 0000000..66b4db3 --- /dev/null +++ b/0038-x86-mm-fix-detection-of-last-L1-entry-in-modify_xen_.patch @@ -0,0 +1,41 @@ +From 1f94117bec55a7b934fed3dfd3529db624eb441f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 12 Mar 2024 12:08:59 +0100 +Subject: [PATCH 38/67] x86/mm: fix detection of last L1 entry in + modify_xen_mappings_lite() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current logic to detect when to switch to the next L1 table is incorrectly +using l2_table_offset() in order to notice when the last entry on the current +L1 table has been reached. + +It should instead use l1_table_offset() to check whether the index has wrapped +to point to the first entry, and so the next L1 table should be used. + +Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active') +Signed-off-by: Roger Pau Monné +Reviewed-by: Andrew Cooper +master commit: 7c81558208de7858251b62f168a449be84305595 +master date: 2024-03-11 11:09:42 +0000 +--- + xen/arch/x86/mm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index e884a6fdbd..330c4abcd1 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5963,7 +5963,7 @@ void init_or_livepatch modify_xen_mappings_lite( + + v += 1UL << L1_PAGETABLE_SHIFT; + +- if ( l2_table_offset(v) == 0 ) ++ if ( l1_table_offset(v) == 0 ) + break; + } + +-- +2.44.0 + diff --git a/0039-x86-entry-Introduce-EFRAME_-constants.patch b/0039-x86-entry-Introduce-EFRAME_-constants.patch new file mode 100644 index 0000000..c280286 --- /dev/null +++ b/0039-x86-entry-Introduce-EFRAME_-constants.patch @@ -0,0 +1,314 @@ +From e691f99f17198906f813b85dcabafe5addb9a57a Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Sat, 27 Jan 2024 17:52:09 +0000 +Subject: [PATCH 39/67] x86/entry: Introduce EFRAME_* constants + +restore_all_guest() does a lot of manipulation of the stack after popping the +GPRs, and uses raw %rsp displacements to do so. Also, almost all entrypaths +use raw %rsp displacements prior to pushing GPRs. + +Provide better mnemonics, to aid readability and reduce the chance of errors +when editing. + +No functional change. The resulting binary is identical. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 37541208f119a9c552c6c6c3246ea61be0d44035) +--- + xen/arch/x86/x86_64/asm-offsets.c | 17 ++++++++ + xen/arch/x86/x86_64/compat/entry.S | 2 +- + xen/arch/x86/x86_64/entry.S | 70 +++++++++++++++--------------- + 3 files changed, 53 insertions(+), 36 deletions(-) + +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 287dac101a..31fa63b77f 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -51,6 +51,23 @@ void __dummy__(void) + OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); + BLANK(); + ++ /* ++ * EFRAME_* is for the entry/exit logic where %rsp is pointing at ++ * UREGS_error_code and GPRs are still/already guest values. ++ */ ++#define OFFSET_EF(sym, mem) \ ++ DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ ++ offsetof(struct cpu_user_regs, error_code)) ++ ++ OFFSET_EF(EFRAME_entry_vector, entry_vector); ++ OFFSET_EF(EFRAME_rip, rip); ++ OFFSET_EF(EFRAME_cs, cs); ++ OFFSET_EF(EFRAME_eflags, eflags); ++ OFFSET_EF(EFRAME_rsp, rsp); ++ BLANK(); ++ ++#undef OFFSET_EF ++ + OFFSET(VCPU_processor, struct vcpu, processor); + OFFSET(VCPU_domain, struct vcpu, domain); + OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 253bb1688c..7c211314d8 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -15,7 +15,7 @@ ENTRY(entry_int82) + ENDBR64 + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + pushq $0 +- movl $HYPERCALL_VECTOR, 4(%rsp) ++ movl $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp) + SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 585b0c9551..412cbeb3ec 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -190,15 +190,15 @@ restore_all_guest: + SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL +- testw $TRAP_syscall,4(%rsp) ++ testw $TRAP_syscall, EFRAME_entry_vector(%rsp) + jz iret_exit_to_guest + +- movq 24(%rsp),%r11 # RFLAGS ++ mov EFRAME_eflags(%rsp), %r11 + andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 + orq $X86_EFLAGS_IF,%r11 + + /* Don't use SYSRET path if the return address is not canonical. */ +- movq 8(%rsp),%rcx ++ mov EFRAME_rip(%rsp), %rcx + sarq $47,%rcx + incl %ecx + cmpl $1,%ecx +@@ -213,20 +213,20 @@ restore_all_guest: + ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK + #endif + +- movq 8(%rsp), %rcx # RIP +- cmpw $FLAT_USER_CS32,16(%rsp)# CS +- movq 32(%rsp),%rsp # RSP ++ mov EFRAME_rip(%rsp), %rcx ++ cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) ++ mov EFRAME_rsp(%rsp), %rsp + je 1f + sysretq + 1: sysretl + + ALIGN + .Lrestore_rcx_iret_exit_to_guest: +- movq 8(%rsp), %rcx # RIP ++ mov EFRAME_rip(%rsp), %rcx + /* No special register assumptions. */ + iret_exit_to_guest: +- andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) +- orl $X86_EFLAGS_IF,24(%rsp) ++ andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) ++ orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) + addq $8,%rsp + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) +@@ -257,7 +257,7 @@ ENTRY(lstar_enter) + pushq $FLAT_KERNEL_CS64 + pushq %rcx + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -294,7 +294,7 @@ ENTRY(cstar_enter) + pushq $FLAT_USER_CS32 + pushq %rcx + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -335,7 +335,7 @@ GLOBAL(sysenter_eflags_saved) + pushq $3 /* ring 3 null cs */ + pushq $0 /* null rip */ + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -389,7 +389,7 @@ ENTRY(int80_direct_trap) + ENDBR64 + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + pushq $0 +- movl $0x80, 4(%rsp) ++ movl $0x80, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -649,7 +649,7 @@ ret_from_intr: + .section .init.text, "ax", @progbits + ENTRY(early_page_fault) + ENDBR64 +- movl $TRAP_page_fault, 4(%rsp) ++ movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) + SAVE_ALL + movq %rsp, %rdi + call do_early_page_fault +@@ -716,7 +716,7 @@ ENTRY(common_interrupt) + + ENTRY(page_fault) + ENDBR64 +- movl $TRAP_page_fault,4(%rsp) ++ movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) + /* No special register assumptions. */ + GLOBAL(handle_exception) + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP +@@ -892,90 +892,90 @@ FATAL_exception_with_ints_disabled: + ENTRY(divide_error) + ENDBR64 + pushq $0 +- movl $TRAP_divide_error,4(%rsp) ++ movl $TRAP_divide_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(coprocessor_error) + ENDBR64 + pushq $0 +- movl $TRAP_copro_error,4(%rsp) ++ movl $TRAP_copro_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(simd_coprocessor_error) + ENDBR64 + pushq $0 +- movl $TRAP_simd_error,4(%rsp) ++ movl $TRAP_simd_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(device_not_available) + ENDBR64 + pushq $0 +- movl $TRAP_no_device,4(%rsp) ++ movl $TRAP_no_device, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(debug) + ENDBR64 + pushq $0 +- movl $TRAP_debug,4(%rsp) ++ movl $TRAP_debug, EFRAME_entry_vector(%rsp) + jmp handle_ist_exception + + ENTRY(int3) + ENDBR64 + pushq $0 +- movl $TRAP_int3,4(%rsp) ++ movl $TRAP_int3, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(overflow) + ENDBR64 + pushq $0 +- movl $TRAP_overflow,4(%rsp) ++ movl $TRAP_overflow, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(bounds) + ENDBR64 + pushq $0 +- movl $TRAP_bounds,4(%rsp) ++ movl $TRAP_bounds, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(invalid_op) + ENDBR64 + pushq $0 +- movl $TRAP_invalid_op,4(%rsp) ++ movl $TRAP_invalid_op, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(invalid_TSS) + ENDBR64 +- movl $TRAP_invalid_tss,4(%rsp) ++ movl $TRAP_invalid_tss, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(segment_not_present) + ENDBR64 +- movl $TRAP_no_segment,4(%rsp) ++ movl $TRAP_no_segment, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(stack_segment) + ENDBR64 +- movl $TRAP_stack_error,4(%rsp) ++ movl $TRAP_stack_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(general_protection) + ENDBR64 +- movl $TRAP_gp_fault,4(%rsp) ++ movl $TRAP_gp_fault, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(alignment_check) + ENDBR64 +- movl $TRAP_alignment_check,4(%rsp) ++ movl $TRAP_alignment_check, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_CP) + ENDBR64 +- movl $X86_EXC_CP, 4(%rsp) ++ movl $X86_EXC_CP, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(double_fault) + ENDBR64 +- movl $TRAP_double_fault,4(%rsp) ++ movl $TRAP_double_fault, EFRAME_entry_vector(%rsp) + /* Set AC to reduce chance of further SMAP faults */ + ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP + SAVE_ALL +@@ -1001,7 +1001,7 @@ ENTRY(double_fault) + ENTRY(nmi) + ENDBR64 + pushq $0 +- movl $TRAP_nmi,4(%rsp) ++ movl $TRAP_nmi, EFRAME_entry_vector(%rsp) + handle_ist_exception: + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + SAVE_ALL +@@ -1134,7 +1134,7 @@ handle_ist_exception: + ENTRY(machine_check) + ENDBR64 + pushq $0 +- movl $TRAP_machine_check,4(%rsp) ++ movl $TRAP_machine_check, EFRAME_entry_vector(%rsp) + jmp handle_ist_exception + + /* No op trap handler. Required for kexec crash path. */ +@@ -1171,7 +1171,7 @@ autogen_stubs: /* Automatically generated stubs. */ + 1: + ENDBR64 + pushq $0 +- movb $vec,4(%rsp) ++ movb $vec, EFRAME_entry_vector(%rsp) + jmp common_interrupt + + entrypoint 1b +@@ -1185,7 +1185,7 @@ autogen_stubs: /* Automatically generated stubs. */ + test $8,%spl /* 64bit exception frames are 16 byte aligned, but the word */ + jz 2f /* size is 8 bytes. Check whether the processor gave us an */ + pushq $0 /* error code, and insert an empty one if not. */ +-2: movb $vec,4(%rsp) ++2: movb $vec, EFRAME_entry_vector(%rsp) + jmp handle_exception + + entrypoint 1b +-- +2.44.0 + diff --git a/0040-x86-Resync-intel-family.h-from-Linux.patch b/0040-x86-Resync-intel-family.h-from-Linux.patch new file mode 100644 index 0000000..84e0304 --- /dev/null +++ b/0040-x86-Resync-intel-family.h-from-Linux.patch @@ -0,0 +1,98 @@ +From abc43cf5a6579f1aa0decf0a2349cdd2d2473117 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 27 Feb 2024 16:07:39 +0000 +Subject: [PATCH 40/67] x86: Resync intel-family.h from Linux + +From v6.8-rc6 + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +(cherry picked from commit 195e75371b13c4f7ecdf7b5c50aed0d02f2d7ce8) +--- + xen/arch/x86/include/asm/intel-family.h | 38 ++++++++++++++++++++++--- + 1 file changed, 34 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/include/asm/intel-family.h b/xen/arch/x86/include/asm/intel-family.h +index ffc49151be..b65e9c46b9 100644 +--- a/xen/arch/x86/include/asm/intel-family.h ++++ b/xen/arch/x86/include/asm/intel-family.h +@@ -26,6 +26,9 @@ + * _G - parts with extra graphics on + * _X - regular server parts + * _D - micro server parts ++ * _N,_P - other mobile parts ++ * _H - premium mobile parts ++ * _S - other client parts + * + * Historical OPTDIFFs: + * +@@ -37,6 +40,9 @@ + * their own names :-( + */ + ++/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ ++#define INTEL_FAM6_ANY X86_MODEL_ANY ++ + #define INTEL_FAM6_CORE_YONAH 0x0E + + #define INTEL_FAM6_CORE2_MEROM 0x0F +@@ -93,8 +99,6 @@ + #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ + #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ + +-#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ +- + #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ + + #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ +@@ -102,12 +106,31 @@ + + #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ + ++#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF ++ ++#define INTEL_FAM6_GRANITERAPIDS_X 0xAD ++#define INTEL_FAM6_GRANITERAPIDS_D 0xAE ++ ++/* "Hybrid" Processors (P-Core/E-Core) */ ++ ++#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ ++ + #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ + #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ + +-#define INTEL_FAM6_RAPTORLAKE 0xB7 ++#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ ++#define INTEL_FAM6_RAPTORLAKE_P 0xBA ++#define INTEL_FAM6_RAPTORLAKE_S 0xBF ++ ++#define INTEL_FAM6_METEORLAKE 0xAC ++#define INTEL_FAM6_METEORLAKE_L 0xAA ++ ++#define INTEL_FAM6_ARROWLAKE_H 0xC5 ++#define INTEL_FAM6_ARROWLAKE 0xC6 ++ ++#define INTEL_FAM6_LUNARLAKE_M 0xBD + +-/* "Small Core" Processors (Atom) */ ++/* "Small Core" Processors (Atom/E-Core) */ + + #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ + #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ +@@ -134,6 +157,13 @@ + #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ + #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ + ++#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ ++ ++#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ ++#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ ++ ++#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ ++ + /* Xeon Phi */ + + #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ +-- +2.44.0 + diff --git a/0041-x86-vmx-Perform-VERW-flushing-later-in-the-VMExit-pa.patch b/0041-x86-vmx-Perform-VERW-flushing-later-in-the-VMExit-pa.patch new file mode 100644 index 0000000..871f10f --- /dev/null +++ b/0041-x86-vmx-Perform-VERW-flushing-later-in-the-VMExit-pa.patch @@ -0,0 +1,146 @@ +From 77f2bec134049aba29b9b459f955022722d10847 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 23 Jun 2023 11:32:00 +0100 +Subject: [PATCH 41/67] x86/vmx: Perform VERW flushing later in the VMExit path + +Broken out of the following patch because this change is subtle enough on its +own. See it for the rational of why we're moving VERW. + +As for how, extend the trick already used to hold one condition in +flags (RESUME vs LAUNCH) through the POPing of GPRs. + +Move the MOV CR earlier. Intel specify flags to be undefined across it. + +Encode the two conditions we want using SF and PF. See the code comment for +exactly how. + +Leave a comment to explain the lack of any content around +SPEC_CTRL_EXIT_TO_VMX, but leave the block in place. Sods law says if we +delete it, we'll need to reintroduce it. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 475fa20b7384464210f42bad7195f87bd6f1c63f) +--- + xen/arch/x86/hvm/vmx/entry.S | 36 +++++++++++++++++++++--- + xen/arch/x86/include/asm/asm_defns.h | 8 ++++++ + xen/arch/x86/include/asm/spec_ctrl_asm.h | 7 +++++ + xen/arch/x86/x86_64/asm-offsets.c | 1 + + 4 files changed, 48 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S +index 5f5de45a13..cdde76e138 100644 +--- a/xen/arch/x86/hvm/vmx/entry.S ++++ b/xen/arch/x86/hvm/vmx/entry.S +@@ -87,17 +87,39 @@ UNLIKELY_END(realmode) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ + /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ +- DO_SPEC_CTRL_COND_VERW ++ /* ++ * All speculation safety work happens to be elsewhere. VERW is after ++ * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left ++ * to the MSR load list. ++ */ + + mov VCPU_hvm_guest_cr2(%rbx),%rax ++ mov %rax, %cr2 ++ ++ /* ++ * We need to perform two conditional actions (VERW, and Resume vs ++ * Launch) after popping GPRs. With some cunning, we can encode both ++ * of these in eflags together. ++ * ++ * Parity is only calculated over the bottom byte of the answer, while ++ * Sign is simply the top bit. ++ * ++ * Therefore, the final OR instruction ends up producing: ++ * SF = VCPU_vmx_launched ++ * PF = !SCF_verw ++ */ ++ BUILD_BUG_ON(SCF_verw & ~0xff) ++ movzbl VCPU_vmx_launched(%rbx), %ecx ++ shl $31, %ecx ++ movzbl CPUINFO_spec_ctrl_flags(%rsp), %eax ++ and $SCF_verw, %eax ++ or %eax, %ecx + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp +- mov %rax,%cr2 +- cmpb $0,VCPU_vmx_launched(%rbx) + pop %rbx + pop %r11 + pop %r10 +@@ -108,7 +130,13 @@ UNLIKELY_END(realmode) + pop %rdx + pop %rsi + pop %rdi +- je .Lvmx_launch ++ ++ jpe .L_skip_verw ++ /* VERW clobbers ZF, but preserves all others, including SF. */ ++ verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp) ++.L_skip_verw: ++ ++ jns .Lvmx_launch + + /*.Lvmx_resume:*/ + VMRESUME +diff --git a/xen/arch/x86/include/asm/asm_defns.h b/xen/arch/x86/include/asm/asm_defns.h +index d9431180cf..abc6822b08 100644 +--- a/xen/arch/x86/include/asm/asm_defns.h ++++ b/xen/arch/x86/include/asm/asm_defns.h +@@ -81,6 +81,14 @@ register unsigned long current_stack_pointer asm("rsp"); + + #ifdef __ASSEMBLY__ + ++.macro BUILD_BUG_ON condstr, cond:vararg ++ .if \cond ++ .error "Condition \"\condstr\" not satisfied" ++ .endif ++.endm ++/* preprocessor macro to make error message more user friendly */ ++#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond ++ + #ifdef HAVE_AS_QUOTED_SYM + #define SUBSECTION_LBL(tag) \ + .ifndef .L.tag; \ +diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h +index f4b8b9d956..ca9cb0f5dd 100644 +--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h ++++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h +@@ -164,6 +164,13 @@ + #endif + .endm + ++/* ++ * Helper to improve the readibility of stack dispacements with %rsp in ++ * unusual positions. Both @field and @top_of_stack should be constants from ++ * the same object. @top_of_stack should be where %rsp is currently pointing. ++ */ ++#define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) ++ + .macro DO_SPEC_CTRL_COND_VERW + /* + * Requires %rsp=cpuinfo +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 31fa63b77f..a4e94d6930 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -135,6 +135,7 @@ void __dummy__(void) + #endif + + OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); ++ OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); + OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); +-- +2.44.0 + diff --git a/0042-x86-spec-ctrl-Perform-VERW-flushing-later-in-exit-pa.patch b/0042-x86-spec-ctrl-Perform-VERW-flushing-later-in-exit-pa.patch new file mode 100644 index 0000000..ac78acd --- /dev/null +++ b/0042-x86-spec-ctrl-Perform-VERW-flushing-later-in-exit-pa.patch @@ -0,0 +1,209 @@ +From 76af773de5d3e68b7140cc9c5343be6746c9101c Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Sat, 27 Jan 2024 18:20:56 +0000 +Subject: [PATCH 42/67] x86/spec-ctrl: Perform VERW flushing later in exit + paths + +On parts vulnerable to RFDS, VERW's side effects are extended to scrub all +non-architectural entries in various Physical Register Files. To remove all +of Xen's values, the VERW must be after popping the GPRs. + +Rework SPEC_CTRL_COND_VERW to default to an CPUINFO_error_code %rsp position, +but with overrides for other contexts. Identify that it clobbers eflags; this +is particularly relevant for the SYSRET path. + +For the IST exit return to Xen, have the main SPEC_CTRL_EXIT_TO_XEN put a +shadow copy of spec_ctrl_flags, as GPRs can't be used at the point we want to +issue the VERW. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 0a666cf2cd99df6faf3eebc81a1fc286e4eca4c7) +--- + xen/arch/x86/include/asm/spec_ctrl_asm.h | 36 ++++++++++++++++-------- + xen/arch/x86/x86_64/asm-offsets.c | 13 +++++++-- + xen/arch/x86/x86_64/compat/entry.S | 6 ++++ + xen/arch/x86/x86_64/entry.S | 21 +++++++++++++- + 4 files changed, 61 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h +index ca9cb0f5dd..97a97b2b82 100644 +--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h ++++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h +@@ -171,16 +171,23 @@ + */ + #define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) + +-.macro DO_SPEC_CTRL_COND_VERW ++.macro SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_error_code), \ ++ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_error_code) + /* +- * Requires %rsp=cpuinfo ++ * Requires \scf and \sel as %rsp-relative expressions ++ * Clobbers eflags ++ * ++ * VERW needs to run after guest GPRs have been restored, where only %rsp is ++ * good to use. Default to expecting %rsp pointing at CPUINFO_error_code. ++ * Contexts where this is not true must provide an alternative \scf and \sel. + * + * Issue a VERW for its flushing side effect, if indicated. This is a Spectre + * v1 gadget, but the IRET/VMEntry is serialising. + */ +- testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) ++ testb $SCF_verw, \scf(%rsp) + jz .L\@_verw_skip +- verw CPUINFO_verw_sel(%rsp) ++ verw \sel(%rsp) + .L\@_verw_skip: + .endm + +@@ -298,8 +305,6 @@ + */ + ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV + +- DO_SPEC_CTRL_COND_VERW +- + ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV + .endm + +@@ -379,7 +384,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + */ + .macro SPEC_CTRL_EXIT_TO_XEN + /* +- * Requires %r12=ist_exit, %r14=stack_end ++ * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs + * Clobbers %rax, %rbx, %rcx, %rdx + */ + movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx +@@ -407,11 +412,18 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + test %r12, %r12 + jz .L\@_skip_ist_exit + +- /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ +- testb $SCF_verw, %bl +- jz .L\@_skip_verw +- verw STACK_CPUINFO_FIELD(verw_sel)(%r14) +-.L\@_skip_verw: ++ /* ++ * Stash SCF and verw_sel above eflags in the case of an IST_exit. The ++ * VERW logic needs to run after guest GPRs have been restored; i.e. where ++ * we cannot use %r12 or %r14 for the purposes they have here. ++ * ++ * When the CPU pushed this exception frame, it zero-extended eflags. ++ * Therefore it is safe for the VERW logic to look at the stashed SCF ++ * outside of the ist_exit condition. Also, this stashing won't influence ++ * any other restore_all_guest() paths. ++ */ ++ or $(__HYPERVISOR_DS32 << 16), %ebx ++ mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */ + + ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV + +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index a4e94d6930..4cd5938d7b 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -55,14 +55,22 @@ void __dummy__(void) + * EFRAME_* is for the entry/exit logic where %rsp is pointing at + * UREGS_error_code and GPRs are still/already guest values. + */ +-#define OFFSET_EF(sym, mem) \ ++#define OFFSET_EF(sym, mem, ...) \ + DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ +- offsetof(struct cpu_user_regs, error_code)) ++ offsetof(struct cpu_user_regs, error_code) __VA_ARGS__) + + OFFSET_EF(EFRAME_entry_vector, entry_vector); + OFFSET_EF(EFRAME_rip, rip); + OFFSET_EF(EFRAME_cs, cs); + OFFSET_EF(EFRAME_eflags, eflags); ++ ++ /* ++ * These aren't real fields. They're spare space, used by the IST ++ * exit-to-xen path. ++ */ ++ OFFSET_EF(EFRAME_shadow_scf, eflags, +4); ++ OFFSET_EF(EFRAME_shadow_sel, eflags, +6); ++ + OFFSET_EF(EFRAME_rsp, rsp); + BLANK(); + +@@ -136,6 +144,7 @@ void __dummy__(void) + + OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); + OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); ++ OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip); + OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 7c211314d8..3b2fbcd873 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -161,6 +161,12 @@ ENTRY(compat_restore_all_guest) + SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL adj=8 compat=1 ++ ++ /* Account for ev/ec having already been popped off the stack. */ ++ SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_rip), \ ++ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_rip) ++ + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) + +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 412cbeb3ec..ef517e2945 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -214,6 +214,9 @@ restore_all_guest: + #endif + + mov EFRAME_rip(%rsp), %rcx ++ ++ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ ++ + cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) + mov EFRAME_rsp(%rsp), %rsp + je 1f +@@ -227,6 +230,9 @@ restore_all_guest: + iret_exit_to_guest: + andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) + orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) ++ ++ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ ++ + addq $8,%rsp + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) +@@ -679,9 +685,22 @@ UNLIKELY_START(ne, exit_cr3) + UNLIKELY_END(exit_cr3) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ ++ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */ + + RESTORE_ALL adj=8 ++ ++ /* ++ * When the CPU pushed this exception frame, it zero-extended eflags. ++ * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of ++ * spec_ctrl_flags and ver_sel above eflags, as we can't use any GPRs, ++ * and we're at a random place on the stack, not in a CPUFINFO block. ++ * ++ * Account for ev/ec having already been popped off the stack. ++ */ ++ SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \ ++ sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip) ++ + iretq + + ENTRY(common_interrupt) +-- +2.44.0 + diff --git a/0043-x86-spec-ctrl-Rename-VERW-related-options.patch b/0043-x86-spec-ctrl-Rename-VERW-related-options.patch new file mode 100644 index 0000000..38edc15 --- /dev/null +++ b/0043-x86-spec-ctrl-Rename-VERW-related-options.patch @@ -0,0 +1,248 @@ +From d55d52961d13d4fcd1441fcfca98f690e687b941 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 12 Feb 2024 17:50:43 +0000 +Subject: [PATCH 43/67] x86/spec-ctrl: Rename VERW related options + +VERW is going to be used for a 3rd purpose, and the existing nomenclature +didn't survive the Stale MMIO issues terribly well. + +Rename the command line option from `md-clear=` to `verw=`. This is more +consistent with other options which tend to be named based on what they're +doing, not which feature enumeration they use behind the scenes. Retain +`md-clear=` as a deprecated alias. + +Rename opt_md_clear_{pv,hvm} and opt_fb_clear_mmio to opt_verw_{pv,hvm,mmio}, +which has a side effect of making spec_ctrl_init_domain() rather clearer to +follow. + +No functional change. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit f7603ca252e4226739eb3129a5290ee3da3f8ea4) +--- + docs/misc/xen-command-line.pandoc | 15 ++++---- + xen/arch/x86/spec_ctrl.c | 62 ++++++++++++++++--------------- + 2 files changed, 40 insertions(+), 37 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 2006697226..d909ec94fe 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2324,7 +2324,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + + ### spec-ctrl (x86) + > `= List of [ , xen=, {pv,hvm}=, +-> {msr-sc,rsb,md-clear,ibpb-entry}=|{pv,hvm}=, ++> {msr-sc,rsb,verw,ibpb-entry}=|{pv,hvm}=, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, + > eager-fpu,l1d-flush,branch-harden,srb-lock, + > unpriv-mmio,gds-mit,div-scrub}= ]` +@@ -2349,7 +2349,7 @@ in place for guests to use. + + Use of a positive boolean value for either of these options is invalid. + +-The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options ++The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=` and `ibpb-entry=` options + offer fine grained control over the primitives by Xen. These impact Xen's + ability to protect itself, and/or Xen's ability to virtualise support for + guests to use. +@@ -2366,11 +2366,12 @@ guests to use. + guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. + * `rsb=` offers control over whether to overwrite the Return Stack Buffer / + Return Address Stack on entry to Xen and on idle. +-* `md-clear=` offers control over whether to use VERW to flush +- microarchitectural buffers on idle and exit from Xen. *Note: For +- compatibility with development versions of this fix, `mds=` is also accepted +- on Xen 4.12 and earlier as an alias. Consult vendor documentation in +- preference to here.* ++* `verw=` offers control over whether to use VERW for its scrubbing side ++ effects at appropriate privilege transitions. The exact side effects are ++ microarchitecture and microcode specific. *Note: `md-clear=` is accepted as ++ a deprecated alias. For compatibility with development versions of XSA-297, ++ `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor ++ documentation in preference to here.* + * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction + Barrier) is used on entry to Xen. This is used by default on hardware + vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 25a18ac598..e12ec9930c 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -37,8 +37,8 @@ static bool __initdata opt_msr_sc_pv = true; + static bool __initdata opt_msr_sc_hvm = true; + static int8_t __initdata opt_rsb_pv = -1; + static bool __initdata opt_rsb_hvm = true; +-static int8_t __ro_after_init opt_md_clear_pv = -1; +-static int8_t __ro_after_init opt_md_clear_hvm = -1; ++static int8_t __ro_after_init opt_verw_pv = -1; ++static int8_t __ro_after_init opt_verw_hvm = -1; + + static int8_t __ro_after_init opt_ibpb_entry_pv = -1; + static int8_t __ro_after_init opt_ibpb_entry_hvm = -1; +@@ -78,7 +78,7 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. + + static int8_t __initdata opt_srb_lock = -1; + static bool __initdata opt_unpriv_mmio; +-static bool __ro_after_init opt_fb_clear_mmio; ++static bool __ro_after_init opt_verw_mmio; + static int8_t __initdata opt_gds_mit = -1; + static int8_t __initdata opt_div_scrub = -1; + +@@ -120,8 +120,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +- opt_md_clear_pv = 0; +- opt_md_clear_hvm = 0; ++ opt_verw_pv = 0; ++ opt_verw_hvm = 0; + opt_ibpb_entry_pv = 0; + opt_ibpb_entry_hvm = 0; + opt_ibpb_entry_dom0 = false; +@@ -152,14 +152,14 @@ static int __init cf_check parse_spec_ctrl(const char *s) + { + opt_msr_sc_pv = val; + opt_rsb_pv = val; +- opt_md_clear_pv = val; ++ opt_verw_pv = val; + opt_ibpb_entry_pv = val; + } + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + { + opt_msr_sc_hvm = val; + opt_rsb_hvm = val; +- opt_md_clear_hvm = val; ++ opt_verw_hvm = val; + opt_ibpb_entry_hvm = val; + } + else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) +@@ -204,21 +204,22 @@ static int __init cf_check parse_spec_ctrl(const char *s) + break; + } + } +- else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) ++ else if ( (val = parse_boolean("verw", s, ss)) != -1 || ++ (val = parse_boolean("md-clear", s, ss)) != -1 ) + { + switch ( val ) + { + case 0: + case 1: +- opt_md_clear_pv = opt_md_clear_hvm = val; ++ opt_verw_pv = opt_verw_hvm = val; + break; + + case -2: +- s += strlen("md-clear="); ++ s += (*s == 'v') ? strlen("verw=") : strlen("md-clear="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) +- opt_md_clear_pv = val; ++ opt_verw_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) +- opt_md_clear_hvm = val; ++ opt_verw_hvm = val; + else + default: + rc = -EINVAL; +@@ -540,8 +541,8 @@ static void __init print_details(enum ind_thunk thunk) + opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", + opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", + opt_l1d_flush ? " L1D_FLUSH" : "", +- opt_md_clear_pv || opt_md_clear_hvm || +- opt_fb_clear_mmio ? " VERW" : "", ++ opt_verw_pv || opt_verw_hvm || ++ opt_verw_mmio ? " VERW" : "", + opt_div_scrub ? " DIV" : "", + opt_branch_harden ? " BRANCH_HARDEN" : ""); + +@@ -562,13 +563,13 @@ static void __init print_details(enum ind_thunk thunk) + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || + amd_virt_spec_ctrl || +- opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", ++ opt_eager_fpu || opt_verw_hvm) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", + (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || + amd_virt_spec_ctrl) ? " MSR_VIRT_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- opt_md_clear_hvm ? " MD_CLEAR" : "", ++ opt_verw_hvm ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); + + #endif +@@ -577,11 +578,11 @@ static void __init print_details(enum ind_thunk thunk) + (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || + boot_cpu_has(X86_FEATURE_SC_RSB_PV) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || +- opt_eager_fpu || opt_md_clear_pv) ? "" : " None", ++ opt_eager_fpu || opt_verw_pv) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- opt_md_clear_pv ? " MD_CLEAR" : "", ++ opt_verw_pv ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); + + printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", +@@ -1514,8 +1515,8 @@ void spec_ctrl_init_domain(struct domain *d) + { + bool pv = is_pv_domain(d); + +- bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || +- (opt_fb_clear_mmio && is_iommu_enabled(d))); ++ bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) || ++ (opt_verw_mmio && is_iommu_enabled(d))); + + bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && + (d->domain_id != 0 || opt_ibpb_entry_dom0)); +@@ -1878,19 +1879,20 @@ void __init init_speculation_mitigations(void) + * the return-to-guest path. + */ + if ( opt_unpriv_mmio ) +- opt_fb_clear_mmio = cpu_has_fb_clear; ++ opt_verw_mmio = cpu_has_fb_clear; + + /* + * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. + * This will only be a token effort for MLPDS/MFBDS when HT is enabled, + * but it is somewhat better than nothing. + */ +- if ( opt_md_clear_pv == -1 ) +- opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- boot_cpu_has(X86_FEATURE_MD_CLEAR)); +- if ( opt_md_clear_hvm == -1 ) +- opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- boot_cpu_has(X86_FEATURE_MD_CLEAR)); ++ if ( opt_verw_pv == -1 ) ++ opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && ++ cpu_has_md_clear); ++ ++ if ( opt_verw_hvm == -1 ) ++ opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && ++ cpu_has_md_clear); + + /* + * Enable MDS/MMIO defences as applicable. The Idle blocks need using if +@@ -1903,12 +1905,12 @@ void __init init_speculation_mitigations(void) + * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) + * + * After calculating the appropriate idle setting, simplify +- * opt_md_clear_hvm to mean just "should we VERW on the way into HVM ++ * opt_verw_hvm to mean just "should we VERW on the way into HVM + * guests", so spec_ctrl_init_domain() can calculate suitable settings. + */ +- if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) ++ if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; ++ opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT +-- +2.44.0 + diff --git a/0044-x86-spec-ctrl-VERW-handling-adjustments.patch b/0044-x86-spec-ctrl-VERW-handling-adjustments.patch new file mode 100644 index 0000000..e2458c9 --- /dev/null +++ b/0044-x86-spec-ctrl-VERW-handling-adjustments.patch @@ -0,0 +1,171 @@ +From 6663430b442fdf9698bd8e03f701a4547309ad71 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 5 Mar 2024 19:33:37 +0000 +Subject: [PATCH 44/67] x86/spec-ctrl: VERW-handling adjustments + +... before we add yet more complexity to this logic. Mostly expanded +comments, but with three minor changes. + +1) Introduce cpu_has_useful_md_clear to simplify later logic in this patch and + future ones. + +2) We only ever need SC_VERW_IDLE when SMT is active. If SMT isn't active, + then there's no re-partition of pipeline resources based on thread-idleness + to worry about. + +3) The logic to adjust HVM VERW based on L1D_FLUSH is unmaintainable and, as + it turns out, wrong. SKIP_L1DFL is just a hint bit, whereas opt_l1d_flush + is the relevant decision of whether to use L1D_FLUSH based on + susceptibility and user preference. + + Rewrite the logic so it can be followed, and incorporate the fact that when + FB_CLEAR is visible, L1D_FLUSH isn't a safe substitution. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +(cherry picked from commit 1eb91a8a06230b4b64228c9a380194f8cfe6c5e2) +--- + xen/arch/x86/spec_ctrl.c | 99 +++++++++++++++++++++++++++++----------- + 1 file changed, 73 insertions(+), 26 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index e12ec9930c..adb6bc74e8 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -1531,7 +1531,7 @@ void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; + bool has_spec_ctrl, ibrs = false, hw_smt_enabled; +- bool cpu_has_bug_taa, retpoline_safe; ++ bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe; + + hw_smt_enabled = check_smt_enabled(); + +@@ -1867,50 +1867,97 @@ void __init init_speculation_mitigations(void) + "enabled. Please assess your configuration and choose an\n" + "explicit 'smt=' setting. See XSA-273.\n"); + ++ /* ++ * A brief summary of VERW-related changes. ++ * ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html ++ * ++ * Relevant ucodes: ++ * ++ * - May 2019, for MDS. Introduces the MD_CLEAR CPUID bit and VERW side ++ * effects to scrub Store/Load/Fill buffers as applicable. MD_CLEAR ++ * exists architecturally, even when the side effects have been removed. ++ * ++ * Use VERW to scrub on return-to-guest. Parts with L1D_FLUSH to ++ * mitigate L1TF have the same side effect, so no need to do both. ++ * ++ * Various Atoms suffer from Store-buffer sampling only. Store buffers ++ * are statically partitioned between non-idle threads, so scrubbing is ++ * wanted when going idle too. ++ * ++ * Load ports and Fill buffers are competitively shared between threads. ++ * SMT must be disabled for VERW scrubbing to be fully effective. ++ * ++ * - November 2019, for TAA. Extended VERW side effects to TSX-enabled ++ * MDS_NO parts. ++ * ++ * - February 2022, for Client TSX de-feature. Removed VERW side effects ++ * from Client CPUs only. ++ * ++ * - May 2022, for MMIO Stale Data. (Re)introduced Fill Buffer scrubbing ++ * on all MMIO-affected parts which didn't already have it for MDS ++ * reasons, enumerating FB_CLEAR on those parts only. ++ * ++ * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing ++ * side effects as VERW and cannot be used in its place. ++ */ + mds_calculations(); + + /* +- * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have +- * reintroduced the VERW fill buffer flushing side effect because of a +- * susceptibility to FBSDP. ++ * Parts which enumerate FB_CLEAR are those with now-updated microcode ++ * which weren't susceptible to the original MFBDS (and therefore didn't ++ * have Fill Buffer scrubbing side effects to begin with, or were Client ++ * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had ++ * the scrubbing reintroduced because of a susceptibility to FBSDP. + * + * If unprivileged guests have (or will have) MMIO mappings, we can + * mitigate cross-domain leakage of fill buffer data by issuing VERW on +- * the return-to-guest path. ++ * the return-to-guest path. This is only a token effort if SMT is ++ * active. + */ + if ( opt_unpriv_mmio ) + opt_verw_mmio = cpu_has_fb_clear; + + /* +- * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. +- * This will only be a token effort for MLPDS/MFBDS when HT is enabled, +- * but it is somewhat better than nothing. ++ * MD_CLEAR is enumerated architecturally forevermore, even after the ++ * scrubbing side effects have been removed. Create ourselves an version ++ * which expressed whether we think MD_CLEAR is having any useful side ++ * effect. ++ */ ++ cpu_has_useful_md_clear = (cpu_has_md_clear && ++ (cpu_has_bug_mds || cpu_has_bug_msbds_only)); ++ ++ /* ++ * By default, use VERW scrubbing on applicable hardware, if we think it's ++ * going to have an effect. This will only be a token effort for ++ * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) +- opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- cpu_has_md_clear); ++ opt_verw_pv = cpu_has_useful_md_clear; + + if ( opt_verw_hvm == -1 ) +- opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- cpu_has_md_clear); ++ opt_verw_hvm = cpu_has_useful_md_clear; + + /* +- * Enable MDS/MMIO defences as applicable. The Idle blocks need using if +- * either the PV or HVM MDS defences are used, or if we may give MMIO +- * access to untrusted guests. +- * +- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with +- * equivalent semantics to avoid needing to perform both flushes on the +- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for +- * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) +- * +- * After calculating the appropriate idle setting, simplify +- * opt_verw_hvm to mean just "should we VERW on the way into HVM +- * guests", so spec_ctrl_init_domain() can calculate suitable settings. ++ * If SMT is active, and we're protecting against MDS or MMIO stale data, ++ * we need to scrub before going idle as well as on return to guest. ++ * Various pipeline resources are repartitioned amongst non-idle threads. + */ +- if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) ++ if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || ++ opt_verw_mmio) && hw_smt_enabled ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; ++ ++ /* ++ * After calculating the appropriate idle setting, simplify opt_verw_hvm ++ * to mean just "should we VERW on the way into HVM guests", so ++ * spec_ctrl_init_domain() can calculate suitable settings. ++ * ++ * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the ++ * only *_CLEAR we can see. ++ */ ++ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) ++ opt_verw_hvm = false; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT +-- +2.44.0 + diff --git a/0045-x86-spec-ctrl-Mitigation-Register-File-Data-Sampling.patch b/0045-x86-spec-ctrl-Mitigation-Register-File-Data-Sampling.patch new file mode 100644 index 0000000..4a10524 --- /dev/null +++ b/0045-x86-spec-ctrl-Mitigation-Register-File-Data-Sampling.patch @@ -0,0 +1,320 @@ +From d85481135d87abbbf1feab18b749288fa08b65f2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 22 Jun 2023 23:32:19 +0100 +Subject: [PATCH 45/67] x86/spec-ctrl: Mitigation Register File Data Sampling + +RFDS affects Atom cores, also branded E-cores, between the Goldmont and +Gracemont microarchitectures. This includes Alder Lake and Raptor Lake hybrid +clien systems which have a mix of Gracemont and other types of cores. + +Two new bits have been defined; RFDS_CLEAR to indicate VERW has more side +effets, and RFDS_NO to incidate that the system is unaffected. Plenty of +unaffected CPUs won't be getting RFDS_NO retrofitted in microcode, so we +synthesise it. Alder Lake and Raptor Lake Xeon-E's are unaffected due to +their platform configuration, and we must use the Hybrid CPUID bit to +distinguish them from their non-Xeon counterparts. + +Like MD_CLEAR and FB_CLEAR, RFDS_CLEAR needs OR-ing across a resource pool, so +set it in the max policies and reflect the host setting in default. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit fb5b6f6744713410c74cfc12b7176c108e3c9a31) +--- + tools/misc/xen-cpuid.c | 5 +- + xen/arch/x86/cpu-policy.c | 5 + + xen/arch/x86/include/asm/cpufeature.h | 3 + + xen/arch/x86/include/asm/msr-index.h | 2 + + xen/arch/x86/spec_ctrl.c | 100 +++++++++++++++++++- + xen/include/public/arch-x86/cpufeatureset.h | 3 + + 6 files changed, 111 insertions(+), 7 deletions(-) + +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index aefc140d66..5ceea8be07 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -172,7 +172,7 @@ static const char *const str_7d0[32] = + [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl", + [10] = "md-clear", [11] = "rtm-always-abort", + /* 12 */ [13] = "tsx-force-abort", +- [14] = "serialize", ++ [14] = "serialize", [15] = "hybrid", + [16] = "tsxldtrk", + [18] = "pconfig", + [20] = "cet-ibt", +@@ -237,7 +237,8 @@ static const char *const str_m10Al[32] = + [20] = "bhi-no", [21] = "xapic-status", + /* 22 */ [23] = "ovrclk-status", + [24] = "pbrsb-no", [25] = "gds-ctrl", +- [26] = "gds-no", ++ [26] = "gds-no", [27] = "rfds-no", ++ [28] = "rfds-clear", + }; + + static const char *const str_m10Ah[32] = +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index 7b875a7221..96c2cee1a8 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -444,6 +444,7 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + */ + __set_bit(X86_FEATURE_MD_CLEAR, fs); + __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); + + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an +@@ -493,6 +494,10 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + if ( cpu_has_fb_clear ) + __set_bit(X86_FEATURE_FB_CLEAR, fs); + ++ __clear_bit(X86_FEATURE_RFDS_CLEAR, fs); ++ if ( cpu_has_rfds_clear ) ++ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h +index ec824e8954..a6b8af1296 100644 +--- a/xen/arch/x86/include/asm/cpufeature.h ++++ b/xen/arch/x86/include/asm/cpufeature.h +@@ -140,6 +140,7 @@ + #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) + #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) + #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) ++#define cpu_has_hybrid boot_cpu_has(X86_FEATURE_HYBRID) + #define cpu_has_avx512_fp16 boot_cpu_has(X86_FEATURE_AVX512_FP16) + #define cpu_has_arch_caps boot_cpu_has(X86_FEATURE_ARCH_CAPS) + +@@ -161,6 +162,8 @@ + #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) + #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) + #define cpu_has_gds_no boot_cpu_has(X86_FEATURE_GDS_NO) ++#define cpu_has_rfds_no boot_cpu_has(X86_FEATURE_RFDS_NO) ++#define cpu_has_rfds_clear boot_cpu_has(X86_FEATURE_RFDS_CLEAR) + + /* Synthesized. */ + #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) +diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h +index 6abf7bc34a..9b5f67711f 100644 +--- a/xen/arch/x86/include/asm/msr-index.h ++++ b/xen/arch/x86/include/asm/msr-index.h +@@ -88,6 +88,8 @@ + #define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) + #define ARCH_CAPS_GDS_CTRL (_AC(1, ULL) << 25) + #define ARCH_CAPS_GDS_NO (_AC(1, ULL) << 26) ++#define ARCH_CAPS_RFDS_NO (_AC(1, ULL) << 27) ++#define ARCH_CAPS_RFDS_CLEAR (_AC(1, ULL) << 28) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index adb6bc74e8..1ee81e2dfe 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -24,6 +24,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -447,7 +448,7 @@ static void __init print_details(enum ind_thunk thunk) + * Hardware read-only information, stating immunity to certain issues, or + * suggestions of which mitigation to use. + */ +- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", +@@ -463,6 +464,7 @@ static void __init print_details(enum ind_thunk thunk) + (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", + (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", + (caps & ARCH_CAPS_GDS_NO) ? " GDS_NO" : "", ++ (caps & ARCH_CAPS_RFDS_NO) ? " RFDS_NO" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", +@@ -473,7 +475,7 @@ static void __init print_details(enum ind_thunk thunk) + (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); + + /* Hardware features which need driving to mitigate issues. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || +@@ -491,6 +493,7 @@ static void __init print_details(enum ind_thunk thunk) + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", + (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", + (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", ++ (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", + (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); + + /* Compiled-in support which pertains to mitigations. */ +@@ -1359,6 +1362,83 @@ static __init void mds_calculations(void) + } + } + ++/* ++ * Register File Data Sampling affects Atom cores from the Goldmont to ++ * Gracemont microarchitectures. The March 2024 microcode adds RFDS_NO to ++ * some but not all unaffected parts, and RFDS_CLEAR to affected parts still ++ * in support. ++ * ++ * Alder Lake and Raptor Lake client CPUs have a mix of P cores ++ * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont, ++ * vulnerable), and both enumerate RFDS_CLEAR. ++ * ++ * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by ++ * platform configuration, and enumerate RFDS_NO. ++ * ++ * With older parts, or with out-of-date microcode, synthesise RFDS_NO when ++ * safe to do so. ++ * ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html ++ */ ++static void __init rfds_calculations(void) ++{ ++ /* RFDS is only known to affect Intel Family 6 processors at this time. */ ++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || ++ boot_cpu_data.x86 != 6 ) ++ return; ++ ++ /* ++ * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable ++ * microcode, or an RFDS-aware hypervisor is levelling us in a pool. ++ */ ++ if ( cpu_has_rfds_no || cpu_has_rfds_clear ) ++ return; ++ ++ /* If we're virtualised, don't attempt to synthesise RFDS_NO. */ ++ if ( cpu_has_hypervisor ) ++ return; ++ ++ /* ++ * Not all CPUs are expected to get a microcode update enumerating one of ++ * RFDS_{NO,CLEAR}, or we might have out-of-date microcode. ++ */ ++ switch ( boot_cpu_data.x86_model ) ++ { ++ case INTEL_FAM6_ALDERLAKE: ++ case INTEL_FAM6_RAPTORLAKE: ++ /* ++ * Alder Lake and Raptor Lake might be a client SKU (with the ++ * Gracemont cores active, and therefore vulnerable) or might be a ++ * server SKU (with the Gracemont cores disabled, and therefore not ++ * vulnerable). ++ * ++ * See if the CPU identifies as hybrid to distinguish the two cases. ++ */ ++ if ( !cpu_has_hybrid ) ++ break; ++ fallthrough; ++ case INTEL_FAM6_ALDERLAKE_L: ++ case INTEL_FAM6_RAPTORLAKE_P: ++ case INTEL_FAM6_RAPTORLAKE_S: ++ ++ case INTEL_FAM6_ATOM_GOLDMONT: /* Apollo Lake */ ++ case INTEL_FAM6_ATOM_GOLDMONT_D: /* Denverton */ ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* Gemini Lake */ ++ case INTEL_FAM6_ATOM_TREMONT_D: /* Snow Ridge / Parker Ridge */ ++ case INTEL_FAM6_ATOM_TREMONT: /* Elkhart Lake */ ++ case INTEL_FAM6_ATOM_TREMONT_L: /* Jasper Lake */ ++ case INTEL_FAM6_ATOM_GRACEMONT: /* Alder Lake N */ ++ return; ++ } ++ ++ /* ++ * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO, ++ * perhaps because of it's age or because of out-of-date microcode. ++ * Synthesise it. ++ */ ++ setup_force_cpu_cap(X86_FEATURE_RFDS_NO); ++} ++ + static bool __init cpu_has_gds(void) + { + /* +@@ -1872,6 +1952,7 @@ void __init init_speculation_mitigations(void) + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html + * + * Relevant ucodes: + * +@@ -1901,8 +1982,12 @@ void __init init_speculation_mitigations(void) + * + * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing + * side effects as VERW and cannot be used in its place. ++ * ++ * - March 2023, for RFDS. Enumerate RFDS_CLEAR to mean that VERW now ++ * scrubs non-architectural entries from certain register files. + */ + mds_calculations(); ++ rfds_calculations(); + + /* + * Parts which enumerate FB_CLEAR are those with now-updated microcode +@@ -1934,15 +2019,19 @@ void __init init_speculation_mitigations(void) + * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) +- opt_verw_pv = cpu_has_useful_md_clear; ++ opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + if ( opt_verw_hvm == -1 ) +- opt_verw_hvm = cpu_has_useful_md_clear; ++ opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + /* + * If SMT is active, and we're protecting against MDS or MMIO stale data, + * we need to scrub before going idle as well as on return to guest. + * Various pipeline resources are repartitioned amongst non-idle threads. ++ * ++ * We don't need to scrub on idle for RFDS. There are no affected cores ++ * which support SMT, despite there being affected cores in hybrid systems ++ * which have SMT elsewhere in the platform. + */ + if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || + opt_verw_mmio) && hw_smt_enabled ) +@@ -1956,7 +2045,8 @@ void __init init_speculation_mitigations(void) + * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the + * only *_CLEAR we can see. + */ +- if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) ++ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear && ++ !cpu_has_rfds_clear ) + opt_verw_hvm = false; + + /* +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index aec1407613..113e6cadc1 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -264,6 +264,7 @@ XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffe + XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ + XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ + XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ ++XEN_CPUFEATURE(HYBRID, 9*32+15) /* Heterogeneous platform */ + XEN_CPUFEATURE(TSXLDTRK, 9*32+16) /*a TSX load tracking suspend/resume insns */ + XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ + XEN_CPUFEATURE(AVX512_FP16, 9*32+23) /* AVX512 FP16 instructions */ +@@ -330,6 +331,8 @@ XEN_CPUFEATURE(OVRCLK_STATUS, 16*32+23) /* MSR_OVERCLOCKING_STATUS */ + XEN_CPUFEATURE(PBRSB_NO, 16*32+24) /*A No Post-Barrier RSB predictions */ + XEN_CPUFEATURE(GDS_CTRL, 16*32+25) /* MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */ + XEN_CPUFEATURE(GDS_NO, 16*32+26) /*A No Gather Data Sampling */ ++XEN_CPUFEATURE(RFDS_NO, 16*32+27) /*A No Register File Data Sampling */ ++XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A Register File(s) cleared by VERW */ + + /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ + +-- +2.44.0 + diff --git a/0046-x86-paging-Delete-update_cr3-s-do_locking-parameter.patch b/0046-x86-paging-Delete-update_cr3-s-do_locking-parameter.patch new file mode 100644 index 0000000..ce397a1 --- /dev/null +++ b/0046-x86-paging-Delete-update_cr3-s-do_locking-parameter.patch @@ -0,0 +1,161 @@ +From bf70ce8b3449c49eb828d5b1f4934a49b00fef35 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 20 Sep 2023 20:06:53 +0100 +Subject: [PATCH 46/67] x86/paging: Delete update_cr3()'s do_locking parameter + +Nicola reports that the XSA-438 fix introduced new MISRA violations because of +some incidental tidying it tried to do. The parameter is useless, so resolve +the MISRA regression by removing it. + +hap_update_cr3() discards the parameter entirely, while sh_update_cr3() uses +it to distinguish internal and external callers and therefore whether the +paging lock should be taken. + +However, we have paging_lock_recursive() for this purpose, which also avoids +the ability for the shadow internal callers to accidentally not hold the lock. + +Fixes: fb0ff49fe9f7 ("x86/shadow: defer releasing of PV's top-level shadow reference") +Reported-by: Nicola Vetrini +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +Release-acked-by: Henry Wang +(cherry picked from commit e71157d1ac2a7fbf413130663cf0a93ff9fbcf7e) +--- + xen/arch/x86/include/asm/paging.h | 5 ++--- + xen/arch/x86/mm/hap/hap.c | 5 ++--- + xen/arch/x86/mm/shadow/common.c | 2 +- + xen/arch/x86/mm/shadow/multi.c | 17 ++++++++--------- + xen/arch/x86/mm/shadow/none.c | 3 +-- + 5 files changed, 14 insertions(+), 18 deletions(-) + +diff --git a/xen/arch/x86/include/asm/paging.h b/xen/arch/x86/include/asm/paging.h +index 94c590f31a..809ff35d9a 100644 +--- a/xen/arch/x86/include/asm/paging.h ++++ b/xen/arch/x86/include/asm/paging.h +@@ -138,8 +138,7 @@ struct paging_mode { + paddr_t ga, uint32_t *pfec, + unsigned int *page_order); + #endif +- pagetable_t (*update_cr3 )(struct vcpu *v, bool do_locking, +- bool noflush); ++ pagetable_t (*update_cr3 )(struct vcpu *v, bool noflush); + void (*update_paging_modes )(struct vcpu *v); + bool (*flush_tlb )(const unsigned long *vcpu_bitmap); + +@@ -312,7 +311,7 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct vcpu *v, + * as the value to load into the host CR3 to schedule this vcpu */ + static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush) + { +- return paging_get_hostmode(v)->update_cr3(v, 1, noflush); ++ return paging_get_hostmode(v)->update_cr3(v, noflush); + } + + /* Update all the things that are derived from the guest's CR0/CR3/CR4. +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index 57a19c3d59..3ad39a7dd7 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -739,8 +739,7 @@ static bool cf_check hap_invlpg(struct vcpu *v, unsigned long linear) + return 1; + } + +-static pagetable_t cf_check hap_update_cr3( +- struct vcpu *v, bool do_locking, bool noflush) ++static pagetable_t cf_check hap_update_cr3(struct vcpu *v, bool noflush) + { + v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3]; + hvm_update_guest_cr3(v, noflush); +@@ -826,7 +825,7 @@ static void cf_check hap_update_paging_modes(struct vcpu *v) + } + + /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ +- hap_update_cr3(v, 0, false); ++ hap_update_cr3(v, false); + + unlock: + paging_unlock(d); +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index c0940f939e..18714dbd02 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2579,7 +2579,7 @@ static void sh_update_paging_modes(struct vcpu *v) + } + #endif /* OOS */ + +- v->arch.paging.mode->update_cr3(v, 0, false); ++ v->arch.paging.mode->update_cr3(v, false); + } + + void cf_check shadow_update_paging_modes(struct vcpu *v) +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index c92b354a78..e54a507b54 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -2506,7 +2506,7 @@ static int cf_check sh_page_fault( + * In any case, in the PAE case, the ASSERT is not true; it can + * happen because of actions the guest is taking. */ + #if GUEST_PAGING_LEVELS == 3 +- v->arch.paging.mode->update_cr3(v, 0, false); ++ v->arch.paging.mode->update_cr3(v, false); + #else + ASSERT(d->is_shutting_down); + #endif +@@ -3224,17 +3224,13 @@ static void cf_check sh_detach_old_tables(struct vcpu *v) + } + } + +-static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, +- bool noflush) ++static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool noflush) + /* Updates vcpu->arch.cr3 after the guest has changed CR3. + * Paravirtual guests should set v->arch.guest_table (and guest_table_user, + * if appropriate). + * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works; + * this function will call hvm_update_guest_cr(v, 3) to tell them where the + * shadow tables are. +- * If do_locking != 0, assume we are being called from outside the +- * shadow code, and must take and release the paging lock; otherwise +- * that is the caller's responsibility. + */ + { + struct domain *d = v->domain; +@@ -3252,7 +3248,11 @@ static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, + return old_entry; + } + +- if ( do_locking ) paging_lock(v->domain); ++ /* ++ * This is used externally (with the paging lock not taken) and internally ++ * by the shadow code (with the lock already taken). ++ */ ++ paging_lock_recursive(v->domain); + + #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all the shadow entries on a TLB flush. Resync +@@ -3480,8 +3480,7 @@ static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, + shadow_sync_other_vcpus(v); + #endif + +- /* Release the lock, if we took it (otherwise it's the caller's problem) */ +- if ( do_locking ) paging_unlock(v->domain); ++ paging_unlock(v->domain); + + return old_entry; + } +diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c +index 743c0ffb85..7e4e386cd0 100644 +--- a/xen/arch/x86/mm/shadow/none.c ++++ b/xen/arch/x86/mm/shadow/none.c +@@ -52,8 +52,7 @@ static unsigned long cf_check _gva_to_gfn( + } + #endif + +-static pagetable_t cf_check _update_cr3(struct vcpu *v, bool do_locking, +- bool noflush) ++static pagetable_t cf_check _update_cr3(struct vcpu *v, bool noflush) + { + ASSERT_UNREACHABLE(); + return pagetable_null(); +-- +2.44.0 + diff --git a/0047-xen-Swap-order-of-actions-in-the-FREE-macros.patch b/0047-xen-Swap-order-of-actions-in-the-FREE-macros.patch new file mode 100644 index 0000000..3e58906 --- /dev/null +++ b/0047-xen-Swap-order-of-actions-in-the-FREE-macros.patch @@ -0,0 +1,58 @@ +From 0a53565f1886201cc8a8afe9b2619ee297c20955 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 2 Feb 2024 00:39:42 +0000 +Subject: [PATCH 47/67] xen: Swap order of actions in the FREE*() macros + +Wherever possible, it is a good idea to NULL out the visible reference to an +object prior to freeing it. The FREE*() macros already collect together both +parts, making it easy to adjust. + +This has a marginal code generation improvement, as some of the calls to the +free() function can be tailcall optimised. + +No functional change. + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +(cherry picked from commit c4f427ec879e7c0df6d44d02561e8bee838a293e) +--- + xen/include/xen/mm.h | 3 ++- + xen/include/xen/xmalloc.h | 7 ++++--- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h +index 3dc61bcc3c..211685a5d2 100644 +--- a/xen/include/xen/mm.h ++++ b/xen/include/xen/mm.h +@@ -80,8 +80,9 @@ bool scrub_free_pages(void); + + /* Free an allocation, and zero the pointer to it. */ + #define FREE_XENHEAP_PAGES(p, o) do { \ +- free_xenheap_pages(p, o); \ ++ void *_ptr_ = (p); \ + (p) = NULL; \ ++ free_xenheap_pages(_ptr_, o); \ + } while ( false ) + #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) + +diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h +index 16979a117c..d857298011 100644 +--- a/xen/include/xen/xmalloc.h ++++ b/xen/include/xen/xmalloc.h +@@ -66,9 +66,10 @@ + extern void xfree(void *); + + /* Free an allocation, and zero the pointer to it. */ +-#define XFREE(p) do { \ +- xfree(p); \ +- (p) = NULL; \ ++#define XFREE(p) do { \ ++ void *_ptr_ = (p); \ ++ (p) = NULL; \ ++ xfree(_ptr_); \ + } while ( false ) + + /* Underlying functions */ +-- +2.44.0 + diff --git a/0048-x86-spinlock-introduce-support-for-blocking-speculat.patch b/0048-x86-spinlock-introduce-support-for-blocking-speculat.patch new file mode 100644 index 0000000..ecf0830 --- /dev/null +++ b/0048-x86-spinlock-introduce-support-for-blocking-speculat.patch @@ -0,0 +1,331 @@ +From 9d2f136328aab5537b7180a1b23e171893ebe455 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 13 Feb 2024 13:08:05 +0100 +Subject: [PATCH 48/67] x86/spinlock: introduce support for blocking + speculation into critical regions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce a new Kconfig option to block speculation into lock protected +critical regions. The Kconfig option is enabled by default, but the mitigation +won't be engaged unless it's explicitly enabled in the command line using +`spec-ctrl=lock-harden`. + +Convert the spinlock acquire macros into always-inline functions, and introduce +a speculation barrier after the lock has been taken. Note the speculation +barrier is not placed inside the implementation of the spin lock functions, as +to prevent speculation from falling through the call to the lock functions +resulting in the barrier also being skipped. + +trylock variants are protected using a construct akin to the existing +evaluate_nospec(). + +This patch only implements the speculation barrier for x86. + +Note spin locks are the only locking primitive taken care in this change, +further locking primitives will be adjusted by separate changes. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 7ef0084418e188d05f338c3e028fbbe8b6924afa) +--- + docs/misc/xen-command-line.pandoc | 7 ++++- + xen/arch/x86/include/asm/cpufeatures.h | 2 +- + xen/arch/x86/include/asm/nospec.h | 26 ++++++++++++++++++ + xen/arch/x86/spec_ctrl.c | 26 +++++++++++++++--- + xen/common/Kconfig | 17 ++++++++++++ + xen/include/xen/nospec.h | 15 +++++++++++ + xen/include/xen/spinlock.h | 37 +++++++++++++++++++++----- + 7 files changed, 119 insertions(+), 11 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index d909ec94fe..e1d56407dd 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2327,7 +2327,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + > {msr-sc,rsb,verw,ibpb-entry}=|{pv,hvm}=, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, + > eager-fpu,l1d-flush,branch-harden,srb-lock, +-> unpriv-mmio,gds-mit,div-scrub}= ]` ++> unpriv-mmio,gds-mit,div-scrub,lock-harden}= ]` + + Controls for speculative execution sidechannel mitigations. By default, Xen + will pick the most appropriate mitigations based on compiled in support, +@@ -2454,6 +2454,11 @@ On all hardware, the `div-scrub=` option can be used to force or prevent Xen + from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate + DIV-leakage on hardware believed to be vulnerable. + ++If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=` ++boolean can be used to force or prevent Xen from using speculation barriers to ++protect lock critical regions. This mitigation won't be engaged by default, ++and needs to be explicitly enabled on the command line. ++ + ### sync_console + > `= ` + +diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h +index c3aad21c3b..7e8221fd85 100644 +--- a/xen/arch/x86/include/asm/cpufeatures.h ++++ b/xen/arch/x86/include/asm/cpufeatures.h +@@ -24,7 +24,7 @@ XEN_CPUFEATURE(APERFMPERF, X86_SYNTH( 8)) /* APERFMPERF */ + XEN_CPUFEATURE(MFENCE_RDTSC, X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */ + XEN_CPUFEATURE(XEN_SMEP, X86_SYNTH(10)) /* SMEP gets used by Xen itself */ + XEN_CPUFEATURE(XEN_SMAP, X86_SYNTH(11)) /* SMAP gets used by Xen itself */ +-/* Bit 12 unused. */ ++XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */ + XEN_CPUFEATURE(IND_THUNK_LFENCE, X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */ + XEN_CPUFEATURE(IND_THUNK_JMP, X86_SYNTH(14)) /* Use IND_THUNK_JMP */ + XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */ +diff --git a/xen/arch/x86/include/asm/nospec.h b/xen/arch/x86/include/asm/nospec.h +index 7150e76b87..0725839e19 100644 +--- a/xen/arch/x86/include/asm/nospec.h ++++ b/xen/arch/x86/include/asm/nospec.h +@@ -38,6 +38,32 @@ static always_inline void block_speculation(void) + barrier_nospec_true(); + } + ++static always_inline void arch_block_lock_speculation(void) ++{ ++ alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++} ++ ++/* Allow to insert a read memory barrier into conditionals */ ++static always_inline bool barrier_lock_true(void) ++{ ++ alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++ return true; ++} ++ ++static always_inline bool barrier_lock_false(void) ++{ ++ alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++ return false; ++} ++ ++static always_inline bool arch_lock_evaluate_nospec(bool condition) ++{ ++ if ( condition ) ++ return barrier_lock_true(); ++ else ++ return barrier_lock_false(); ++} ++ + #endif /* _ASM_X86_NOSPEC_H */ + + /* +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 1ee81e2dfe..ac21af2c5c 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -65,6 +65,7 @@ int8_t __read_mostly opt_eager_fpu = -1; + int8_t __read_mostly opt_l1d_flush = -1; + static bool __initdata opt_branch_harden = + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); ++static bool __initdata opt_lock_harden; + + bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; +@@ -133,6 +134,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) + opt_ssbd = false; + opt_l1d_flush = 0; + opt_branch_harden = false; ++ opt_lock_harden = false; + opt_srb_lock = 0; + opt_unpriv_mmio = false; + opt_gds_mit = 0; +@@ -298,6 +300,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) + rc = -EINVAL; + } + } ++ else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 ) ++ { ++ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) ++ opt_lock_harden = val; ++ else ++ { ++ no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss); ++ rc = -EINVAL; ++ } ++ } + else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) + opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) +@@ -500,7 +512,8 @@ static void __init print_details(enum ind_thunk thunk) + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || +- IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) + printk(" Compiled-in support:" + #ifdef CONFIG_INDIRECT_THUNK + " INDIRECT_THUNK" +@@ -516,11 +529,14 @@ static void __init print_details(enum ind_thunk thunk) + #endif + #ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS + " HARDEN_GUEST_ACCESS" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ " HARDEN_LOCK" + #endif + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", ++ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s%s\n", + thunk != THUNK_NONE ? "BTI-Thunk: " : "", + thunk == THUNK_NONE ? "" : + thunk == THUNK_RETPOLINE ? "RETPOLINE, " : +@@ -547,7 +563,8 @@ static void __init print_details(enum ind_thunk thunk) + opt_verw_pv || opt_verw_hvm || + opt_verw_mmio ? " VERW" : "", + opt_div_scrub ? " DIV" : "", +- opt_branch_harden ? " BRANCH_HARDEN" : ""); ++ opt_branch_harden ? " BRANCH_HARDEN" : "", ++ opt_lock_harden ? " LOCK_HARDEN" : ""); + + /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ + if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu ) +@@ -1930,6 +1947,9 @@ void __init init_speculation_mitigations(void) + if ( !opt_branch_harden ) + setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN); + ++ if ( !opt_lock_harden ) ++ setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN); ++ + /* + * We do not disable HT by default on affected hardware. + * +diff --git a/xen/common/Kconfig b/xen/common/Kconfig +index e7794cb7f6..cd73851538 100644 +--- a/xen/common/Kconfig ++++ b/xen/common/Kconfig +@@ -173,6 +173,23 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS + + If unsure, say Y. + ++config SPECULATIVE_HARDEN_LOCK ++ bool "Speculative lock context hardening" ++ default y ++ depends on X86 ++ help ++ Contemporary processors may use speculative execution as a ++ performance optimisation, but this can potentially be abused by an ++ attacker to leak data via speculative sidechannels. ++ ++ One source of data leakage is via speculative accesses to lock ++ critical regions. ++ ++ This option is disabled by default at run time, and needs to be ++ enabled on the command line. ++ ++ If unsure, say Y. ++ + endmenu + + config DIT_DEFAULT +diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h +index 76255bc46e..4552846403 100644 +--- a/xen/include/xen/nospec.h ++++ b/xen/include/xen/nospec.h +@@ -70,6 +70,21 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, + #define array_access_nospec(array, index) \ + (array)[array_index_nospec(index, ARRAY_SIZE(array))] + ++static always_inline void block_lock_speculation(void) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ arch_block_lock_speculation(); ++#endif ++} ++ ++static always_inline bool lock_evaluate_nospec(bool condition) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ return arch_lock_evaluate_nospec(condition); ++#endif ++ return condition; ++} ++ + #endif /* XEN_NOSPEC_H */ + + /* +diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h +index 961891bea4..daf48fdea7 100644 +--- a/xen/include/xen/spinlock.h ++++ b/xen/include/xen/spinlock.h +@@ -1,6 +1,7 @@ + #ifndef __SPINLOCK_H__ + #define __SPINLOCK_H__ + ++#include + #include + #include + #include +@@ -189,13 +190,30 @@ int _spin_trylock_recursive(spinlock_t *lock); + void _spin_lock_recursive(spinlock_t *lock); + void _spin_unlock_recursive(spinlock_t *lock); + +-#define spin_lock(l) _spin_lock(l) +-#define spin_lock_cb(l, c, d) _spin_lock_cb(l, c, d) +-#define spin_lock_irq(l) _spin_lock_irq(l) ++static always_inline void spin_lock(spinlock_t *l) ++{ ++ _spin_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data), ++ void *d) ++{ ++ _spin_lock_cb(l, c, d); ++ block_lock_speculation(); ++} ++ ++static always_inline void spin_lock_irq(spinlock_t *l) ++{ ++ _spin_lock_irq(l); ++ block_lock_speculation(); ++} ++ + #define spin_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _spin_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) + + #define spin_unlock(l) _spin_unlock(l) +@@ -203,7 +221,7 @@ void _spin_unlock_recursive(spinlock_t *lock); + #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) + + #define spin_is_locked(l) _spin_is_locked(l) +-#define spin_trylock(l) _spin_trylock(l) ++#define spin_trylock(l) lock_evaluate_nospec(_spin_trylock(l)) + + #define spin_trylock_irqsave(lock, flags) \ + ({ \ +@@ -224,8 +242,15 @@ void _spin_unlock_recursive(spinlock_t *lock); + * are any critical regions that cannot form part of such a set, they can use + * standard spin_[un]lock(). + */ +-#define spin_trylock_recursive(l) _spin_trylock_recursive(l) +-#define spin_lock_recursive(l) _spin_lock_recursive(l) ++#define spin_trylock_recursive(l) \ ++ lock_evaluate_nospec(_spin_trylock_recursive(l)) ++ ++static always_inline void spin_lock_recursive(spinlock_t *l) ++{ ++ _spin_lock_recursive(l); ++ block_lock_speculation(); ++} ++ + #define spin_unlock_recursive(l) _spin_unlock_recursive(l) + + #endif /* __SPINLOCK_H__ */ +-- +2.44.0 + diff --git a/0049-rwlock-introduce-support-for-blocking-speculation-in.patch b/0049-rwlock-introduce-support-for-blocking-speculation-in.patch new file mode 100644 index 0000000..593b588 --- /dev/null +++ b/0049-rwlock-introduce-support-for-blocking-speculation-in.patch @@ -0,0 +1,125 @@ +From 7454dad6ee15f9fa6d84fc285d366b86f3d47494 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 13 Feb 2024 16:08:52 +0100 +Subject: [PATCH 49/67] rwlock: introduce support for blocking speculation into + critical regions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce inline wrappers as required and add direct calls to +block_lock_speculation() in order to prevent speculation into the rwlock +protected critical regions. + +Note the rwlock primitives are adjusted to use the non speculation safe variants +of the spinlock handlers, as a speculation barrier is added in the rwlock +calling wrappers. + +trylock variants are protected by using lock_evaluate_nospec(). + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit a1fb15f61692b1fa9945fc51f55471ace49cdd59) +--- + xen/common/rwlock.c | 14 +++++++++++--- + xen/include/xen/rwlock.h | 34 ++++++++++++++++++++++++++++------ + 2 files changed, 39 insertions(+), 9 deletions(-) + +diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c +index aa15529bbe..cda06b9d6e 100644 +--- a/xen/common/rwlock.c ++++ b/xen/common/rwlock.c +@@ -34,8 +34,11 @@ void queue_read_lock_slowpath(rwlock_t *lock) + + /* + * Put the reader into the wait queue. ++ * ++ * Use the speculation unsafe helper, as it's the caller responsibility to ++ * issue a speculation barrier if required. + */ +- spin_lock(&lock->lock); ++ _spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state +@@ -64,8 +67,13 @@ void queue_write_lock_slowpath(rwlock_t *lock) + { + u32 cnts; + +- /* Put the writer into the wait queue. */ +- spin_lock(&lock->lock); ++ /* ++ * Put the writer into the wait queue. ++ * ++ * Use the speculation unsafe helper, as it's the caller responsibility to ++ * issue a speculation barrier if required. ++ */ ++ _spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present. */ + if ( !atomic_read(&lock->cnts) && +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index 0cc9167715..fd0458be94 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -247,27 +247,49 @@ static inline int _rw_is_write_locked(rwlock_t *lock) + return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED; + } + +-#define read_lock(l) _read_lock(l) +-#define read_lock_irq(l) _read_lock_irq(l) ++static always_inline void read_lock(rwlock_t *l) ++{ ++ _read_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void read_lock_irq(rwlock_t *l) ++{ ++ _read_lock_irq(l); ++ block_lock_speculation(); ++} ++ + #define read_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _read_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) + + #define read_unlock(l) _read_unlock(l) + #define read_unlock_irq(l) _read_unlock_irq(l) + #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) +-#define read_trylock(l) _read_trylock(l) ++#define read_trylock(l) lock_evaluate_nospec(_read_trylock(l)) ++ ++static always_inline void write_lock(rwlock_t *l) ++{ ++ _write_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void write_lock_irq(rwlock_t *l) ++{ ++ _write_lock_irq(l); ++ block_lock_speculation(); ++} + +-#define write_lock(l) _write_lock(l) +-#define write_lock_irq(l) _write_lock_irq(l) + #define write_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _write_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) +-#define write_trylock(l) _write_trylock(l) ++#define write_trylock(l) lock_evaluate_nospec(_write_trylock(l)) + + #define write_unlock(l) _write_unlock(l) + #define write_unlock_irq(l) _write_unlock_irq(l) +-- +2.44.0 + diff --git a/0050-percpu-rwlock-introduce-support-for-blocking-specula.patch b/0050-percpu-rwlock-introduce-support-for-blocking-specula.patch new file mode 100644 index 0000000..1da2128 --- /dev/null +++ b/0050-percpu-rwlock-introduce-support-for-blocking-specula.patch @@ -0,0 +1,87 @@ +From 468a368b2e5a38fc0be8e9e5f475820f7e4a6b4f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 13 Feb 2024 17:57:38 +0100 +Subject: [PATCH 50/67] percpu-rwlock: introduce support for blocking + speculation into critical regions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add direct calls to block_lock_speculation() where required in order to prevent +speculation into the lock protected critical regions. Also convert +_percpu_read_lock() from inline to always_inline. + +Note that _percpu_write_lock() has been modified the use the non speculation +safe of the locking primites, as a speculation is added unconditionally by the +calling wrapper. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit f218daf6d3a3b847736d37c6a6b76031a0d08441) +--- + xen/common/rwlock.c | 6 +++++- + xen/include/xen/rwlock.h | 14 ++++++++++---- + 2 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c +index cda06b9d6e..4da0ed8fad 100644 +--- a/xen/common/rwlock.c ++++ b/xen/common/rwlock.c +@@ -125,8 +125,12 @@ void _percpu_write_lock(percpu_rwlock_t **per_cpudata, + /* + * First take the write lock to protect against other writers or slow + * path readers. ++ * ++ * Note we use the speculation unsafe variant of write_lock(), as the ++ * calling wrapper already adds a speculation barrier after the lock has ++ * been taken. + */ +- write_lock(&percpu_rwlock->rwlock); ++ _write_lock(&percpu_rwlock->rwlock); + + /* Now set the global variable so that readers start using read_lock. */ + percpu_rwlock->writer_activating = 1; +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index fd0458be94..abe0804bf7 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -326,8 +326,8 @@ static inline void _percpu_rwlock_owner_check(percpu_rwlock_t **per_cpudata, + #define percpu_rwlock_resource_init(l, owner) \ + (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner))) + +-static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, +- percpu_rwlock_t *percpu_rwlock) ++static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, ++ percpu_rwlock_t *percpu_rwlock) + { + /* Validate the correct per_cpudata variable has been provided. */ + _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock); +@@ -362,6 +362,8 @@ static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, + } + else + { ++ /* Other branch already has a speculation barrier in read_lock(). */ ++ block_lock_speculation(); + /* All other paths have implicit check_lock() calls via read_lock(). */ + check_lock(&percpu_rwlock->rwlock.lock.debug, false); + } +@@ -410,8 +412,12 @@ static inline void _percpu_write_unlock(percpu_rwlock_t **per_cpudata, + _percpu_read_lock(&get_per_cpu_var(percpu), lock) + #define percpu_read_unlock(percpu, lock) \ + _percpu_read_unlock(&get_per_cpu_var(percpu), lock) +-#define percpu_write_lock(percpu, lock) \ +- _percpu_write_lock(&get_per_cpu_var(percpu), lock) ++ ++#define percpu_write_lock(percpu, lock) \ ++({ \ ++ _percpu_write_lock(&get_per_cpu_var(percpu), lock); \ ++ block_lock_speculation(); \ ++}) + #define percpu_write_unlock(percpu, lock) \ + _percpu_write_unlock(&get_per_cpu_var(percpu), lock) + +-- +2.44.0 + diff --git a/0051-locking-attempt-to-ensure-lock-wrappers-are-always-i.patch b/0051-locking-attempt-to-ensure-lock-wrappers-are-always-i.patch new file mode 100644 index 0000000..822836d --- /dev/null +++ b/0051-locking-attempt-to-ensure-lock-wrappers-are-always-i.patch @@ -0,0 +1,405 @@ +From 2cc5e57be680a516aa5cdef4281856d09b9d0ea6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Mon, 4 Mar 2024 14:29:36 +0100 +Subject: [PATCH 51/67] locking: attempt to ensure lock wrappers are always + inline +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In order to prevent the locking speculation barriers from being inside of +`call`ed functions that could be speculatively bypassed. + +While there also add an extra locking barrier to _mm_write_lock() in the branch +taken when the lock is already held. + +Note some functions are switched to use the unsafe variants (without speculation +barrier) of the locking primitives, but a speculation barrier is always added +to the exposed public lock wrapping helper. That's the case with +sched_spin_lock_double() or pcidevs_lock() for example. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762) +--- + xen/arch/x86/hvm/vpt.c | 10 +++++++--- + xen/arch/x86/include/asm/irq.h | 1 + + xen/arch/x86/mm/mm-locks.h | 28 +++++++++++++++------------- + xen/arch/x86/mm/p2m-pod.c | 2 +- + xen/common/event_channel.c | 5 +++-- + xen/common/grant_table.c | 6 +++--- + xen/common/sched/core.c | 19 ++++++++++++------- + xen/common/sched/private.h | 26 ++++++++++++++++++++++++-- + xen/common/timer.c | 8 +++++--- + xen/drivers/passthrough/pci.c | 5 +++-- + xen/include/xen/event.h | 4 ++-- + xen/include/xen/pci.h | 8 ++++++-- + 12 files changed, 82 insertions(+), 40 deletions(-) + +diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c +index cb1d81bf9e..66f1095245 100644 +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -161,7 +161,7 @@ static int pt_irq_masked(struct periodic_time *pt) + * pt->vcpu field, because another thread holding the pt_migrate lock + * may already be spinning waiting for your vcpu lock. + */ +-static void pt_vcpu_lock(struct vcpu *v) ++static always_inline void pt_vcpu_lock(struct vcpu *v) + { + spin_lock(&v->arch.hvm.tm_lock); + } +@@ -180,9 +180,13 @@ static void pt_vcpu_unlock(struct vcpu *v) + * need to take an additional lock that protects against pt->vcpu + * changing. + */ +-static void pt_lock(struct periodic_time *pt) ++static always_inline void pt_lock(struct periodic_time *pt) + { +- read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); ++ /* ++ * Use the speculation unsafe variant for the first lock, as the following ++ * lock taking helper already includes a speculation barrier. ++ */ ++ _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + spin_lock(&pt->vcpu->arch.hvm.tm_lock); + } + +diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h +index f6a0207a80..823d627fd0 100644 +--- a/xen/arch/x86/include/asm/irq.h ++++ b/xen/arch/x86/include/asm/irq.h +@@ -178,6 +178,7 @@ void cf_check irq_complete_move(struct irq_desc *); + + extern struct irq_desc *irq_desc; + ++/* Not speculation safe, only used for AP bringup. */ + void lock_vector_lock(void); + void unlock_vector_lock(void); + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index c1523aeccf..265239c49f 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -86,8 +86,8 @@ static inline void _set_lock_level(int l) + this_cpu(mm_lock_level) = l; + } + +-static inline void _mm_lock(const struct domain *d, mm_lock_t *l, +- const char *func, int level, int rec) ++static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l, ++ const char *func, int level, int rec) + { + if ( !((mm_locked_by_me(l)) && rec) ) + _check_lock_level(d, level); +@@ -137,8 +137,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l) + return (l->locker == get_processor_id()); + } + +-static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, +- const char *func, int level) ++static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, ++ const char *func, int level) + { + if ( !mm_write_locked_by_me(l) ) + { +@@ -149,6 +149,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, + l->unlock_level = _get_lock_level(); + _set_lock_level(_lock_level(d, level)); + } ++ else ++ block_speculation(); + l->recurse_count++; + } + +@@ -162,8 +164,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l) + percpu_write_unlock(p2m_percpu_rwlock, &l->lock); + } + +-static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, +- int level) ++static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, ++ int level) + { + _check_lock_level(d, level); + percpu_read_lock(p2m_percpu_rwlock, &l->lock); +@@ -178,15 +180,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l) + + /* This wrapper uses the line number to express the locking order below */ + #define declare_mm_lock(name) \ +- static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ +- const char *func, int rec) \ ++ static always_inline void mm_lock_##name( \ ++ const struct domain *d, mm_lock_t *l, const char *func, int rec) \ + { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } + #define declare_mm_rwlock(name) \ +- static inline void mm_write_lock_##name(const struct domain *d, \ +- mm_rwlock_t *l, const char *func) \ ++ static always_inline void mm_write_lock_##name( \ ++ const struct domain *d, mm_rwlock_t *l, const char *func) \ + { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ +- static inline void mm_read_lock_##name(const struct domain *d, \ +- mm_rwlock_t *l) \ ++ static always_inline void mm_read_lock_##name(const struct domain *d, \ ++ mm_rwlock_t *l) \ + { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } + /* These capture the name of the calling function */ + #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) +@@ -321,7 +323,7 @@ declare_mm_lock(altp2mlist) + #define MM_LOCK_ORDER_altp2m 40 + declare_mm_rwlock(altp2m); + +-static inline void p2m_lock(struct p2m_domain *p) ++static always_inline void p2m_lock(struct p2m_domain *p) + { + if ( p2m_is_altp2m(p) ) + mm_write_lock(altp2m, p->domain, &p->lock); +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index fc110506dc..99dbcb3101 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -36,7 +36,7 @@ + #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) + + /* Enforce lock ordering when grabbing the "external" page_alloc lock */ +-static inline void lock_page_alloc(struct p2m_domain *p2m) ++static always_inline void lock_page_alloc(struct p2m_domain *p2m) + { + page_alloc_mm_pre_lock(p2m->domain); + spin_lock(&(p2m->domain->page_alloc_lock)); +diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c +index f5e0b12d15..dada9f15f5 100644 +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -62,7 +62,7 @@ + * just assume the event channel is free or unbound at the moment when the + * evtchn_read_trylock() returns false. + */ +-static inline void evtchn_write_lock(struct evtchn *evtchn) ++static always_inline void evtchn_write_lock(struct evtchn *evtchn) + { + write_lock(&evtchn->lock); + +@@ -364,7 +364,8 @@ int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc, evtchn_port_t port) + return rc; + } + +-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) ++static always_inline void double_evtchn_lock(struct evtchn *lchn, ++ struct evtchn *rchn) + { + ASSERT(lchn != rchn); + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index ee7cc496b8..62a8685cd5 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -410,7 +410,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn) + + static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock); + +-static inline void grant_read_lock(struct grant_table *gt) ++static always_inline void grant_read_lock(struct grant_table *gt) + { + percpu_read_lock(grant_rwlock, >->lock); + } +@@ -420,7 +420,7 @@ static inline void grant_read_unlock(struct grant_table *gt) + percpu_read_unlock(grant_rwlock, >->lock); + } + +-static inline void grant_write_lock(struct grant_table *gt) ++static always_inline void grant_write_lock(struct grant_table *gt) + { + percpu_write_lock(grant_rwlock, >->lock); + } +@@ -457,7 +457,7 @@ nr_active_grant_frames(struct grant_table *gt) + return num_act_frames_from_sha_frames(nr_grant_frames(gt)); + } + +-static inline struct active_grant_entry * ++static always_inline struct active_grant_entry * + active_entry_acquire(struct grant_table *t, grant_ref_t e) + { + struct active_grant_entry *act; +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 078beb1adb..29bbab5ac6 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu) + * This avoids dead- or live-locks when this code is running on both + * cpus at the same time. + */ +-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, +- unsigned long *flags) ++static always_inline void sched_spin_lock_double( ++ spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) + { ++ /* ++ * In order to avoid extra overhead, use the locking primitives without the ++ * speculation barrier, and introduce a single barrier here. ++ */ + if ( lock1 == lock2 ) + { +- spin_lock_irqsave(lock1, *flags); ++ *flags = _spin_lock_irqsave(lock1); + } + else if ( lock1 < lock2 ) + { +- spin_lock_irqsave(lock1, *flags); +- spin_lock(lock2); ++ *flags = _spin_lock_irqsave(lock1); ++ _spin_lock(lock2); + } + else + { +- spin_lock_irqsave(lock2, *flags); +- spin_lock(lock1); ++ *flags = _spin_lock_irqsave(lock2); ++ _spin_lock(lock1); + } ++ block_lock_speculation(); + } + + static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index 0527a8c70d..24a93dd0c1 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch); + #define cpumask_scratch (&this_cpu(cpumask_scratch)) + #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c)) + ++/* ++ * Deal with _spin_lock_irqsave() returning the flags value instead of storing ++ * it in a passed parameter. ++ */ ++#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock) ++#define _sched_spinlock1(lock, irq, arg) ({ \ ++ BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \ ++ (arg) = _spin_lock##irq(lock); \ ++}) ++ ++#define _sched_spinlock__(nr) _sched_spinlock ## nr ++#define _sched_spinlock_(nr) _sched_spinlock__(nr) ++#define _sched_spinlock(lock, irq, args...) \ ++ _sched_spinlock_(count_args(args))(lock, irq, ## args) ++ + #define sched_lock(kind, param, cpu, irq, arg...) \ +-static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ ++static always_inline spinlock_t \ ++*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ + { \ + for ( ; ; ) \ + { \ +@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ + * \ + * It may also be the case that v->processor may change but the \ + * lock may be the same; this will succeed in that case. \ ++ * \ ++ * Use the speculation unsafe locking helper, there's a speculation \ ++ * barrier before returning to the caller. \ + */ \ +- spin_lock##irq(lock, ## arg); \ ++ _sched_spinlock(lock, irq, ## arg); \ + if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \ ++ { \ ++ block_lock_speculation(); \ + return lock; \ ++ } \ + spin_unlock##irq(lock, ## arg); \ + } \ + } +diff --git a/xen/common/timer.c b/xen/common/timer.c +index 9b5016d5ed..459668d417 100644 +--- a/xen/common/timer.c ++++ b/xen/common/timer.c +@@ -240,7 +240,7 @@ static inline void deactivate_timer(struct timer *timer) + list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); + } + +-static inline bool_t timer_lock(struct timer *timer) ++static inline bool_t timer_lock_unsafe(struct timer *timer) + { + unsigned int cpu; + +@@ -254,7 +254,8 @@ static inline bool_t timer_lock(struct timer *timer) + rcu_read_unlock(&timer_cpu_read_lock); + return 0; + } +- spin_lock(&per_cpu(timers, cpu).lock); ++ /* Use the speculation unsafe variant, the wrapper has the barrier. */ ++ _spin_lock(&per_cpu(timers, cpu).lock); + if ( likely(timer->cpu == cpu) ) + break; + spin_unlock(&per_cpu(timers, cpu).lock); +@@ -267,8 +268,9 @@ static inline bool_t timer_lock(struct timer *timer) + #define timer_lock_irqsave(t, flags) ({ \ + bool_t __x; \ + local_irq_save(flags); \ +- if ( !(__x = timer_lock(t)) ) \ ++ if ( !(__x = timer_lock_unsafe(t)) ) \ + local_irq_restore(flags); \ ++ block_lock_speculation(); \ + __x; \ + }) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 8c62b14d19..1b3d285166 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -52,9 +52,10 @@ struct pci_seg { + + static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED; + +-void pcidevs_lock(void) ++/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */ ++void pcidevs_lock_unsafe(void) + { +- spin_lock_recursive(&_pcidevs_lock); ++ _spin_lock_recursive(&_pcidevs_lock); + } + + void pcidevs_unlock(void) +diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h +index 8eae9984a9..dd96e84c69 100644 +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -114,12 +114,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); + #define bucket_from_port(d, p) \ + ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) + +-static inline void evtchn_read_lock(struct evtchn *evtchn) ++static always_inline void evtchn_read_lock(struct evtchn *evtchn) + { + read_lock(&evtchn->lock); + } + +-static inline bool evtchn_read_trylock(struct evtchn *evtchn) ++static always_inline bool evtchn_read_trylock(struct evtchn *evtchn) + { + return read_trylock(&evtchn->lock); + } +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index 5975ca2f30..b373f139d1 100644 +--- a/xen/include/xen/pci.h ++++ b/xen/include/xen/pci.h +@@ -155,8 +155,12 @@ struct pci_dev { + * devices, it also sync the access to the msi capability that is not + * interrupt handling related (the mask bit register). + */ +- +-void pcidevs_lock(void); ++void pcidevs_lock_unsafe(void); ++static always_inline void pcidevs_lock(void) ++{ ++ pcidevs_lock_unsafe(); ++ block_lock_speculation(); ++} + void pcidevs_unlock(void); + bool_t __must_check pcidevs_locked(void); + +-- +2.44.0 + diff --git a/0052-x86-mm-add-speculation-barriers-to-open-coded-locks.patch b/0052-x86-mm-add-speculation-barriers-to-open-coded-locks.patch new file mode 100644 index 0000000..9e20f78 --- /dev/null +++ b/0052-x86-mm-add-speculation-barriers-to-open-coded-locks.patch @@ -0,0 +1,73 @@ +From 074b4c8987db235a0b86798810c045f68e4775b6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Mon, 4 Mar 2024 18:08:48 +0100 +Subject: [PATCH 52/67] x86/mm: add speculation barriers to open coded locks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add a speculation barrier to the clearly identified open-coded lock taking +functions. + +Note that the memory sharing page_lock() replacement (_page_lock()) is left +as-is, as the code is experimental and not security supported. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 42a572a38e22a97d86a4b648a22597628d5b42e4) +--- + xen/arch/x86/include/asm/mm.h | 4 +++- + xen/arch/x86/mm.c | 6 ++++-- + 2 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h +index a5d7fdd32e..5845b729c3 100644 +--- a/xen/arch/x86/include/asm/mm.h ++++ b/xen/arch/x86/include/asm/mm.h +@@ -393,7 +393,9 @@ const struct platform_bad_page *get_platform_badpages(unsigned int *array_size); + * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is + * only supported for hvm guests, which do not have PV PTEs updated. + */ +-int page_lock(struct page_info *page); ++int page_lock_unsafe(struct page_info *page); ++#define page_lock(pg) lock_evaluate_nospec(page_lock_unsafe(pg)) ++ + void page_unlock(struct page_info *page); + + void put_page_type(struct page_info *page); +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 330c4abcd1..8d19d719bd 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2033,7 +2033,7 @@ static inline bool current_locked_page_ne_check(struct page_info *page) { + #define current_locked_page_ne_check(x) true + #endif + +-int page_lock(struct page_info *page) ++int page_lock_unsafe(struct page_info *page) + { + unsigned long x, nx; + +@@ -2094,7 +2094,7 @@ void page_unlock(struct page_info *page) + * l3t_lock(), so to avoid deadlock we must avoid grabbing them in + * reverse order. + */ +-static void l3t_lock(struct page_info *page) ++static always_inline void l3t_lock(struct page_info *page) + { + unsigned long x, nx; + +@@ -2103,6 +2103,8 @@ static void l3t_lock(struct page_info *page) + cpu_relax(); + nx = x | PGT_locked; + } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); ++ ++ block_lock_speculation(); + } + + static void l3t_unlock(struct page_info *page) +-- +2.44.0 + diff --git a/0053-x86-protect-conditional-lock-taking-from-speculative.patch b/0053-x86-protect-conditional-lock-taking-from-speculative.patch new file mode 100644 index 0000000..f0caa24 --- /dev/null +++ b/0053-x86-protect-conditional-lock-taking-from-speculative.patch @@ -0,0 +1,216 @@ +From 0ebd2e49bcd0f566ba6b9158555942aab8e41332 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Mon, 4 Mar 2024 16:24:21 +0100 +Subject: [PATCH 53/67] x86: protect conditional lock taking from speculative + execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Conditionally taken locks that use the pattern: + +if ( lock ) + spin_lock(...); + +Need an else branch in order to issue an speculation barrier in the else case, +just like it's done in case the lock needs to be acquired. + +eval_nospec() could be used on the condition itself, but that would result in a +double barrier on the branch where the lock is taken. + +Introduce a new pair of helpers, {gfn,spin}_lock_if() that can be used to +conditionally take a lock in a speculation safe way. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 03cf7ca23e0e876075954c558485b267b7d02406) +--- + xen/arch/x86/mm.c | 35 +++++++++++++---------------------- + xen/arch/x86/mm/mm-locks.h | 9 +++++++++ + xen/arch/x86/mm/p2m.c | 5 ++--- + xen/include/xen/spinlock.h | 8 ++++++++ + 4 files changed, 32 insertions(+), 25 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 8d19d719bd..d31b8d56ff 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5023,8 +5023,7 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) + if ( !l3t ) + return NULL; + UNMAP_DOMAIN_PAGE(l3t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) + { + l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); +@@ -5061,8 +5060,7 @@ static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) + return NULL; + } + UNMAP_DOMAIN_PAGE(l2t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + { + l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); +@@ -5100,8 +5098,7 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + return NULL; + } + UNMAP_DOMAIN_PAGE(l1t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) + { + l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); +@@ -5132,6 +5129,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + do { \ + if ( locking ) \ + l3t_lock(page); \ ++ else \ ++ block_lock_speculation(); \ + } while ( false ) + + #define L3T_UNLOCK(page) \ +@@ -5347,8 +5346,7 @@ int map_pages_to_xen( + if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) + flush_flags |= FLUSH_TLB_GLOBAL; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && + (l3e_get_flags(*pl3e) & _PAGE_PSE) ) + { +@@ -5452,8 +5450,7 @@ int map_pages_to_xen( + if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) + flush_flags |= FLUSH_TLB_GLOBAL; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && + (l2e_get_flags(*pl2e) & _PAGE_PSE) ) + { +@@ -5494,8 +5491,7 @@ int map_pages_to_xen( + unsigned long base_mfn; + const l1_pgentry_t *l1t; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + ol2e = *pl2e; + /* +@@ -5549,8 +5545,7 @@ int map_pages_to_xen( + unsigned long base_mfn; + const l2_pgentry_t *l2t; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + ol3e = *pl3e; + /* +@@ -5694,8 +5689,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l3e_get_flags(*pl3e))); + UNMAP_DOMAIN_PAGE(l2t); + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && + (l3e_get_flags(*pl3e) & _PAGE_PSE) ) + { +@@ -5754,8 +5748,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l2e_get_flags(*pl2e) & ~_PAGE_PSE)); + UNMAP_DOMAIN_PAGE(l1t); + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && + (l2e_get_flags(*pl2e) & _PAGE_PSE) ) + { +@@ -5799,8 +5792,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + */ + if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) + continue; +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + /* + * L2E may be already cleared, or set to a superpage, by +@@ -5847,8 +5839,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + if ( (nf & _PAGE_PRESENT) || + ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) + continue; +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + /* + * L3E may be already cleared, or set to a superpage, by +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index 265239c49f..3ea2d8eb03 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -347,6 +347,15 @@ static inline void p2m_unlock(struct p2m_domain *p) + #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) + #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) + ++static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m, ++ gfn_t gfn, unsigned int order) ++{ ++ if ( condition ) ++ gfn_lock(p2m, gfn, order); ++ else ++ block_lock_speculation(); ++} ++ + /* PoD lock (per-p2m-table) + * + * Protects private PoD data structs: entry and cache +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index b28c899b5e..1fa9e01012 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -292,9 +292,8 @@ mfn_t p2m_get_gfn_type_access(struct p2m_domain *p2m, gfn_t gfn, + if ( q & P2M_UNSHARE ) + q |= P2M_ALLOC; + +- if ( locked ) +- /* Grab the lock here, don't release until put_gfn */ +- gfn_lock(p2m, gfn, 0); ++ /* Grab the lock here, don't release until put_gfn */ ++ gfn_lock_if(locked, p2m, gfn, 0); + + mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); + +diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h +index daf48fdea7..7e75d0e2e7 100644 +--- a/xen/include/xen/spinlock.h ++++ b/xen/include/xen/spinlock.h +@@ -216,6 +216,14 @@ static always_inline void spin_lock_irq(spinlock_t *l) + block_lock_speculation(); \ + }) + ++/* Conditionally take a spinlock in a speculation safe way. */ ++static always_inline void spin_lock_if(bool condition, spinlock_t *l) ++{ ++ if ( condition ) ++ _spin_lock(l); ++ block_lock_speculation(); ++} ++ + #define spin_unlock(l) _spin_unlock(l) + #define spin_unlock_irq(l) _spin_unlock_irq(l) + #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) +-- +2.44.0 + diff --git a/0054-tools-ipxe-update-for-fixing-build-with-GCC12.patch b/0054-tools-ipxe-update-for-fixing-build-with-GCC12.patch new file mode 100644 index 0000000..90efaf8 --- /dev/null +++ b/0054-tools-ipxe-update-for-fixing-build-with-GCC12.patch @@ -0,0 +1,33 @@ +From a01c0b0f9691a8350e74938329892f949669119e Mon Sep 17 00:00:00 2001 +From: Olaf Hering +Date: Wed, 27 Mar 2024 12:27:03 +0100 +Subject: [PATCH 54/67] tools: ipxe: update for fixing build with GCC12 + +Use a snapshot which includes commit +b0ded89e917b48b73097d3b8b88dfa3afb264ed0 ("[build] Disable dangling +pointer checking for GCC"), which fixes build with gcc12. + +Signed-off-by: Olaf Hering +Acked-by: Andrew Cooper +master commit: 18a36b4a9b088875486cfe33a2d4a8ae7eb4ab47 +master date: 2023-04-25 23:47:45 +0100 +--- + tools/firmware/etherboot/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/firmware/etherboot/Makefile b/tools/firmware/etherboot/Makefile +index 4bc3633ba3..7a56fe8014 100644 +--- a/tools/firmware/etherboot/Makefile ++++ b/tools/firmware/etherboot/Makefile +@@ -11,7 +11,7 @@ IPXE_GIT_URL ?= git://git.ipxe.org/ipxe.git + endif + + # put an updated tar.gz on xenbits after changes to this variable +-IPXE_GIT_TAG := 3c040ad387099483102708bb1839110bc788cefb ++IPXE_GIT_TAG := 1d1cf74a5e58811822bee4b3da3cff7282fcdfca + + IPXE_TARBALL_URL ?= $(XEN_EXTFILES_URL)/ipxe-git-$(IPXE_GIT_TAG).tar.gz + +-- +2.44.0 + diff --git a/0055-x86-mm-use-block_lock_speculation-in-_mm_write_lock.patch b/0055-x86-mm-use-block_lock_speculation-in-_mm_write_lock.patch new file mode 100644 index 0000000..719234c --- /dev/null +++ b/0055-x86-mm-use-block_lock_speculation-in-_mm_write_lock.patch @@ -0,0 +1,35 @@ +From a153b8b42e9027ba3057bc7c8bf55e4d71e86ec3 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 27 Mar 2024 12:28:24 +0100 +Subject: [PATCH 55/67] x86/mm: use block_lock_speculation() in + _mm_write_lock() + +I can only guess that using block_speculation() there was a leftover +from, earlier on, SPECULATIVE_HARDEN_LOCK depending on +SPECULATIVE_HARDEN_BRANCH. + +Fixes: 197ecd838a2a ("locking: attempt to ensure lock wrappers are always inline") +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +master commit: 62018f08708a5ff6ef8fc8ff2aaaac46e5a60430 +master date: 2024-03-18 13:53:37 +0100 +--- + xen/arch/x86/mm/mm-locks.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index 3ea2d8eb03..7d6e4d2a7c 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -150,7 +150,7 @@ static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, + _set_lock_level(_lock_level(d, level)); + } + else +- block_speculation(); ++ block_lock_speculation(); + l->recurse_count++; + } + +-- +2.44.0 + diff --git a/0056-x86-boot-Fix-setup_apic_nmi_watchdog-to-fail-more-cl.patch b/0056-x86-boot-Fix-setup_apic_nmi_watchdog-to-fail-more-cl.patch new file mode 100644 index 0000000..5d549c1 --- /dev/null +++ b/0056-x86-boot-Fix-setup_apic_nmi_watchdog-to-fail-more-cl.patch @@ -0,0 +1,120 @@ +From 471b53c6a092940f3629990d9ca946aa22bd8535 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Wed, 27 Mar 2024 12:29:11 +0100 +Subject: [PATCH 56/67] x86/boot: Fix setup_apic_nmi_watchdog() to fail more + cleanly + +Right now, if the user requests the watchdog on the command line, +setup_apic_nmi_watchdog() will blindly assume that setting up the watchdog +worked. Reuse nmi_perfctr_msr to identify when the watchdog has been +configured. + +Rearrange setup_p6_watchdog() to not set nmi_perfctr_msr until the sanity +checks are complete. Turn setup_p4_watchdog() into a void function, matching +the others. + +If the watchdog isn't set up, inform the user and override to NMI_NONE, which +will prevent check_nmi_watchdog() from claiming that all CPUs are stuck. + +e.g.: + + (XEN) alt table ffff82d040697c38 -> ffff82d0406a97f0 + (XEN) Failed to configure NMI watchdog + (XEN) Brought up 512 CPUs + (XEN) Scheduling granularity: cpu, 1 CPU per sched-resource + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: f658321374687c7339235e1ac643e0427acff717 +master date: 2024-03-19 18:29:37 +0000 +--- + xen/arch/x86/nmi.c | 25 ++++++++++++------------- + 1 file changed, 12 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c +index 7656023748..7c9591b65e 100644 +--- a/xen/arch/x86/nmi.c ++++ b/xen/arch/x86/nmi.c +@@ -323,8 +323,6 @@ static void setup_p6_watchdog(unsigned counter) + { + unsigned int evntsel; + +- nmi_perfctr_msr = MSR_P6_PERFCTR(0); +- + if ( !nmi_p6_event_width && current_cpu_data.cpuid_level >= 0xa ) + nmi_p6_event_width = MASK_EXTR(cpuid_eax(0xa), P6_EVENT_WIDTH_MASK); + if ( !nmi_p6_event_width ) +@@ -334,6 +332,8 @@ static void setup_p6_watchdog(unsigned counter) + nmi_p6_event_width > BITS_PER_LONG ) + return; + ++ nmi_perfctr_msr = MSR_P6_PERFCTR(0); ++ + clear_msr_range(MSR_P6_EVNTSEL(0), 2); + clear_msr_range(MSR_P6_PERFCTR(0), 2); + +@@ -349,13 +349,13 @@ static void setup_p6_watchdog(unsigned counter) + wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0); + } + +-static int setup_p4_watchdog(void) ++static void setup_p4_watchdog(void) + { + uint64_t misc_enable; + + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) +- return 0; ++ return; + + nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0; + nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; +@@ -378,13 +378,12 @@ static int setup_p4_watchdog(void) + clear_msr_range(0x3E0, 2); + clear_msr_range(MSR_P4_BPU_CCCR0, 18); + clear_msr_range(MSR_P4_BPU_PERFCTR0, 18); +- ++ + wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0); + wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE); + write_watchdog_counter("P4_IQ_COUNTER0"); + apic_write(APIC_LVTPC, APIC_DM_NMI); + wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val); +- return 1; + } + + void setup_apic_nmi_watchdog(void) +@@ -399,8 +398,6 @@ void setup_apic_nmi_watchdog(void) + case 0xf ... 0x19: + setup_k7_watchdog(); + break; +- default: +- return; + } + break; + case X86_VENDOR_INTEL: +@@ -411,14 +408,16 @@ void setup_apic_nmi_watchdog(void) + : CORE_EVENT_CPU_CLOCKS_NOT_HALTED); + break; + case 15: +- if (!setup_p4_watchdog()) +- return; ++ setup_p4_watchdog(); + break; +- default: +- return; + } + break; +- default: ++ } ++ ++ if ( nmi_perfctr_msr == 0 ) ++ { ++ printk(XENLOG_WARNING "Failed to configure NMI watchdog\n"); ++ nmi_watchdog = NMI_NONE; + return; + } + +-- +2.44.0 + diff --git a/0057-x86-PoD-tie-together-P2M-update-and-increment-of-ent.patch b/0057-x86-PoD-tie-together-P2M-update-and-increment-of-ent.patch new file mode 100644 index 0000000..dedc1c2 --- /dev/null +++ b/0057-x86-PoD-tie-together-P2M-update-and-increment-of-ent.patch @@ -0,0 +1,61 @@ +From bfb69205376d94ff91b09a337c47fb665ee12da3 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 27 Mar 2024 12:29:33 +0100 +Subject: [PATCH 57/67] x86/PoD: tie together P2M update and increment of entry + count + +When not holding the PoD lock across the entire region covering P2M +update and stats update, the entry count - if to be incorrect at all - +should indicate too large a value in preference to a too small one, to +avoid functions bailing early when they find the count is zero. However, +instead of moving the increment ahead (and adjust back upon failure), +extend the PoD-locked region. + +Fixes: 99af3cd40b6e ("x86/mm: Rework locking in the PoD layer") +Signed-off-by: Jan Beulich +Reviewed-by: George Dunlap +master commit: cc950c49ae6a6690f7fc3041a1f43122c250d250 +master date: 2024-03-21 09:48:10 +0100 +--- + xen/arch/x86/mm/p2m-pod.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index 99dbcb3101..e903db9d93 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -1370,19 +1370,28 @@ mark_populate_on_demand(struct domain *d, unsigned long gfn_l, + } + } + ++ /* ++ * P2M update and stats increment need to collectively be under PoD lock, ++ * to prevent code elsewhere observing PoD entry count being zero despite ++ * there actually still being PoD entries (created by the p2m_set_entry() ++ * invocation below). ++ */ ++ pod_lock(p2m); ++ + /* Now, actually do the two-way mapping */ + rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, + p2m_populate_on_demand, p2m->default_access); + if ( rc == 0 ) + { +- pod_lock(p2m); + p2m->pod.entry_count += 1UL << order; + p2m->pod.entry_count -= pod_count; + BUG_ON(p2m->pod.entry_count < 0); +- pod_unlock(p2m); ++ } ++ ++ pod_unlock(p2m); + ++ if ( rc == 0 ) + ioreq_request_mapcache_invalidate(d); +- } + else if ( order ) + { + /* +-- +2.44.0 + diff --git a/0058-tools-oxenstored-Use-Map-instead-of-Hashtbl-for-quot.patch b/0058-tools-oxenstored-Use-Map-instead-of-Hashtbl-for-quot.patch new file mode 100644 index 0000000..dfc7f5a --- /dev/null +++ b/0058-tools-oxenstored-Use-Map-instead-of-Hashtbl-for-quot.patch @@ -0,0 +1,143 @@ +From 7abd305607938b846da1a37dd1bda7bf7d47dba5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= +Date: Wed, 31 Jan 2024 10:52:55 +0000 +Subject: [PATCH 58/67] tools/oxenstored: Use Map instead of Hashtbl for quotas +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +On a stress test running 1000 VMs flamegraphs have shown that +`oxenstored` spends a large amount of time in `Hashtbl.copy` and the GC. + +Hashtable complexity: + * read/write: O(1) average + * copy: O(domains) -- copying the entire table + +Map complexity: + * read/write: O(log n) worst case + * copy: O(1) -- a word copy + +We always perform at least one 'copy' when processing each xenstore +packet (regardless whether it is a readonly operation or inside a +transaction or not), so the actual complexity per packet is: + * Hashtbl: O(domains) + * Map: O(log domains) + +Maps are the clear winner, and a better fit for the immutable xenstore +tree. + +Signed-off-by: Edwin Török +Acked-by: Christian Lindig +(cherry picked from commit b6cf604207fd0a04451a48f2ce6d05fb66c612ab) +--- + tools/ocaml/xenstored/quota.ml | 65 ++++++++++++++++++---------------- + 1 file changed, 34 insertions(+), 31 deletions(-) + +diff --git a/tools/ocaml/xenstored/quota.ml b/tools/ocaml/xenstored/quota.ml +index 6e3d6401ae..ee8dd22581 100644 +--- a/tools/ocaml/xenstored/quota.ml ++++ b/tools/ocaml/xenstored/quota.ml +@@ -23,66 +23,69 @@ let activate = ref true + let maxent = ref (1000) + let maxsize = ref (2048) + ++module Domid = struct ++ type t = Xenctrl.domid ++ let compare (a:t) (b:t) = compare a b ++end ++ ++module DomidMap = Map.Make(Domid) ++ + type t = { + maxent: int; (* max entities per domU *) + maxsize: int; (* max size of data store in one node *) +- cur: (Xenctrl.domid, int) Hashtbl.t; (* current domains quota *) ++ mutable cur: int DomidMap.t; (* current domains quota *) + } + + let to_string quota domid = +- if Hashtbl.mem quota.cur domid +- then Printf.sprintf "dom%i quota: %i/%i" domid (Hashtbl.find quota.cur domid) quota.maxent +- else Printf.sprintf "dom%i quota: not set" domid ++ try ++ Printf.sprintf "dom%i quota: %i/%i" domid (DomidMap.find domid quota.cur) quota.maxent ++ with Not_found -> ++ Printf.sprintf "dom%i quota: not set" domid + + let create () = +- { maxent = !maxent; maxsize = !maxsize; cur = Hashtbl.create 100; } ++ { maxent = !maxent; maxsize = !maxsize; cur = DomidMap.empty; } + +-let copy quota = { quota with cur = (Hashtbl.copy quota.cur) } ++let copy quota = { quota with cur = quota.cur } + +-let del quota id = Hashtbl.remove quota.cur id ++let del quota id = { quota with cur = DomidMap.remove id quota.cur } + + let _check quota id size = + if size > quota.maxsize then ( + warn "domain %u err create entry: data too big %d" id size; + raise Data_too_big + ); +- if id > 0 && Hashtbl.mem quota.cur id then +- let entry = Hashtbl.find quota.cur id in ++ if id > 0 then ++ try ++ let entry = DomidMap.find id quota.cur in + if entry >= quota.maxent then ( + warn "domain %u cannot create entry: quota reached" id; + raise Limit_reached + ) ++ with Not_found -> () + + let check quota id size = + if !activate then + _check quota id size + +-let get_entry quota id = Hashtbl.find quota.cur id ++let find_or_zero quota_cur id = ++ try DomidMap.find id quota_cur with Not_found -> 0 + +-let set_entry quota id nb = +- if nb = 0 +- then Hashtbl.remove quota.cur id +- else begin +- if Hashtbl.mem quota.cur id then +- Hashtbl.replace quota.cur id nb +- else +- Hashtbl.add quota.cur id nb +- end ++let update_entry quota_cur id diff = ++ let nb = diff + find_or_zero quota_cur id in ++ if nb = 0 then DomidMap.remove id quota_cur ++ else DomidMap.add id nb quota_cur + + let del_entry quota id = +- try +- let nb = get_entry quota id in +- set_entry quota id (nb - 1) +- with Not_found -> () ++ quota.cur <- update_entry quota.cur id (-1) + + let add_entry quota id = +- let nb = try get_entry quota id with Not_found -> 0 in +- set_entry quota id (nb + 1) +- +-let add quota diff = +- Hashtbl.iter (fun id nb -> set_entry quota id (get_entry quota id + nb)) diff.cur ++ quota.cur <- update_entry quota.cur id (+1) + + let merge orig_quota mod_quota dest_quota = +- Hashtbl.iter (fun id nb -> let diff = nb - (try get_entry orig_quota id with Not_found -> 0) in +- if diff <> 0 then +- set_entry dest_quota id ((try get_entry dest_quota id with Not_found -> 0) + diff)) mod_quota.cur ++ let fold_merge id nb dest = ++ match nb - find_or_zero orig_quota.cur id with ++ | 0 -> dest (* not modified *) ++ | diff -> update_entry dest id diff (* update with [x=x+diff] *) ++ in ++ dest_quota.cur <- DomidMap.fold fold_merge mod_quota.cur dest_quota.cur ++ (* dest_quota = dest_quota + (mod_quota - orig_quota) *) +-- +2.44.0 + diff --git a/0059-tools-oxenstored-Make-Quota.t-pure.patch b/0059-tools-oxenstored-Make-Quota.t-pure.patch new file mode 100644 index 0000000..7616b90 --- /dev/null +++ b/0059-tools-oxenstored-Make-Quota.t-pure.patch @@ -0,0 +1,121 @@ +From f38a815a54000ca51ff5165b2863d60b6bbea49c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= +Date: Wed, 31 Jan 2024 10:52:56 +0000 +Subject: [PATCH 59/67] tools/oxenstored: Make Quota.t pure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Now that we no longer have a hashtable inside we can make Quota.t pure, and +push the mutable update to its callers. Store.t already had a mutable Quota.t +field. + +No functional change. + +Signed-off-by: Edwin Török +Acked-by: Christian Lindig +(cherry picked from commit 098d868e52ac0165b7f36e22b767ea70cef70054) +--- + tools/ocaml/xenstored/quota.ml | 8 ++++---- + tools/ocaml/xenstored/store.ml | 17 ++++++++++------- + 2 files changed, 14 insertions(+), 11 deletions(-) + +diff --git a/tools/ocaml/xenstored/quota.ml b/tools/ocaml/xenstored/quota.ml +index ee8dd22581..b3ab678c72 100644 +--- a/tools/ocaml/xenstored/quota.ml ++++ b/tools/ocaml/xenstored/quota.ml +@@ -33,7 +33,7 @@ module DomidMap = Map.Make(Domid) + type t = { + maxent: int; (* max entities per domU *) + maxsize: int; (* max size of data store in one node *) +- mutable cur: int DomidMap.t; (* current domains quota *) ++ cur: int DomidMap.t; (* current domains quota *) + } + + let to_string quota domid = +@@ -76,10 +76,10 @@ let update_entry quota_cur id diff = + else DomidMap.add id nb quota_cur + + let del_entry quota id = +- quota.cur <- update_entry quota.cur id (-1) ++ {quota with cur = update_entry quota.cur id (-1)} + + let add_entry quota id = +- quota.cur <- update_entry quota.cur id (+1) ++ {quota with cur = update_entry quota.cur id (+1)} + + let merge orig_quota mod_quota dest_quota = + let fold_merge id nb dest = +@@ -87,5 +87,5 @@ let merge orig_quota mod_quota dest_quota = + | 0 -> dest (* not modified *) + | diff -> update_entry dest id diff (* update with [x=x+diff] *) + in +- dest_quota.cur <- DomidMap.fold fold_merge mod_quota.cur dest_quota.cur ++ {dest_quota with cur = DomidMap.fold fold_merge mod_quota.cur dest_quota.cur} + (* dest_quota = dest_quota + (mod_quota - orig_quota) *) +diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml +index c94dbf3a62..5dd965db15 100644 +--- a/tools/ocaml/xenstored/store.ml ++++ b/tools/ocaml/xenstored/store.ml +@@ -85,7 +85,9 @@ let check_owner node connection = + raise Define.Permission_denied; + end + +-let rec recurse fct node = fct node; SymbolMap.iter (fun _ -> recurse fct) node.children ++let rec recurse fct node acc = ++ let acc = fct node acc in ++ SymbolMap.fold (fun _ -> recurse fct) node.children acc + + (** [recurse_filter_map f tree] applies [f] on each node in the tree recursively, + possibly removing some nodes. +@@ -408,7 +410,7 @@ let dump_buffer store = dump_store_buf store.root + let set_node store path node orig_quota mod_quota = + let root = Path.set_node store.root path node in + store.root <- root; +- Quota.merge orig_quota mod_quota store.quota ++ store.quota <- Quota.merge orig_quota mod_quota store.quota + + let write store perm path value = + let node, existing = get_deepest_existing_node store path in +@@ -422,7 +424,7 @@ let write store perm path value = + let root, node_created = path_write store perm path value in + store.root <- root; + if node_created +- then Quota.add_entry store.quota owner ++ then store.quota <- Quota.add_entry store.quota owner + + let mkdir store perm path = + let node, existing = get_deepest_existing_node store path in +@@ -431,7 +433,7 @@ let mkdir store perm path = + if not (existing || (Perms.Connection.is_dom0 perm)) then Quota.check store.quota owner 0; + store.root <- path_mkdir store perm path; + if not existing then +- Quota.add_entry store.quota owner ++ store.quota <- Quota.add_entry store.quota owner + + let rm store perm path = + let rmed_node = Path.get_node store.root path in +@@ -439,7 +441,7 @@ let rm store perm path = + | None -> raise Define.Doesnt_exist + | Some rmed_node -> + store.root <- path_rm store perm path; +- Node.recurse (fun node -> Quota.del_entry store.quota (Node.get_owner node)) rmed_node ++ store.quota <- Node.recurse (fun node quota -> Quota.del_entry quota (Node.get_owner node)) rmed_node store.quota + + let setperms store perm path nperms = + match Path.get_node store.root path with +@@ -450,8 +452,9 @@ let setperms store perm path nperms = + if not ((old_owner = new_owner) || (Perms.Connection.is_dom0 perm)) then + raise Define.Permission_denied; + store.root <- path_setperms store perm path nperms; +- Quota.del_entry store.quota old_owner; +- Quota.add_entry store.quota new_owner ++ store.quota <- ++ let quota = Quota.del_entry store.quota old_owner in ++ Quota.add_entry quota new_owner + + let reset_permissions store domid = + Logging.info "store|node" "Cleaning up xenstore ACLs for domid %d" domid; +-- +2.44.0 + diff --git a/0060-x86-cpu-policy-Hide-x2APIC-from-PV-guests.patch b/0060-x86-cpu-policy-Hide-x2APIC-from-PV-guests.patch new file mode 100644 index 0000000..ce2b89d --- /dev/null +++ b/0060-x86-cpu-policy-Hide-x2APIC-from-PV-guests.patch @@ -0,0 +1,90 @@ +From bb27e11c56963e170d1f6d2fbddbc956f7164121 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:17:25 +0200 +Subject: [PATCH 60/67] x86/cpu-policy: Hide x2APIC from PV guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +PV guests can't write to MSR_APIC_BASE (in order to set EXTD), nor can they +access any of the x2APIC MSR range. Therefore they mustn't see the x2APIC +CPUID bit saying that they can. + +Right now, the host x2APIC flag filters into PV guests, meaning that PV guests +generally see x2APIC except on Zen1-and-older AMD systems. + +Linux works around this by explicitly hiding the bit itself, and filtering +EXTD out of MSR_APIC_BASE reads. NetBSD behaves more in the spirit of PV +guests, and entirely ignores the APIC when built as a PV guest. + +Change the annotation from !A to !S. This has a consequence of stripping it +out of both PV featuremasks. However, as existing guests may have seen the +bit, set it back into the PV Max policy; a VM which saw the bit and is alive +enough to migrate will have ignored it one way or another. + +Hiding x2APIC does change the contents of leaf 0xb, but as the information is +nonsense to begin with, this is likely an improvement on the status quo. + +Xen's blind assumption that APIC_ID = vCPU_ID * 2 isn't interlinked with the +host's topology structure, where a PV guest may see real host values, and the +APIC_IDs are useless without an MADT to start with. Dom0 is the only PV VM to +get an MADT but it's the host one, meaning the two sets of APIC_IDs are from +different address spaces. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +master commit: 5420aa165dfa5fe95dd84bb71cb96c15459935b1 +master date: 2024-03-01 20:14:19 +0000 +--- + xen/arch/x86/cpu-policy.c | 11 +++++++++-- + xen/include/public/arch-x86/cpufeatureset.h | 2 +- + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index 96c2cee1a8..ed64d56294 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -559,6 +559,14 @@ static void __init calculate_pv_max_policy(void) + for ( i = 0; i < ARRAY_SIZE(fs); ++i ) + fs[i] &= pv_max_featuremask[i]; + ++ /* ++ * Xen at the time of writing (Feb 2024, 4.19 dev cycle) used to leak the ++ * host x2APIC capability into PV guests, but never supported the guest ++ * trying to turn x2APIC mode on. Tolerate an incoming VM which saw the ++ * x2APIC CPUID bit and is alive enough to migrate. ++ */ ++ __set_bit(X86_FEATURE_X2APIC, fs); ++ + /* + * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional + * availability, or admin choice), hide the feature. +@@ -837,11 +845,10 @@ void recalculate_cpuid_policy(struct domain *d) + } + + /* +- * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits ++ * Allow the toolstack to set HTT and CMP_LEGACY. These bits + * affect how to interpret topology information in other cpuid leaves. + */ + __set_bit(X86_FEATURE_HTT, max_fs); +- __set_bit(X86_FEATURE_X2APIC, max_fs); + __set_bit(X86_FEATURE_CMP_LEGACY, max_fs); + + /* +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 113e6cadc1..bc971f3c6f 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -123,7 +123,7 @@ XEN_CPUFEATURE(PCID, 1*32+17) /*H Process Context ID */ + XEN_CPUFEATURE(DCA, 1*32+18) /* Direct Cache Access */ + XEN_CPUFEATURE(SSE4_1, 1*32+19) /*A Streaming SIMD Extensions 4.1 */ + XEN_CPUFEATURE(SSE4_2, 1*32+20) /*A Streaming SIMD Extensions 4.2 */ +-XEN_CPUFEATURE(X2APIC, 1*32+21) /*!A Extended xAPIC */ ++XEN_CPUFEATURE(X2APIC, 1*32+21) /*!S Extended xAPIC */ + XEN_CPUFEATURE(MOVBE, 1*32+22) /*A movbe instruction */ + XEN_CPUFEATURE(POPCNT, 1*32+23) /*A POPCNT instruction */ + XEN_CPUFEATURE(TSC_DEADLINE, 1*32+24) /*S TSC Deadline Timer */ +-- +2.44.0 + diff --git a/0061-x86-cpu-policy-Fix-visibility-of-HTT-CMP_LEGACY-in-m.patch b/0061-x86-cpu-policy-Fix-visibility-of-HTT-CMP_LEGACY-in-m.patch new file mode 100644 index 0000000..d1b8786 --- /dev/null +++ b/0061-x86-cpu-policy-Fix-visibility-of-HTT-CMP_LEGACY-in-m.patch @@ -0,0 +1,85 @@ +From 70ad9c5fdeac4814050080c87e06d44292ecf868 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:18:05 +0200 +Subject: [PATCH 61/67] x86/cpu-policy: Fix visibility of HTT/CMP_LEGACY in max + policies +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The block in recalculate_cpuid_policy() predates the proper split between +default and max policies, and was a "slightly max for a toolstack which knows +about it" capability. It didn't get transformed properly in Xen 4.14. + +Because Xen will accept a VM with HTT/CMP_LEGACY seen, they should be visible +in the max polices. Keep the default policy matching host settings. + +This manifested as an incorrectly-rejected migration across XenServer's Xen +4.13 -> 4.17 upgrade, as Xapi is slowly growing the logic to check a VM +against the target max policy. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +master commit: e2d8a652251660c3252d92b442e1a9c5d6e6a1e9 +master date: 2024-03-01 20:14:19 +0000 +--- + xen/arch/x86/cpu-policy.c | 29 ++++++++++++++++++++++------- + 1 file changed, 22 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index ed64d56294..24acd12ce2 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -458,6 +458,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + raw_cpu_policy.feat.clwb ) + __set_bit(X86_FEATURE_CLWB, fs); + } ++ ++ /* ++ * Topology information inside the guest is entirely at the toolstack's ++ * discretion, and bears no relationship to the host we're running on. ++ * ++ * HTT identifies p->basic.lppp as valid ++ * CMP_LEGACY identifies p->extd.nc as valid ++ */ ++ __set_bit(X86_FEATURE_HTT, fs); ++ __set_bit(X86_FEATURE_CMP_LEGACY, fs); + } + + static void __init guest_common_default_feature_adjustments(uint32_t *fs) +@@ -512,6 +522,18 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + __clear_bit(X86_FEATURE_CLWB, fs); + } + ++ /* ++ * Topology information is at the toolstack's discretion so these are ++ * unconditionally set in max, but pick a default which matches the host. ++ */ ++ __clear_bit(X86_FEATURE_HTT, fs); ++ if ( cpu_has_htt ) ++ __set_bit(X86_FEATURE_HTT, fs); ++ ++ __clear_bit(X86_FEATURE_CMP_LEGACY, fs); ++ if ( cpu_has_cmp_legacy ) ++ __set_bit(X86_FEATURE_CMP_LEGACY, fs); ++ + /* + * On certain hardware, speculative or errata workarounds can result in + * TSX being placed in "force-abort" mode, where it doesn't actually +@@ -844,13 +866,6 @@ void recalculate_cpuid_policy(struct domain *d) + } + } + +- /* +- * Allow the toolstack to set HTT and CMP_LEGACY. These bits +- * affect how to interpret topology information in other cpuid leaves. +- */ +- __set_bit(X86_FEATURE_HTT, max_fs); +- __set_bit(X86_FEATURE_CMP_LEGACY, max_fs); +- + /* + * 32bit PV domains can't use any Long Mode features, and cannot use + * SYSCALL on non-AMD hardware. +-- +2.44.0 + diff --git a/0062-xen-virtual-region-Rename-the-start-end-fields.patch b/0062-xen-virtual-region-Rename-the-start-end-fields.patch new file mode 100644 index 0000000..9dbd5c9 --- /dev/null +++ b/0062-xen-virtual-region-Rename-the-start-end-fields.patch @@ -0,0 +1,140 @@ +From 2392e958ec6fd2e48e011781344cf94dee6d6142 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:18:51 +0200 +Subject: [PATCH 62/67] xen/virtual-region: Rename the start/end fields +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +... to text_{start,end}. We're about to introduce another start/end pair. + +Despite it's name, struct virtual_region has always been a module-ish +description. Call this out specifically. + +As minor cleanup, replace ROUNDUP(x, PAGE_SIZE) with the more concise +PAGE_ALIGN() ahead of duplicating the example. + +No functional change. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: 989556c6f8ca080f5f202417af97d1188b9ba52a +master date: 2024-03-07 14:24:42 +0000 +--- + xen/common/livepatch.c | 9 +++++---- + xen/common/virtual_region.c | 19 ++++++++++--------- + xen/include/xen/virtual_region.h | 11 +++++++++-- + 3 files changed, 24 insertions(+), 15 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index a5068a2217..29395f286f 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -785,8 +785,8 @@ static int prepare_payload(struct payload *payload, + region = &payload->region; + + region->symbols_lookup = livepatch_symbols_lookup; +- region->start = payload->text_addr; +- region->end = payload->text_addr + payload->text_size; ++ region->text_start = payload->text_addr; ++ region->text_end = payload->text_addr + payload->text_size; + + /* Optional sections. */ + for ( i = 0; i < BUGFRAME_NR; i++ ) +@@ -823,8 +823,9 @@ static int prepare_payload(struct payload *payload, + const void *instr = ALT_ORIG_PTR(a); + const void *replacement = ALT_REPL_PTR(a); + +- if ( (instr < region->start && instr >= region->end) || +- (replacement < region->start && replacement >= region->end) ) ++ if ( (instr < region->text_start && instr >= region->text_end) || ++ (replacement < region->text_start && ++ replacement >= region->text_end) ) + { + printk(XENLOG_ERR LIVEPATCH "%s Alt patching outside payload: %p\n", + elf->name, instr); +diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c +index 9f12c30efe..b22ffb75c4 100644 +--- a/xen/common/virtual_region.c ++++ b/xen/common/virtual_region.c +@@ -11,15 +11,15 @@ + + static struct virtual_region core = { + .list = LIST_HEAD_INIT(core.list), +- .start = _stext, +- .end = _etext, ++ .text_start = _stext, ++ .text_end = _etext, + }; + + /* Becomes irrelevant when __init sections are cleared. */ + static struct virtual_region core_init __initdata = { + .list = LIST_HEAD_INIT(core_init.list), +- .start = _sinittext, +- .end = _einittext, ++ .text_start = _sinittext, ++ .text_end = _einittext, + }; + + /* +@@ -39,7 +39,8 @@ const struct virtual_region *find_text_region(unsigned long addr) + rcu_read_lock(&rcu_virtual_region_lock); + list_for_each_entry_rcu( region, &virtual_region_list, list ) + { +- if ( (void *)addr >= region->start && (void *)addr < region->end ) ++ if ( (void *)addr >= region->text_start && ++ (void *)addr < region->text_end ) + { + rcu_read_unlock(&rcu_virtual_region_lock); + return region; +@@ -88,8 +89,8 @@ void relax_virtual_region_perms(void) + + rcu_read_lock(&rcu_virtual_region_lock); + list_for_each_entry_rcu( region, &virtual_region_list, list ) +- modify_xen_mappings_lite((unsigned long)region->start, +- ROUNDUP((unsigned long)region->end, PAGE_SIZE), ++ modify_xen_mappings_lite((unsigned long)region->text_start, ++ PAGE_ALIGN((unsigned long)region->text_end), + PAGE_HYPERVISOR_RWX); + rcu_read_unlock(&rcu_virtual_region_lock); + } +@@ -100,8 +101,8 @@ void tighten_virtual_region_perms(void) + + rcu_read_lock(&rcu_virtual_region_lock); + list_for_each_entry_rcu( region, &virtual_region_list, list ) +- modify_xen_mappings_lite((unsigned long)region->start, +- ROUNDUP((unsigned long)region->end, PAGE_SIZE), ++ modify_xen_mappings_lite((unsigned long)region->text_start, ++ PAGE_ALIGN((unsigned long)region->text_end), + PAGE_HYPERVISOR_RX); + rcu_read_unlock(&rcu_virtual_region_lock); + } +diff --git a/xen/include/xen/virtual_region.h b/xen/include/xen/virtual_region.h +index d053620711..442a45bf1f 100644 +--- a/xen/include/xen/virtual_region.h ++++ b/xen/include/xen/virtual_region.h +@@ -9,11 +9,18 @@ + #include + #include + ++/* ++ * Despite it's name, this is a module(ish) description. ++ * ++ * There's one region for the runtime .text/etc, one region for .init during ++ * boot only, and one region per livepatch. ++ */ + struct virtual_region + { + struct list_head list; +- const void *start; /* Virtual address start. */ +- const void *end; /* Virtual address end. */ ++ ++ const void *text_start; /* .text virtual address start. */ ++ const void *text_end; /* .text virtual address end. */ + + /* If this is NULL the default lookup mechanism is used. */ + symbols_lookup_t *symbols_lookup; +-- +2.44.0 + diff --git a/0063-xen-virtual-region-Include-rodata-pointers.patch b/0063-xen-virtual-region-Include-rodata-pointers.patch new file mode 100644 index 0000000..9f51d4d --- /dev/null +++ b/0063-xen-virtual-region-Include-rodata-pointers.patch @@ -0,0 +1,71 @@ +From 335cbb55567b20df8e8bd2d1b340609e272ddab6 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:19:11 +0200 +Subject: [PATCH 63/67] xen/virtual-region: Include rodata pointers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +These are optional. .init doesn't distinguish types of data like this, and +livepatches don't necesserily have any .rodata either. + +No functional change. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: ef969144a425e39f5b214a875b5713d0ea8575fb +master date: 2024-03-07 14:24:42 +0000 +--- + xen/common/livepatch.c | 6 ++++++ + xen/common/virtual_region.c | 2 ++ + xen/include/xen/virtual_region.h | 3 +++ + 3 files changed, 11 insertions(+) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index 29395f286f..28c09ddf58 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -788,6 +788,12 @@ static int prepare_payload(struct payload *payload, + region->text_start = payload->text_addr; + region->text_end = payload->text_addr + payload->text_size; + ++ if ( payload->ro_size ) ++ { ++ region->rodata_start = payload->ro_addr; ++ region->rodata_end = payload->ro_addr + payload->ro_size; ++ } ++ + /* Optional sections. */ + for ( i = 0; i < BUGFRAME_NR; i++ ) + { +diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c +index b22ffb75c4..9c566f8ec9 100644 +--- a/xen/common/virtual_region.c ++++ b/xen/common/virtual_region.c +@@ -13,6 +13,8 @@ static struct virtual_region core = { + .list = LIST_HEAD_INIT(core.list), + .text_start = _stext, + .text_end = _etext, ++ .rodata_start = _srodata, ++ .rodata_end = _erodata, + }; + + /* Becomes irrelevant when __init sections are cleared. */ +diff --git a/xen/include/xen/virtual_region.h b/xen/include/xen/virtual_region.h +index 442a45bf1f..dcdc95ba49 100644 +--- a/xen/include/xen/virtual_region.h ++++ b/xen/include/xen/virtual_region.h +@@ -22,6 +22,9 @@ struct virtual_region + const void *text_start; /* .text virtual address start. */ + const void *text_end; /* .text virtual address end. */ + ++ const void *rodata_start; /* .rodata virtual address start (optional). */ ++ const void *rodata_end; /* .rodata virtual address end. */ ++ + /* If this is NULL the default lookup mechanism is used. */ + symbols_lookup_t *symbols_lookup; + +-- +2.44.0 + diff --git a/0064-x86-livepatch-Relax-permissions-on-rodata-too.patch b/0064-x86-livepatch-Relax-permissions-on-rodata-too.patch new file mode 100644 index 0000000..bc80769 --- /dev/null +++ b/0064-x86-livepatch-Relax-permissions-on-rodata-too.patch @@ -0,0 +1,85 @@ +From c3ff11b11c21777a9b1c616607705f3a7340b391 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:19:36 +0200 +Subject: [PATCH 64/67] x86/livepatch: Relax permissions on rodata too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reinstates the capability to patch .rodata in load/unload hooks, which +was lost when we stopped using CR0.WP=0 to patch. + +This turns out to be rather less of a large TODO than I thought at the time. + +Fixes: 8676092a0f16 ("x86/livepatch: Fix livepatch application when CET is active") +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: b083b1c393dc8961acf0959b1d2e0ad459985ae3 +master date: 2024-03-07 14:24:42 +0000 +--- + xen/arch/x86/livepatch.c | 4 ++-- + xen/common/virtual_region.c | 12 ++++++++++++ + 2 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/livepatch.c b/xen/arch/x86/livepatch.c +index ee539f001b..4f76127e1f 100644 +--- a/xen/arch/x86/livepatch.c ++++ b/xen/arch/x86/livepatch.c +@@ -62,7 +62,7 @@ int arch_livepatch_safety_check(void) + int noinline arch_livepatch_quiesce(void) + { + /* +- * Relax perms on .text to be RWX, so we can modify them. ++ * Relax perms on .text/.rodata, so we can modify them. + * + * This relaxes perms globally, but all other CPUs are waiting on us. + */ +@@ -75,7 +75,7 @@ int noinline arch_livepatch_quiesce(void) + void noinline arch_livepatch_revive(void) + { + /* +- * Reinstate perms on .text to be RX. This also cleans out the dirty ++ * Reinstate perms on .text/.rodata. This also cleans out the dirty + * bits, which matters when CET Shstk is active. + * + * The other CPUs waiting for us could in principle have re-walked while +diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c +index 9c566f8ec9..aefc08e75f 100644 +--- a/xen/common/virtual_region.c ++++ b/xen/common/virtual_region.c +@@ -91,9 +91,15 @@ void relax_virtual_region_perms(void) + + rcu_read_lock(&rcu_virtual_region_lock); + list_for_each_entry_rcu( region, &virtual_region_list, list ) ++ { + modify_xen_mappings_lite((unsigned long)region->text_start, + PAGE_ALIGN((unsigned long)region->text_end), + PAGE_HYPERVISOR_RWX); ++ if ( region->rodata_start ) ++ modify_xen_mappings_lite((unsigned long)region->rodata_start, ++ PAGE_ALIGN((unsigned long)region->rodata_end), ++ PAGE_HYPERVISOR_RW); ++ } + rcu_read_unlock(&rcu_virtual_region_lock); + } + +@@ -103,9 +109,15 @@ void tighten_virtual_region_perms(void) + + rcu_read_lock(&rcu_virtual_region_lock); + list_for_each_entry_rcu( region, &virtual_region_list, list ) ++ { + modify_xen_mappings_lite((unsigned long)region->text_start, + PAGE_ALIGN((unsigned long)region->text_end), + PAGE_HYPERVISOR_RX); ++ if ( region->rodata_start ) ++ modify_xen_mappings_lite((unsigned long)region->rodata_start, ++ PAGE_ALIGN((unsigned long)region->rodata_end), ++ PAGE_HYPERVISOR_RO); ++ } + rcu_read_unlock(&rcu_virtual_region_lock); + } + #endif /* CONFIG_X86 */ +-- +2.44.0 + diff --git a/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch b/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch new file mode 100644 index 0000000..4a46326 --- /dev/null +++ b/0065-x86-boot-Improve-the-boot-watchdog-determination-of-.patch @@ -0,0 +1,106 @@ +From 846fb984b506135917c2862d2e4607005d6afdeb Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:20:09 +0200 +Subject: [PATCH 65/67] x86/boot: Improve the boot watchdog determination of + stuck cpus +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Right now, check_nmi_watchdog() has two processing loops over all online CPUs +using prev_nmi_count as storage. + +Use a cpumask_t instead (1/32th as much initdata) and have wait_for_nmis() +make the determination of whether it is stuck, rather than having both +functions needing to agree on how many ticks mean stuck. + +More importantly though, it means we can use the standard cpumask +infrastructure, including turning this: + + (XEN) Brought up 512 CPUs + (XEN) Testing NMI watchdog on all CPUs: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511} stuck + +into the rather more manageable: + + (XEN) Brought up 512 CPUs + (XEN) Testing NMI watchdog on all CPUs: {0-511} stuck + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +master commit: 9e18f339830c828798aef465556d4029d83476a0 +master date: 2024-03-19 18:29:37 +0000 +--- + xen/arch/x86/nmi.c | 33 ++++++++++++++------------------- + 1 file changed, 14 insertions(+), 19 deletions(-) + +diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c +index 7c9591b65e..dd31034ac8 100644 +--- a/xen/arch/x86/nmi.c ++++ b/xen/arch/x86/nmi.c +@@ -150,6 +150,8 @@ int nmi_active; + + static void __init cf_check wait_for_nmis(void *p) + { ++ cpumask_t *stuck_cpus = p; ++ unsigned int cpu = smp_processor_id(); + unsigned int start_count = this_cpu(nmi_count); + unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz; + unsigned long s, e; +@@ -158,42 +160,35 @@ static void __init cf_check wait_for_nmis(void *p) + do { + cpu_relax(); + if ( this_cpu(nmi_count) >= start_count + 2 ) +- break; ++ return; ++ + e = rdtsc(); +- } while( e - s < ticks ); ++ } while ( e - s < ticks ); ++ ++ /* Timeout. Mark ourselves as stuck. */ ++ cpumask_set_cpu(cpu, stuck_cpus); + } + + void __init check_nmi_watchdog(void) + { +- static unsigned int __initdata prev_nmi_count[NR_CPUS]; +- int cpu; +- bool ok = true; ++ static cpumask_t __initdata stuck_cpus; + + if ( nmi_watchdog == NMI_NONE ) + return; + + printk("Testing NMI watchdog on all CPUs:"); + +- for_each_online_cpu ( cpu ) +- prev_nmi_count[cpu] = per_cpu(nmi_count, cpu); +- + /* + * Wait at most 10 ticks for 2 watchdog NMIs on each CPU. + * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog + * uses only runs while the core's not halted + */ +- on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1); +- +- for_each_online_cpu ( cpu ) +- { +- if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 ) +- { +- printk(" %d", cpu); +- ok = false; +- } +- } ++ on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1); + +- printk(" %s\n", ok ? "ok" : "stuck"); ++ if ( cpumask_empty(&stuck_cpus) ) ++ printk("ok\n"); ++ else ++ printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus)); + + /* + * Now that we know it works we can reduce NMI frequency to +-- +2.44.0 + diff --git a/0066-x86-boot-Support-the-watchdog-on-newer-AMD-systems.patch b/0066-x86-boot-Support-the-watchdog-on-newer-AMD-systems.patch new file mode 100644 index 0000000..e501861 --- /dev/null +++ b/0066-x86-boot-Support-the-watchdog-on-newer-AMD-systems.patch @@ -0,0 +1,48 @@ +From 2777b499f1f6d5cea68f9479f82d055542b822ad Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:20:30 +0200 +Subject: [PATCH 66/67] x86/boot: Support the watchdog on newer AMD systems + +The MSRs used by setup_k7_watchdog() are architectural in 64bit. The Unit +Select (0x76, cycles not in halt state) isn't, but it hasn't changed in 25 +years, making this a trend likely to continue. + +Drop the family check. If the Unit Select does happen to change meaning in +the future, check_nmi_watchdog() will still notice the watchdog not operating +as expected. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: 131892e0dcc1265b621c2b7d844cb9e7c3a4404f +master date: 2024-03-19 18:29:37 +0000 +--- + xen/arch/x86/nmi.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c +index dd31034ac8..c7c51614a6 100644 +--- a/xen/arch/x86/nmi.c ++++ b/xen/arch/x86/nmi.c +@@ -386,15 +386,12 @@ void setup_apic_nmi_watchdog(void) + if ( nmi_watchdog == NMI_NONE ) + return; + +- switch (boot_cpu_data.x86_vendor) { ++ switch ( boot_cpu_data.x86_vendor ) ++ { + case X86_VENDOR_AMD: +- switch (boot_cpu_data.x86) { +- case 6: +- case 0xf ... 0x19: +- setup_k7_watchdog(); +- break; +- } ++ setup_k7_watchdog(); + break; ++ + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: +-- +2.44.0 + diff --git a/0067-tests-resource-Fix-HVM-guest-in-SHADOW-builds.patch b/0067-tests-resource-Fix-HVM-guest-in-SHADOW-builds.patch new file mode 100644 index 0000000..5ce4e17 --- /dev/null +++ b/0067-tests-resource-Fix-HVM-guest-in-SHADOW-builds.patch @@ -0,0 +1,110 @@ +From 9bc40dbcf9eafccc1923b2555286bf6a2af03b7a Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 2 Apr 2024 16:24:07 +0200 +Subject: [PATCH 67/67] tests/resource: Fix HVM guest in !SHADOW builds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Right now, test-resource always creates HVM Shadow guests. But if Xen has +SHADOW compiled out, running the test yields: + + $./test-resource + XENMEM_acquire_resource tests + Test x86 PV + Created d1 + Test grant table + Test x86 PVH + Skip: 95 - Operation not supported + +and doesn't really test HVM guests, but doesn't fail either. + +There's nothing paging-mode-specific about this test, so default to HAP if +possible and provide a more specific message if neither HAP or Shadow are +available. + +As we've got physinfo to hand, also provide more specific message about the +absence of PV or HVM support. + +Signed-off-by: Andrew Cooper +Acked-by: Roger Pau Monné +master commit: 0263dc9069ddb66335c72a159e09050b1600e56a +master date: 2024-03-01 20:14:19 +0000 +--- + tools/tests/resource/test-resource.c | 39 ++++++++++++++++++++++++++++ + 1 file changed, 39 insertions(+) + +diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c +index 0a950072f9..e2c4ba3478 100644 +--- a/tools/tests/resource/test-resource.c ++++ b/tools/tests/resource/test-resource.c +@@ -20,6 +20,8 @@ static xc_interface *xch; + static xenforeignmemory_handle *fh; + static xengnttab_handle *gh; + ++static xc_physinfo_t physinfo; ++ + static void test_gnttab(uint32_t domid, unsigned int nr_frames, + unsigned long gfn) + { +@@ -172,6 +174,37 @@ static void test_domain_configurations(void) + + printf("Test %s\n", t->name); + ++#if defined(__x86_64__) || defined(__i386__) ++ if ( t->create.flags & XEN_DOMCTL_CDF_hvm ) ++ { ++ if ( !(physinfo.capabilities & XEN_SYSCTL_PHYSCAP_hvm) ) ++ { ++ printf(" Skip: HVM not available\n"); ++ continue; ++ } ++ ++ /* ++ * On x86, use HAP guests if possible, but skip if neither HAP nor ++ * SHADOW is available. ++ */ ++ if ( physinfo.capabilities & XEN_SYSCTL_PHYSCAP_hap ) ++ t->create.flags |= XEN_DOMCTL_CDF_hap; ++ else if ( !(physinfo.capabilities & XEN_SYSCTL_PHYSCAP_shadow) ) ++ { ++ printf(" Skip: Neither HAP or SHADOW available\n"); ++ continue; ++ } ++ } ++ else ++ { ++ if ( !(physinfo.capabilities & XEN_SYSCTL_PHYSCAP_pv) ) ++ { ++ printf(" Skip: PV not available\n"); ++ continue; ++ } ++ } ++#endif ++ + rc = xc_domain_create(xch, &domid, &t->create); + if ( rc ) + { +@@ -214,6 +247,8 @@ static void test_domain_configurations(void) + + int main(int argc, char **argv) + { ++ int rc; ++ + printf("XENMEM_acquire_resource tests\n"); + + xch = xc_interface_open(NULL, NULL, 0); +@@ -227,6 +262,10 @@ int main(int argc, char **argv) + if ( !gh ) + err(1, "xengnttab_open"); + ++ rc = xc_physinfo(xch, &physinfo); ++ if ( rc ) ++ err(1, "Failed to obtain physinfo"); ++ + test_domain_configurations(); + + return !!nr_failures; +-- +2.44.0 + diff --git a/info.txt b/info.txt index 0a99509..fa9f510 100644 --- a/info.txt +++ b/info.txt @@ -1,6 +1,6 @@ -Xen upstream patchset #0 for 4.17.4-pre +Xen upstream patchset #1 for 4.17.4-pre Containing patches from RELEASE-4.17.3 (07f413d7ffb06eab36045bd19f53555de1cacf62) to -staging-4.17 (091466ba55d1e2e75738f751818ace2e3ed08ccf) +staging-4.17 (9bc40dbcf9eafccc1923b2555286bf6a2af03b7a) -- cgit v1.2.3-65-gdbad