diff options
author | Florian Schmaus <flow@gentoo.org> | 2022-11-09 09:54:09 +0100 |
---|---|---|
committer | Florian Schmaus <flow@gentoo.org> | 2022-11-09 09:54:09 +0100 |
commit | 364cc6703e42a167e223662998592c26a315fd36 (patch) | |
tree | 642939ac0d9e401dd9d1cea0e34f32b1968ce9ab | |
parent | Xen 4.15.4-pre-patchset-1 (diff) | |
download | xen-upstream-patches-364cc6703e42a167e223662998592c26a315fd36.tar.gz xen-upstream-patches-364cc6703e42a167e223662998592c26a315fd36.tar.bz2 xen-upstream-patches-364cc6703e42a167e223662998592c26a315fd36.zip |
Xen 4.15.4-pre-patchset-24.15.4-pre-patchset-2
Signed-off-by: Florian Schmaus <flow@gentoo.org>
127 files changed, 9714 insertions, 142 deletions
diff --git a/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch b/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch index 32ff417..4b643e1 100644 --- a/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch +++ b/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch @@ -1,7 +1,8 @@ From f6e26ce7d9317abc41130ead6dc2443a7e2dde00 Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 12 Jul 2022 11:20:46 +0200 -Subject: [PATCH 01/67] build: fix exported variable name CFLAGS_stack_boundary +Subject: [PATCH 001/126] build: fix exported variable name + CFLAGS_stack_boundary Exporting a variable with a dash doesn't work reliably, they may be striped from the environment when calling a sub-make or sub-shell. @@ -63,5 +64,5 @@ index e857c0f2cc2c..a5b2041f9b96 100644 obj-y := stub.o obj-$(XEN_BUILD_EFI) := $(filter-out %.init.o,$(EFIOBJ)) -- -2.37.3 +2.37.4 diff --git a/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch b/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch index 9f2f8e4..edc6857 100644 --- a/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch +++ b/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch @@ -1,7 +1,7 @@ From b89b932cfe86556c5de4ad56702aed83142e22a3 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 12 Jul 2022 11:21:14 +0200 -Subject: [PATCH 02/67] IOMMU/x86: work around bogus gcc12 warning in +Subject: [PATCH 002/126] IOMMU/x86: work around bogus gcc12 warning in hvm_gsi_eoi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -48,5 +48,5 @@ index 9544f3234e65..50865eec2c04 100644 /* -- -2.37.3 +2.37.4 diff --git a/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch b/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch index 777ef8a..fd460e0 100644 --- a/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch +++ b/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch @@ -2,7 +2,7 @@ From b53df5b4341fa97614ad064a7c8e781c88b6ed71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= <marmarek@invisiblethingslab.com> Date: Tue, 12 Jul 2022 11:22:09 +0200 -Subject: [PATCH 03/67] ehci-dbgp: fix selecting n-th ehci controller +Subject: [PATCH 003/126] ehci-dbgp: fix selecting n-th ehci controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -32,5 +32,5 @@ index c893d246defa..66b4811af24a 100644 dbgp->cap = find_dbgp(dbgp, num); if ( !dbgp->cap ) -- -2.37.3 +2.37.4 diff --git a/0004-tools-xenstored-Harden-corrupt.patch b/0004-tools-xenstored-Harden-corrupt.patch index 62b7ec9..c9e6852 100644 --- a/0004-tools-xenstored-Harden-corrupt.patch +++ b/0004-tools-xenstored-Harden-corrupt.patch @@ -1,7 +1,7 @@ From 7fe638c28fa693d8bb8f9419de1220d4359a1b2d Mon Sep 17 00:00:00 2001 From: Julien Grall <jgrall@amazon.com> Date: Tue, 12 Jul 2022 11:23:01 +0200 -Subject: [PATCH 04/67] tools/xenstored: Harden corrupt() +Subject: [PATCH 004/126] tools/xenstored: Harden corrupt() At the moment, corrupt() is neither checking for allocation failure nor freeing the allocated memory. @@ -40,5 +40,5 @@ index 8033c1e0eb28..9172dd767140 100644 check_store(); } -- -2.37.3 +2.37.4 diff --git a/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch b/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch index 7d79c2e..dcfc447 100644 --- a/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch +++ b/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch @@ -1,8 +1,8 @@ From 799a8d49237a62ea0d33c3756a6a7f665b8389b2 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:23:32 +0200 -Subject: [PATCH 05/67] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle with - legacy IBRS +Subject: [PATCH 005/126] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle + with legacy IBRS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -89,5 +89,5 @@ index 68f6c46c470c..12283573cdd5 100644 * Disable shadowing before updating the MSR. There are no SMP issues * here; only local processor ordering concerns. -- -2.37.3 +2.37.4 diff --git a/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch b/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch index 965c965..177d677 100644 --- a/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch +++ b/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch @@ -1,7 +1,7 @@ From cd5081e8c31651e623d86532306b4c56bbcb6e6d Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:24:11 +0200 -Subject: [PATCH 06/67] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow +Subject: [PATCH 006/126] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow hardware STIBP hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -230,5 +230,5 @@ index eb7fb70e86f9..8212227ee02a 100644 /* * PV guests can poison the RSB to any virtual address from which -- -2.37.3 +2.37.4 diff --git a/0007-libxc-fix-compilation-error-with-gcc13.patch b/0007-libxc-fix-compilation-error-with-gcc13.patch index 9a1ca92..388111e 100644 --- a/0007-libxc-fix-compilation-error-with-gcc13.patch +++ b/0007-libxc-fix-compilation-error-with-gcc13.patch @@ -1,7 +1,7 @@ From 77deab4233b5d9ec5cf214fdc1652424fd4fc9d6 Mon Sep 17 00:00:00 2001 From: Charles Arnold <carnold@suse.com> Date: Tue, 12 Jul 2022 11:24:39 +0200 -Subject: [PATCH 07/67] libxc: fix compilation error with gcc13 +Subject: [PATCH 007/126] libxc: fix compilation error with gcc13 xc_psr.c:161:5: error: conflicting types for 'xc_psr_cmt_get_data' due to enum/integer mismatch; @@ -29,5 +29,5 @@ index 318920166c5e..2013200b9eff 100644 int xc_psr_cmt_enabled(xc_interface *xch); -- -2.37.3 +2.37.4 diff --git a/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch b/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch index 22a1ebe..18ec7de 100644 --- a/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch +++ b/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch @@ -1,7 +1,7 @@ From 5be1f46f435f8b05608b1eae029cb17d8bd3a560 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:25:05 +0200 -Subject: [PATCH 08/67] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio +Subject: [PATCH 008/126] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio sub-option This was an oversight from when unpriv-mmio was introduced. @@ -28,5 +28,5 @@ index 8212227ee02a..06790897e496 100644 else if ( val > 0 ) rc = -EINVAL; -- -2.37.3 +2.37.4 diff --git a/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch b/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch index 53a8b70..bfae8e2 100644 --- a/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch +++ b/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch @@ -1,7 +1,7 @@ From ae417706870333bb52ebcf33c527809cdd2d7265 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:25:40 +0200 -Subject: [PATCH 09/67] xen/cmdline: Extend parse_boolean() to signal a name +Subject: [PATCH 009/126] xen/cmdline: Extend parse_boolean() to signal a name match This will help parsing a sub-option which has boolean and non-boolean options @@ -83,5 +83,5 @@ index 1198c7c0b207..be7498135170 100644 int parse_boolean(const char *name, const char *s, const char *e); -- -2.37.3 +2.37.4 diff --git a/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch b/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch index 36577d6..621d372 100644 --- a/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch +++ b/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch @@ -1,8 +1,8 @@ From 08bfd4d01185e94fda1be9dd79a981d890a9085e Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 12 Jul 2022 11:26:14 +0200 -Subject: [PATCH 10/67] x86/spec-ctrl: Add fine-grained cmdline suboptions for - primitives +Subject: [PATCH 010/126] x86/spec-ctrl: Add fine-grained cmdline suboptions + for primitives Support controling the PV/HVM suboption of msr-sc/rsb/md-clear, which previously wasn't possible. @@ -133,5 +133,5 @@ index 06790897e496..225fe08259b3 100644 /* Xen's speculative sidechannel mitigation settings. */ -- -2.37.3 +2.37.4 diff --git a/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch b/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch index dc468c8..34acad9 100644 --- a/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch +++ b/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch @@ -1,7 +1,8 @@ From f241cc48dabeef6cb0b381db62f2562b0a3970eb Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 12 Jul 2022 11:26:47 +0200 -Subject: [PATCH 11/67] tools/helpers: fix build of xen-init-dom0 with -Werror +Subject: [PATCH 011/126] tools/helpers: fix build of xen-init-dom0 with + -Werror Missing prototype of asprintf() without _GNU_SOURCE. @@ -24,5 +25,5 @@ index c99224a4b607..b4861c9e8041 100644 #include <stdint.h> #include <string.h> -- -2.37.3 +2.37.4 diff --git a/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch b/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch index 74fee03..1ca34af 100644 --- a/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch +++ b/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch @@ -1,7 +1,7 @@ From d470a54087e0fbd813dae4d773ad0b830eeec4a1 Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 12 Jul 2022 11:26:58 +0200 -Subject: [PATCH 12/67] libxl: check return value of libxl__xs_directory in +Subject: [PATCH 012/126] libxl: check return value of libxl__xs_directory in name2bdf libxl__xs_directory() can potentially return NULL without setting `n`. @@ -34,5 +34,5 @@ index 92bf86b2bebd..a5f5cdf62b80 100644 for (i = 0; i < n; i++) { -- -2.37.3 +2.37.4 diff --git a/0013-update-Xen-version-to-4.15.4-pre.patch b/0013-update-Xen-version-to-4.15.4-pre.patch index 8626fdd..6e8c05b 100644 --- a/0013-update-Xen-version-to-4.15.4-pre.patch +++ b/0013-update-Xen-version-to-4.15.4-pre.patch @@ -1,7 +1,7 @@ From 505771bb1dffdf6f763fad18ee49a913b98abfea Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 12 Jul 2022 11:28:33 +0200 -Subject: [PATCH 13/67] update Xen version to 4.15.4-pre +Subject: [PATCH 013/126] update Xen version to 4.15.4-pre --- xen/Makefile | 2 +- @@ -21,5 +21,5 @@ index e9a88325c467..cd66bb3b1c84 100644 -include xen-version -- -2.37.3 +2.37.4 diff --git a/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch b/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch index a21b4d8..1c237f2 100644 --- a/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch +++ b/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch @@ -1,7 +1,8 @@ From 156ab775769d39b2dfb048ccd34dee7e86ba83a2 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 14/67] x86/spec-ctrl: Rework spec_ctrl_flags context switching +Subject: [PATCH 014/126] x86/spec-ctrl: Rework spec_ctrl_flags context + switching We are shortly going to need to context switch new bits in both the vcpu and S3 paths. Introduce SCF_IST_MASK and SCF_DOM_MASK, and rework d->arch.verw @@ -163,5 +164,5 @@ index 5a590bac44aa..66b00d511fc6 100644 .macro SPEC_CTRL_ENTRY_FROM_INTR_IST /* -- -2.37.3 +2.37.4 diff --git a/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch b/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch index 49351ae..a9cc63f 100644 --- a/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch +++ b/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch @@ -1,7 +1,7 @@ From 2cfbca32b9dc3a8d6520549ff468a7f550daf1b1 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 28 Jun 2022 14:36:56 +0100 -Subject: [PATCH 15/67] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr +Subject: [PATCH 015/126] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr We are about to introduce SCF_ist_ibpb, at which point SCF_ist_wrmsr becomes ambiguous. @@ -106,5 +106,5 @@ index 66b00d511fc6..0ff1b118f882 100644 DO_SPEC_CTRL_EXIT_TO_XEN -- -2.37.3 +2.37.4 diff --git a/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch b/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch index f114f6d..cfe270c 100644 --- a/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch +++ b/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch @@ -1,7 +1,8 @@ From c707015bf118df2c43e3a48b3774916322fca50a Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 4 Jul 2022 21:32:17 +0100 -Subject: [PATCH 16/67] x86/spec-ctrl: Rename opt_ibpb to opt_ibpb_ctxt_switch +Subject: [PATCH 016/126] x86/spec-ctrl: Rename opt_ibpb to + opt_ibpb_ctxt_switch We are about to introduce the use of IBPB at different points in Xen, making opt_ibpb ambiguous. Rename it to opt_ibpb_ctxt_switch. @@ -93,5 +94,5 @@ index 6f8b0e09348e..fd8162ca9ab9 100644 extern int8_t opt_eager_fpu; extern int8_t opt_l1d_flush; -- -2.37.3 +2.37.4 diff --git a/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch b/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch index e162148..5a6bfa5 100644 --- a/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch +++ b/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch @@ -1,7 +1,7 @@ From d7f5fb1e2abd0d56cada9bfcf96ab530d214d9aa Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 17/67] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST +Subject: [PATCH 017/126] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST We are shortly going to add a conditional IBPB in this path. @@ -102,5 +102,5 @@ index 0ff1b118f882..15e24cde00d1 100644 /* Opencoded UNLIKELY_START() with no condition. */ -- -2.37.3 +2.37.4 diff --git a/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch b/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch index 1de9d4c..43b2d76 100644 --- a/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch +++ b/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch @@ -1,7 +1,7 @@ From f0d78e0c11d3984c74f34a7325f862dee93a5835 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Thu, 24 Feb 2022 13:44:33 +0000 -Subject: [PATCH 18/67] x86/spec-ctrl: Support IBPB-on-entry +Subject: [PATCH 018/126] x86/spec-ctrl: Support IBPB-on-entry We are going to need this to mitigate Branch Type Confusion on AMD/Hygon CPUs, but as we've talked about using it in other cases too, arrange to support it @@ -296,5 +296,5 @@ index 15e24cde00d1..9eb4ad9ab71d 100644 jz .L\@_skip_rsb -- -2.37.3 +2.37.4 diff --git a/0019-x86-cpuid-Enumeration-for-BTC_NO.patch b/0019-x86-cpuid-Enumeration-for-BTC_NO.patch index a4444f4..626bfd8 100644 --- a/0019-x86-cpuid-Enumeration-for-BTC_NO.patch +++ b/0019-x86-cpuid-Enumeration-for-BTC_NO.patch @@ -1,7 +1,7 @@ From 2b29ac476fa0c91655906fac3512202e514ecbed Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 16 May 2022 15:48:24 +0100 -Subject: [PATCH 19/67] x86/cpuid: Enumeration for BTC_NO +Subject: [PATCH 019/126] x86/cpuid: Enumeration for BTC_NO BTC_NO indicates that hardware is not succeptable to Branch Type Confusion. @@ -102,5 +102,5 @@ index 9686c82ed75c..1bbc7da4b53c 100644 /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ -- -2.37.3 +2.37.4 diff --git a/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch b/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch index 4d12421..933660d 100644 --- a/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch +++ b/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch @@ -1,7 +1,7 @@ From 409976bed91f61fb7b053d536d2fc87cf3ad7018 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Tue, 15 Mar 2022 18:30:25 +0000 -Subject: [PATCH 20/67] x86/spec-ctrl: Enable Zen2 chickenbit +Subject: [PATCH 020/126] x86/spec-ctrl: Enable Zen2 chickenbit ... as instructed in the Branch Type Confusion whitepaper. @@ -101,5 +101,5 @@ index 1e743461e91d..b4a360723b14 100644 #define MSR_AMD64_DR0_ADDRESS_MASK 0xc0011027 #define MSR_AMD64_DR1_ADDRESS_MASK 0xc0011019 -- -2.37.3 +2.37.4 diff --git a/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch b/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch index b676ba3..01be575 100644 --- a/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch +++ b/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch @@ -1,7 +1,7 @@ From 35bf91d30f1a480dcf5bfd99b79384b2b283da7f Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 27 Jun 2022 19:29:40 +0100 -Subject: [PATCH 21/67] x86/spec-ctrl: Mitigate Branch Type Confusion when +Subject: [PATCH 021/126] x86/spec-ctrl: Mitigate Branch Type Confusion when possible Branch Type Confusion affects AMD/Hygon CPUs on Zen2 and earlier. To @@ -301,5 +301,5 @@ index 10cd0cd2518f..33e845991b0a 100644 extern int8_t opt_eager_fpu; extern int8_t opt_l1d_flush; -- -2.37.3 +2.37.4 diff --git a/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch b/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch index 81f5b9a..5b038c4 100644 --- a/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch +++ b/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch @@ -1,7 +1,8 @@ From 3859f3ee7e37323ae5e0014c07ba8d3a4d7890b2 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 26 Jul 2022 15:03:14 +0200 -Subject: [PATCH 22/67] x86/mm: correct TLB flush condition in _get_page_type() +Subject: [PATCH 022/126] x86/mm: correct TLB flush condition in + _get_page_type() When this logic was moved, it was moved across the point where nx is updated to hold the new type for the page. IOW originally it was @@ -41,5 +42,5 @@ index 7d0747017db5..c88dc749d431 100644 perfc_incr(need_flush_tlb_flush); /* -- -2.37.3 +2.37.4 diff --git a/0023-xl-relax-freemem-s-retry-calculation.patch b/0023-xl-relax-freemem-s-retry-calculation.patch index d7dda30..1879884 100644 --- a/0023-xl-relax-freemem-s-retry-calculation.patch +++ b/0023-xl-relax-freemem-s-retry-calculation.patch @@ -1,7 +1,7 @@ From 2173d9c8be28d5f33c0e299a363ac994867d111b Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 27 Jul 2022 09:28:46 +0200 -Subject: [PATCH 23/67] xl: relax freemem()'s retry calculation +Subject: [PATCH 023/126] xl: relax freemem()'s retry calculation While in principle possible also under other conditions as long as other parallel operations potentially consuming memory aren't "locked out", in @@ -76,5 +76,5 @@ index 435155a03396..5dee7730ca76 100644 return false; } -- -2.37.3 +2.37.4 diff --git a/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch b/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch index fbb1448..ccde751 100644 --- a/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch +++ b/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch @@ -1,7 +1,7 @@ From a2684d9cbbfb02b268be7e551674f709db0617a4 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Wed, 27 Jul 2022 09:29:08 +0200 -Subject: [PATCH 24/67] tools/init-xenstore-domain: fix memory map for PVH +Subject: [PATCH 024/126] tools/init-xenstore-domain: fix memory map for PVH stubdom In case of maxmem != memsize the E820 map of the PVH stubdom is wrong, @@ -55,5 +55,5 @@ index 6836002f0bad..32689abd7479 100644 } -- -2.37.3 +2.37.4 diff --git a/0025-xl-move-freemem-s-credit-expired-loop-exit.patch b/0025-xl-move-freemem-s-credit-expired-loop-exit.patch index c3a1965..a3b2e2b 100644 --- a/0025-xl-move-freemem-s-credit-expired-loop-exit.patch +++ b/0025-xl-move-freemem-s-credit-expired-loop-exit.patch @@ -1,7 +1,7 @@ From c37099426ea678c1d5b6c99ae5ad6834f4edd2e6 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 27 Jul 2022 09:29:31 +0200 -Subject: [PATCH 25/67] xl: move freemem()'s "credit expired" loop exit +Subject: [PATCH 025/126] xl: move freemem()'s "credit expired" loop exit Move the "credit expired" loop exit to the middle of the loop, immediately after "return true". This way having reached the goal on the @@ -51,5 +51,5 @@ index 5dee7730ca76..d1c6f8aae67a 100644 static void reload_domain_config(uint32_t domid, -- -2.37.3 +2.37.4 diff --git a/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch b/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch index fbf3f41..fbbf450 100644 --- a/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch +++ b/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch @@ -1,7 +1,7 @@ From 5f1d0179e15d726622a49044a825894d5010df15 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 27 Jul 2022 09:29:54 +0200 -Subject: [PATCH 26/67] x86/spec-ctrl: correct per-guest-type reporting of +Subject: [PATCH 026/126] x86/spec-ctrl: correct per-guest-type reporting of MD_CLEAR There are command line controls for this and the default also isn't "always @@ -52,5 +52,5 @@ index 563519ce0e31..f7b0251c42bc 100644 printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", -- -2.37.3 +2.37.4 diff --git a/0027-x86-deal-with-gcc12-release-build-issues.patch b/0027-x86-deal-with-gcc12-release-build-issues.patch index d26f6d3..b30c65b 100644 --- a/0027-x86-deal-with-gcc12-release-build-issues.patch +++ b/0027-x86-deal-with-gcc12-release-build-issues.patch @@ -1,7 +1,7 @@ From a095c6cde8a717325cc31bb393c547cad5e16e35 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 27 Jul 2022 09:30:24 +0200 -Subject: [PATCH 27/67] x86: deal with gcc12 release build issues +Subject: [PATCH 027/126] x86: deal with gcc12 release build issues While a number of issues we previously had with pre-release gcc12 were fixed in the final release, we continue to have one issue (with multiple @@ -61,5 +61,5 @@ index 5c19b71eca70..71dd28f126c3 100644 #define PRtype_info "016lx"/* should only be used for printk's */ -- -2.37.3 +2.37.4 diff --git a/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch b/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch index 26b959e..1a63be4 100644 --- a/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch +++ b/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch @@ -1,7 +1,7 @@ From 4799a202a9017360708c18aa8cd699bd8d6be08b Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 27 Jul 2022 09:31:01 +0200 -Subject: [PATCH 28/67] x86emul: add memory operand low bits checks for +Subject: [PATCH 028/126] x86emul: add memory operand low bits checks for ENQCMD{,S} Already ISE rev 044 added text to this effect; rev 045 further dropped @@ -41,5 +41,5 @@ index 5e297f797187..247c14dc4e68 100644 if ( (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags, state, ctxt)) != X86EMUL_OKAY ) -- -2.37.3 +2.37.4 diff --git a/0029-x86-also-suppress-use-of-MMX-insns.patch b/0029-x86-also-suppress-use-of-MMX-insns.patch index 1298a47..d954cdd 100644 --- a/0029-x86-also-suppress-use-of-MMX-insns.patch +++ b/0029-x86-also-suppress-use-of-MMX-insns.patch @@ -1,7 +1,7 @@ From 30d3de4c61c297e12662df1fdb89af335947e59d Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 27 Jul 2022 09:31:31 +0200 -Subject: [PATCH 29/67] x86: also suppress use of MMX insns +Subject: [PATCH 029/126] x86: also suppress use of MMX insns Passing -mno-sse alone is not enough: The compiler may still find (questionable) reasons to use MMX insns. In particular with gcc12 use @@ -35,5 +35,5 @@ index 456e5d5c1ad7..c4337a1a118c 100644 # Compile with thunk-extern, indirect-branch-register if avaiable. CFLAGS-$(CONFIG_INDIRECT_THUNK) += -mindirect-branch=thunk-extern -- -2.37.3 +2.37.4 diff --git a/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch b/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch index a9bf845..b4f6881 100644 --- a/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch +++ b/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch @@ -1,7 +1,7 @@ From b64f1c9e3e3a2a416c7bb5aab77ba5d2cba98638 Mon Sep 17 00:00:00 2001 From: Luca Fancellu <luca.fancellu@arm.com> Date: Wed, 27 Jul 2022 09:31:49 +0200 -Subject: [PATCH 30/67] common/memory: Fix ifdefs for ptdom_max_order +Subject: [PATCH 030/126] common/memory: Fix ifdefs for ptdom_max_order In common/memory.c the ifdef code surrounding ptdom_max_order is using HAS_PASSTHROUGH instead of CONFIG_HAS_PASSTHROUGH, fix the @@ -48,5 +48,5 @@ index 297b98a562b2..95b2b934e4a2 100644 order = ptdom_max_order; #endif -- -2.37.3 +2.37.4 diff --git a/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch b/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch index a52055a..65fe05b 100644 --- a/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch +++ b/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch @@ -1,7 +1,7 @@ From 1b9845dcf959421db3a071a6bc0aa9d8edbffb50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Date: Wed, 3 Aug 2022 12:41:18 +0200 -Subject: [PATCH 31/67] tools/libxl: env variable to signal whether disk/nic +Subject: [PATCH 031/126] tools/libxl: env variable to signal whether disk/nic backend is trusted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -103,5 +103,5 @@ index 0b9e70c9d13d..f87890d1d65f 100644 } -- -2.37.3 +2.37.4 diff --git a/0032-x86-msr-fix-X2APIC_LAST.patch b/0032-x86-msr-fix-X2APIC_LAST.patch index ac42842..4046822 100644 --- a/0032-x86-msr-fix-X2APIC_LAST.patch +++ b/0032-x86-msr-fix-X2APIC_LAST.patch @@ -1,7 +1,7 @@ From df3395f6b2d759aba39fb67a7bc0fe49147c8b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> Date: Wed, 3 Aug 2022 12:41:49 +0200 -Subject: [PATCH 32/67] x86/msr: fix X2APIC_LAST +Subject: [PATCH 032/126] x86/msr: fix X2APIC_LAST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -62,5 +62,5 @@ index b4a360723b14..f1b2cf5460c1 100644 #define MSR_X2APIC_TPR 0x00000808 #define MSR_X2APIC_PPR 0x0000080a -- -2.37.3 +2.37.4 diff --git a/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch b/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch index 46780c4..f1400b8 100644 --- a/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch +++ b/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch @@ -1,7 +1,8 @@ From 8ae0b4d1331c14fb9e30a42987c0152c9b00f530 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 15 Aug 2022 15:40:05 +0200 -Subject: [PATCH 33/67] x86/spec-ctrl: Use IST RSB protection for !SVM systems +Subject: [PATCH 033/126] x86/spec-ctrl: Use IST RSB protection for !SVM + systems There is a corner case where a VT-x guest which manages to reliably trigger non-fatal #MC's could evade the rogue RSB speculation protections that were @@ -50,5 +51,5 @@ index f7b0251c42bc..ac73806eacd8 100644 /* Check whether Eager FPU should be enabled by default. */ -- -2.37.3 +2.37.4 diff --git a/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch b/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch index 6a73c21..5433ddb 100644 --- a/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch +++ b/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch @@ -1,7 +1,7 @@ From 5efcae1eb30ff24e100954e00889a568c1745ea1 Mon Sep 17 00:00:00 2001 From: Jason Andryuk <jandryuk@gmail.com> Date: Mon, 15 Aug 2022 15:40:47 +0200 -Subject: [PATCH 34/67] x86: Expose more MSR_ARCH_CAPS to hwdom +Subject: [PATCH 034/126] x86: Expose more MSR_ARCH_CAPS to hwdom commit e46474278a0e ("x86/intel: Expose MSR_ARCH_CAPS to dom0") started exposing MSR_ARCH_CAPS to dom0. More bits in MSR_ARCH_CAPS have since @@ -64,5 +64,5 @@ index f1b2cf5460c1..49ca1f1845e6 100644 #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) -- -2.37.3 +2.37.4 diff --git a/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch b/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch index 0dfb3b4..150de40 100644 --- a/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch +++ b/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch @@ -1,7 +1,7 @@ From 1e31848cdd8d2ff3cb76f364f04f9771f9b3a8b1 Mon Sep 17 00:00:00 2001 From: Dario Faggioli <dfaggioli@suse.com> Date: Mon, 15 Aug 2022 15:41:25 +0200 -Subject: [PATCH 35/67] xen/sched: setup dom0 vCPUs affinity only once +Subject: [PATCH 035/126] xen/sched: setup dom0 vCPUs affinity only once Right now, affinity for dom0 vCPUs is setup in two steps. This is a problem as, at least in Credit2, unit_insert() sees and uses the @@ -119,5 +119,5 @@ index 8f4b1ca10d1c..f07bd2681fcb 100644 } #endif -- -2.37.3 +2.37.4 diff --git a/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch b/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch index 1637236..bd1b1cb 100644 --- a/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch +++ b/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch @@ -1,7 +1,7 @@ From c373ad3d084614a93c55e25dc20e70ffc7574971 Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Mon, 15 Aug 2022 15:42:09 +0200 -Subject: [PATCH 36/67] tools/libxl: Replace deprecated -sdl option on QEMU +Subject: [PATCH 036/126] tools/libxl: Replace deprecated -sdl option on QEMU command line "-sdl" is deprecated upstream since 6695e4c0fd9e ("softmmu/vl: @@ -34,5 +34,5 @@ index 24f6e73b0a77..ae5f35e0c3fd 100644 flexarray_append_pair(dm_envs, "DISPLAY", sdl->display); if (sdl->xauthority) -- -2.37.3 +2.37.4 diff --git a/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch b/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch index d27766b..bfd812b 100644 --- a/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch +++ b/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch @@ -1,7 +1,7 @@ From fba0c22e79922085c46527eb1391123aadfb24d1 Mon Sep 17 00:00:00 2001 From: Andrew Cooper <andrew.cooper3@citrix.com> Date: Mon, 15 Aug 2022 15:42:31 +0200 -Subject: [PATCH 37/67] x86/spec-ctrl: Enumeration for PBRSB_NO +Subject: [PATCH 037/126] x86/spec-ctrl: Enumeration for PBRSB_NO The PBRSB_NO bit indicates that the CPU is not vulnerable to the Post-Barrier RSB speculative vulnerability. @@ -63,5 +63,5 @@ index 49ca1f1845e6..5a830f76a8d4 100644 #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) -- -2.37.3 +2.37.4 diff --git a/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch b/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch index e0e0f87..e3d159b 100644 --- a/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch +++ b/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch @@ -1,7 +1,7 @@ From 104a54a307b08945365faf6d285cd5a02f94a80f Mon Sep 17 00:00:00 2001 From: Ross Lagerwall <ross.lagerwall@citrix.com> Date: Mon, 15 Aug 2022 15:43:08 +0200 -Subject: [PATCH 38/67] x86/amd: only call setup_force_cpu_cap for boot CPU +Subject: [PATCH 038/126] x86/amd: only call setup_force_cpu_cap for boot CPU This should only be called for the boot CPU to avoid calling _init code after it has been unloaded. @@ -29,5 +29,5 @@ index 60dbe61a61ca..a8d2fb8a1590 100644 switch(c->x86) -- -2.37.3 +2.37.4 diff --git a/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch b/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch index 50d83b6..f6e62b7 100644 --- a/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch +++ b/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch @@ -1,8 +1,8 @@ From a075900cf768fe45f270b6f1d09c4e99281da142 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Mon, 15 Aug 2022 15:43:56 +0200 -Subject: [PATCH 39/67] build/x86: suppress GNU ld 2.39 warning about RWX load - segments +Subject: [PATCH 039/126] build/x86: suppress GNU ld 2.39 warning about RWX + load segments Commit 68f5aac012b9 ("build: suppress future GNU ld warning about RWX load segments") didn't quite cover all the cases: Apparently I missed @@ -34,5 +34,5 @@ index e90680cd9f52..d2fae5cf9eee 100644 %.S: %.bin (od -v -t x $< | tr -s ' ' | awk 'NR > 1 {print s} {s=$$0}' | \ -- -2.37.3 +2.37.4 diff --git a/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch b/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch index c29e5ac..1de5d0d 100644 --- a/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch +++ b/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch @@ -1,7 +1,7 @@ From 9acedc3c58c31930737edbe212f2ccf437a0b757 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Mon, 15 Aug 2022 15:44:23 +0200 -Subject: [PATCH 40/67] PCI: simplify (and thus correct) +Subject: [PATCH 040/126] PCI: simplify (and thus correct) pci_get_pdev{,_by_domain}() The last "wildcard" use of either function went away with f591755823a7 @@ -149,5 +149,5 @@ index 8e3d4d94543a..cd238ae852b0 100644 uint8_t pci_conf_read8(pci_sbdf_t sbdf, unsigned int reg); -- -2.37.3 +2.37.4 diff --git a/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch b/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch index 3fa0e43..e695f96 100644 --- a/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch +++ b/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch @@ -1,7 +1,7 @@ From 09fc590c15773c2471946a78740c6b02e8c34a45 Mon Sep 17 00:00:00 2001 From: Julien Grall <jgrall@amazon.com> Date: Tue, 11 Oct 2022 15:05:53 +0200 -Subject: [PATCH 41/67] xen/arm: p2m: Prevent adding mapping when domain is +Subject: [PATCH 041/126] xen/arm: p2m: Prevent adding mapping when domain is dying During the domain destroy process, the domain will still be accessible @@ -58,5 +58,5 @@ index 2ddd06801a82..8398251c518b 100644 start = p2m->lowest_mapped_gfn; -- -2.37.3 +2.37.4 diff --git a/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch b/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch index 8217a06..96b8528 100644 --- a/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch +++ b/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch @@ -1,7 +1,7 @@ From 0d805f9fba4bc155d15047685024f7d842e925e4 Mon Sep 17 00:00:00 2001 From: Julien Grall <jgrall@amazon.com> Date: Tue, 11 Oct 2022 15:06:36 +0200 -Subject: [PATCH 42/67] xen/arm: p2m: Handle preemption when freeing +Subject: [PATCH 042/126] xen/arm: p2m: Handle preemption when freeing intermediate page tables At the moment the P2M page tables will be freed when the domain structure @@ -163,5 +163,5 @@ index 6a2108398fd7..3a2d51b35d71 100644 /* * Remove mapping refcount on each mapping page in the p2m -- -2.37.3 +2.37.4 diff --git a/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch b/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch index f3f7e3a..f8d61bb 100644 --- a/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch +++ b/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch @@ -1,7 +1,7 @@ From 0f3eab90f327210d91e8e31a769376f286e8819a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Date: Tue, 11 Oct 2022 15:07:25 +0200 -Subject: [PATCH 43/67] x86/p2m: add option to skip root pagetable removal in +Subject: [PATCH 043/126] x86/p2m: add option to skip root pagetable removal in p2m_teardown() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -134,5 +134,5 @@ index 46e8b94a49df..46eb51d44cf5 100644 /* Add a page to a domain's p2m table */ -- -2.37.3 +2.37.4 diff --git a/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch b/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch index 39db626..97a55a5 100644 --- a/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch +++ b/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch @@ -1,7 +1,7 @@ From d24a10a91d46a56e1d406239643ec651a31033d4 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 11 Oct 2022 15:07:42 +0200 -Subject: [PATCH 44/67] x86/HAP: adjust monitor table related error handling +Subject: [PATCH 044/126] x86/HAP: adjust monitor table related error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -73,5 +73,5 @@ index a8f5a19da917..d75dc2b9ed3d 100644 put_gfn(d, cr3_gfn); } -- -2.37.3 +2.37.4 diff --git a/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch b/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch index 7cf356d..08ff309 100644 --- a/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch +++ b/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch @@ -1,7 +1,7 @@ From 95f6d555ec84383f7daaf3374f65bec5ff4351f5 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 11 Oct 2022 15:07:57 +0200 -Subject: [PATCH 45/67] x86/shadow: tolerate failure of +Subject: [PATCH 045/126] x86/shadow: tolerate failure of sh_set_toplevel_shadow() Subsequently sh_set_toplevel_shadow() will be adjusted to install a @@ -72,5 +72,5 @@ index 9b43cb116c47..7e0494cf7faa 100644 #error This should never happen #endif -- -2.37.3 +2.37.4 diff --git a/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch b/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch index 62be72a..4773eef 100644 --- a/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch +++ b/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch @@ -1,7 +1,7 @@ From 1e26afa846fb9a00b9155280eeae3b8cb8375dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Date: Tue, 11 Oct 2022 15:08:14 +0200 -Subject: [PATCH 46/67] x86/shadow: tolerate failure in shadow_prealloc() +Subject: [PATCH 046/126] x86/shadow: tolerate failure in shadow_prealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -275,5 +275,5 @@ index 911db46e7399..3fe0388e7c4f 100644 u32 shadow_type, unsigned long backpointer); -- -2.37.3 +2.37.4 diff --git a/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch b/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch index c81cfab..880b68d 100644 --- a/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch +++ b/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch @@ -1,7 +1,7 @@ From 4f9b535194f70582863f2a78f113547d8822b2b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Date: Tue, 11 Oct 2022 15:08:28 +0200 -Subject: [PATCH 47/67] x86/p2m: refuse new allocations for dying domains +Subject: [PATCH 047/126] x86/p2m: refuse new allocations for dying domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -96,5 +96,5 @@ index fc4f7f78ce43..9ad7e5a88650 100644 * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); -- -2.37.3 +2.37.4 diff --git a/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch b/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch index c3d5a2c..280b6d8 100644 --- a/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch +++ b/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch @@ -1,7 +1,7 @@ From 7f055b011a657f8f16b0df242301efb312058eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Date: Tue, 11 Oct 2022 15:08:42 +0200 -Subject: [PATCH 48/67] x86/p2m: truly free paging pool memory for dying +Subject: [PATCH 048/126] x86/p2m: truly free paging pool memory for dying domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 @@ -111,5 +111,5 @@ index 9ad7e5a88650..366956c146aa 100644 paging_unlock(d); } -- -2.37.3 +2.37.4 diff --git a/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch b/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch index 83502a6..aef6a24 100644 --- a/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch +++ b/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch @@ -1,7 +1,7 @@ From 686c920fa9389fe2b6b619643024ed98b4b7d51f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Date: Tue, 11 Oct 2022 15:08:58 +0200 -Subject: [PATCH 49/67] x86/p2m: free the paging memory pool preemptively +Subject: [PATCH 049/126] x86/p2m: free the paging memory pool preemptively MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -177,5 +177,5 @@ index 366956c146aa..680766fd5170 100644 } -- -2.37.3 +2.37.4 diff --git a/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch b/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch index 23e10ba..8ab565d 100644 --- a/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch +++ b/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch @@ -1,7 +1,7 @@ From b03074bb47d10c9373688b3661c7c31da01c21a3 Mon Sep 17 00:00:00 2001 From: Julien Grall <jgrall@amazon.com> Date: Tue, 11 Oct 2022 15:09:12 +0200 -Subject: [PATCH 50/67] xen/x86: p2m: Add preemption in p2m_teardown() +Subject: [PATCH 050/126] xen/x86: p2m: Add preemption in p2m_teardown() The list p2m->pages contain all the pages used by the P2M. On large instance this can be quite large and the time spent to call @@ -193,5 +193,5 @@ index 46eb51d44cf5..edbe4cee2717 100644 /* Add a page to a domain's p2m table */ -- -2.37.3 +2.37.4 diff --git a/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch b/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch index f3bded4..4ec35bf 100644 --- a/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch +++ b/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch @@ -1,7 +1,7 @@ From 0c0680d6e7953ca4c91699e60060c732f9ead5c1 Mon Sep 17 00:00:00 2001 From: Henry Wang <Henry.Wang@arm.com> Date: Tue, 11 Oct 2022 15:09:32 +0200 -Subject: [PATCH 51/67] libxl, docs: Use arch-specific default paging memory +Subject: [PATCH 051/126] libxl, docs: Use arch-specific default paging memory The default paging memory (descibed in `shadow_memory` entry in xl config) in libxl is used to determine the memory pool size for xl @@ -143,5 +143,5 @@ index 18c3c77ccde3..4d66478fe9dd 100644 * Local variables: * mode: C -- -2.37.3 +2.37.4 diff --git a/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch index 77093a7..a17ad53 100644 --- a/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch +++ b/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch @@ -1,7 +1,7 @@ From 45336d8f88725aec65ee177b1b09abf6eef1dc8d Mon Sep 17 00:00:00 2001 From: Henry Wang <Henry.Wang@arm.com> Date: Tue, 11 Oct 2022 15:09:58 +0200 -Subject: [PATCH 52/67] xen/arm: Construct the P2M pages pool for guests +Subject: [PATCH 052/126] xen/arm: Construct the P2M pages pool for guests This commit constructs the p2m pages pool for guests from the data structure and helper perspective. @@ -185,5 +185,5 @@ index 3a2d51b35d71..18675b234570 100644 { write_lock(&p2m->lock); -- -2.37.3 +2.37.4 diff --git a/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch index 52ce67c..c4e543d 100644 --- a/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch +++ b/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch @@ -1,7 +1,8 @@ From c5215044578e88b401a1296ed6302df05c113c5f Mon Sep 17 00:00:00 2001 From: Henry Wang <Henry.Wang@arm.com> Date: Tue, 11 Oct 2022 15:10:16 +0200 -Subject: [PATCH 53/67] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm +Subject: [PATCH 053/126] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for + Arm This commit implements the `XEN_DOMCTL_shadow_op` support in Xen for Arm. The p2m pages pool size for xl guests is supposed to be @@ -104,5 +105,5 @@ index a8c48b0beaab..a049bc7f3e52 100644 { gfn_t s = _gfn(domctl->u.cacheflush.start_pfn); -- -2.37.3 +2.37.4 diff --git a/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch index 3ef7019..78ce712 100644 --- a/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch +++ b/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch @@ -1,7 +1,8 @@ From 7ad38a39f08aadc1578bdb46ccabaad79ed0faee Mon Sep 17 00:00:00 2001 From: Henry Wang <Henry.Wang@arm.com> Date: Tue, 11 Oct 2022 15:10:34 +0200 -Subject: [PATCH 54/67] xen/arm: Allocate and free P2M pages from the P2M pool +Subject: [PATCH 054/126] xen/arm: Allocate and free P2M pages from the P2M + pool This commit sets/tearsdown of p2m pages pool for non-privileged Arm guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`. @@ -285,5 +286,5 @@ index 6883d8627702..c1055ff2a745 100644 if ( p2m->root ) free_domheap_pages(p2m->root, P2M_ROOT_ORDER); -- -2.37.3 +2.37.4 diff --git a/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch b/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch index be83ce5..5b8a7ea 100644 --- a/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch +++ b/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch @@ -1,8 +1,8 @@ From bb43a10fefe494ab747b020fef3e823b63fc566d Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 11 Oct 2022 15:11:01 +0200 -Subject: [PATCH 55/67] gnttab: correct locking on transitive grant copy error - path +Subject: [PATCH 055/126] gnttab: correct locking on transitive grant copy + error path While the comment next to the lock dropping in preparation of recursively calling acquire_grant_for_copy() mistakenly talks about the @@ -62,5 +62,5 @@ index 77bba9806937..0523beb9b734 100644 *page = NULL; return ERESTART; -- -2.37.3 +2.37.4 diff --git a/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch b/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch index c5d2c9c..80a1923 100644 --- a/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch +++ b/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch @@ -1,7 +1,7 @@ From d65ebacb78901b695bc5e8a075ad1ad865a78928 Mon Sep 17 00:00:00 2001 From: Anthony PERARD <anthony.perard@citrix.com> Date: Tue, 11 Oct 2022 15:13:15 +0200 -Subject: [PATCH 56/67] tools/libxl: Replace deprecated -soundhw on QEMU +Subject: [PATCH 056/126] tools/libxl: Replace deprecated -soundhw on QEMU command line -soundhw is deprecated since 825ff02911c9 ("audio: add soundhw @@ -108,5 +108,5 @@ index 3593e21dbb64..caa08d3229cd 100644 + (7, "sb16"), + ]) -- -2.37.3 +2.37.4 diff --git a/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch b/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch index 9b1cce8..2949fb0 100644 --- a/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch +++ b/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch @@ -1,7 +1,7 @@ From 7923ea47e578bca30a6e45951a9da09e827ff028 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 11 Oct 2022 15:14:05 +0200 -Subject: [PATCH 57/67] x86/CPUID: surface suitable value in EBX of XSTATE +Subject: [PATCH 057/126] x86/CPUID: surface suitable value in EBX of XSTATE subleaf 1 While the SDM isn't very clear about this, our present behavior make @@ -40,5 +40,5 @@ index ee2c4ea03a89..11c95178f110 100644 /* * TODO: Figure out what to do for XSS state. VT-x manages -- -2.37.3 +2.37.4 diff --git a/0058-xen-sched-introduce-cpupool_update_node_affinity.patch b/0058-xen-sched-introduce-cpupool_update_node_affinity.patch index c15edb8..c2cf0b8 100644 --- a/0058-xen-sched-introduce-cpupool_update_node_affinity.patch +++ b/0058-xen-sched-introduce-cpupool_update_node_affinity.patch @@ -1,7 +1,7 @@ From 735b10844489babf52d3193193285a7311cf2c39 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Tue, 11 Oct 2022 15:14:22 +0200 -Subject: [PATCH 58/67] xen/sched: introduce cpupool_update_node_affinity() +Subject: [PATCH 058/126] xen/sched: introduce cpupool_update_node_affinity() For updating the node affinities of all domains in a cpupool add a new function cpupool_update_node_affinity(). @@ -253,5 +253,5 @@ index 701963f84cb8..4e25627d9685 100644 /* * To be implemented by each architecture, sanity checking the configuration -- -2.37.3 +2.37.4 diff --git a/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch b/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch index 587eef7..7e81f53 100644 --- a/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch +++ b/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch @@ -1,8 +1,8 @@ From d638c2085f71f694344b34e70eb1b371c86b00f0 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Tue, 11 Oct 2022 15:15:14 +0200 -Subject: [PATCH 59/67] xen/sched: carve out memory allocation and freeing from - schedule_cpu_rm() +Subject: [PATCH 059/126] xen/sched: carve out memory allocation and freeing + from schedule_cpu_rm() In order to prepare not allocating or freeing memory from schedule_cpu_rm(), move this functionality to dedicated functions. @@ -259,5 +259,5 @@ index 6e036f8c8077..ff3185425219 100644 int sched_move_domain(struct domain *d, struct cpupool *c); struct cpupool *cpupool_get_by_id(unsigned int poolid); -- -2.37.3 +2.37.4 diff --git a/0060-xen-sched-fix-cpu-hotplug.patch b/0060-xen-sched-fix-cpu-hotplug.patch index 3e158f4..264c8ef 100644 --- a/0060-xen-sched-fix-cpu-hotplug.patch +++ b/0060-xen-sched-fix-cpu-hotplug.patch @@ -1,7 +1,7 @@ From d17680808b4c8015e31070c971e1ee548170ae34 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Tue, 11 Oct 2022 15:15:41 +0200 -Subject: [PATCH 60/67] xen/sched: fix cpu hotplug +Subject: [PATCH 060/126] xen/sched: fix cpu hotplug Cpu unplugging is calling schedule_cpu_rm() via stop_machine_run() with interrupts disabled, thus any memory allocation or freeing must be @@ -303,5 +303,5 @@ index ff3185425219..3bab78ccb240 100644 struct cpupool *cpupool_get_by_id(unsigned int poolid); void cpupool_put(struct cpupool *pool); -- -2.37.3 +2.37.4 diff --git a/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch b/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch index 0f044b2..64144fe 100644 --- a/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch +++ b/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch @@ -1,7 +1,7 @@ From 19cf28b515f21da02df80e68f901ad7650daaa37 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 11 Oct 2022 15:15:55 +0200 -Subject: [PATCH 61/67] Config.mk: correct PIE-related option(s) in +Subject: [PATCH 061/126] Config.mk: correct PIE-related option(s) in EMBEDDED_EXTRA_CFLAGS I haven't been able to find evidence of "-nopie" ever having been a @@ -54,5 +54,5 @@ index 96d89b2f7dfc..9f87608f6602 100644 XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles -- -2.37.3 +2.37.4 diff --git a/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch b/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch index 65882a9..c2299bf 100644 --- a/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch +++ b/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch @@ -1,7 +1,7 @@ From 182f8bb503b9dd3db5dd9118dc763d241787c6fc Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Tue, 11 Oct 2022 15:16:09 +0200 -Subject: [PATCH 62/67] tools/xenstore: minor fix of the migration stream doc +Subject: [PATCH 062/126] tools/xenstore: minor fix of the migration stream doc Drop mentioning the non-existent read-only socket in the migration stream description document. @@ -37,5 +37,5 @@ index 5f1155273ec3..78530bbb0ef4 100644 \pagebreak -- -2.37.3 +2.37.4 diff --git a/0063-xen-gnttab-fix-gnttab_acquire_resource.patch b/0063-xen-gnttab-fix-gnttab_acquire_resource.patch index 0d58157..9087ddb 100644 --- a/0063-xen-gnttab-fix-gnttab_acquire_resource.patch +++ b/0063-xen-gnttab-fix-gnttab_acquire_resource.patch @@ -1,7 +1,7 @@ From 3ac64b3751837a117ee3dfb3e2cc27057a83d0f7 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Tue, 11 Oct 2022 15:16:53 +0200 -Subject: [PATCH 63/67] xen/gnttab: fix gnttab_acquire_resource() +Subject: [PATCH 063/126] xen/gnttab: fix gnttab_acquire_resource() Commit 9dc46386d89d ("gnttab: work around "may be used uninitialized" warning") was wrong, as vaddrs can legitimately be NULL in case @@ -65,5 +65,5 @@ index 0523beb9b734..01e426c67fb6 100644 ASSERT_UNREACHABLE(); rc = -ENODATA; -- -2.37.3 +2.37.4 diff --git a/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch b/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch index 4246b01..738df82 100644 --- a/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch +++ b/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch @@ -1,8 +1,8 @@ From 62e534d17cdd838828bfd75d3d845e31524dd336 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Tue, 11 Oct 2022 15:17:12 +0200 -Subject: [PATCH 64/67] x86: wire up VCPUOP_register_vcpu_time_memory_area for - 32-bit guests +Subject: [PATCH 064/126] x86: wire up VCPUOP_register_vcpu_time_memory_area + for 32-bit guests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -55,5 +55,5 @@ index c46dccc25a54..d51d99344796 100644 rc = arch_do_vcpu_op(cmd, v, arg); break; -- -2.37.3 +2.37.4 diff --git a/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch b/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch index df4fb38..84edf5d 100644 --- a/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch +++ b/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch @@ -1,7 +1,7 @@ From 9690bb261d5fa09cb281e1fa124d93db7b84fda5 Mon Sep 17 00:00:00 2001 From: Tamas K Lengyel <tamas.lengyel@intel.com> Date: Tue, 11 Oct 2022 15:17:42 +0200 -Subject: [PATCH 65/67] x86/vpmu: Fix race-condition in vpmu_load +Subject: [PATCH 065/126] x86/vpmu: Fix race-condition in vpmu_load The vPMU code-bases attempts to perform an optimization on saving/reloading the PMU context by keeping track of what vCPU ran on each pCPU. When a pCPU is @@ -93,5 +93,5 @@ index fb1b296a6cc1..800eff87dc03 100644 if ( !vpmu_is_set(vpmu, VPMU_RUNNING) || (!has_vlapic(vpmu_vcpu(vpmu)->domain) && -- -2.37.3 +2.37.4 diff --git a/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch b/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch index 24b9576..8578e02 100644 --- a/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch +++ b/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch @@ -1,7 +1,7 @@ From 0d233924d4b0f676056856096e8761205add3ee8 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Wed, 12 Oct 2022 17:31:44 +0200 -Subject: [PATCH 66/67] tools/tests: fix wrong backport of upstream commit +Subject: [PATCH 066/126] tools/tests: fix wrong backport of upstream commit 52daa6a8483e4 The backport of upstream commit 52daa6a8483e4 had a bug, correct it. @@ -27,5 +27,5 @@ index bf485baff2b4..51a8f4a000f6 100644 if ( res ) { -- -2.37.3 +2.37.4 diff --git a/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch b/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch index 309d486..6e75a84 100644 --- a/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch +++ b/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch @@ -1,7 +1,7 @@ From 816580afdd1730d4f85f64477a242a439af1cdf8 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 12 Oct 2022 17:33:40 +0200 -Subject: [PATCH 67/67] libxl/Arm: correct xc_shadow_control() invocation to +Subject: [PATCH 067/126] libxl/Arm: correct xc_shadow_control() invocation to fix build The backport didn't adapt to the earlier function prototype taking more @@ -38,5 +38,5 @@ index d21f614ed788..ba548befdd25 100644 } -- -2.37.3 +2.37.4 diff --git a/0068-arm-p2m-Rework-p2m_init.patch b/0068-arm-p2m-Rework-p2m_init.patch new file mode 100644 index 0000000..cc80d52 --- /dev/null +++ b/0068-arm-p2m-Rework-p2m_init.patch @@ -0,0 +1,88 @@ +From 6f948fd1929c01b82a119f03670cab38ffebb47e Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 25 Oct 2022 09:21:11 +0000 +Subject: [PATCH 068/126] arm/p2m: Rework p2m_init() + +p2m_init() is mostly trivial initialisation, but has two fallible operations +which are on either side of the backpointer trigger for teardown to take +actions. + +p2m_free_vmid() is idempotent with a failed p2m_alloc_vmid(), so rearrange +p2m_init() to perform all trivial setup, then set the backpointer, then +perform all fallible setup. + +This will simplify a future bugfix which needs to add a third fallible +operation. + +No practical change. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com> +(cherry picked from commit: 3783e583319fa1ce75e414d851f0fde191a14753) +--- + xen/arch/arm/p2m.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index c1055ff2a745..25eb1d84cbc1 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1733,7 +1733,7 @@ void p2m_final_teardown(struct domain *d) + int p2m_init(struct domain *d) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); +- int rc = 0; ++ int rc; + unsigned int cpu; + + rwlock_init(&p2m->lock); +@@ -1742,11 +1742,6 @@ int p2m_init(struct domain *d) + INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); + + p2m->vmid = INVALID_VMID; +- +- rc = p2m_alloc_vmid(d); +- if ( rc != 0 ) +- return rc; +- + p2m->max_mapped_gfn = _gfn(0); + p2m->lowest_mapped_gfn = _gfn(ULONG_MAX); + +@@ -1762,8 +1757,6 @@ int p2m_init(struct domain *d) + p2m->clean_pte = is_iommu_enabled(d) && + !iommu_has_feature(d, IOMMU_FEAT_COHERENT_WALK); + +- rc = p2m_alloc_table(d); +- + /* + * Make sure that the type chosen to is able to store the an vCPU ID + * between 0 and the maximum of virtual CPUS supported as long as +@@ -1776,13 +1769,20 @@ int p2m_init(struct domain *d) + p2m->last_vcpu_ran[cpu] = INVALID_VCPU_ID; + + /* +- * Besides getting a domain when we only have the p2m in hand, +- * the back pointer to domain is also used in p2m_teardown() +- * as an end-of-initialization indicator. ++ * "Trivial" initialisation is now complete. Set the backpointer so ++ * p2m_teardown() and friends know to do something. + */ + p2m->domain = d; + +- return rc; ++ rc = p2m_alloc_vmid(d); ++ if ( rc ) ++ return rc; ++ ++ rc = p2m_alloc_table(d); ++ if ( rc ) ++ return rc; ++ ++ return 0; + } + + /* +-- +2.37.4 + diff --git a/0069-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch b/0069-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch new file mode 100644 index 0000000..67cdb7a --- /dev/null +++ b/0069-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch @@ -0,0 +1,169 @@ +From f8915cd5dbe0f51e9bb31a54fe40600b839dd707 Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 25 Oct 2022 09:21:12 +0000 +Subject: [PATCH 069/126] xen/arm: p2m: Populate pages for GICv2 mapping in + p2m_init() + +Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area +when the domain is created. Considering the worst case of page tables +which requires 6 P2M pages as the two pages will be consecutive but not +necessarily in the same L3 page table and keep a buffer, populate 16 +pages as the default value to the P2M pages pool in p2m_init() at the +domain creation stage to satisfy the GICv2 requirement. For GICv3, the +above-mentioned P2M mapping is not necessary, but since the allocated +16 pages here would not be lost, hence populate these pages +unconditionally. + +With the default 16 P2M pages populated, there would be a case that +failures would happen in the domain creation with P2M pages already in +use. To properly free the P2M for this case, firstly support the +optionally preemption of p2m_teardown(), then call p2m_teardown() and +p2m_set_allocation(d, 0, NULL) non-preemptively in p2m_final_teardown(). +As non-preemptive p2m_teardown() should only return 0, use a +BUG_ON to confirm that. + +Since p2m_final_teardown() is called either after +domain_relinquish_resources() where relinquish_p2m_mapping() has been +called, or from failure path of domain_create()/arch_domain_create() +where mappings that require p2m_put_l3_page() should never be created, +relinquish_p2m_mapping() is not added in p2m_final_teardown(), add +in-code comments to refer this. + +Fixes: cbea5a1149ca ("xen/arm: Allocate and free P2M pages from the P2M pool") +Suggested-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com> +(cherry picked from commit: c7cff1188802646eaa38e918e5738da0e84949be) +--- + xen/arch/arm/domain.c | 2 +- + xen/arch/arm/p2m.c | 34 ++++++++++++++++++++++++++++++++-- + xen/include/asm-arm/p2m.h | 14 ++++++++++---- + 3 files changed, 43 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index a5ffd952ecd0..b11359b8cca3 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -1041,7 +1041,7 @@ int domain_relinquish_resources(struct domain *d) + return ret; + + PROGRESS(p2m): +- ret = p2m_teardown(d); ++ ret = p2m_teardown(d, true); + if ( ret ) + return ret; + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 25eb1d84cbc1..f6012f2a538f 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1664,7 +1664,7 @@ static void p2m_free_vmid(struct domain *d) + spin_unlock(&vmid_alloc_lock); + } + +-int p2m_teardown(struct domain *d) ++int p2m_teardown(struct domain *d, bool allow_preemption) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); + unsigned long count = 0; +@@ -1672,6 +1672,9 @@ int p2m_teardown(struct domain *d) + unsigned int i; + int rc = 0; + ++ if ( page_list_empty(&p2m->pages) ) ++ return 0; ++ + p2m_write_lock(p2m); + + /* +@@ -1695,7 +1698,7 @@ int p2m_teardown(struct domain *d) + p2m_free_page(p2m->domain, pg); + count++; + /* Arbitrarily preempt every 512 iterations */ +- if ( !(count % 512) && hypercall_preempt_check() ) ++ if ( allow_preemption && !(count % 512) && hypercall_preempt_check() ) + { + rc = -ERESTART; + break; +@@ -1715,7 +1718,20 @@ void p2m_final_teardown(struct domain *d) + if ( !p2m->domain ) + return; + ++ /* ++ * No need to call relinquish_p2m_mapping() here because ++ * p2m_final_teardown() is called either after domain_relinquish_resources() ++ * where relinquish_p2m_mapping() has been called, or from failure path of ++ * domain_create()/arch_domain_create() where mappings that require ++ * p2m_put_l3_page() should never be created. For the latter case, also see ++ * comment on top of the p2m_set_entry() for more info. ++ */ ++ ++ BUG_ON(p2m_teardown(d, false)); + ASSERT(page_list_empty(&p2m->pages)); ++ ++ while ( p2m_teardown_allocation(d) == -ERESTART ) ++ continue; /* No preemption support here */ + ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); + + if ( p2m->root ) +@@ -1782,6 +1798,20 @@ int p2m_init(struct domain *d) + if ( rc ) + return rc; + ++ /* ++ * Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area ++ * when the domain is created. Considering the worst case for page ++ * tables and keep a buffer, populate 16 pages to the P2M pages pool here. ++ * For GICv3, the above-mentioned P2M mapping is not necessary, but since ++ * the allocated 16 pages here would not be lost, hence populate these ++ * pages unconditionally. ++ */ ++ spin_lock(&d->arch.paging.lock); ++ rc = p2m_set_allocation(d, 16, NULL); ++ spin_unlock(&d->arch.paging.lock); ++ if ( rc ) ++ return rc; ++ + return 0; + } + +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index 18675b234570..ea7ca41d82b2 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -194,14 +194,18 @@ int p2m_init(struct domain *d); + + /* + * The P2M resources are freed in two parts: +- * - p2m_teardown() will be called when relinquish the resources. It +- * will free large resources (e.g. intermediate page-tables) that +- * requires preemption. ++ * - p2m_teardown() will be called preemptively when relinquish the ++ * resources, in which case it will free large resources (e.g. intermediate ++ * page-tables) that requires preemption. + * - p2m_final_teardown() will be called when domain struct is been + * freed. This *cannot* be preempted and therefore one small + * resources should be freed here. ++ * Note that p2m_final_teardown() will also call p2m_teardown(), to properly ++ * free the P2M when failures happen in the domain creation with P2M pages ++ * already in use. In this case p2m_teardown() is called non-preemptively and ++ * p2m_teardown() will always return 0. + */ +-int p2m_teardown(struct domain *d); ++int p2m_teardown(struct domain *d, bool allow_preemption); + void p2m_final_teardown(struct domain *d); + + /* +@@ -266,6 +270,8 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn, + /* + * Direct set a p2m entry: only for use by the P2M code. + * The P2M write lock should be taken. ++ * TODO: Add a check in __p2m_set_entry() to avoid creating a mapping in ++ * arch_domain_create() that requires p2m_put_l3_page() to be called. + */ + int p2m_set_entry(struct p2m_domain *p2m, + gfn_t sgfn, +-- +2.37.4 + diff --git a/0070-VMX-correct-error-handling-in-vmx_create_vmcs.patch b/0070-VMX-correct-error-handling-in-vmx_create_vmcs.patch new file mode 100644 index 0000000..4823c64 --- /dev/null +++ b/0070-VMX-correct-error-handling-in-vmx_create_vmcs.patch @@ -0,0 +1,38 @@ +From 3885fa42349c3c6f31f0e0eec3b4605dca7fdda9 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 31 Oct 2022 13:31:26 +0100 +Subject: [PATCH 070/126] VMX: correct error handling in vmx_create_vmcs() + +With the addition of vmx_add_msr() calls to construct_vmcs() there are +now cases where simply freeing the VMCS isn't enough: The MSR bitmap +page as well as one of the MSR area ones (if it's the 2nd vmx_add_msr() +which fails) may also need freeing. Switch to using vmx_destroy_vmcs() +instead. + +Fixes: 3bd36952dab6 ("x86/spec-ctrl: Introduce an option to control L1D_FLUSH for HVM HAP guests") +Fixes: 53a570b28569 ("x86/spec-ctrl: Support IBPB-on-entry") +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 448d28309f1a966bdc850aff1a637e0b79a03e43 +master date: 2022-10-12 17:57:56 +0200 +--- + xen/arch/x86/hvm/vmx/vmcs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index dd817cee4e69..237b13459d4f 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -1831,7 +1831,7 @@ int vmx_create_vmcs(struct vcpu *v) + + if ( (rc = construct_vmcs(v)) != 0 ) + { +- vmx_free_vmcs(vmx->vmcs_pa); ++ vmx_destroy_vmcs(v); + return rc; + } + +-- +2.37.4 + diff --git a/0071-argo-Remove-reachable-ASSERT_UNREACHABLE.patch b/0071-argo-Remove-reachable-ASSERT_UNREACHABLE.patch new file mode 100644 index 0000000..d1563bd --- /dev/null +++ b/0071-argo-Remove-reachable-ASSERT_UNREACHABLE.patch @@ -0,0 +1,41 @@ +From 916668baf9252ac30260e3394278a098712c5d34 Mon Sep 17 00:00:00 2001 +From: Jason Andryuk <jandryuk@gmail.com> +Date: Mon, 31 Oct 2022 13:32:59 +0100 +Subject: [PATCH 071/126] argo: Remove reachable ASSERT_UNREACHABLE + +I observed this ASSERT_UNREACHABLE in partner_rings_remove consistently +trip. It was in OpenXT with the viptables patch applied. + +dom10 shuts down. +dom7 is REJECTED sending to dom10. +dom7 shuts down and this ASSERT trips for dom10. + +The argo_send_info has a domid, but there is no refcount taken on +the domain. Therefore it's not appropriate to ASSERT that the domain +can be looked up via domid. Replace with a debug message. + +Signed-off-by: Jason Andryuk <jandryuk@gmail.com> +Reviewed-by: Christopher Clark <christopher.w.clark@gmail.com> +master commit: 197f612b77c5afe04e60df2100a855370d720ad7 +master date: 2022-10-14 14:45:41 +0100 +--- + xen/common/argo.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/common/argo.c b/xen/common/argo.c +index 49be715f638e..2b0d980d4bba 100644 +--- a/xen/common/argo.c ++++ b/xen/common/argo.c +@@ -1299,7 +1299,8 @@ partner_rings_remove(struct domain *src_d) + ASSERT_UNREACHABLE(); + } + else +- ASSERT_UNREACHABLE(); ++ argo_dprintk("%pd has entry for stale partner d%u\n", ++ src_d, send_info->id.domain_id); + + if ( dst_d ) + rcu_unlock_domain(dst_d); +-- +2.37.4 + diff --git a/0072-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch b/0072-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch new file mode 100644 index 0000000..7993482 --- /dev/null +++ b/0072-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch @@ -0,0 +1,64 @@ +From b833014293f3fa5a7c48756ce0c8c9f3e4a666ff Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 31 Oct 2022 13:33:29 +0100 +Subject: [PATCH 072/126] EFI: don't convert memory marked for runtime use to + ordinary RAM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +efi_init_memory() in both relevant places is treating EFI_MEMORY_RUNTIME +higher priority than the type of the range. To avoid accessing memory at +runtime which was re-used for other purposes, make +efi_arch_process_memory_map() follow suit. While in theory the same would +apply to EfiACPIReclaimMemory, we don't actually "reclaim" or clobber +that memory (converted to E820_ACPI on x86) there (and it would be a bug +if the Dom0 kernel tried to reclaim the range, bypassing Xen's memory +management, plus it would be at least bogus if it clobbered that space), +hence that type's handling can be left alone. + +Fixes: bf6501a62e80 ("x86-64: EFI boot code") +Fixes: facac0af87ef ("x86-64: EFI runtime code") +Fixes: 6d70ea10d49f ("Add ARM EFI boot support") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +master commit: f324300c8347b6aa6f9c0b18e0a90bbf44011a9a +master date: 2022-10-21 12:30:24 +0200 +--- + xen/arch/arm/efi/efi-boot.h | 3 ++- + xen/arch/x86/efi/efi-boot.h | 4 +++- + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h +index cf9c37153fea..37d7ebd59ae2 100644 +--- a/xen/arch/arm/efi/efi-boot.h ++++ b/xen/arch/arm/efi/efi-boot.h +@@ -149,7 +149,8 @@ static EFI_STATUS __init efi_process_memory_map_bootinfo(EFI_MEMORY_DESCRIPTOR * + + for ( Index = 0; Index < (mmap_size / desc_size); Index++ ) + { +- if ( desc_ptr->Attribute & EFI_MEMORY_WB && ++ if ( !(desc_ptr->Attribute & EFI_MEMORY_RUNTIME) && ++ (desc_ptr->Attribute & EFI_MEMORY_WB) && + (desc_ptr->Type == EfiConventionalMemory || + desc_ptr->Type == EfiLoaderCode || + desc_ptr->Type == EfiLoaderData || +diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h +index 84fd77931456..3c3b3ab936f4 100644 +--- a/xen/arch/x86/efi/efi-boot.h ++++ b/xen/arch/x86/efi/efi-boot.h +@@ -183,7 +183,9 @@ static void __init efi_arch_process_memory_map(EFI_SYSTEM_TABLE *SystemTable, + /* fall through */ + case EfiLoaderCode: + case EfiLoaderData: +- if ( desc->Attribute & EFI_MEMORY_WB ) ++ if ( desc->Attribute & EFI_MEMORY_RUNTIME ) ++ type = E820_RESERVED; ++ else if ( desc->Attribute & EFI_MEMORY_WB ) + type = E820_RAM; + else + case EfiUnusableMemory: +-- +2.37.4 + diff --git a/0073-xen-sched-fix-race-in-RTDS-scheduler.patch b/0073-xen-sched-fix-race-in-RTDS-scheduler.patch new file mode 100644 index 0000000..bb456ca --- /dev/null +++ b/0073-xen-sched-fix-race-in-RTDS-scheduler.patch @@ -0,0 +1,42 @@ +From 1f679f084fef76810762ee69a584fc1b524be0b6 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Mon, 31 Oct 2022 13:33:59 +0100 +Subject: [PATCH 073/126] xen/sched: fix race in RTDS scheduler + +When a domain gets paused the unit runnable state can change to "not +runnable" without the scheduling lock being involved. This means that +a specific scheduler isn't involved in this change of runnable state. + +In the RTDS scheduler this can result in an inconsistency in case a +unit is losing its "runnable" capability while the RTDS scheduler's +scheduling function is active. RTDS will remove the unit from the run +queue, but doesn't do so for the replenish queue, leading to hitting +an ASSERT() in replq_insert() later when the domain is unpaused again. + +Fix that by removing the unit from the replenish queue as well in this +case. + +Fixes: 7c7b407e7772 ("xen/sched: introduce unit_runnable_state()") +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Dario Faggioli <dfaggioli@suse.com> +master commit: 73c62927f64ecb48f27d06176befdf76b879f340 +master date: 2022-10-21 12:32:23 +0200 +--- + xen/common/sched/rt.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c +index c24cd2ac3200..ec2ca1bebc26 100644 +--- a/xen/common/sched/rt.c ++++ b/xen/common/sched/rt.c +@@ -1087,6 +1087,7 @@ rt_schedule(const struct scheduler *ops, struct sched_unit *currunit, + else if ( !unit_runnable_state(snext->unit) ) + { + q_remove(snext); ++ replq_remove(ops, snext); + snext = rt_unit(sched_idle_unit(sched_cpu)); + } + +-- +2.37.4 + diff --git a/0074-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch b/0074-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch new file mode 100644 index 0000000..9085f67 --- /dev/null +++ b/0074-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch @@ -0,0 +1,158 @@ +From 9c5114696c6f7773b7f3691f27aaa7a0636c916d Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Mon, 31 Oct 2022 13:34:28 +0100 +Subject: [PATCH 074/126] xen/sched: fix restore_vcpu_affinity() by removing it +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When the system is coming up after having been suspended, +restore_vcpu_affinity() is called for each domain in order to adjust +the vcpu's affinity settings in case a cpu didn't come to live again. + +The way restore_vcpu_affinity() is doing that is wrong, because the +specific scheduler isn't being informed about a possible migration of +the vcpu to another cpu. Additionally the migration is often even +happening if all cpus are running again, as it is done without check +whether it is really needed. + +As cpupool management is already calling cpu_disable_scheduler() for +cpus not having come up again, and cpu_disable_scheduler() is taking +care of eventually needed vcpu migration in the proper way, there is +simply no need for restore_vcpu_affinity(). + +So just remove restore_vcpu_affinity() completely, together with the +no longer used sched_reset_affinity_broken(). + +Fixes: 8a04eaa8ea83 ("xen/sched: move some per-vcpu items to struct sched_unit") +Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Dario Faggioli <dfaggioli@suse.com> +Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> +master commit: fce1f381f7388daaa3e96dbb0d67d7a3e4bb2d2d +master date: 2022-10-24 11:16:27 +0100 +--- + xen/arch/x86/acpi/power.c | 3 -- + xen/common/sched/core.c | 78 --------------------------------------- + xen/include/xen/sched.h | 1 - + 3 files changed, 82 deletions(-) + +diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c +index dd397f713067..1a7baeebe6d0 100644 +--- a/xen/arch/x86/acpi/power.c ++++ b/xen/arch/x86/acpi/power.c +@@ -159,10 +159,7 @@ static void thaw_domains(void) + + rcu_read_lock(&domlist_read_lock); + for_each_domain ( d ) +- { +- restore_vcpu_affinity(d); + domain_unpause(d); +- } + rcu_read_unlock(&domlist_read_lock); + } + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 900aab8f66a7..9173cf690c72 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -1188,84 +1188,6 @@ static bool sched_check_affinity_broken(const struct sched_unit *unit) + return false; + } + +-static void sched_reset_affinity_broken(const struct sched_unit *unit) +-{ +- struct vcpu *v; +- +- for_each_sched_unit_vcpu ( unit, v ) +- v->affinity_broken = false; +-} +- +-void restore_vcpu_affinity(struct domain *d) +-{ +- unsigned int cpu = smp_processor_id(); +- struct sched_unit *unit; +- +- ASSERT(system_state == SYS_STATE_resume); +- +- rcu_read_lock(&sched_res_rculock); +- +- for_each_sched_unit ( d, unit ) +- { +- spinlock_t *lock; +- unsigned int old_cpu = sched_unit_master(unit); +- struct sched_resource *res; +- +- ASSERT(!unit_runnable(unit)); +- +- /* +- * Re-assign the initial processor as after resume we have no +- * guarantee the old processor has come back to life again. +- * +- * Therefore, here, before actually unpausing the domains, we should +- * set v->processor of each of their vCPUs to something that will +- * make sense for the scheduler of the cpupool in which they are in. +- */ +- lock = unit_schedule_lock_irq(unit); +- +- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, +- cpupool_domain_master_cpumask(d)); +- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) +- { +- if ( sched_check_affinity_broken(unit) ) +- { +- sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); +- sched_reset_affinity_broken(unit); +- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, +- cpupool_domain_master_cpumask(d)); +- } +- +- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) +- { +- /* Affinity settings of one vcpu are for the complete unit. */ +- printk(XENLOG_DEBUG "Breaking affinity for %pv\n", +- unit->vcpu_list); +- sched_set_affinity(unit, &cpumask_all, NULL); +- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, +- cpupool_domain_master_cpumask(d)); +- } +- } +- +- res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu))); +- sched_set_res(unit, res); +- +- spin_unlock_irq(lock); +- +- /* v->processor might have changed, so reacquire the lock. */ +- lock = unit_schedule_lock_irq(unit); +- res = sched_pick_resource(unit_scheduler(unit), unit); +- sched_set_res(unit, res); +- spin_unlock_irq(lock); +- +- if ( old_cpu != sched_unit_master(unit) ) +- sched_move_irqs(unit); +- } +- +- rcu_read_unlock(&sched_res_rculock); +- +- domain_update_node_affinity(d); +-} +- + /* + * This function is used by cpu_hotplug code via cpu notifier chain + * and from cpupools to switch schedulers on a cpu. +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 4e25627d9685..bb05d167ae0f 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -993,7 +993,6 @@ void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value); + void sched_setup_dom0_vcpus(struct domain *d); + int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason); + int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity); +-void restore_vcpu_affinity(struct domain *d); + int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, + struct xen_domctl_vcpuaffinity *vcpuaff); + +-- +2.37.4 + diff --git a/0075-x86-shadow-drop-replace-bogus-assertions.patch b/0075-x86-shadow-drop-replace-bogus-assertions.patch new file mode 100644 index 0000000..183dc68 --- /dev/null +++ b/0075-x86-shadow-drop-replace-bogus-assertions.patch @@ -0,0 +1,71 @@ +From 08bc78b4eecaef33250038f7e484bdf01ea1017c Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 31 Oct 2022 13:35:06 +0100 +Subject: [PATCH 075/126] x86/shadow: drop (replace) bogus assertions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The addition of a call to shadow_blow_tables() from shadow_teardown() +has resulted in the "no vcpus" related assertion becoming triggerable: +If domain_create() fails with at least one page successfully allocated +in the course of shadow_enable(), or if domain_create() succeeds and +the domain is then killed without ever invoking XEN_DOMCTL_max_vcpus. +Note that in-tree tests (test-resource and test-tsx) do exactly the +latter of these two. + +The assertion's comment was bogus anyway: Shadow mode has been getting +enabled before allocation of vCPU-s for quite some time. Convert the +assertion to a conditional: As long as there are no vCPU-s, there's +nothing to blow away. + +Fixes: e7aa55c0aab3 ("x86/p2m: free the paging memory pool preemptively") +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> + +A similar assertion/comment pair exists in _shadow_prealloc(); the +comment is similarly bogus, and the assertion could in principle trigger +e.g. when shadow_alloc_p2m_page() is called early enough. Replace those +at the same time by a similar early return, here indicating failure to +the caller (which will generally lead to the domain being crashed in +shadow_prealloc()). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: a92dc2bb30ba65ae25d2f417677eb7ef9a6a0fef +master date: 2022-10-24 15:46:11 +0200 +--- + xen/arch/x86/mm/shadow/common.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 8f7fddcee1e5..e36d49d1fcba 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -942,8 +942,9 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + /* No reclaim when the domain is dying, teardown will take care of it. */ + return false; + +- /* Shouldn't have enabled shadows if we've no vcpus. */ +- ASSERT(d->vcpu && d->vcpu[0]); ++ /* Nothing to reclaim when there are no vcpus yet. */ ++ if ( !d->vcpu[0] ) ++ return false; + + /* Stage one: walk the list of pinned pages, unpinning them */ + perfc_incr(shadow_prealloc_1); +@@ -1033,8 +1034,9 @@ void shadow_blow_tables(struct domain *d) + mfn_t smfn; + int i; + +- /* Shouldn't have enabled shadows if we've no vcpus. */ +- ASSERT(d->vcpu && d->vcpu[0]); ++ /* Nothing to do when there are no vcpus yet. */ ++ if ( !d->vcpu[0] ) ++ return; + + /* Pass one: unpin all pinned pages */ + foreach_pinned_shadow(d, sp, t) +-- +2.37.4 + diff --git a/0076-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch b/0076-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch new file mode 100644 index 0000000..0350771 --- /dev/null +++ b/0076-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch @@ -0,0 +1,61 @@ +From 6b035f4f5829eb213cb9fcbe83b5dfae05c857a6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Mon, 31 Oct 2022 13:35:33 +0100 +Subject: [PATCH 076/126] vpci: don't assume that vpci per-device data exists + unconditionally +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It's possible for a device to be assigned to a domain but have no +vpci structure if vpci_process_pending() failed and called +vpci_remove_device() as a result. The unconditional accesses done by +vpci_{read,write}() and vpci_remove_device() to pdev->vpci would +then trigger a NULL pointer dereference. + +Add checks for pdev->vpci presence in the affected functions. + +Fixes: 9c244fdef7 ('vpci: add header handlers') +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 6ccb5e308ceeb895fbccd87a528a8bd24325aa39 +master date: 2022-10-26 14:55:30 +0200 +--- + xen/drivers/vpci/vpci.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c +index a27c9e600df1..6b90e4fa32dc 100644 +--- a/xen/drivers/vpci/vpci.c ++++ b/xen/drivers/vpci/vpci.c +@@ -37,6 +37,9 @@ extern vpci_register_init_t *const __end_vpci_array[]; + + void vpci_remove_device(struct pci_dev *pdev) + { ++ if ( !pdev->vpci ) ++ return; ++ + spin_lock(&pdev->vpci->lock); + while ( !list_empty(&pdev->vpci->handlers) ) + { +@@ -320,7 +323,7 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) + + /* Find the PCI dev matching the address. */ + pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn); +- if ( !pdev ) ++ if ( !pdev || !pdev->vpci ) + return vpci_read_hw(sbdf, reg, size); + + spin_lock(&pdev->vpci->lock); +@@ -430,7 +433,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, + * Passthrough everything that's not trapped. + */ + pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn); +- if ( !pdev ) ++ if ( !pdev || !pdev->vpci ) + { + vpci_write_hw(sbdf, reg, size, data); + return; +-- +2.37.4 + diff --git a/0077-vpci-msix-remove-from-table-list-on-detach.patch b/0077-vpci-msix-remove-from-table-list-on-detach.patch new file mode 100644 index 0000000..2e60831 --- /dev/null +++ b/0077-vpci-msix-remove-from-table-list-on-detach.patch @@ -0,0 +1,47 @@ +From bff4c4457950abb498270d921d728f654876f944 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Mon, 31 Oct 2022 13:35:59 +0100 +Subject: [PATCH 077/126] vpci/msix: remove from table list on detach +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Teardown of MSIX vPCI related data doesn't currently remove the MSIX +device data from the list of MSIX tables handled by the domain, +leading to a use-after-free of the data in the msix structure. + +Remove the structure from the list before freeing in order to solve +it. + +Reported-by: Jan Beulich <jbeulich@suse.com> +Fixes: d6281be9d0 ('vpci/msix: add MSI-X handlers') +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: c14aea137eab29eb9c30bfad745a00c65ad21066 +master date: 2022-10-26 14:56:58 +0200 +--- + xen/drivers/vpci/vpci.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c +index 6b90e4fa32dc..75edbbee4025 100644 +--- a/xen/drivers/vpci/vpci.c ++++ b/xen/drivers/vpci/vpci.c +@@ -51,8 +51,12 @@ void vpci_remove_device(struct pci_dev *pdev) + xfree(r); + } + spin_unlock(&pdev->vpci->lock); +- if ( pdev->vpci->msix && pdev->vpci->msix->pba ) +- iounmap(pdev->vpci->msix->pba); ++ if ( pdev->vpci->msix ) ++ { ++ list_del(&pdev->vpci->msix->next); ++ if ( pdev->vpci->msix->pba ) ++ iounmap(pdev->vpci->msix->pba); ++ } + xfree(pdev->vpci->msix); + xfree(pdev->vpci->msi); + xfree(pdev->vpci); +-- +2.37.4 + diff --git a/0078-x86-also-zap-secondary-time-area-handles-during-soft.patch b/0078-x86-also-zap-secondary-time-area-handles-during-soft.patch new file mode 100644 index 0000000..e3db6ad --- /dev/null +++ b/0078-x86-also-zap-secondary-time-area-handles-during-soft.patch @@ -0,0 +1,49 @@ +From 9b8b65c827169eca2d0e500150009ac0f857d455 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 31 Oct 2022 13:36:25 +0100 +Subject: [PATCH 078/126] x86: also zap secondary time area handles during soft + reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Just like domain_soft_reset() properly zaps runstate area handles, the +secondary time area ones also need discarding to prevent guest memory +corruption once the guest is re-started. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: b80d4f8d2ea6418e32fb4f20d1304ace6d6566e3 +master date: 2022-10-27 11:49:09 +0200 +--- + xen/arch/x86/domain.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index ce6ddcf31397..e9b8ed4c96c2 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -927,6 +927,7 @@ int arch_domain_soft_reset(struct domain *d) + struct page_info *page = virt_to_page(d->shared_info), *new_page; + int ret = 0; + struct domain *owner; ++ struct vcpu *v; + mfn_t mfn; + gfn_t gfn; + p2m_type_t p2mt; +@@ -1006,7 +1007,12 @@ int arch_domain_soft_reset(struct domain *d) + "Failed to add a page to replace %pd's shared_info frame %"PRI_gfn"\n", + d, gfn_x(gfn)); + free_domheap_page(new_page); ++ goto exit_put_gfn; + } ++ ++ for_each_vcpu ( d, v ) ++ set_xen_guest_handle(v->arch.time_info_guest, NULL); ++ + exit_put_gfn: + put_gfn(d, gfn_x(gfn)); + exit_put_page: +-- +2.37.4 + diff --git a/0079-common-map_vcpu_info-wants-to-unshare-the-underlying.patch b/0079-common-map_vcpu_info-wants-to-unshare-the-underlying.patch new file mode 100644 index 0000000..2944a80 --- /dev/null +++ b/0079-common-map_vcpu_info-wants-to-unshare-the-underlying.patch @@ -0,0 +1,41 @@ +From 317894fa6a067a7903199bc5c1e3e06a0436caf8 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 31 Oct 2022 13:36:50 +0100 +Subject: [PATCH 079/126] common: map_vcpu_info() wants to unshare the + underlying page +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Not passing P2M_UNSHARE to get_page_from_gfn() means there won't even be +an attempt to unshare the referenced page, without any indication to the +caller (e.g. -EAGAIN). Note that guests have no direct control over +which of their pages are shared (or paged out), and hence they have no +way to make sure all on their own that the subsequent obtaining of a +writable type reference can actually succeed. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Julien Grall <jgrall@amazon.com> +master commit: 48980cf24d5cf41fd644600f99c753419505e735 +master date: 2022-10-28 11:38:32 +0200 +--- + xen/common/domain.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/common/domain.c b/xen/common/domain.c +index 17cc32fde373..0fb7f9a6225c 100644 +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -1454,7 +1454,7 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) + if ( (v != current) && !(v->pause_flags & VPF_down) ) + return -EINVAL; + +- page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); ++ page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE); + if ( !page ) + return -EINVAL; + +-- +2.37.4 + diff --git a/0080-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch b/0080-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch new file mode 100644 index 0000000..31aa812 --- /dev/null +++ b/0080-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch @@ -0,0 +1,43 @@ +From a46f01fad17173afe3809ac1980cbe4b67a9a8b5 Mon Sep 17 00:00:00 2001 +From: Igor Druzhinin <igor.druzhinin@citrix.com> +Date: Mon, 31 Oct 2022 13:37:17 +0100 +Subject: [PATCH 080/126] x86/pv-shim: correctly ignore empty onlining requests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Mem-op requests may have zero extents. Such requests need treating as +no-ops. pv_shim_online_memory(), however, would have tried to take 2³²-1 +order-sized pages from its balloon list (to then populate them), +typically ending when the entire set of ballooned pages of this order +was consumed. + +Note that pv_shim_offline_memory() does not have such an issue. + +Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") +Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 9272225ca72801fd9fa5b268a2d1c5adebd19cd9 +master date: 2022-10-28 15:47:59 +0200 +--- + xen/arch/x86/pv/shim.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c +index b4e83e077891..104357e2c398 100644 +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -922,6 +922,9 @@ void pv_shim_online_memory(unsigned int nr, unsigned int order) + struct page_info *page, *tmp; + PAGE_LIST_HEAD(list); + ++ if ( !nr ) ++ return; ++ + spin_lock(&balloon_lock); + page_list_for_each_safe ( page, tmp, &balloon ) + { +-- +2.37.4 + diff --git a/0081-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch b/0081-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch new file mode 100644 index 0000000..cd97334 --- /dev/null +++ b/0081-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch @@ -0,0 +1,55 @@ +From b68e3fda8a76fb3ab582b5633727ac5545e4e8b9 Mon Sep 17 00:00:00 2001 +From: Igor Druzhinin <igor.druzhinin@citrix.com> +Date: Mon, 31 Oct 2022 13:37:42 +0100 +Subject: [PATCH 081/126] x86/pv-shim: correct ballooning up for compat guests + +The compat layer for multi-extent memory ops may need to split incoming +requests. Since the guest handles in the interface structures may not be +altered, it does so by leveraging do_memory_op()'s continuation +handling: It hands on non-initial requests with a non-zero start extent, +with the (native) handle suitably adjusted down. As a result +do_memory_op() sees only the first of potentially several requests with +start extent being zero. It's only that case when the function would +issue a call to pv_shim_online_memory(), yet the range then covers only +the first sub-range that results from the split. + +Address that breakage by making a complementary call to +pv_shim_online_memory() in compat layer. + +Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") +Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: a0bfdd201ea12aa5679bb8944d63a4e0d3c23160 +master date: 2022-10-28 15:48:50 +0200 +--- + xen/common/compat/memory.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/xen/common/compat/memory.c b/xen/common/compat/memory.c +index c43fa97cf15f..a0e0562a4033 100644 +--- a/xen/common/compat/memory.c ++++ b/xen/common/compat/memory.c +@@ -7,6 +7,7 @@ EMIT_FILE; + #include <xen/event.h> + #include <xen/mem_access.h> + #include <asm/current.h> ++#include <asm/guest.h> + #include <compat/memory.h> + + #define xen_domid_t domid_t +@@ -146,7 +147,10 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) compat) + nat.rsrv->nr_extents = end_extent; + ++split; + } +- ++ /* Avoid calling pv_shim_online_memory() when in a continuation. */ ++ if ( pv_shim && op != XENMEM_decrease_reservation && !start_extent ) ++ pv_shim_online_memory(cmp.rsrv.nr_extents - nat.rsrv->nr_extents, ++ cmp.rsrv.extent_order); + break; + + case XENMEM_exchange: +-- +2.37.4 + diff --git a/0082-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch b/0082-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch new file mode 100644 index 0000000..a6d895f --- /dev/null +++ b/0082-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch @@ -0,0 +1,73 @@ +From ddab5b1e001366258c0bfc7d5995b9d548e6042b Mon Sep 17 00:00:00 2001 +From: Igor Druzhinin <igor.druzhinin@citrix.com> +Date: Mon, 31 Oct 2022 13:38:05 +0100 +Subject: [PATCH 082/126] x86/pv-shim: correct ballooning down for compat + guests + +The compat layer for multi-extent memory ops may need to split incoming +requests. Since the guest handles in the interface structures may not be +altered, it does so by leveraging do_memory_op()'s continuation +handling: It hands on non-initial requests with a non-zero start extent, +with the (native) handle suitably adjusted down. As a result +do_memory_op() sees only the first of potentially several requests with +start extent being zero. In order to be usable as overall result, the +function accumulates args.nr_done, i.e. it initialized the field with +the start extent. Therefore non-initial requests resulting from the +split would pass too large a number into pv_shim_offline_memory(). + +Address that breakage by always calling pv_shim_offline_memory() +regardless of current hypercall preemption status, with a suitably +adjusted first argument. Note that this is correct also for the native +guest case: We now simply "commit" what was completed right away, rather +than at the end of a series of preemption/re-start cycles. In fact this +improves overall preemption behavior: There's no longer a potentially +big chunk of work done non-preemptively at the end of the last +"iteration". + +Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") +Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 1d7fbc535d1d37bdc2cc53ede360b0f6651f7de1 +master date: 2022-10-28 15:49:33 +0200 +--- + xen/common/memory.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +diff --git a/xen/common/memory.c b/xen/common/memory.c +index 95b2b934e4a2..a958d94ac3cd 100644 +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -1407,22 +1407,17 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + + rc = args.nr_done; + +- if ( args.preempted ) +- return hypercall_create_continuation( +- __HYPERVISOR_memory_op, "lh", +- op | (rc << MEMOP_EXTENT_SHIFT), arg); +- + #ifdef CONFIG_X86 + if ( pv_shim && op == XENMEM_decrease_reservation ) +- /* +- * Only call pv_shim_offline_memory when the hypercall has +- * finished. Note that nr_done is used to cope in case the +- * hypercall has failed and only part of the extents where +- * processed. +- */ +- pv_shim_offline_memory(args.nr_done, args.extent_order); ++ pv_shim_offline_memory(args.nr_done - start_extent, ++ args.extent_order); + #endif + ++ if ( args.preempted ) ++ return hypercall_create_continuation( ++ __HYPERVISOR_memory_op, "lh", ++ op | (rc << MEMOP_EXTENT_SHIFT), arg); ++ + break; + + case XENMEM_exchange: +-- +2.37.4 + diff --git a/0083-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch b/0083-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch new file mode 100644 index 0000000..5204b3f --- /dev/null +++ b/0083-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch @@ -0,0 +1,120 @@ +From ee03d9b56e6141422b4ef2444f93cf2e88e6a26c Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 13 Sep 2022 07:35:06 +0200 +Subject: [PATCH 083/126] tools/xenstore: create_node: Don't defer work to undo + any changes on failure + +XSA-115 extended destroy_node() to update the node accounting for the +connection. The implementation is assuming the connection is the parent +of the node, however all the nodes are allocated using a separate context +(see process_message()). This will result to crash (or corrupt) xenstored +as the pointer is wrongly used. + +In case of an error, any changes to the database or update to the +accounting will now be reverted in create_node() by calling directly +destroy_node(). This has the nice advantage to remove the loop to unset +the destructors in case of success. + +Take the opportunity to free the nodes right now as they are not +going to be reachable (the function returns NULL) and are just wasting +resources. + +This is XSA-414 / CVE-2022-42309. + +Fixes: 0bfb2101f243 ("tools/xenstore: fix node accounting after failed node creation") +Signed-off-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +(cherry picked from commit 1cd3cc7ea27cda7640a8d895e09617b61c265697) +--- + tools/xenstore/xenstored_core.c | 47 ++++++++++++++++++++++----------- + 1 file changed, 32 insertions(+), 15 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 9172dd767140..a00c49e404a1 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1054,9 +1054,8 @@ nomem: + return NULL; + } + +-static int destroy_node(void *_node) ++static int destroy_node(struct connection *conn, struct node *node) + { +- struct node *node = _node; + TDB_DATA key; + + if (streq(node->name, "/")) +@@ -1065,7 +1064,7 @@ static int destroy_node(void *_node) + set_tdb_key(node->name, &key); + tdb_delete(tdb_ctx, key); + +- domain_entry_dec(talloc_parent(node), node); ++ domain_entry_dec(conn, node); + + return 0; + } +@@ -1074,7 +1073,8 @@ static struct node *create_node(struct connection *conn, const void *ctx, + const char *name, + void *data, unsigned int datalen) + { +- struct node *node, *i; ++ struct node *node, *i, *j; ++ int ret; + + node = construct_node(conn, ctx, name); + if (!node) +@@ -1096,23 +1096,40 @@ static struct node *create_node(struct connection *conn, const void *ctx, + /* i->parent is set for each new node, so check quota. */ + if (i->parent && + domain_entry(conn) >= quota_nb_entry_per_domain) { +- errno = ENOSPC; +- return NULL; ++ ret = ENOSPC; ++ goto err; + } +- if (write_node(conn, i, false)) +- return NULL; + +- /* Account for new node, set destructor for error case. */ +- if (i->parent) { ++ ret = write_node(conn, i, false); ++ if (ret) ++ goto err; ++ ++ /* Account for new node */ ++ if (i->parent) + domain_entry_inc(conn, i); +- talloc_set_destructor(i, destroy_node); +- } + } + +- /* OK, now remove destructors so they stay around */ +- for (i = node; i->parent; i = i->parent) +- talloc_set_destructor(i, NULL); + return node; ++ ++err: ++ /* ++ * We failed to update TDB for some of the nodes. Undo any work that ++ * have already been done. ++ */ ++ for (j = node; j != i; j = j->parent) ++ destroy_node(conn, j); ++ ++ /* We don't need to keep the nodes around, so free them. */ ++ i = node; ++ while (i) { ++ j = i; ++ i = i->parent; ++ talloc_free(j); ++ } ++ ++ errno = ret; ++ ++ return NULL; + } + + /* path, data... */ +-- +2.37.4 + diff --git a/0084-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch b/0084-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch new file mode 100644 index 0000000..05936ea --- /dev/null +++ b/0084-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch @@ -0,0 +1,145 @@ +From 579e7334b909c22efc65c5df22e8afe414882154 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 13 Sep 2022 07:35:06 +0200 +Subject: [PATCH 084/126] tools/xenstore: Fail a transaction if it is not + possible to create a node + +Commit f2bebf72c4d5 "xenstore: rework of transaction handling" moved +out from copying the entire database everytime a new transaction is +opened to track the list of nodes changed. + +The content of all the nodes accessed during a transaction will be +temporarily stored in TDB using a different key. + +The function create_node() may write/update multiple nodes if the child +doesn't exist. In case of a failure, the function will revert any +changes (this include any update to TDB). Unfortunately, the function +which reverts the changes (i.e. destroy_node()) will not use the correct +key to delete any update or even request the transaction to fail. + +This means that if a client decide to go ahead with committing the +transaction, orphan nodes will be created because they were not linked +to an existing node (create_node() will write the nodes backwards). + +Once some nodes have been partially updated in a transaction, it is not +easily possible to undo any changes. So rather than continuing and hit +weird issue while committing, it is much saner to fail the transaction. + +This will have an impact on any client that decides to commit even if it +can't write a node. Although, it is not clear why a normal client would +want to do that... + +Lastly, update destroy_node() to use the correct key for deleting the +node. Rather than recreating it (this will allocate memory and +therefore fail), stash the key in the structure node. + +This is XSA-415 / CVE-2022-42310. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +(cherry picked from commit 5d71766bd1a4a3a8b2fe952ca2be80e02fe48f34) +--- + tools/xenstore/xenstored_core.c | 23 +++++++++++++++-------- + tools/xenstore/xenstored_core.h | 2 ++ + tools/xenstore/xenstored_transaction.c | 5 +++++ + tools/xenstore/xenstored_transaction.h | 3 +++ + 4 files changed, 25 insertions(+), 8 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index a00c49e404a1..b28c2c66b53b 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -531,15 +531,17 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + return 0; + } + ++/* ++ * Write the node. If the node is written, caller can find the key used in ++ * node->key. This can later be used if the change needs to be reverted. ++ */ + static int write_node(struct connection *conn, struct node *node, + bool no_quota_check) + { +- TDB_DATA key; +- +- if (access_node(conn, node, NODE_ACCESS_WRITE, &key)) ++ if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) + return errno; + +- return write_node_raw(conn, &key, node, no_quota_check); ++ return write_node_raw(conn, &node->key, node, no_quota_check); + } + + enum xs_perm_type perm_for_conn(struct connection *conn, +@@ -1056,16 +1058,21 @@ nomem: + + static int destroy_node(struct connection *conn, struct node *node) + { +- TDB_DATA key; +- + if (streq(node->name, "/")) + corrupt(NULL, "Destroying root node!"); + +- set_tdb_key(node->name, &key); +- tdb_delete(tdb_ctx, key); ++ tdb_delete(tdb_ctx, node->key); + + domain_entry_dec(conn, node); + ++ /* ++ * It is not possible to easily revert the changes in a transaction. ++ * So if the failure happens in a transaction, mark it as fail to ++ * prevent any commit. ++ */ ++ if ( conn->transaction ) ++ fail_transaction(conn->transaction); ++ + return 0; + } + +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 0c9a0961b57e..900336afa426 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -148,6 +148,8 @@ struct node_perms { + + struct node { + const char *name; ++ /* Key used to update TDB */ ++ TDB_DATA key; + + /* Parent (optional) */ + struct node *parent; +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index cd07fb0f218b..faf6c930e42a 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -580,6 +580,11 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid) + list_add_tail(&d->list, &trans->changed_domains); + } + ++void fail_transaction(struct transaction *trans) ++{ ++ trans->fail = true; ++} ++ + void conn_delete_all_transactions(struct connection *conn) + { + struct transaction *trans; +diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h +index 43a162bea3f3..14062730e3c9 100644 +--- a/tools/xenstore/xenstored_transaction.h ++++ b/tools/xenstore/xenstored_transaction.h +@@ -46,6 +46,9 @@ int access_node(struct connection *conn, struct node *node, + int transaction_prepend(struct connection *conn, const char *name, + TDB_DATA *key); + ++/* Mark the transaction as failed. This will prevent it to be committed. */ ++void fail_transaction(struct transaction *trans); ++ + void conn_delete_all_transactions(struct connection *conn); + int check_transactions(struct hashtable *hash); + +-- +2.37.4 + diff --git a/0085-tools-xenstore-split-up-send_reply.patch b/0085-tools-xenstore-split-up-send_reply.patch new file mode 100644 index 0000000..7420f93 --- /dev/null +++ b/0085-tools-xenstore-split-up-send_reply.patch @@ -0,0 +1,213 @@ +From 0d8bea403d4d1763dddb0c1c81d30efebafb6962 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:07 +0200 +Subject: [PATCH 085/126] tools/xenstore: split up send_reply() + +Today send_reply() is used for both, normal request replies and watch +events. + +Split it up into send_reply() and send_event(). This will be used to +add some event specific handling. + +add_event() can be merged into send_event(), removing the need for an +intermediate memory allocation. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 9bfde319dbac2a1321898d2f75a3f075c3eb7b32) +--- + tools/xenstore/xenstored_core.c | 74 +++++++++++++++++++------------- + tools/xenstore/xenstored_core.h | 1 + + tools/xenstore/xenstored_watch.c | 39 +++-------------- + 3 files changed, 52 insertions(+), 62 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index b28c2c66b53b..01d4a2e440ec 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -733,49 +733,32 @@ static void send_error(struct connection *conn, int error) + void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + const void *data, unsigned int len) + { +- struct buffered_data *bdata; ++ struct buffered_data *bdata = conn->in; ++ ++ assert(type != XS_WATCH_EVENT); + + if ( len > XENSTORE_PAYLOAD_MAX ) { + send_error(conn, E2BIG); + return; + } + +- /* Replies reuse the request buffer, events need a new one. */ +- if (type != XS_WATCH_EVENT) { +- bdata = conn->in; +- /* Drop asynchronous responses, e.g. errors for watch events. */ +- if (!bdata) +- return; +- bdata->inhdr = true; +- bdata->used = 0; +- conn->in = NULL; +- } else { +- /* Message is a child of the connection for auto-cleanup. */ +- bdata = new_buffer(conn); ++ if (!bdata) ++ return; ++ bdata->inhdr = true; ++ bdata->used = 0; + +- /* +- * Allocation failure here is unfortunate: we have no way to +- * tell anybody about it. +- */ +- if (!bdata) +- return; +- } + if (len <= DEFAULT_BUFFER_SIZE) + bdata->buffer = bdata->default_buffer; +- else ++ else { + bdata->buffer = talloc_array(bdata, char, len); +- if (!bdata->buffer) { +- if (type == XS_WATCH_EVENT) { +- /* Same as above: no way to tell someone. */ +- talloc_free(bdata); ++ if (!bdata->buffer) { ++ send_error(conn, ENOMEM); + return; + } +- /* re-establish request buffer for sending ENOMEM. */ +- conn->in = bdata; +- send_error(conn, ENOMEM); +- return; + } + ++ conn->in = NULL; ++ + /* Update relevant header fields and fill in the message body. */ + bdata->hdr.msg.type = type; + bdata->hdr.msg.len = len; +@@ -783,8 +766,39 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + + /* Queue for later transmission. */ + list_add_tail(&bdata->list, &conn->out_list); ++} + +- return; ++/* ++ * Send a watch event. ++ * As this is not directly related to the current command, errors can't be ++ * reported. ++ */ ++void send_event(struct connection *conn, const char *path, const char *token) ++{ ++ struct buffered_data *bdata; ++ unsigned int len; ++ ++ len = strlen(path) + 1 + strlen(token) + 1; ++ /* Don't try to send over-long events. */ ++ if (len > XENSTORE_PAYLOAD_MAX) ++ return; ++ ++ bdata = new_buffer(conn); ++ if (!bdata) ++ return; ++ ++ bdata->buffer = talloc_array(bdata, char, len); ++ if (!bdata->buffer) { ++ talloc_free(bdata); ++ return; ++ } ++ strcpy(bdata->buffer, path); ++ strcpy(bdata->buffer + strlen(path) + 1, token); ++ bdata->hdr.msg.type = XS_WATCH_EVENT; ++ bdata->hdr.msg.len = len; ++ ++ /* Queue for later transmission. */ ++ list_add_tail(&bdata->list, &conn->out_list); + } + + /* Some routines (write, mkdir, etc) just need a non-error return */ +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 900336afa426..38d97fa081a6 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -180,6 +180,7 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); + + void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + const void *data, unsigned int len); ++void send_event(struct connection *conn, const char *path, const char *token); + + /* Some routines (write, mkdir, etc) just need a non-error return */ + void send_ack(struct connection *conn, enum xsd_sockmsg_type type); +diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c +index db89e0141fce..a116f967dc66 100644 +--- a/tools/xenstore/xenstored_watch.c ++++ b/tools/xenstore/xenstored_watch.c +@@ -85,35 +85,6 @@ static const char *get_watch_path(const struct watch *watch, const char *name) + return path; + } + +-/* +- * Send a watch event. +- * Temporary memory allocations are done with ctx. +- */ +-static void add_event(struct connection *conn, +- const void *ctx, +- struct watch *watch, +- const char *name) +-{ +- /* Data to send (node\0token\0). */ +- unsigned int len; +- char *data; +- +- name = get_watch_path(watch, name); +- +- len = strlen(name) + 1 + strlen(watch->token) + 1; +- /* Don't try to send over-long events. */ +- if (len > XENSTORE_PAYLOAD_MAX) +- return; +- +- data = talloc_array(ctx, char, len); +- if (!data) +- return; +- strcpy(data, name); +- strcpy(data + strlen(name) + 1, watch->token); +- send_reply(conn, XS_WATCH_EVENT, data, len); +- talloc_free(data); +-} +- + /* + * Check permissions of a specific watch to fire: + * Either the node itself or its parent have to be readable by the connection +@@ -190,10 +161,14 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, + list_for_each_entry(watch, &i->watches, list) { + if (exact) { + if (streq(name, watch->node)) +- add_event(i, ctx, watch, name); ++ send_event(i, ++ get_watch_path(watch, name), ++ watch->token); + } else { + if (is_child(name, watch->node)) +- add_event(i, ctx, watch, name); ++ send_event(i, ++ get_watch_path(watch, name), ++ watch->token); + } + } + } +@@ -292,7 +267,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) + send_ack(conn, XS_WATCH); + + /* We fire once up front: simplifies clients and restart. */ +- add_event(conn, in, watch, watch->node); ++ send_event(conn, get_watch_path(watch, watch->node), watch->token); + + return 0; + } +-- +2.37.4 + diff --git a/0086-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch b/0086-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch new file mode 100644 index 0000000..46ae2d3 --- /dev/null +++ b/0086-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch @@ -0,0 +1,117 @@ +From b322923894ea23f397efc58a938cb9213d7dc617 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:07 +0200 +Subject: [PATCH 086/126] tools/xenstore: add helpers to free struct + buffered_data + +Add two helpers for freeing struct buffered_data: free_buffered_data() +for freeing one instance and conn_free_buffered_data() for freeing all +instances for a connection. + +This is avoiding duplicated code and will help later when more actions +are needed when freeing a struct buffered_data. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit ead062a68a9c201a95488e84750a70a107f7b317) +--- + tools/xenstore/xenstored_core.c | 26 +++++++++++++++++--------- + tools/xenstore/xenstored_core.h | 2 ++ + tools/xenstore/xenstored_domain.c | 7 +------ + 3 files changed, 20 insertions(+), 15 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 01d4a2e440ec..6498bf603666 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -211,6 +211,21 @@ void reopen_log(void) + } + } + ++static void free_buffered_data(struct buffered_data *out, ++ struct connection *conn) ++{ ++ list_del(&out->list); ++ talloc_free(out); ++} ++ ++void conn_free_buffered_data(struct connection *conn) ++{ ++ struct buffered_data *out; ++ ++ while ((out = list_top(&conn->out_list, struct buffered_data, list))) ++ free_buffered_data(out, conn); ++} ++ + static bool write_messages(struct connection *conn) + { + int ret; +@@ -254,8 +269,7 @@ static bool write_messages(struct connection *conn) + + trace_io(conn, out, 1); + +- list_del(&out->list); +- talloc_free(out); ++ free_buffered_data(out, conn); + + return true; + } +@@ -1472,18 +1486,12 @@ static struct { + */ + static void ignore_connection(struct connection *conn) + { +- struct buffered_data *out, *tmp; +- + trace("CONN %p ignored\n", conn); + + conn->is_ignored = true; + conn_delete_all_watches(conn); + conn_delete_all_transactions(conn); +- +- list_for_each_entry_safe(out, tmp, &conn->out_list, list) { +- list_del(&out->list); +- talloc_free(out); +- } ++ conn_free_buffered_data(conn); + + talloc_free(conn->in); + conn->in = NULL; +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 38d97fa081a6..0ba5b783d4d1 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -270,6 +270,8 @@ int remember_string(struct hashtable *hash, const char *str); + + void set_tdb_key(const char *name, TDB_DATA *key); + ++void conn_free_buffered_data(struct connection *conn); ++ + const char *dump_state_global(FILE *fp); + const char *dump_state_buffered_data(FILE *fp, const struct connection *c, + const struct connection *conn, +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 3d4d0649a243..72a5cd3b9aaf 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -417,15 +417,10 @@ static struct domain *find_domain_by_domid(unsigned int domid) + static void domain_conn_reset(struct domain *domain) + { + struct connection *conn = domain->conn; +- struct buffered_data *out; + + conn_delete_all_watches(conn); + conn_delete_all_transactions(conn); +- +- while ((out = list_top(&conn->out_list, struct buffered_data, list))) { +- list_del(&out->list); +- talloc_free(out); +- } ++ conn_free_buffered_data(conn); + + talloc_free(conn->in); + +-- +2.37.4 + diff --git a/0087-tools-xenstore-reduce-number-of-watch-events.patch b/0087-tools-xenstore-reduce-number-of-watch-events.patch new file mode 100644 index 0000000..ab6cc92 --- /dev/null +++ b/0087-tools-xenstore-reduce-number-of-watch-events.patch @@ -0,0 +1,201 @@ +From 8999db805e5ef55172a85d67695429edc3d78771 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:07 +0200 +Subject: [PATCH 087/126] tools/xenstore: reduce number of watch events + +When removing a watched node outside of a transaction, two watch events +are being produced instead of just a single one. + +When finalizing a transaction watch events can be generated for each +node which is being modified, even if outside a transaction such +modifications might not have resulted in a watch event. + +This happens e.g.: + +- for nodes which are only modified due to added/removed child entries +- for nodes being removed or created implicitly (e.g. creation of a/b/c + is implicitly creating a/b, resulting in watch events for a, a/b and + a/b/c instead of a/b/c only) + +Avoid these additional watch events, in order to reduce the needed +memory inside Xenstore for queueing them. + +This is being achieved by adding event flags to struct accessed_node +specifying whether an event should be triggered, and whether it should +be an exact match of the modified path. Both flags can be set from +fire_watches() instead of implying them only. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 3a96013a3e17baa07410b1b9776225d1d9a74297) +--- + tools/xenstore/xenstored_core.c | 19 ++++++------ + tools/xenstore/xenstored_transaction.c | 41 +++++++++++++++++++++----- + tools/xenstore/xenstored_transaction.h | 3 ++ + tools/xenstore/xenstored_watch.c | 7 +++-- + 4 files changed, 51 insertions(+), 19 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 6498bf603666..5157a7527f58 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1261,7 +1261,7 @@ static void delete_child(struct connection *conn, + } + + static int delete_node(struct connection *conn, const void *ctx, +- struct node *parent, struct node *node) ++ struct node *parent, struct node *node, bool watch_exact) + { + char *name; + +@@ -1273,7 +1273,7 @@ static int delete_node(struct connection *conn, const void *ctx, + node->children); + child = name ? read_node(conn, node, name) : NULL; + if (child) { +- if (delete_node(conn, ctx, node, child)) ++ if (delete_node(conn, ctx, node, child, true)) + return errno; + } else { + trace("delete_node: Error deleting child '%s/%s'!\n", +@@ -1285,7 +1285,12 @@ static int delete_node(struct connection *conn, const void *ctx, + talloc_free(name); + } + +- fire_watches(conn, ctx, node->name, node, true, NULL); ++ /* ++ * Fire the watches now, when we can still see the node permissions. ++ * This fine as we are single threaded and the next possible read will ++ * be handled only after the node has been really removed. ++ */ ++ fire_watches(conn, ctx, node->name, node, watch_exact, NULL); + delete_node_single(conn, node); + delete_child(conn, parent, basename(node->name)); + talloc_free(node); +@@ -1311,13 +1316,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, + return (errno == ENOMEM) ? ENOMEM : EINVAL; + node->parent = parent; + +- /* +- * Fire the watches now, when we can still see the node permissions. +- * This fine as we are single threaded and the next possible read will +- * be handled only after the node has been really removed. +- */ +- fire_watches(conn, ctx, name, node, false, NULL); +- return delete_node(conn, ctx, parent, node); ++ return delete_node(conn, ctx, parent, node, false); + } + + +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index faf6c930e42a..54432907fc76 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -130,6 +130,10 @@ struct accessed_node + + /* Transaction node in data base? */ + bool ta_node; ++ ++ /* Watch event flags. */ ++ bool fire_watch; ++ bool watch_exact; + }; + + struct changed_domain +@@ -323,6 +327,29 @@ err: + return ret; + } + ++/* ++ * A watch event should be fired for a node modified inside a transaction. ++ * Set the corresponding information. A non-exact event is replacing an exact ++ * one, but not the other way round. ++ */ ++void queue_watches(struct connection *conn, const char *name, bool watch_exact) ++{ ++ struct accessed_node *i; ++ ++ i = find_accessed_node(conn->transaction, name); ++ if (!i) { ++ conn->transaction->fail = true; ++ return; ++ } ++ ++ if (!i->fire_watch) { ++ i->fire_watch = true; ++ i->watch_exact = watch_exact; ++ } else if (!watch_exact) { ++ i->watch_exact = false; ++ } ++} ++ + /* + * Finalize transaction: + * Walk through accessed nodes and check generation against global data. +@@ -377,15 +404,15 @@ static int finalize_transaction(struct connection *conn, + ret = tdb_store(tdb_ctx, key, data, + TDB_REPLACE); + talloc_free(data.dptr); +- if (ret) +- goto err; +- fire_watches(conn, trans, i->node, NULL, false, +- i->perms.p ? &i->perms : NULL); + } else { +- fire_watches(conn, trans, i->node, NULL, false, ++ ret = tdb_delete(tdb_ctx, key); ++ } ++ if (ret) ++ goto err; ++ if (i->fire_watch) { ++ fire_watches(conn, trans, i->node, NULL, ++ i->watch_exact, + i->perms.p ? &i->perms : NULL); +- if (tdb_delete(tdb_ctx, key)) +- goto err; + } + } + +diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h +index 14062730e3c9..0093cac807e3 100644 +--- a/tools/xenstore/xenstored_transaction.h ++++ b/tools/xenstore/xenstored_transaction.h +@@ -42,6 +42,9 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid); + int access_node(struct connection *conn, struct node *node, + enum node_access_type type, TDB_DATA *key); + ++/* Queue watches for a modified node. */ ++void queue_watches(struct connection *conn, const char *name, bool watch_exact); ++ + /* Prepend the transaction to name if appropriate. */ + int transaction_prepend(struct connection *conn, const char *name, + TDB_DATA *key); +diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c +index a116f967dc66..bc6d833028a3 100644 +--- a/tools/xenstore/xenstored_watch.c ++++ b/tools/xenstore/xenstored_watch.c +@@ -29,6 +29,7 @@ + #include "xenstore_lib.h" + #include "utils.h" + #include "xenstored_domain.h" ++#include "xenstored_transaction.h" + + extern int quota_nb_watch_per_domain; + +@@ -143,9 +144,11 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, + struct connection *i; + struct watch *watch; + +- /* During transactions, don't fire watches. */ +- if (conn && conn->transaction) ++ /* During transactions, don't fire watches, but queue them. */ ++ if (conn && conn->transaction) { ++ queue_watches(conn, name, exact); + return; ++ } + + /* Create an event for each watch. */ + list_for_each_entry(i, &connections, list) { +-- +2.37.4 + diff --git a/0088-tools-xenstore-let-unread-watch-events-time-out.patch b/0088-tools-xenstore-let-unread-watch-events-time-out.patch new file mode 100644 index 0000000..03419c6 --- /dev/null +++ b/0088-tools-xenstore-let-unread-watch-events-time-out.patch @@ -0,0 +1,309 @@ +From 53a77b82717530d836300f1de0ad037de85477dd Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:07 +0200 +Subject: [PATCH 088/126] tools/xenstore: let unread watch events time out + +A future modification will limit the number of outstanding requests +for a domain, where "outstanding" means that the response of the +request or any resulting watch event hasn't been consumed yet. + +In order to avoid a malicious guest being capable to block other guests +by not reading watch events, add a timeout for watch events. In case a +watch event hasn't been consumed after this timeout, it is being +deleted. Set the default timeout to 20 seconds (a random value being +not too high). + +In order to support to specify other timeout values in future, use a +generic command line option for that purpose: + +--timeout|-w watch-event=<seconds> + +This is part of XSA-326 / CVE-2022-42311. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 5285dcb1a5c01695c11e6397c95d906b5e765c98) +--- + tools/xenstore/xenstored_core.c | 133 +++++++++++++++++++++++++++++++- + tools/xenstore/xenstored_core.h | 6 ++ + 2 files changed, 138 insertions(+), 1 deletion(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 5157a7527f58..ee3396fefa94 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -108,6 +108,8 @@ int quota_max_transaction = 10; + int quota_nb_perms_per_node = 5; + int quota_max_path_len = XENSTORE_REL_PATH_MAX; + ++unsigned int timeout_watch_event_msec = 20000; ++ + void trace(const char *fmt, ...) + { + va_list arglist; +@@ -211,19 +213,92 @@ void reopen_log(void) + } + } + ++static uint64_t get_now_msec(void) ++{ ++ struct timespec now_ts; ++ ++ if (clock_gettime(CLOCK_MONOTONIC, &now_ts)) ++ barf_perror("Could not find time (clock_gettime failed)"); ++ ++ return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; ++} ++ + static void free_buffered_data(struct buffered_data *out, + struct connection *conn) + { ++ struct buffered_data *req; ++ + list_del(&out->list); ++ ++ /* ++ * Update conn->timeout_msec with the next found timeout value in the ++ * queued pending requests. ++ */ ++ if (out->timeout_msec) { ++ conn->timeout_msec = 0; ++ list_for_each_entry(req, &conn->out_list, list) { ++ if (req->timeout_msec) { ++ conn->timeout_msec = req->timeout_msec; ++ break; ++ } ++ } ++ } ++ + talloc_free(out); + } + ++static void check_event_timeout(struct connection *conn, uint64_t msecs, ++ int *ptimeout) ++{ ++ uint64_t delta; ++ struct buffered_data *out, *tmp; ++ ++ if (!conn->timeout_msec) ++ return; ++ ++ delta = conn->timeout_msec - msecs; ++ if (conn->timeout_msec <= msecs) { ++ delta = 0; ++ list_for_each_entry_safe(out, tmp, &conn->out_list, list) { ++ /* ++ * Only look at buffers with timeout and no data ++ * already written to the ring. ++ */ ++ if (out->timeout_msec && out->inhdr && !out->used) { ++ if (out->timeout_msec > msecs) { ++ conn->timeout_msec = out->timeout_msec; ++ delta = conn->timeout_msec - msecs; ++ break; ++ } ++ ++ /* ++ * Free out without updating conn->timeout_msec, ++ * as the update is done in this loop already. ++ */ ++ out->timeout_msec = 0; ++ trace("watch event path %s for domain %u timed out\n", ++ out->buffer, conn->id); ++ free_buffered_data(out, conn); ++ } ++ } ++ if (!delta) { ++ conn->timeout_msec = 0; ++ return; ++ } ++ } ++ ++ if (*ptimeout == -1 || *ptimeout > delta) ++ *ptimeout = delta; ++} ++ + void conn_free_buffered_data(struct connection *conn) + { + struct buffered_data *out; + + while ((out = list_top(&conn->out_list, struct buffered_data, list))) + free_buffered_data(out, conn); ++ ++ conn->timeout_msec = 0; + } + + static bool write_messages(struct connection *conn) +@@ -382,6 +457,7 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) + { + struct connection *conn; + struct wrl_timestampt now; ++ uint64_t msecs; + + if (fds) + memset(fds, 0, sizeof(struct pollfd) * current_array_size); +@@ -402,10 +478,12 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) + + wrl_gettime_now(&now); + wrl_log_periodic(now); ++ msecs = get_now_msec(); + + list_for_each_entry(conn, &connections, list) { + if (conn->domain) { + wrl_check_timeout(conn->domain, now, ptimeout); ++ check_event_timeout(conn, msecs, ptimeout); + if (domain_can_read(conn) || + (domain_can_write(conn) && + !list_empty(&conn->out_list))) +@@ -760,6 +838,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + return; + bdata->inhdr = true; + bdata->used = 0; ++ bdata->timeout_msec = 0; + + if (len <= DEFAULT_BUFFER_SIZE) + bdata->buffer = bdata->default_buffer; +@@ -811,6 +890,12 @@ void send_event(struct connection *conn, const char *path, const char *token) + bdata->hdr.msg.type = XS_WATCH_EVENT; + bdata->hdr.msg.len = len; + ++ if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { ++ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; ++ if (!conn->timeout_msec) ++ conn->timeout_msec = bdata->timeout_msec; ++ } ++ + /* Queue for later transmission. */ + list_add_tail(&bdata->list, &conn->out_list); + } +@@ -2099,6 +2184,9 @@ static void usage(void) + " -t, --transaction <nb> limit the number of transaction allowed per domain,\n" + " -A, --perm-nb <nb> limit the number of permissions per node,\n" + " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" ++" -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" ++" allowed timeout candidates are:\n" ++" watch-event: time a watch-event is kept pending\n" + " -R, --no-recovery to request that no recovery should be attempted when\n" + " the store is corrupted (debug only),\n" + " -I, --internal-db store database in memory, not on disk\n" +@@ -2121,6 +2209,7 @@ static struct option options[] = { + { "transaction", 1, NULL, 't' }, + { "perm-nb", 1, NULL, 'A' }, + { "path-max", 1, NULL, 'M' }, ++ { "timeout", 1, NULL, 'w' }, + { "no-recovery", 0, NULL, 'R' }, + { "internal-db", 0, NULL, 'I' }, + { "verbose", 0, NULL, 'V' }, +@@ -2135,6 +2224,39 @@ int dom0_domid = 0; + int dom0_event = 0; + int priv_domid = 0; + ++static int get_optval_int(const char *arg) ++{ ++ char *end; ++ long val; ++ ++ val = strtol(arg, &end, 10); ++ if (!*arg || *end || val < 0 || val > INT_MAX) ++ barf("invalid parameter value \"%s\"\n", arg); ++ ++ return val; ++} ++ ++static bool what_matches(const char *arg, const char *what) ++{ ++ unsigned int what_len = strlen(what); ++ ++ return !strncmp(arg, what, what_len) && arg[what_len] == '='; ++} ++ ++static void set_timeout(const char *arg) ++{ ++ const char *eq = strchr(arg, '='); ++ int val; ++ ++ if (!eq) ++ barf("quotas must be specified via <what>=<seconds>\n"); ++ val = get_optval_int(eq + 1); ++ if (what_matches(arg, "watch-event")) ++ timeout_watch_event_msec = val * 1000; ++ else ++ barf("unknown timeout \"%s\"\n", arg); ++} ++ + int main(int argc, char *argv[]) + { + int opt; +@@ -2149,7 +2271,7 @@ int main(int argc, char *argv[]) + orig_argc = argc; + orig_argv = argv; + +- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:U", options, ++ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, + NULL)) != -1) { + switch (opt) { + case 'D': +@@ -2198,6 +2320,9 @@ int main(int argc, char *argv[]) + quota_max_path_len = min(XENSTORE_REL_PATH_MAX, + quota_max_path_len); + break; ++ case 'w': ++ set_timeout(optarg); ++ break; + case 'e': + dom0_event = strtol(optarg, NULL, 10); + break; +@@ -2642,6 +2767,12 @@ static void add_buffered_data(struct buffered_data *bdata, + barf("error restoring buffered data"); + + memcpy(bdata->buffer, data, len); ++ if (bdata->hdr.msg.type == XS_WATCH_EVENT && timeout_watch_event_msec && ++ domain_is_unprivileged(conn)) { ++ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; ++ if (!conn->timeout_msec) ++ conn->timeout_msec = bdata->timeout_msec; ++ } + + /* Queue for later transmission. */ + list_add_tail(&bdata->list, &conn->out_list); +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 0ba5b783d4d1..2db577928fc6 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -27,6 +27,7 @@ + #include <dirent.h> + #include <stdbool.h> + #include <stdint.h> ++#include <time.h> + #include <errno.h> + + #include "xenstore_lib.h" +@@ -67,6 +68,8 @@ struct buffered_data + char raw[sizeof(struct xsd_sockmsg)]; + } hdr; + ++ uint64_t timeout_msec; ++ + /* The actual data. */ + char *buffer; + char default_buffer[DEFAULT_BUFFER_SIZE]; +@@ -110,6 +113,7 @@ struct connection + + /* Buffered output data */ + struct list_head out_list; ++ uint64_t timeout_msec; + + /* Transaction context for current request (NULL if none). */ + struct transaction *transaction; +@@ -237,6 +241,8 @@ extern int dom0_event; + extern int priv_domid; + extern int quota_nb_entry_per_domain; + ++extern unsigned int timeout_watch_event_msec; ++ + /* Map the kernel's xenstore page. */ + void *xenbus_map(void); + void unmap_xenbus(void *interface); +-- +2.37.4 + diff --git a/0089-tools-xenstore-limit-outstanding-requests.patch b/0089-tools-xenstore-limit-outstanding-requests.patch new file mode 100644 index 0000000..2e110b0 --- /dev/null +++ b/0089-tools-xenstore-limit-outstanding-requests.patch @@ -0,0 +1,453 @@ +From 56300e8e1781cee1b6a514e5f2bea234a7885d55 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:08 +0200 +Subject: [PATCH 089/126] tools/xenstore: limit outstanding requests + +Add another quota for limiting the number of outstanding requests of a +guest. As the way to specify quotas on the command line is becoming +rather nasty, switch to a new scheme using [--quota|-Q] <what>=<val> +allowing to add more quotas in future easily. + +Set the default value to 20 (basically a random value not seeming to +be too high or too low). + +A request is said to be outstanding if any message generated by this +request (the direct response plus potential watch events) is not yet +completely stored into a ring buffer. The initial watch event sent as +a result of registering a watch is an exception. + +Note that across a live update the relation to buffered watch events +for other domains is lost. + +Use talloc_zero() for allocating the domain structure in order to have +all per-domain quota zeroed initially. + +This is part of XSA-326 / CVE-2022-42312. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 36de433a273f55d614c83b89c9a8972287a1e475) +--- + tools/xenstore/xenstored_core.c | 88 +++++++++++++++++++++++++++++-- + tools/xenstore/xenstored_core.h | 20 ++++++- + tools/xenstore/xenstored_domain.c | 38 ++++++++++--- + tools/xenstore/xenstored_domain.h | 3 ++ + tools/xenstore/xenstored_watch.c | 15 ++++-- + 5 files changed, 150 insertions(+), 14 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index ee3396fefa94..d871f217af9c 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -107,6 +107,7 @@ int quota_max_entry_size = 2048; /* 2K */ + int quota_max_transaction = 10; + int quota_nb_perms_per_node = 5; + int quota_max_path_len = XENSTORE_REL_PATH_MAX; ++int quota_req_outstanding = 20; + + unsigned int timeout_watch_event_msec = 20000; + +@@ -223,12 +224,24 @@ static uint64_t get_now_msec(void) + return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; + } + ++/* ++ * Remove a struct buffered_data from the list of outgoing data. ++ * A struct buffered_data related to a request having caused watch events to be ++ * sent is kept until all those events have been written out. ++ * Each watch event is referencing the related request via pend.req, while the ++ * number of watch events caused by a request is kept in pend.ref.event_cnt ++ * (those two cases are mutually exclusive, so the two fields can share memory ++ * via a union). ++ * The struct buffered_data is freed only if no related watch event is ++ * referencing it. The related return data can be freed right away. ++ */ + static void free_buffered_data(struct buffered_data *out, + struct connection *conn) + { + struct buffered_data *req; + + list_del(&out->list); ++ out->on_out_list = false; + + /* + * Update conn->timeout_msec with the next found timeout value in the +@@ -244,6 +257,30 @@ static void free_buffered_data(struct buffered_data *out, + } + } + ++ if (out->hdr.msg.type == XS_WATCH_EVENT) { ++ req = out->pend.req; ++ if (req) { ++ req->pend.ref.event_cnt--; ++ if (!req->pend.ref.event_cnt && !req->on_out_list) { ++ if (req->on_ref_list) { ++ domain_outstanding_domid_dec( ++ req->pend.ref.domid); ++ list_del(&req->list); ++ } ++ talloc_free(req); ++ } ++ } ++ } else if (out->pend.ref.event_cnt) { ++ /* Hang out off from conn. */ ++ talloc_steal(NULL, out); ++ if (out->buffer != out->default_buffer) ++ talloc_free(out->buffer); ++ list_add(&out->list, &conn->ref_list); ++ out->on_ref_list = true; ++ return; ++ } else ++ domain_outstanding_dec(conn); ++ + talloc_free(out); + } + +@@ -399,6 +436,7 @@ int delay_request(struct connection *conn, struct buffered_data *in, + static int destroy_conn(void *_conn) + { + struct connection *conn = _conn; ++ struct buffered_data *req; + + /* Flush outgoing if possible, but don't block. */ + if (!conn->domain) { +@@ -412,6 +450,11 @@ static int destroy_conn(void *_conn) + break; + close(conn->fd); + } ++ ++ conn_free_buffered_data(conn); ++ list_for_each_entry(req, &conn->ref_list, list) ++ req->on_ref_list = false; ++ + if (conn->target) + talloc_unlink(conn, conn->target); + list_del(&conn->list); +@@ -859,6 +902,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + + /* Queue for later transmission. */ + list_add_tail(&bdata->list, &conn->out_list); ++ bdata->on_out_list = true; ++ domain_outstanding_inc(conn); + } + + /* +@@ -866,7 +911,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + * As this is not directly related to the current command, errors can't be + * reported. + */ +-void send_event(struct connection *conn, const char *path, const char *token) ++void send_event(struct buffered_data *req, struct connection *conn, ++ const char *path, const char *token) + { + struct buffered_data *bdata; + unsigned int len; +@@ -896,8 +942,13 @@ void send_event(struct connection *conn, const char *path, const char *token) + conn->timeout_msec = bdata->timeout_msec; + } + ++ bdata->pend.req = req; ++ if (req) ++ req->pend.ref.event_cnt++; ++ + /* Queue for later transmission. */ + list_add_tail(&bdata->list, &conn->out_list); ++ bdata->on_out_list = true; + } + + /* Some routines (write, mkdir, etc) just need a non-error return */ +@@ -1658,6 +1709,7 @@ static void handle_input(struct connection *conn) + return; + } + in = conn->in; ++ in->pend.ref.domid = conn->id; + + /* Not finished header yet? */ + if (in->inhdr) { +@@ -1727,6 +1779,7 @@ struct connection *new_connection(connwritefn_t *write, connreadfn_t *read) + new->is_ignored = false; + new->transaction_started = 0; + INIT_LIST_HEAD(&new->out_list); ++ INIT_LIST_HEAD(&new->ref_list); + INIT_LIST_HEAD(&new->watches); + INIT_LIST_HEAD(&new->transaction_list); + INIT_LIST_HEAD(&new->delayed); +@@ -2184,6 +2237,9 @@ static void usage(void) + " -t, --transaction <nb> limit the number of transaction allowed per domain,\n" + " -A, --perm-nb <nb> limit the number of permissions per node,\n" + " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" ++" -Q, --quota <what>=<nb> set the quota <what> to the value <nb>, allowed\n" ++" quotas are:\n" ++" outstanding: number of outstanding requests\n" + " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" + " allowed timeout candidates are:\n" + " watch-event: time a watch-event is kept pending\n" +@@ -2209,6 +2265,7 @@ static struct option options[] = { + { "transaction", 1, NULL, 't' }, + { "perm-nb", 1, NULL, 'A' }, + { "path-max", 1, NULL, 'M' }, ++ { "quota", 1, NULL, 'Q' }, + { "timeout", 1, NULL, 'w' }, + { "no-recovery", 0, NULL, 'R' }, + { "internal-db", 0, NULL, 'I' }, +@@ -2257,6 +2314,20 @@ static void set_timeout(const char *arg) + barf("unknown timeout \"%s\"\n", arg); + } + ++static void set_quota(const char *arg) ++{ ++ const char *eq = strchr(arg, '='); ++ int val; ++ ++ if (!eq) ++ barf("quotas must be specified via <what>=<nb>\n"); ++ val = get_optval_int(eq + 1); ++ if (what_matches(arg, "outstanding")) ++ quota_req_outstanding = val; ++ else ++ barf("unknown quota \"%s\"\n", arg); ++} ++ + int main(int argc, char *argv[]) + { + int opt; +@@ -2271,8 +2342,8 @@ int main(int argc, char *argv[]) + orig_argc = argc; + orig_argv = argv; + +- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, +- NULL)) != -1) { ++ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", ++ options, NULL)) != -1) { + switch (opt) { + case 'D': + no_domain_init = true; +@@ -2320,6 +2391,9 @@ int main(int argc, char *argv[]) + quota_max_path_len = min(XENSTORE_REL_PATH_MAX, + quota_max_path_len); + break; ++ case 'Q': ++ set_quota(optarg); ++ break; + case 'w': + set_timeout(optarg); + break; +@@ -2776,6 +2850,14 @@ static void add_buffered_data(struct buffered_data *bdata, + + /* Queue for later transmission. */ + list_add_tail(&bdata->list, &conn->out_list); ++ bdata->on_out_list = true; ++ /* ++ * Watch events are never "outstanding", but the request causing them ++ * are instead kept "outstanding" until all watch events caused by that ++ * request have been delivered. ++ */ ++ if (bdata->hdr.msg.type != XS_WATCH_EVENT) ++ domain_outstanding_inc(conn); + } + + void read_state_buffered_data(const void *ctx, struct connection *conn, +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 2db577928fc6..fcb27399f116 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -56,6 +56,8 @@ struct xs_state_connection; + struct buffered_data + { + struct list_head list; ++ bool on_out_list; ++ bool on_ref_list; + + /* Are we still doing the header? */ + bool inhdr; +@@ -63,6 +65,17 @@ struct buffered_data + /* How far are we? */ + unsigned int used; + ++ /* Outstanding request accounting. */ ++ union { ++ /* ref is being used for requests. */ ++ struct { ++ unsigned int event_cnt; /* # of outstanding events. */ ++ unsigned int domid; /* domid of request. */ ++ } ref; ++ /* req is being used for watch events. */ ++ struct buffered_data *req; /* request causing event. */ ++ } pend; ++ + union { + struct xsd_sockmsg msg; + char raw[sizeof(struct xsd_sockmsg)]; +@@ -115,6 +128,9 @@ struct connection + struct list_head out_list; + uint64_t timeout_msec; + ++ /* Referenced requests no longer pending. */ ++ struct list_head ref_list; ++ + /* Transaction context for current request (NULL if none). */ + struct transaction *transaction; + +@@ -184,7 +200,8 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); + + void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + const void *data, unsigned int len); +-void send_event(struct connection *conn, const char *path, const char *token); ++void send_event(struct buffered_data *req, struct connection *conn, ++ const char *path, const char *token); + + /* Some routines (write, mkdir, etc) just need a non-error return */ + void send_ack(struct connection *conn, enum xsd_sockmsg_type type); +@@ -240,6 +257,7 @@ extern int dom0_domid; + extern int dom0_event; + extern int priv_domid; + extern int quota_nb_entry_per_domain; ++extern int quota_req_outstanding; + + extern unsigned int timeout_watch_event_msec; + +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 72a5cd3b9aaf..979f8c629835 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -78,6 +78,9 @@ struct domain + /* number of watch for this domain */ + int nbwatch; + ++ /* Number of outstanding requests. */ ++ int nboutstanding; ++ + /* write rate limit */ + wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */ + struct wrl_timestampt wrl_timestamp; +@@ -287,8 +290,12 @@ bool domain_can_read(struct connection *conn) + { + struct xenstore_domain_interface *intf = conn->domain->interface; + +- if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0) +- return false; ++ if (domain_is_unprivileged(conn)) { ++ if (conn->domain->wrl_credit < 0) ++ return false; ++ if (conn->domain->nboutstanding >= quota_req_outstanding) ++ return false; ++ } + + if (conn->is_ignored) + return false; +@@ -337,7 +344,7 @@ static struct domain *alloc_domain(const void *context, unsigned int domid) + { + struct domain *domain; + +- domain = talloc(context, struct domain); ++ domain = talloc_zero(context, struct domain); + if (!domain) { + errno = ENOMEM; + return NULL; +@@ -398,9 +405,6 @@ static int new_domain(struct domain *domain, int port, bool restore) + domain->conn->domain = domain; + domain->conn->id = domain->domid; + +- domain->nbentry = 0; +- domain->nbwatch = 0; +- + return 0; + } + +@@ -944,6 +948,28 @@ int domain_watch(struct connection *conn) + : 0; + } + ++void domain_outstanding_inc(struct connection *conn) ++{ ++ if (!conn || !conn->domain) ++ return; ++ conn->domain->nboutstanding++; ++} ++ ++void domain_outstanding_dec(struct connection *conn) ++{ ++ if (!conn || !conn->domain) ++ return; ++ conn->domain->nboutstanding--; ++} ++ ++void domain_outstanding_domid_dec(unsigned int domid) ++{ ++ struct domain *d = find_domain_by_domid(domid); ++ ++ if (d) ++ d->nboutstanding--; ++} ++ + static wrl_creditt wrl_config_writecost = WRL_FACTOR; + static wrl_creditt wrl_config_rate = WRL_RATE * WRL_FACTOR; + static wrl_creditt wrl_config_dburst = WRL_DBURST * WRL_FACTOR; +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index dc9759171317..5757a6557146 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -68,6 +68,9 @@ int domain_entry(struct connection *conn); + void domain_watch_inc(struct connection *conn); + void domain_watch_dec(struct connection *conn); + int domain_watch(struct connection *conn); ++void domain_outstanding_inc(struct connection *conn); ++void domain_outstanding_dec(struct connection *conn); ++void domain_outstanding_domid_dec(unsigned int domid); + + /* Special node permission handling. */ + int set_perms_special(struct connection *conn, const char *name, +diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c +index bc6d833028a3..1d664e3d6b72 100644 +--- a/tools/xenstore/xenstored_watch.c ++++ b/tools/xenstore/xenstored_watch.c +@@ -142,6 +142,7 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, + struct node *node, bool exact, struct node_perms *perms) + { + struct connection *i; ++ struct buffered_data *req; + struct watch *watch; + + /* During transactions, don't fire watches, but queue them. */ +@@ -150,6 +151,8 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, + return; + } + ++ req = domain_is_unprivileged(conn) ? conn->in : NULL; ++ + /* Create an event for each watch. */ + list_for_each_entry(i, &connections, list) { + /* introduce/release domain watches */ +@@ -164,12 +167,12 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, + list_for_each_entry(watch, &i->watches, list) { + if (exact) { + if (streq(name, watch->node)) +- send_event(i, ++ send_event(req, i, + get_watch_path(watch, name), + watch->token); + } else { + if (is_child(name, watch->node)) +- send_event(i, ++ send_event(req, i, + get_watch_path(watch, name), + watch->token); + } +@@ -269,8 +272,12 @@ int do_watch(struct connection *conn, struct buffered_data *in) + trace_create(watch, "watch"); + send_ack(conn, XS_WATCH); + +- /* We fire once up front: simplifies clients and restart. */ +- send_event(conn, get_watch_path(watch, watch->node), watch->token); ++ /* ++ * We fire once up front: simplifies clients and restart. ++ * This event will not be linked to the XS_WATCH request. ++ */ ++ send_event(NULL, conn, get_watch_path(watch, watch->node), ++ watch->token); + + return 0; + } +-- +2.37.4 + diff --git a/0090-tools-xenstore-don-t-buffer-multiple-identical-watch.patch b/0090-tools-xenstore-don-t-buffer-multiple-identical-watch.patch new file mode 100644 index 0000000..305d8ac --- /dev/null +++ b/0090-tools-xenstore-don-t-buffer-multiple-identical-watch.patch @@ -0,0 +1,93 @@ +From 97c251f953c58aec7620499ac12924054b7cd758 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:08 +0200 +Subject: [PATCH 090/126] tools/xenstore: don't buffer multiple identical watch + events + +A guest not reading its Xenstore response buffer fast enough might +pile up lots of Xenstore watch events buffered. Reduce the generated +load by dropping new events which already have an identical copy +pending. + +The special events "@..." are excluded from that handling as there are +known use cases where the handler is relying on each event to be sent +individually. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit b5c0bdb96d33e18c324c13d8e33c08732d77eaa2) +--- + tools/xenstore/xenstored_core.c | 20 +++++++++++++++++++- + tools/xenstore/xenstored_core.h | 3 +++ + 2 files changed, 22 insertions(+), 1 deletion(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index d871f217af9c..6ea06e20df91 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -882,6 +882,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + bdata->inhdr = true; + bdata->used = 0; + bdata->timeout_msec = 0; ++ bdata->watch_event = false; + + if (len <= DEFAULT_BUFFER_SIZE) + bdata->buffer = bdata->default_buffer; +@@ -914,7 +915,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + void send_event(struct buffered_data *req, struct connection *conn, + const char *path, const char *token) + { +- struct buffered_data *bdata; ++ struct buffered_data *bdata, *bd; + unsigned int len; + + len = strlen(path) + 1 + strlen(token) + 1; +@@ -936,12 +937,29 @@ void send_event(struct buffered_data *req, struct connection *conn, + bdata->hdr.msg.type = XS_WATCH_EVENT; + bdata->hdr.msg.len = len; + ++ /* ++ * Check whether an identical event is pending already. ++ * Special events are excluded from that check. ++ */ ++ if (path[0] != '@') { ++ list_for_each_entry(bd, &conn->out_list, list) { ++ if (bd->watch_event && bd->hdr.msg.len == len && ++ !memcmp(bdata->buffer, bd->buffer, len)) { ++ trace("dropping duplicate watch %s %s for domain %u\n", ++ path, token, conn->id); ++ talloc_free(bdata); ++ return; ++ } ++ } ++ } ++ + if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { + bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; + if (!conn->timeout_msec) + conn->timeout_msec = bdata->timeout_msec; + } + ++ bdata->watch_event = true; + bdata->pend.req = req; + if (req) + req->pend.ref.event_cnt++; +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index fcb27399f116..afbd982c2654 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -62,6 +62,9 @@ struct buffered_data + /* Are we still doing the header? */ + bool inhdr; + ++ /* Is this a watch event? */ ++ bool watch_event; ++ + /* How far are we? */ + unsigned int used; + +-- +2.37.4 + diff --git a/0091-tools-xenstore-fix-connection-id-usage.patch b/0091-tools-xenstore-fix-connection-id-usage.patch new file mode 100644 index 0000000..dd7f382 --- /dev/null +++ b/0091-tools-xenstore-fix-connection-id-usage.patch @@ -0,0 +1,61 @@ +From 3e51699fcc578c7c005fd4add70cf7c8117d0af9 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:08 +0200 +Subject: [PATCH 091/126] tools/xenstore: fix connection->id usage + +Don't use conn->id for privilege checks, but domain_is_unprivileged(). + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 3047df38e1991510bc295e3e1bb6b6b6c4a97831) +--- + tools/xenstore/xenstored_control.c | 2 +- + tools/xenstore/xenstored_core.h | 2 +- + tools/xenstore/xenstored_transaction.c | 3 ++- + 3 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c +index 8e470f2b2056..211fe1fd9b37 100644 +--- a/tools/xenstore/xenstored_control.c ++++ b/tools/xenstore/xenstored_control.c +@@ -821,7 +821,7 @@ int do_control(struct connection *conn, struct buffered_data *in) + unsigned int cmd, num, off; + char **vec = NULL; + +- if (conn->id != 0) ++ if (domain_is_unprivileged(conn)) + return EACCES; + + off = get_string(in, 0); +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index afbd982c2654..c0a056ce13fe 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -118,7 +118,7 @@ struct connection + /* The index of pollfd in global pollfd array */ + int pollfd_idx; + +- /* Who am I? 0 for socket connections. */ ++ /* Who am I? Domid of connection. */ + unsigned int id; + + /* Is this connection ignored? */ +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index 54432907fc76..ee1b09031a3b 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -477,7 +477,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) + if (conn->transaction) + return EBUSY; + +- if (conn->id && conn->transaction_started > quota_max_transaction) ++ if (domain_is_unprivileged(conn) && ++ conn->transaction_started > quota_max_transaction) + return ENOSPC; + + /* Attach transaction to input for autofree until it's complete */ +-- +2.37.4 + diff --git a/0092-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch b/0092-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch new file mode 100644 index 0000000..01f29b1 --- /dev/null +++ b/0092-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch @@ -0,0 +1,336 @@ +From 8ee7ed7c1ef435f43edc08be07c036d81642d8e1 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:08 +0200 +Subject: [PATCH 092/126] tools/xenstore: simplify and fix per domain node + accounting + +The accounting of nodes can be simplified now that each connection +holds the associated domid. + +Fix the node accounting to cover nodes created for a domain before it +has been introduced. This requires to react properly to an allocation +failure inside domain_entry_inc() by returning an error code. + +Especially in error paths the node accounting has to be fixed in some +cases. + +This is part of XSA-326 / CVE-2022-42313. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit dbef1f7482894c572d90cd73d99ed689c891e863) +--- + tools/xenstore/xenstored_core.c | 43 ++++++++-- + tools/xenstore/xenstored_domain.c | 105 ++++++++++++++++--------- + tools/xenstore/xenstored_domain.h | 4 +- + tools/xenstore/xenstored_transaction.c | 8 +- + 4 files changed, 109 insertions(+), 51 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 6ea06e20df91..85c0d2f38fac 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -603,7 +603,7 @@ struct node *read_node(struct connection *conn, const void *ctx, + + /* Permissions are struct xs_permissions. */ + node->perms.p = hdr->perms; +- if (domain_adjust_node_perms(node)) { ++ if (domain_adjust_node_perms(conn, node)) { + talloc_free(node); + return NULL; + } +@@ -625,7 +625,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + void *p; + struct xs_tdb_record_hdr *hdr; + +- if (domain_adjust_node_perms(node)) ++ if (domain_adjust_node_perms(conn, node)) + return errno; + + data.dsize = sizeof(*hdr) +@@ -1238,13 +1238,17 @@ nomem: + return NULL; + } + +-static int destroy_node(struct connection *conn, struct node *node) ++static void destroy_node_rm(struct node *node) + { + if (streq(node->name, "/")) + corrupt(NULL, "Destroying root node!"); + + tdb_delete(tdb_ctx, node->key); ++} + ++static int destroy_node(struct connection *conn, struct node *node) ++{ ++ destroy_node_rm(node); + domain_entry_dec(conn, node); + + /* +@@ -1294,8 +1298,12 @@ static struct node *create_node(struct connection *conn, const void *ctx, + goto err; + + /* Account for new node */ +- if (i->parent) +- domain_entry_inc(conn, i); ++ if (i->parent) { ++ if (domain_entry_inc(conn, i)) { ++ destroy_node_rm(i); ++ return NULL; ++ } ++ } + } + + return node; +@@ -1580,10 +1588,27 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) + old_perms = node->perms; + domain_entry_dec(conn, node); + node->perms = perms; +- domain_entry_inc(conn, node); ++ if (domain_entry_inc(conn, node)) { ++ node->perms = old_perms; ++ /* ++ * This should never fail because we had a reference on the ++ * domain before and Xenstored is single-threaded. ++ */ ++ domain_entry_inc(conn, node); ++ return ENOMEM; ++ } + +- if (write_node(conn, node, false)) ++ if (write_node(conn, node, false)) { ++ int saved_errno = errno; ++ ++ domain_entry_dec(conn, node); ++ node->perms = old_perms; ++ /* No failure possible as above. */ ++ domain_entry_inc(conn, node); ++ ++ errno = saved_errno; + return errno; ++ } + + fire_watches(conn, in, name, node, false, &old_perms); + send_ack(conn, XS_SET_PERMS); +@@ -3003,7 +3028,9 @@ void read_state_node(const void *ctx, const void *state) + set_tdb_key(name, &key); + if (write_node_raw(NULL, &key, node, true)) + barf("write node error restoring node"); +- domain_entry_inc(&conn, node); ++ ++ if (domain_entry_inc(&conn, node)) ++ barf("node accounting error restoring node"); + + talloc_free(node); + } +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 979f8c629835..3c27973fb836 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -16,6 +16,7 @@ + along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + ++#include <assert.h> + #include <stdio.h> + #include <sys/mman.h> + #include <unistd.h> +@@ -369,6 +370,18 @@ static struct domain *find_or_alloc_domain(const void *ctx, unsigned int domid) + return domain ? : alloc_domain(ctx, domid); + } + ++static struct domain *find_or_alloc_existing_domain(unsigned int domid) ++{ ++ struct domain *domain; ++ xc_dominfo_t dominfo; ++ ++ domain = find_domain_struct(domid); ++ if (!domain && get_domain_info(domid, &dominfo)) ++ domain = alloc_domain(NULL, domid); ++ ++ return domain; ++} ++ + static int new_domain(struct domain *domain, int port, bool restore) + { + int rc; +@@ -788,30 +801,28 @@ void domain_deinit(void) + xenevtchn_unbind(xce_handle, virq_port); + } + +-void domain_entry_inc(struct connection *conn, struct node *node) ++int domain_entry_inc(struct connection *conn, struct node *node) + { + struct domain *d; ++ unsigned int domid; + + if (!conn) +- return; ++ return 0; + +- if (node->perms.p && node->perms.p[0].id != conn->id) { +- if (conn->transaction) { +- transaction_entry_inc(conn->transaction, +- node->perms.p[0].id); +- } else { +- d = find_domain_by_domid(node->perms.p[0].id); +- if (d) +- d->nbentry++; +- } +- } else if (conn->domain) { +- if (conn->transaction) { +- transaction_entry_inc(conn->transaction, +- conn->domain->domid); +- } else { +- conn->domain->nbentry++; +- } ++ domid = node->perms.p ? node->perms.p[0].id : conn->id; ++ ++ if (conn->transaction) { ++ transaction_entry_inc(conn->transaction, domid); ++ } else { ++ d = (domid == conn->id && conn->domain) ? conn->domain ++ : find_or_alloc_existing_domain(domid); ++ if (d) ++ d->nbentry++; ++ else ++ return ENOMEM; + } ++ ++ return 0; + } + + /* +@@ -847,7 +858,7 @@ static int chk_domain_generation(unsigned int domid, uint64_t gen) + * Remove permissions for no longer existing domains in order to avoid a new + * domain with the same domid inheriting the permissions. + */ +-int domain_adjust_node_perms(struct node *node) ++int domain_adjust_node_perms(struct connection *conn, struct node *node) + { + unsigned int i; + int ret; +@@ -857,8 +868,14 @@ int domain_adjust_node_perms(struct node *node) + return errno; + + /* If the owner doesn't exist any longer give it to priv domain. */ +- if (!ret) ++ if (!ret) { ++ /* ++ * In theory we'd need to update the number of dom0 nodes here, ++ * but we could be called for a read of the node. So better ++ * avoid the risk to overflow the node count of dom0. ++ */ + node->perms.p[0].id = priv_domid; ++ } + + for (i = 1; i < node->perms.num; i++) { + if (node->perms.p[i].perms & XS_PERM_IGNORE) +@@ -877,25 +894,25 @@ int domain_adjust_node_perms(struct node *node) + void domain_entry_dec(struct connection *conn, struct node *node) + { + struct domain *d; ++ unsigned int domid; + + if (!conn) + return; + +- if (node->perms.p && node->perms.p[0].id != conn->id) { +- if (conn->transaction) { +- transaction_entry_dec(conn->transaction, +- node->perms.p[0].id); ++ domid = node->perms.p ? node->perms.p[0].id : conn->id; ++ ++ if (conn->transaction) { ++ transaction_entry_dec(conn->transaction, domid); ++ } else { ++ d = (domid == conn->id && conn->domain) ? conn->domain ++ : find_domain_struct(domid); ++ if (d) { ++ d->nbentry--; + } else { +- d = find_domain_by_domid(node->perms.p[0].id); +- if (d && d->nbentry) +- d->nbentry--; +- } +- } else if (conn->domain && conn->domain->nbentry) { +- if (conn->transaction) { +- transaction_entry_dec(conn->transaction, +- conn->domain->domid); +- } else { +- conn->domain->nbentry--; ++ errno = ENOENT; ++ corrupt(conn, ++ "Node \"%s\" owned by non-existing domain %u\n", ++ node->name, domid); + } + } + } +@@ -905,13 +922,23 @@ int domain_entry_fix(unsigned int domid, int num, bool update) + struct domain *d; + int cnt; + +- d = find_domain_by_domid(domid); +- if (!d) +- return 0; ++ if (update) { ++ d = find_domain_struct(domid); ++ assert(d); ++ } else { ++ /* ++ * We are called first with update == false in order to catch ++ * any error. So do a possible allocation and check for error ++ * only in this case, as in the case of update == true nothing ++ * can go wrong anymore as the allocation already happened. ++ */ ++ d = find_or_alloc_existing_domain(domid); ++ if (!d) ++ return -1; ++ } + + cnt = d->nbentry + num; +- if (cnt < 0) +- cnt = 0; ++ assert(cnt >= 0); + + if (update) + d->nbentry = cnt; +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index 5757a6557146..cce13d14f016 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -58,10 +58,10 @@ bool domain_can_write(struct connection *conn); + bool domain_is_unprivileged(struct connection *conn); + + /* Remove node permissions for no longer existing domains. */ +-int domain_adjust_node_perms(struct node *node); ++int domain_adjust_node_perms(struct connection *conn, struct node *node); + + /* Quota manipulation */ +-void domain_entry_inc(struct connection *conn, struct node *); ++int domain_entry_inc(struct connection *conn, struct node *); + void domain_entry_dec(struct connection *conn, struct node *); + int domain_entry_fix(unsigned int domid, int num, bool update); + int domain_entry(struct connection *conn); +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index ee1b09031a3b..86caf6c398be 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -519,8 +519,12 @@ static int transaction_fix_domains(struct transaction *trans, bool update) + + list_for_each_entry(d, &trans->changed_domains, list) { + cnt = domain_entry_fix(d->domid, d->nbentry, update); +- if (!update && cnt >= quota_nb_entry_per_domain) +- return ENOSPC; ++ if (!update) { ++ if (cnt >= quota_nb_entry_per_domain) ++ return ENOSPC; ++ if (cnt < 0) ++ return ENOMEM; ++ } + } + + return 0; +-- +2.37.4 + diff --git a/0093-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch b/0093-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch new file mode 100644 index 0000000..f064355 --- /dev/null +++ b/0093-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch @@ -0,0 +1,255 @@ +From 1035371fee5552b8cfe9819c4058a4c9e695ba5e Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:09 +0200 +Subject: [PATCH 093/126] tools/xenstore: limit max number of nodes accessed in + a transaction + +Today a guest is free to access as many nodes in a single transaction +as it wants. This can lead to unbounded memory consumption in Xenstore +as there is the need to keep track of all nodes having been accessed +during a transaction. + +In oxenstored the number of requests in a transaction is being limited +via a quota maxrequests (default is 1024). As multiple accesses of a +node are not problematic in C Xenstore, limit the number of accessed +nodes. + +In order to let read_node() detect a quota error in case too many nodes +are being accessed, check the return value of access_node() and return +NULL in case an error has been seen. Introduce __must_check and add it +to the access_node() prototype. + +This is part of XSA-326 / CVE-2022-42314. + +Suggested-by: Julien Grall <julien@xen.org> +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 268369d8e322d227a74a899009c5748d7b0ea142) +--- + tools/include/xen-tools/libs.h | 4 +++ + tools/xenstore/xenstored_core.c | 50 ++++++++++++++++++-------- + tools/xenstore/xenstored_core.h | 1 + + tools/xenstore/xenstored_transaction.c | 9 +++++ + tools/xenstore/xenstored_transaction.h | 4 +-- + 5 files changed, 52 insertions(+), 16 deletions(-) + +diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h +index a16e0c380709..bafc90e2f603 100644 +--- a/tools/include/xen-tools/libs.h ++++ b/tools/include/xen-tools/libs.h +@@ -63,4 +63,8 @@ + #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) + #endif + ++#ifndef __must_check ++#define __must_check __attribute__((__warn_unused_result__)) ++#endif ++ + #endif /* __XEN_TOOLS_LIBS__ */ +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 85c0d2f38fac..050d6f651ae9 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -106,6 +106,7 @@ int quota_nb_watch_per_domain = 128; + int quota_max_entry_size = 2048; /* 2K */ + int quota_max_transaction = 10; + int quota_nb_perms_per_node = 5; ++int quota_trans_nodes = 1024; + int quota_max_path_len = XENSTORE_REL_PATH_MAX; + int quota_req_outstanding = 20; + +@@ -560,6 +561,7 @@ struct node *read_node(struct connection *conn, const void *ctx, + TDB_DATA key, data; + struct xs_tdb_record_hdr *hdr; + struct node *node; ++ int err; + + node = talloc(ctx, struct node); + if (!node) { +@@ -581,14 +583,13 @@ struct node *read_node(struct connection *conn, const void *ctx, + if (data.dptr == NULL) { + if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) { + node->generation = NO_GENERATION; +- access_node(conn, node, NODE_ACCESS_READ, NULL); +- errno = ENOENT; ++ err = access_node(conn, node, NODE_ACCESS_READ, NULL); ++ errno = err ? : ENOENT; + } else { + log("TDB error on read: %s", tdb_errorstr(tdb_ctx)); + errno = EIO; + } +- talloc_free(node); +- return NULL; ++ goto error; + } + + node->parent = NULL; +@@ -603,19 +604,36 @@ struct node *read_node(struct connection *conn, const void *ctx, + + /* Permissions are struct xs_permissions. */ + node->perms.p = hdr->perms; +- if (domain_adjust_node_perms(conn, node)) { +- talloc_free(node); +- return NULL; +- } ++ if (domain_adjust_node_perms(conn, node)) ++ goto error; + + /* Data is binary blob (usually ascii, no nul). */ + node->data = node->perms.p + hdr->num_perms; + /* Children is strings, nul separated. */ + node->children = node->data + node->datalen; + +- access_node(conn, node, NODE_ACCESS_READ, NULL); ++ if (access_node(conn, node, NODE_ACCESS_READ, NULL)) ++ goto error; + + return node; ++ ++ error: ++ err = errno; ++ talloc_free(node); ++ errno = err; ++ return NULL; ++} ++ ++static bool read_node_can_propagate_errno(void) ++{ ++ /* ++ * 2 error cases for read_node() can always be propagated up: ++ * ENOMEM, because this has nothing to do with the node being in the ++ * data base or not, but is caused by a general lack of memory. ++ * ENOSPC, because this is related to hitting quota limits which need ++ * to be respected. ++ */ ++ return errno == ENOMEM || errno == ENOSPC; + } + + int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, +@@ -732,7 +750,7 @@ static int ask_parents(struct connection *conn, const void *ctx, + node = read_node(conn, ctx, name); + if (node) + break; +- if (errno == ENOMEM) ++ if (read_node_can_propagate_errno()) + return errno; + } while (!streq(name, "/")); + +@@ -795,7 +813,7 @@ static struct node *get_node(struct connection *conn, + } + } + /* Clean up errno if they weren't supposed to know. */ +- if (!node && errno != ENOMEM) ++ if (!node && !read_node_can_propagate_errno()) + errno = errno_from_parents(conn, ctx, name, errno, perm); + return node; + } +@@ -1201,7 +1219,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, + + /* If parent doesn't exist, create it. */ + parent = read_node(conn, parentname, parentname); +- if (!parent) ++ if (!parent && errno == ENOENT) + parent = construct_node(conn, ctx, parentname); + if (!parent) + return NULL; +@@ -1475,7 +1493,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, + + parent = read_node(conn, ctx, parentname); + if (!parent) +- return (errno == ENOMEM) ? ENOMEM : EINVAL; ++ return read_node_can_propagate_errno() ? errno : EINVAL; + node->parent = parent; + + return delete_node(conn, ctx, parent, node, false); +@@ -1505,7 +1523,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) + return 0; + } + /* Restore errno, just in case. */ +- if (errno != ENOMEM) ++ if (!read_node_can_propagate_errno()) + errno = ENOENT; + } + return errno; +@@ -2282,6 +2300,8 @@ static void usage(void) + " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" + " -Q, --quota <what>=<nb> set the quota <what> to the value <nb>, allowed\n" + " quotas are:\n" ++" transaction-nodes: number of accessed node per\n" ++" transaction\n" + " outstanding: number of outstanding requests\n" + " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" + " allowed timeout candidates are:\n" +@@ -2367,6 +2387,8 @@ static void set_quota(const char *arg) + val = get_optval_int(eq + 1); + if (what_matches(arg, "outstanding")) + quota_req_outstanding = val; ++ else if (what_matches(arg, "transaction-nodes")) ++ quota_trans_nodes = val; + else + barf("unknown quota \"%s\"\n", arg); + } +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index c0a056ce13fe..1b3bd5ca563a 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -261,6 +261,7 @@ extern int dom0_event; + extern int priv_domid; + extern int quota_nb_entry_per_domain; + extern int quota_req_outstanding; ++extern int quota_trans_nodes; + + extern unsigned int timeout_watch_event_msec; + +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index 86caf6c398be..7bd41eb475e3 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -156,6 +156,9 @@ struct transaction + /* Connection-local identifier for this transaction. */ + uint32_t id; + ++ /* Node counter. */ ++ unsigned int nodes; ++ + /* Generation when transaction started. */ + uint64_t generation; + +@@ -260,6 +263,11 @@ int access_node(struct connection *conn, struct node *node, + + i = find_accessed_node(trans, node->name); + if (!i) { ++ if (trans->nodes >= quota_trans_nodes && ++ domain_is_unprivileged(conn)) { ++ ret = ENOSPC; ++ goto err; ++ } + i = talloc_zero(trans, struct accessed_node); + if (!i) + goto nomem; +@@ -297,6 +305,7 @@ int access_node(struct connection *conn, struct node *node, + i->ta_node = true; + } + } ++ trans->nodes++; + list_add_tail(&i->list, &trans->accessed); + } + +diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h +index 0093cac807e3..e3cbd6b23095 100644 +--- a/tools/xenstore/xenstored_transaction.h ++++ b/tools/xenstore/xenstored_transaction.h +@@ -39,8 +39,8 @@ void transaction_entry_inc(struct transaction *trans, unsigned int domid); + void transaction_entry_dec(struct transaction *trans, unsigned int domid); + + /* This node was accessed. */ +-int access_node(struct connection *conn, struct node *node, +- enum node_access_type type, TDB_DATA *key); ++int __must_check access_node(struct connection *conn, struct node *node, ++ enum node_access_type type, TDB_DATA *key); + + /* Queue watches for a modified node. */ + void queue_watches(struct connection *conn, const char *name, bool watch_exact); +-- +2.37.4 + diff --git a/0094-tools-xenstore-move-the-call-of-setup_structure-to-d.patch b/0094-tools-xenstore-move-the-call-of-setup_structure-to-d.patch new file mode 100644 index 0000000..4cebe89 --- /dev/null +++ b/0094-tools-xenstore-move-the-call-of-setup_structure-to-d.patch @@ -0,0 +1,96 @@ +From ccef72b6a885714dae0b6f1accb33042ee40e108 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:09 +0200 +Subject: [PATCH 094/126] tools/xenstore: move the call of setup_structure() to + dom0 introduction + +Setting up the basic structure when introducing dom0 has the advantage +to be able to add proper node memory accounting for the added nodes +later. + +This makes it possible to do proper node accounting, too. + +An additional requirement to make that work fine is to correct the +owner of the created nodes to be dom0_domid instead of domid 0. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 60e2f6020dea7f616857b8fc1141b1c085d88761) +--- + tools/xenstore/xenstored_core.c | 9 ++++----- + tools/xenstore/xenstored_core.h | 1 + + tools/xenstore/xenstored_domain.c | 3 +++ + 3 files changed, 8 insertions(+), 5 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 050d6f651ae9..51af74390cbe 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1940,7 +1940,8 @@ static int tdb_flags; + static void manual_node(const char *name, const char *child) + { + struct node *node; +- struct xs_permissions perms = { .id = 0, .perms = XS_PERM_NONE }; ++ struct xs_permissions perms = { .id = dom0_domid, ++ .perms = XS_PERM_NONE }; + + node = talloc_zero(NULL, struct node); + if (!node) +@@ -1979,7 +1980,7 @@ static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...) + } + } + +-static void setup_structure(bool live_update) ++void setup_structure(bool live_update) + { + char *tdbname; + +@@ -2002,6 +2003,7 @@ static void setup_structure(bool live_update) + manual_node("/", "tool"); + manual_node("/tool", "xenstored"); + manual_node("/tool/xenstored", NULL); ++ domain_entry_fix(dom0_domid, 3, true); + } + + check_store(); +@@ -2512,9 +2514,6 @@ int main(int argc, char *argv[]) + + init_pipe(reopen_log_pipe); + +- /* Setup the database */ +- setup_structure(live_update); +- + /* Listen to hypervisor. */ + if (!no_domain_init && !live_update) { + domain_init(-1); +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 1b3bd5ca563a..459698d8407a 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -224,6 +224,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + struct node *read_node(struct connection *conn, const void *ctx, + const char *name); + ++void setup_structure(bool live_update); + struct connection *new_connection(connwritefn_t *write, connreadfn_t *read); + struct connection *get_connection_by_id(unsigned int conn_id); + void check_store(void); +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 3c27973fb836..0dd75a6a2194 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -476,6 +476,9 @@ static struct domain *introduce_domain(const void *ctx, + } + domain->interface = interface; + ++ if (is_master_domain) ++ setup_structure(restore); ++ + /* Now domain belongs to its connection. */ + talloc_steal(domain->conn, domain); + +-- +2.37.4 + diff --git a/0095-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch b/0095-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch new file mode 100644 index 0000000..f826f80 --- /dev/null +++ b/0095-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch @@ -0,0 +1,289 @@ +From aa29eb624797fb6825e4a23071c88417672868a4 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:09 +0200 +Subject: [PATCH 095/126] tools/xenstore: add infrastructure to keep track of + per domain memory usage + +The amount of memory a domain can consume in Xenstore is limited by +various quota today, but even with sane quota a domain can still +consume rather large memory quantities. + +Add the infrastructure for keeping track of the amount of memory a +domain is consuming in Xenstore. Note that this is only the memory a +domain has direct control over, so any internal administration data +needed by Xenstore only is not being accounted for. + +There are two quotas defined: a soft quota which will result in a +warning issued via syslog() when it is exceeded, and a hard quota +resulting in a stop of accepting further requests or watch events as +long as the hard quota would be violated by accepting those. + +Setting any of those quotas to 0 will disable it. + +As default values use 2MB per domain for the soft limit (this basically +covers the allowed case to create 1000 nodes needing 2kB each), and +2.5MB for the hard limit. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 0d4a8ec7a93faedbe54fd197db146de628459e77) +--- + tools/xenstore/xenstored_core.c | 30 ++++++++-- + tools/xenstore/xenstored_core.h | 2 + + tools/xenstore/xenstored_domain.c | 93 +++++++++++++++++++++++++++++++ + tools/xenstore/xenstored_domain.h | 20 +++++++ + 4 files changed, 139 insertions(+), 6 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 51af74390cbe..eeb0d893e8c3 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -109,6 +109,8 @@ int quota_nb_perms_per_node = 5; + int quota_trans_nodes = 1024; + int quota_max_path_len = XENSTORE_REL_PATH_MAX; + int quota_req_outstanding = 20; ++int quota_memory_per_domain_soft = 2 * 1024 * 1024; /* 2 MB */ ++int quota_memory_per_domain_hard = 2 * 1024 * 1024 + 512 * 1024; /* 2.5 MB */ + + unsigned int timeout_watch_event_msec = 20000; + +@@ -2304,7 +2306,14 @@ static void usage(void) + " quotas are:\n" + " transaction-nodes: number of accessed node per\n" + " transaction\n" ++" memory: total used memory per domain for nodes,\n" ++" transactions, watches and requests, above\n" ++" which Xenstore will stop talking to domain\n" + " outstanding: number of outstanding requests\n" ++" -q, --quota-soft <what>=<nb> set a soft quota <what> to the value <nb>,\n" ++" causing a warning to be issued via syslog() if the\n" ++" limit is violated, allowed quotas are:\n" ++" memory: see above\n" + " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" + " allowed timeout candidates are:\n" + " watch-event: time a watch-event is kept pending\n" +@@ -2331,6 +2340,7 @@ static struct option options[] = { + { "perm-nb", 1, NULL, 'A' }, + { "path-max", 1, NULL, 'M' }, + { "quota", 1, NULL, 'Q' }, ++ { "quota-soft", 1, NULL, 'q' }, + { "timeout", 1, NULL, 'w' }, + { "no-recovery", 0, NULL, 'R' }, + { "internal-db", 0, NULL, 'I' }, +@@ -2379,7 +2389,7 @@ static void set_timeout(const char *arg) + barf("unknown timeout \"%s\"\n", arg); + } + +-static void set_quota(const char *arg) ++static void set_quota(const char *arg, bool soft) + { + const char *eq = strchr(arg, '='); + int val; +@@ -2387,11 +2397,16 @@ static void set_quota(const char *arg) + if (!eq) + barf("quotas must be specified via <what>=<nb>\n"); + val = get_optval_int(eq + 1); +- if (what_matches(arg, "outstanding")) ++ if (what_matches(arg, "outstanding") && !soft) + quota_req_outstanding = val; +- else if (what_matches(arg, "transaction-nodes")) ++ else if (what_matches(arg, "transaction-nodes") && !soft) + quota_trans_nodes = val; +- else ++ else if (what_matches(arg, "memory")) { ++ if (soft) ++ quota_memory_per_domain_soft = val; ++ else ++ quota_memory_per_domain_hard = val; ++ } else + barf("unknown quota \"%s\"\n", arg); + } + +@@ -2409,7 +2424,7 @@ int main(int argc, char *argv[]) + orig_argc = argc; + orig_argv = argv; + +- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", ++ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", + options, NULL)) != -1) { + switch (opt) { + case 'D': +@@ -2459,7 +2474,10 @@ int main(int argc, char *argv[]) + quota_max_path_len); + break; + case 'Q': +- set_quota(optarg); ++ set_quota(optarg, false); ++ break; ++ case 'q': ++ set_quota(optarg, true); + break; + case 'w': + set_timeout(optarg); +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 459698d8407a..2fb37dbfe847 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -263,6 +263,8 @@ extern int priv_domid; + extern int quota_nb_entry_per_domain; + extern int quota_req_outstanding; + extern int quota_trans_nodes; ++extern int quota_memory_per_domain_soft; ++extern int quota_memory_per_domain_hard; + + extern unsigned int timeout_watch_event_msec; + +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 0dd75a6a2194..ec542df6a67e 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -76,6 +76,13 @@ struct domain + /* number of entry from this domain in the store */ + int nbentry; + ++ /* Amount of memory allocated for this domain. */ ++ int memory; ++ bool soft_quota_reported; ++ bool hard_quota_reported; ++ time_t mem_last_msg; ++#define MEM_WARN_MINTIME_SEC 10 ++ + /* number of watch for this domain */ + int nbwatch; + +@@ -296,6 +303,9 @@ bool domain_can_read(struct connection *conn) + return false; + if (conn->domain->nboutstanding >= quota_req_outstanding) + return false; ++ if (conn->domain->memory >= quota_memory_per_domain_hard && ++ quota_memory_per_domain_hard) ++ return false; + } + + if (conn->is_ignored) +@@ -956,6 +966,89 @@ int domain_entry(struct connection *conn) + : 0; + } + ++static bool domain_chk_quota(struct domain *domain, int mem) ++{ ++ time_t now; ++ ++ if (!domain || !domid_is_unprivileged(domain->domid) || ++ (domain->conn && domain->conn->is_ignored)) ++ return false; ++ ++ now = time(NULL); ++ ++ if (mem >= quota_memory_per_domain_hard && ++ quota_memory_per_domain_hard) { ++ if (domain->hard_quota_reported) ++ return true; ++ syslog(LOG_ERR, "Domain %u exceeds hard memory quota, Xenstore interface to domain stalled\n", ++ domain->domid); ++ domain->mem_last_msg = now; ++ domain->hard_quota_reported = true; ++ return true; ++ } ++ ++ if (now - domain->mem_last_msg >= MEM_WARN_MINTIME_SEC) { ++ if (domain->hard_quota_reported) { ++ domain->mem_last_msg = now; ++ domain->hard_quota_reported = false; ++ syslog(LOG_INFO, "Domain %u below hard memory quota again\n", ++ domain->domid); ++ } ++ if (mem >= quota_memory_per_domain_soft && ++ quota_memory_per_domain_soft && ++ !domain->soft_quota_reported) { ++ domain->mem_last_msg = now; ++ domain->soft_quota_reported = true; ++ syslog(LOG_WARNING, "Domain %u exceeds soft memory quota\n", ++ domain->domid); ++ } ++ if (mem < quota_memory_per_domain_soft && ++ domain->soft_quota_reported) { ++ domain->mem_last_msg = now; ++ domain->soft_quota_reported = false; ++ syslog(LOG_INFO, "Domain %u below soft memory quota again\n", ++ domain->domid); ++ } ++ ++ } ++ ++ return false; ++} ++ ++int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) ++{ ++ struct domain *domain; ++ ++ domain = find_domain_struct(domid); ++ if (domain) { ++ /* ++ * domain_chk_quota() will print warning and also store whether ++ * the soft/hard quota has been hit. So check no_quota_check ++ * *after*. ++ */ ++ if (domain_chk_quota(domain, domain->memory + mem) && ++ !no_quota_check) ++ return ENOMEM; ++ domain->memory += mem; ++ } else { ++ /* ++ * The domain the memory is to be accounted for should always ++ * exist, as accounting is done either for a domain related to ++ * the current connection, or for the domain owning a node ++ * (which is always existing, as the owner of the node is ++ * tested to exist and replaced by domid 0 if not). ++ * So not finding the related domain MUST be an error in the ++ * data base. ++ */ ++ errno = ENOENT; ++ corrupt(NULL, "Accounting called for non-existing domain %u\n", ++ domid); ++ return ENOENT; ++ } ++ ++ return 0; ++} ++ + void domain_watch_inc(struct connection *conn) + { + if (!conn || !conn->domain) +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index cce13d14f016..571aa46d158e 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -65,6 +65,26 @@ int domain_entry_inc(struct connection *conn, struct node *); + void domain_entry_dec(struct connection *conn, struct node *); + int domain_entry_fix(unsigned int domid, int num, bool update); + int domain_entry(struct connection *conn); ++int domain_memory_add(unsigned int domid, int mem, bool no_quota_check); ++ ++/* ++ * domain_memory_add_chk(): to be used when memory quota should be checked. ++ * Not to be used when specifying a negative mem value, as lowering the used ++ * memory should always be allowed. ++ */ ++static inline int domain_memory_add_chk(unsigned int domid, int mem) ++{ ++ return domain_memory_add(domid, mem, false); ++} ++/* ++ * domain_memory_add_nochk(): to be used when memory quota should not be ++ * checked, e.g. when lowering memory usage, or in an error case for undoing ++ * a previous memory adjustment. ++ */ ++static inline void domain_memory_add_nochk(unsigned int domid, int mem) ++{ ++ domain_memory_add(domid, mem, true); ++} + void domain_watch_inc(struct connection *conn); + void domain_watch_dec(struct connection *conn); + int domain_watch(struct connection *conn); +-- +2.37.4 + diff --git a/0096-tools-xenstore-add-memory-accounting-for-responses.patch b/0096-tools-xenstore-add-memory-accounting-for-responses.patch new file mode 100644 index 0000000..6174433 --- /dev/null +++ b/0096-tools-xenstore-add-memory-accounting-for-responses.patch @@ -0,0 +1,82 @@ +From 0113aacb3d791600668cd7703f6f12ed94fc6d03 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:09 +0200 +Subject: [PATCH 096/126] tools/xenstore: add memory accounting for responses + +Add the memory accounting for queued responses. + +In case adding a watch event for a guest is causing the hard memory +quota of that guest to be violated, the event is dropped. This will +ensure that it is impossible to drive another guest past its memory +quota by generating insane amounts of events for that guest. This is +especially important for protecting driver domains from that attack +vector. + +This is part of XSA-326 / CVE-2022-42315. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit f6d00133643a524d2138c9e3f192bbde719050ba) +--- + tools/xenstore/xenstored_core.c | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index eeb0d893e8c3..2e02b577c912 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -260,6 +260,8 @@ static void free_buffered_data(struct buffered_data *out, + } + } + ++ domain_memory_add_nochk(conn->id, -out->hdr.msg.len - sizeof(out->hdr)); ++ + if (out->hdr.msg.type == XS_WATCH_EVENT) { + req = out->pend.req; + if (req) { +@@ -904,11 +906,14 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, + bdata->timeout_msec = 0; + bdata->watch_event = false; + +- if (len <= DEFAULT_BUFFER_SIZE) ++ if (len <= DEFAULT_BUFFER_SIZE) { + bdata->buffer = bdata->default_buffer; +- else { ++ /* Don't check quota, path might be used for returning error. */ ++ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); ++ } else { + bdata->buffer = talloc_array(bdata, char, len); +- if (!bdata->buffer) { ++ if (!bdata->buffer || ++ domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { + send_error(conn, ENOMEM); + return; + } +@@ -973,6 +978,11 @@ void send_event(struct buffered_data *req, struct connection *conn, + } + } + ++ if (domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { ++ talloc_free(bdata); ++ return; ++ } ++ + if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { + bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; + if (!conn->timeout_msec) +@@ -2940,6 +2950,12 @@ static void add_buffered_data(struct buffered_data *bdata, + */ + if (bdata->hdr.msg.type != XS_WATCH_EVENT) + domain_outstanding_inc(conn); ++ /* ++ * We are restoring the state after Live-Update and the new quota may ++ * be smaller. So ignore it. The limit will be applied for any resource ++ * after the state has been fully restored. ++ */ ++ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); + } + + void read_state_buffered_data(const void *ctx, struct connection *conn, +-- +2.37.4 + diff --git a/0097-tools-xenstore-add-memory-accounting-for-watches.patch b/0097-tools-xenstore-add-memory-accounting-for-watches.patch new file mode 100644 index 0000000..dd2ed61 --- /dev/null +++ b/0097-tools-xenstore-add-memory-accounting-for-watches.patch @@ -0,0 +1,96 @@ +From 9c2e71fe0611da9ed2ebbf2362a9bb05d42bf0c3 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:10 +0200 +Subject: [PATCH 097/126] tools/xenstore: add memory accounting for watches + +Add the memory accounting for registered watches. + +When a socket connection is destroyed, the associated watches are +removed, too. In order to keep memory accounting correct the watches +must be removed explicitly via a call of conn_delete_all_watches() from +destroy_conn(). + +This is part of XSA-326 / CVE-2022-42315. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 7f9978a2cc37aaffab2fb09593bc598c0712a69b) +--- + tools/xenstore/xenstored_core.c | 1 + + tools/xenstore/xenstored_watch.c | 13 ++++++++++--- + 2 files changed, 11 insertions(+), 3 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 2e02b577c912..b1a4575929bd 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -457,6 +457,7 @@ static int destroy_conn(void *_conn) + } + + conn_free_buffered_data(conn); ++ conn_delete_all_watches(conn); + list_for_each_entry(req, &conn->ref_list, list) + req->on_ref_list = false; + +diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c +index 1d664e3d6b72..0d5858df5bdd 100644 +--- a/tools/xenstore/xenstored_watch.c ++++ b/tools/xenstore/xenstored_watch.c +@@ -211,7 +211,7 @@ static int check_watch_path(struct connection *conn, const void *ctx, + } + + static struct watch *add_watch(struct connection *conn, char *path, char *token, +- bool relative) ++ bool relative, bool no_quota_check) + { + struct watch *watch; + +@@ -222,6 +222,9 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, + watch->token = talloc_strdup(watch, token); + if (!watch->node || !watch->token) + goto nomem; ++ if (domain_memory_add(conn->id, strlen(path) + strlen(token), ++ no_quota_check)) ++ goto nomem; + + if (relative) + watch->relative_path = get_implicit_path(conn); +@@ -265,7 +268,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) + if (domain_watch(conn) > quota_nb_watch_per_domain) + return E2BIG; + +- watch = add_watch(conn, vec[0], vec[1], relative); ++ watch = add_watch(conn, vec[0], vec[1], relative, false); + if (!watch) + return errno; + +@@ -296,6 +299,8 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) + list_for_each_entry(watch, &conn->watches, list) { + if (streq(watch->node, node) && streq(watch->token, vec[1])) { + list_del(&watch->list); ++ domain_memory_add_nochk(conn->id, -strlen(watch->node) - ++ strlen(watch->token)); + talloc_free(watch); + domain_watch_dec(conn); + send_ack(conn, XS_UNWATCH); +@@ -311,6 +316,8 @@ void conn_delete_all_watches(struct connection *conn) + + while ((watch = list_top(&conn->watches, struct watch, list))) { + list_del(&watch->list); ++ domain_memory_add_nochk(conn->id, -strlen(watch->node) - ++ strlen(watch->token)); + talloc_free(watch); + domain_watch_dec(conn); + } +@@ -373,7 +380,7 @@ void read_state_watch(const void *ctx, const void *state) + if (!path) + barf("allocation error for read watch"); + +- if (!add_watch(conn, path, token, relative)) ++ if (!add_watch(conn, path, token, relative, true)) + barf("error adding watch"); + } + +-- +2.37.4 + diff --git a/0098-tools-xenstore-add-memory-accounting-for-nodes.patch b/0098-tools-xenstore-add-memory-accounting-for-nodes.patch new file mode 100644 index 0000000..f2f8e4f --- /dev/null +++ b/0098-tools-xenstore-add-memory-accounting-for-nodes.patch @@ -0,0 +1,342 @@ +From 32efe29a00efab2896cc973e966a35ecad556495 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:10 +0200 +Subject: [PATCH 098/126] tools/xenstore: add memory accounting for nodes + +Add the memory accounting for Xenstore nodes. In order to make this +not too complicated allow for some sloppiness when writing nodes. Any +hard quota violation will result in no further requests to be accepted. + +This is part of XSA-326 / CVE-2022-42315. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 00e9e32d022be1afc144b75acdaeba8393e63315) +--- + tools/xenstore/xenstored_core.c | 140 ++++++++++++++++++++++--- + tools/xenstore/xenstored_core.h | 12 +++ + tools/xenstore/xenstored_transaction.c | 16 ++- + 3 files changed, 151 insertions(+), 17 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index b1a4575929bd..f27d5c0101bc 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -556,6 +556,117 @@ void set_tdb_key(const char *name, TDB_DATA *key) + key->dsize = strlen(name); + } + ++static void get_acc_data(TDB_DATA *key, struct node_account_data *acc) ++{ ++ TDB_DATA old_data; ++ struct xs_tdb_record_hdr *hdr; ++ ++ if (acc->memory < 0) { ++ old_data = tdb_fetch(tdb_ctx, *key); ++ /* No check for error, as the node might not exist. */ ++ if (old_data.dptr == NULL) { ++ acc->memory = 0; ++ } else { ++ hdr = (void *)old_data.dptr; ++ acc->memory = old_data.dsize; ++ acc->domid = hdr->perms[0].id; ++ } ++ talloc_free(old_data.dptr); ++ } ++} ++ ++/* ++ * Per-transaction nodes need to be accounted for the transaction owner. ++ * Those nodes are stored in the data base with the transaction generation ++ * count prepended (e.g. 123/local/domain/...). So testing for the node's ++ * key not to start with "/" is sufficient. ++ */ ++static unsigned int get_acc_domid(struct connection *conn, TDB_DATA *key, ++ unsigned int domid) ++{ ++ return (!conn || key->dptr[0] == '/') ? domid : conn->id; ++} ++ ++int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, ++ struct node_account_data *acc, bool no_quota_check) ++{ ++ struct xs_tdb_record_hdr *hdr = (void *)data->dptr; ++ struct node_account_data old_acc = {}; ++ unsigned int old_domid, new_domid; ++ int ret; ++ ++ if (!acc) ++ old_acc.memory = -1; ++ else ++ old_acc = *acc; ++ ++ get_acc_data(key, &old_acc); ++ old_domid = get_acc_domid(conn, key, old_acc.domid); ++ new_domid = get_acc_domid(conn, key, hdr->perms[0].id); ++ ++ /* ++ * Don't check for ENOENT, as we want to be able to switch orphaned ++ * nodes to new owners. ++ */ ++ if (old_acc.memory) ++ domain_memory_add_nochk(old_domid, ++ -old_acc.memory - key->dsize); ++ ret = domain_memory_add(new_domid, data->dsize + key->dsize, ++ no_quota_check); ++ if (ret) { ++ /* Error path, so no quota check. */ ++ if (old_acc.memory) ++ domain_memory_add_nochk(old_domid, ++ old_acc.memory + key->dsize); ++ return ret; ++ } ++ ++ /* TDB should set errno, but doesn't even set ecode AFAICT. */ ++ if (tdb_store(tdb_ctx, *key, *data, TDB_REPLACE) != 0) { ++ domain_memory_add_nochk(new_domid, -data->dsize - key->dsize); ++ /* Error path, so no quota check. */ ++ if (old_acc.memory) ++ domain_memory_add_nochk(old_domid, ++ old_acc.memory + key->dsize); ++ errno = EIO; ++ return errno; ++ } ++ ++ if (acc) { ++ /* Don't use new_domid, as it might be a transaction node. */ ++ acc->domid = hdr->perms[0].id; ++ acc->memory = data->dsize; ++ } ++ ++ return 0; ++} ++ ++int do_tdb_delete(struct connection *conn, TDB_DATA *key, ++ struct node_account_data *acc) ++{ ++ struct node_account_data tmp_acc; ++ unsigned int domid; ++ ++ if (!acc) { ++ acc = &tmp_acc; ++ acc->memory = -1; ++ } ++ ++ get_acc_data(key, acc); ++ ++ if (tdb_delete(tdb_ctx, *key)) { ++ errno = EIO; ++ return errno; ++ } ++ ++ if (acc->memory) { ++ domid = get_acc_domid(conn, key, acc->domid); ++ domain_memory_add_nochk(domid, -acc->memory - key->dsize); ++ } ++ ++ return 0; ++} ++ + /* + * If it fails, returns NULL and sets errno. + * Temporary memory allocations will be done with ctx. +@@ -609,9 +720,15 @@ struct node *read_node(struct connection *conn, const void *ctx, + + /* Permissions are struct xs_permissions. */ + node->perms.p = hdr->perms; ++ node->acc.domid = node->perms.p[0].id; ++ node->acc.memory = data.dsize; + if (domain_adjust_node_perms(conn, node)) + goto error; + ++ /* If owner is gone reset currently accounted memory size. */ ++ if (node->acc.domid != node->perms.p[0].id) ++ node->acc.memory = 0; ++ + /* Data is binary blob (usually ascii, no nul). */ + node->data = node->perms.p + hdr->num_perms; + /* Children is strings, nul separated. */ +@@ -680,12 +797,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + p += node->datalen; + memcpy(p, node->children, node->childlen); + +- /* TDB should set errno, but doesn't even set ecode AFAICT. */ +- if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) { +- corrupt(conn, "Write of %s failed", key->dptr); +- errno = EIO; +- return errno; +- } ++ if (do_tdb_write(conn, key, &data, &node->acc, no_quota_check)) ++ return EIO; ++ + return 0; + } + +@@ -1188,7 +1302,7 @@ static void delete_node_single(struct connection *conn, struct node *node) + if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) + return; + +- if (tdb_delete(tdb_ctx, key) != 0) { ++ if (do_tdb_delete(conn, &key, &node->acc) != 0) { + corrupt(conn, "Could not delete '%s'", node->name); + return; + } +@@ -1261,6 +1375,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, + /* No children, no data */ + node->children = node->data = NULL; + node->childlen = node->datalen = 0; ++ node->acc.memory = 0; + node->parent = parent; + return node; + +@@ -1269,17 +1384,17 @@ nomem: + return NULL; + } + +-static void destroy_node_rm(struct node *node) ++static void destroy_node_rm(struct connection *conn, struct node *node) + { + if (streq(node->name, "/")) + corrupt(NULL, "Destroying root node!"); + +- tdb_delete(tdb_ctx, node->key); ++ do_tdb_delete(conn, &node->key, &node->acc); + } + + static int destroy_node(struct connection *conn, struct node *node) + { +- destroy_node_rm(node); ++ destroy_node_rm(conn, node); + domain_entry_dec(conn, node); + + /* +@@ -1331,7 +1446,7 @@ static struct node *create_node(struct connection *conn, const void *ctx, + /* Account for new node */ + if (i->parent) { + if (domain_entry_inc(conn, i)) { +- destroy_node_rm(i); ++ destroy_node_rm(conn, i); + return NULL; + } + } +@@ -2192,7 +2307,7 @@ static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val, + if (!hashtable_search(reachable, name)) { + log("clean_store: '%s' is orphaned!", name); + if (recovery) { +- tdb_delete(tdb, key); ++ do_tdb_delete(NULL, &key, NULL); + } + } + +@@ -3030,6 +3145,7 @@ void read_state_node(const void *ctx, const void *state) + if (!node) + barf("allocation error restoring node"); + ++ node->acc.memory = 0; + node->name = name; + node->generation = ++generation; + node->datalen = sn->data_len; +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 2fb37dbfe847..5c1b574bffe6 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -169,6 +169,11 @@ struct node_perms { + struct xs_permissions *p; + }; + ++struct node_account_data { ++ unsigned int domid; ++ int memory; /* -1 if unknown */ ++}; ++ + struct node { + const char *name; + /* Key used to update TDB */ +@@ -191,6 +196,9 @@ struct node { + /* Children, each nul-terminated. */ + unsigned int childlen; + char *children; ++ ++ /* Allocation information for node currently in store. */ ++ struct node_account_data acc; + }; + + /* Return the only argument in the input. */ +@@ -300,6 +308,10 @@ extern xengnttab_handle **xgt_handle; + int remember_string(struct hashtable *hash, const char *str); + + void set_tdb_key(const char *name, TDB_DATA *key); ++int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, ++ struct node_account_data *acc, bool no_quota_check); ++int do_tdb_delete(struct connection *conn, TDB_DATA *key, ++ struct node_account_data *acc); + + void conn_free_buffered_data(struct connection *conn); + +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index 7bd41eb475e3..ace9a11d77bb 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -153,6 +153,9 @@ struct transaction + /* List of all transactions active on this connection. */ + struct list_head list; + ++ /* Connection this transaction is associated with. */ ++ struct connection *conn; ++ + /* Connection-local identifier for this transaction. */ + uint32_t id; + +@@ -286,6 +289,8 @@ int access_node(struct connection *conn, struct node *node, + + introduce = true; + i->ta_node = false; ++ /* acc.memory < 0 means "unknown, get size from TDB". */ ++ node->acc.memory = -1; + + /* + * Additional transaction-specific node for read type. We only +@@ -410,11 +415,11 @@ static int finalize_transaction(struct connection *conn, + goto err; + hdr = (void *)data.dptr; + hdr->generation = ++generation; +- ret = tdb_store(tdb_ctx, key, data, +- TDB_REPLACE); ++ ret = do_tdb_write(conn, &key, &data, NULL, ++ true); + talloc_free(data.dptr); + } else { +- ret = tdb_delete(tdb_ctx, key); ++ ret = do_tdb_delete(conn, &key, NULL); + } + if (ret) + goto err; +@@ -425,7 +430,7 @@ static int finalize_transaction(struct connection *conn, + } + } + +- if (i->ta_node && tdb_delete(tdb_ctx, ta_key)) ++ if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) + goto err; + list_del(&i->list); + talloc_free(i); +@@ -453,7 +458,7 @@ static int destroy_transaction(void *_transaction) + i->node); + if (trans_name) { + set_tdb_key(trans_name, &key); +- tdb_delete(tdb_ctx, key); ++ do_tdb_delete(trans->conn, &key, NULL); + } + } + list_del(&i->list); +@@ -497,6 +502,7 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) + + INIT_LIST_HEAD(&trans->accessed); + INIT_LIST_HEAD(&trans->changed_domains); ++ trans->conn = conn; + trans->fail = false; + trans->generation = ++generation; + +-- +2.37.4 + diff --git a/0099-tools-xenstore-add-exports-for-quota-variables.patch b/0099-tools-xenstore-add-exports-for-quota-variables.patch new file mode 100644 index 0000000..98f341f --- /dev/null +++ b/0099-tools-xenstore-add-exports-for-quota-variables.patch @@ -0,0 +1,62 @@ +From 1fc3ecc9bfead0a50d8e05de983ed2a8f02fa03c Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:10 +0200 +Subject: [PATCH 099/126] tools/xenstore: add exports for quota variables + +Some quota variables are not exported via header files. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 1da16d5990b5f7752657fca3e948f735177ea9ad) +--- + tools/xenstore/xenstored_core.h | 5 +++++ + tools/xenstore/xenstored_transaction.c | 1 - + tools/xenstore/xenstored_watch.c | 2 -- + 3 files changed, 5 insertions(+), 3 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 5c1b574bffe6..1eb3708f82dd 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -268,6 +268,11 @@ extern TDB_CONTEXT *tdb_ctx; + extern int dom0_domid; + extern int dom0_event; + extern int priv_domid; ++extern int quota_nb_watch_per_domain; ++extern int quota_max_transaction; ++extern int quota_max_entry_size; ++extern int quota_nb_perms_per_node; ++extern int quota_max_path_len; + extern int quota_nb_entry_per_domain; + extern int quota_req_outstanding; + extern int quota_trans_nodes; +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index ace9a11d77bb..28774813de83 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -175,7 +175,6 @@ struct transaction + bool fail; + }; + +-extern int quota_max_transaction; + uint64_t generation; + + static struct accessed_node *find_accessed_node(struct transaction *trans, +diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c +index 0d5858df5bdd..4970e9f1a1b9 100644 +--- a/tools/xenstore/xenstored_watch.c ++++ b/tools/xenstore/xenstored_watch.c +@@ -31,8 +31,6 @@ + #include "xenstored_domain.h" + #include "xenstored_transaction.h" + +-extern int quota_nb_watch_per_domain; +- + struct watch + { + /* Watches on this connection */ +-- +2.37.4 + diff --git a/0100-tools-xenstore-add-control-command-for-setting-and-s.patch b/0100-tools-xenstore-add-control-command-for-setting-and-s.patch new file mode 100644 index 0000000..e721645 --- /dev/null +++ b/0100-tools-xenstore-add-control-command-for-setting-and-s.patch @@ -0,0 +1,248 @@ +From 4d30175fdadb75c55acb8abb186727eda7cd5585 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:10 +0200 +Subject: [PATCH 100/126] tools/xenstore: add control command for setting and + showing quota + +Add a xenstore-control command "quota" to: +- show current quota settings +- change quota settings +- show current quota related values of a domain + +Note that in the case the new quota is lower than existing one, +Xenstored may continue to handle requests from a domain exceeding the +new limit (depends on which one has been broken) and the amount of +resource used will not change. However the domain will not be able to +create more resource (associated to the quota) until it is back to below +the limit. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 9c484bef83496b683b0087e3bd2a560da4aa37af) +--- + docs/misc/xenstore.txt | 11 +++ + tools/xenstore/xenstored_control.c | 111 +++++++++++++++++++++++++++++ + tools/xenstore/xenstored_domain.c | 33 +++++++++ + tools/xenstore/xenstored_domain.h | 2 + + 4 files changed, 157 insertions(+) + +diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt +index 334dc8b6fdf5..a7d006519ae8 100644 +--- a/docs/misc/xenstore.txt ++++ b/docs/misc/xenstore.txt +@@ -366,6 +366,17 @@ CONTROL <command>|[<parameters>|] + print|<string> + print <string> to syslog (xenstore runs as daemon) or + to console (xenstore runs as stubdom) ++ quota|[set <name> <val>|<domid>] ++ without parameters: print the current quota settings ++ with "set <name> <val>": set the quota <name> to new value ++ <val> (The admin should make sure all the domain usage is ++ below the quota. If it is not, then Xenstored may continue to ++ handle requests from the domain as long as the resource ++ violating the new quota setting isn't increased further) ++ with "<domid>": print quota related accounting data for ++ the domain <domid> ++ quota-soft|[set <name> <val>] ++ like the "quota" command, but for soft-quota. + help <supported-commands> + return list of supported commands for CONTROL + +diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c +index 211fe1fd9b37..980279fa53ff 100644 +--- a/tools/xenstore/xenstored_control.c ++++ b/tools/xenstore/xenstored_control.c +@@ -148,6 +148,115 @@ static int do_control_log(void *ctx, struct connection *conn, + return 0; + } + ++struct quota { ++ const char *name; ++ int *quota; ++ const char *descr; ++}; ++ ++static const struct quota hard_quotas[] = { ++ { "nodes", "a_nb_entry_per_domain, "Nodes per domain" }, ++ { "watches", "a_nb_watch_per_domain, "Watches per domain" }, ++ { "transactions", "a_max_transaction, "Transactions per domain" }, ++ { "outstanding", "a_req_outstanding, ++ "Outstanding requests per domain" }, ++ { "transaction-nodes", "a_trans_nodes, ++ "Max. number of accessed nodes per transaction" }, ++ { "memory", "a_memory_per_domain_hard, ++ "Total Xenstore memory per domain (error level)" }, ++ { "node-size", "a_max_entry_size, "Max. size of a node" }, ++ { "path-max", "a_max_path_len, "Max. length of a node path" }, ++ { "permissions", "a_nb_perms_per_node, ++ "Max. number of permissions per node" }, ++ { NULL, NULL, NULL } ++}; ++ ++static const struct quota soft_quotas[] = { ++ { "memory", "a_memory_per_domain_soft, ++ "Total Xenstore memory per domain (warning level)" }, ++ { NULL, NULL, NULL } ++}; ++ ++static int quota_show_current(const void *ctx, struct connection *conn, ++ const struct quota *quotas) ++{ ++ char *resp; ++ unsigned int i; ++ ++ resp = talloc_strdup(ctx, "Quota settings:\n"); ++ if (!resp) ++ return ENOMEM; ++ ++ for (i = 0; quotas[i].quota; i++) { ++ resp = talloc_asprintf_append(resp, "%-17s: %8d %s\n", ++ quotas[i].name, *quotas[i].quota, ++ quotas[i].descr); ++ if (!resp) ++ return ENOMEM; ++ } ++ ++ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); ++ ++ return 0; ++} ++ ++static int quota_set(const void *ctx, struct connection *conn, ++ char **vec, int num, const struct quota *quotas) ++{ ++ unsigned int i; ++ int val; ++ ++ if (num != 2) ++ return EINVAL; ++ ++ val = atoi(vec[1]); ++ if (val < 1) ++ return EINVAL; ++ ++ for (i = 0; quotas[i].quota; i++) { ++ if (!strcmp(vec[0], quotas[i].name)) { ++ *quotas[i].quota = val; ++ send_ack(conn, XS_CONTROL); ++ return 0; ++ } ++ } ++ ++ return EINVAL; ++} ++ ++static int quota_get(const void *ctx, struct connection *conn, ++ char **vec, int num) ++{ ++ if (num != 1) ++ return EINVAL; ++ ++ return domain_get_quota(ctx, conn, atoi(vec[0])); ++} ++ ++static int do_control_quota(void *ctx, struct connection *conn, ++ char **vec, int num) ++{ ++ if (num == 0) ++ return quota_show_current(ctx, conn, hard_quotas); ++ ++ if (!strcmp(vec[0], "set")) ++ return quota_set(ctx, conn, vec + 1, num - 1, hard_quotas); ++ ++ return quota_get(ctx, conn, vec, num); ++} ++ ++static int do_control_quota_s(void *ctx, struct connection *conn, ++ char **vec, int num) ++{ ++ if (num == 0) ++ return quota_show_current(ctx, conn, soft_quotas); ++ ++ if (!strcmp(vec[0], "set")) ++ return quota_set(ctx, conn, vec + 1, num - 1, soft_quotas); ++ ++ return EINVAL; ++} ++ + #ifdef __MINIOS__ + static int do_control_memreport(void *ctx, struct connection *conn, + char **vec, int num) +@@ -777,6 +886,8 @@ static struct cmd_s cmds[] = { + { "memreport", do_control_memreport, "[<file>]" }, + #endif + { "print", do_control_print, "<string>" }, ++ { "quota", do_control_quota, "[set <name> <val>|<domid>]" }, ++ { "quota-soft", do_control_quota_s, "[set <name> <val>]" }, + { "help", do_control_help, "" }, + }; + +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index ec542df6a67e..3d5142581332 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -31,6 +31,7 @@ + #include "xenstored_domain.h" + #include "xenstored_transaction.h" + #include "xenstored_watch.h" ++#include "xenstored_control.h" + + #include <xenevtchn.h> + #include <xenctrl.h> +@@ -351,6 +352,38 @@ static struct domain *find_domain_struct(unsigned int domid) + return NULL; + } + ++int domain_get_quota(const void *ctx, struct connection *conn, ++ unsigned int domid) ++{ ++ struct domain *d = find_domain_struct(domid); ++ char *resp; ++ int ta; ++ ++ if (!d) ++ return ENOENT; ++ ++ ta = d->conn ? d->conn->transaction_started : 0; ++ resp = talloc_asprintf(ctx, "Domain %u:\n", domid); ++ if (!resp) ++ return ENOMEM; ++ ++#define ent(t, e) \ ++ resp = talloc_asprintf_append(resp, "%-16s: %8d\n", #t, e); \ ++ if (!resp) return ENOMEM ++ ++ ent(nodes, d->nbentry); ++ ent(watches, d->nbwatch); ++ ent(transactions, ta); ++ ent(outstanding, d->nboutstanding); ++ ent(memory, d->memory); ++ ++#undef ent ++ ++ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); ++ ++ return 0; ++} ++ + static struct domain *alloc_domain(const void *context, unsigned int domid) + { + struct domain *domain; +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index 571aa46d158e..0f883936f413 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -91,6 +91,8 @@ int domain_watch(struct connection *conn); + void domain_outstanding_inc(struct connection *conn); + void domain_outstanding_dec(struct connection *conn); + void domain_outstanding_domid_dec(unsigned int domid); ++int domain_get_quota(const void *ctx, struct connection *conn, ++ unsigned int domid); + + /* Special node permission handling. */ + int set_perms_special(struct connection *conn, const char *name, +-- +2.37.4 + diff --git a/0101-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch b/0101-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch new file mode 100644 index 0000000..7df76b1 --- /dev/null +++ b/0101-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch @@ -0,0 +1,63 @@ +From 8fabb963e662a544a397cb2afefb2b15af07ace9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:01 +0100 +Subject: [PATCH 101/126] tools/ocaml/xenstored: Synchronise defaults with + oxenstore.conf.in +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We currently have 2 different set of defaults in upstream Xen git tree: +* defined in the source code, only used if there is no config file +* defined in the oxenstored.conf.in upstream Xen + +An oxenstored.conf file is not mandatory, and if missing, maxrequests in +particular has an unsafe default. + +Resync the defaults from oxenstored.conf.in into the source code. + +This is part of XSA-326 / CVE-2022-42316. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 84734955d4bf629ba459a74773afcde50a52236f) +--- + tools/ocaml/xenstored/define.ml | 6 +++--- + tools/ocaml/xenstored/quota.ml | 4 ++-- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml +index ebe18b8e312c..6b06f808595b 100644 +--- a/tools/ocaml/xenstored/define.ml ++++ b/tools/ocaml/xenstored/define.ml +@@ -21,9 +21,9 @@ let xs_daemon_socket = Paths.xen_run_stored ^ "/socket" + + let default_config_dir = Paths.xen_config_dir + +-let maxwatch = ref (50) +-let maxtransaction = ref (20) +-let maxrequests = ref (-1) (* maximum requests per transaction *) ++let maxwatch = ref (100) ++let maxtransaction = ref (10) ++let maxrequests = ref (1024) (* maximum requests per transaction *) + + let conflict_burst_limit = ref 5.0 + let conflict_max_history_seconds = ref 0.05 +diff --git a/tools/ocaml/xenstored/quota.ml b/tools/ocaml/xenstored/quota.ml +index abcac912805a..6e3d6401ae89 100644 +--- a/tools/ocaml/xenstored/quota.ml ++++ b/tools/ocaml/xenstored/quota.ml +@@ -20,8 +20,8 @@ exception Transaction_opened + + let warn fmt = Logging.warn "quota" fmt + let activate = ref true +-let maxent = ref (10000) +-let maxsize = ref (4096) ++let maxent = ref (1000) ++let maxsize = ref (2048) + + type t = { + maxent: int; (* max entities per domU *) +-- +2.37.4 + diff --git a/0102-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch b/0102-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch new file mode 100644 index 0000000..bc741ae --- /dev/null +++ b/0102-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch @@ -0,0 +1,101 @@ +From 45816222bb3da04f4cd3388efc46d127d48b8906 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Thu, 28 Jul 2022 17:08:15 +0100 +Subject: [PATCH 102/126] tools/ocaml/xenstored: Check for maxrequests before + performing operations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Previously we'd perform the operation, record the updated tree in the +transaction record, then try to insert a watchop path and the reply packet. + +If we exceeded max requests we would've returned EQUOTA, but still: +* have performed the operation on the transaction's tree +* have recorded the watchop, making this queue effectively unbounded + +It is better if we check whether we'd have room to store the operation before +performing the transaction, and raise EQUOTA there. Then the transaction +record won't grow. + +This is part of XSA-326 / CVE-2022-42317. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 329f4d1a6535c6c5a34025ca0d03fc5c7228fcff) +--- + tools/ocaml/xenstored/process.ml | 4 +++- + tools/ocaml/xenstored/transaction.ml | 16 ++++++++++++---- + 2 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml +index 27790d4a5c41..dd58e6979cf9 100644 +--- a/tools/ocaml/xenstored/process.ml ++++ b/tools/ocaml/xenstored/process.ml +@@ -389,6 +389,7 @@ let input_handle_error ~cons ~doms ~fct ~con ~t ~req = + let reply_error e = + Packet.Error e in + try ++ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; + fct con t doms cons req.Packet.data + with + | Define.Invalid_path -> reply_error "EINVAL" +@@ -681,9 +682,10 @@ let process_packet ~store ~cons ~doms ~con ~req = + in + + let response = try ++ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; + if tid <> Transaction.none then + (* Remember the request and response for this operation in case we need to replay the transaction *) +- Transaction.add_operation ~perm:(Connection.get_perm con) t req response; ++ Transaction.add_operation t req response; + response + with Quota.Limit_reached -> + Packet.Error "EQUOTA" +diff --git a/tools/ocaml/xenstored/transaction.ml b/tools/ocaml/xenstored/transaction.ml +index 17b1bdf2eaf9..294143e2335b 100644 +--- a/tools/ocaml/xenstored/transaction.ml ++++ b/tools/ocaml/xenstored/transaction.ml +@@ -85,6 +85,7 @@ type t = { + oldroot: Store.Node.t; + mutable paths: (Xenbus.Xb.Op.operation * Store.Path.t) list; + mutable operations: (Packet.request * Packet.response) list; ++ mutable quota_reached: bool; + mutable read_lowpath: Store.Path.t option; + mutable write_lowpath: Store.Path.t option; + } +@@ -127,6 +128,7 @@ let make ?(internal=false) id store = + oldroot = Store.get_root store; + paths = []; + operations = []; ++ quota_reached = false; + read_lowpath = None; + write_lowpath = None; + } in +@@ -143,13 +145,19 @@ let get_root t = Store.get_root t.store + + let is_read_only t = t.paths = [] + let add_wop t ty path = t.paths <- (ty, path) :: t.paths +-let add_operation ~perm t request response = ++let get_operations t = List.rev t.operations ++ ++let check_quota_exn ~perm t = + if !Define.maxrequests >= 0 + && not (Perms.Connection.is_dom0 perm) +- && List.length t.operations >= !Define.maxrequests +- then raise Quota.Limit_reached; ++ && (t.quota_reached || List.length t.operations >= !Define.maxrequests) ++ then begin ++ t.quota_reached <- true; ++ raise Quota.Limit_reached; ++ end ++ ++let add_operation t request response = + t.operations <- (request, response) :: t.operations +-let get_operations t = List.rev t.operations + let set_read_lowpath t path = t.read_lowpath <- get_lowest path t.read_lowpath + let set_write_lowpath t path = t.write_lowpath <- get_lowest path t.write_lowpath + +-- +2.37.4 + diff --git a/0103-tools-ocaml-GC-parameter-tuning.patch b/0103-tools-ocaml-GC-parameter-tuning.patch new file mode 100644 index 0000000..d1473df --- /dev/null +++ b/0103-tools-ocaml-GC-parameter-tuning.patch @@ -0,0 +1,126 @@ +From 9f89883fabd53cb7873cc31778887ba2a1228dd8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:07 +0100 +Subject: [PATCH 103/126] tools/ocaml: GC parameter tuning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +By default the OCaml garbage collector would return memory to the OS only +after unused memory is 5x live memory. Tweak this to 120% instead, which +would match the major GC speed. + +This is part of XSA-326. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 4a8bacff20b857ca0d628ef5525877ade11f2a42) +--- + tools/ocaml/xenstored/define.ml | 1 + + tools/ocaml/xenstored/xenstored.ml | 64 ++++++++++++++++++++++++++++++ + 2 files changed, 65 insertions(+) + +diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml +index 6b06f808595b..ba63a8147e09 100644 +--- a/tools/ocaml/xenstored/define.ml ++++ b/tools/ocaml/xenstored/define.ml +@@ -25,6 +25,7 @@ let maxwatch = ref (100) + let maxtransaction = ref (10) + let maxrequests = ref (1024) (* maximum requests per transaction *) + ++let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) + let conflict_burst_limit = ref 5.0 + let conflict_max_history_seconds = ref 0.05 + let conflict_rate_limit_is_aggregate = ref true +diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml +index d44ae673c42a..3b57ad016dfb 100644 +--- a/tools/ocaml/xenstored/xenstored.ml ++++ b/tools/ocaml/xenstored/xenstored.ml +@@ -104,6 +104,7 @@ let parse_config filename = + ("quota-maxsize", Config.Set_int Quota.maxsize); + ("quota-maxrequests", Config.Set_int Define.maxrequests); + ("quota-path-max", Config.Set_int Define.path_max); ++ ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); + ("test-eagain", Config.Set_bool Transaction.test_eagain); + ("persistent", Config.Set_bool Disk.enable); + ("xenstored-log-file", Config.String Logging.set_xenstored_log_destination); +@@ -265,6 +266,67 @@ let to_file store cons fds file = + (fun () -> close_out channel) + end + ++(* ++ By default OCaml's GC only returns memory to the OS when it exceeds a ++ configurable 'max overhead' setting. ++ The default is 500%, that is 5/6th of the OCaml heap needs to be free ++ and only 1/6th live for a compaction to be triggerred that would ++ release memory back to the OS. ++ If the limit is not hit then the OCaml process can reuse that memory ++ for its own purposes, but other processes won't be able to use it. ++ ++ There is also a 'space overhead' setting that controls how much work ++ each major GC slice does, and by default aims at having no more than ++ 80% or 120% (depending on version) garbage values compared to live ++ values. ++ This doesn't have as much relevance to memory returned to the OS as ++ long as space_overhead <= max_overhead, because compaction is only ++ triggerred at the end of major GC cycles. ++ ++ The defaults are too large once the program starts using ~100MiB of ++ memory, at which point ~500MiB would be unavailable to other processes ++ (which would be fine if this was the main process in this VM, but it is ++ not). ++ ++ Max overhead can also be set to 0, however this is for testing purposes ++ only (setting it lower than 'space overhead' wouldn't help because the ++ major GC wouldn't run fast enough, and compaction does have a ++ performance cost: we can only compact contiguous regions, so memory has ++ to be moved around). ++ ++ Max overhead controls how often the heap is compacted, which is useful ++ if there are burst of activity followed by long periods of idle state, ++ or if a domain quits, etc. Compaction returns memory to the OS. ++ ++ wasted = live * space_overhead / 100 ++ ++ For globally overriding the GC settings one can use OCAMLRUNPARAM, ++ however we provide a config file override to be consistent with other ++ oxenstored settings. ++ ++ One might want to dynamically adjust the overhead setting based on used ++ memory, i.e. to use a fixed upper bound in bytes, not percentage. However ++ measurements show that such adjustments increase GC overhead massively, ++ while still not guaranteeing that memory is returned any more quickly ++ than with a percentage based setting. ++ ++ The allocation policy could also be tweaked, e.g. first fit would reduce ++ fragmentation and thus memory usage, but the documentation warns that it ++ can be sensibly slower, and indeed one of our own testcases can trigger ++ such a corner case where it is multiple times slower, so it is best to keep ++ the default allocation policy (next-fit/best-fit depending on version). ++ ++ There are other tweaks that can be attempted in the future, e.g. setting ++ 'ulimit -v' to 75% of RAM, however getting the kernel to actually return ++ NULL from allocations is difficult even with that setting, and without a ++ NULL the emergency GC won't be triggerred. ++ Perhaps cgroup limits could help, but for now tweak the safest only. ++*) ++ ++let tweak_gc () = ++ Gc.set { (Gc.get ()) with Gc.max_overhead = !Define.gc_max_overhead } ++ ++ + let _ = + let cf = do_argv in + let pidfile = +@@ -274,6 +336,8 @@ let _ = + default_pidfile + in + ++ tweak_gc (); ++ + (try + Unixext.mkdir_rec (Filename.dirname pidfile) 0o755 + with _ -> +-- +2.37.4 + diff --git a/0104-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch b/0104-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch new file mode 100644 index 0000000..15f69b0 --- /dev/null +++ b/0104-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch @@ -0,0 +1,92 @@ +From bbb4ceab25124646fa845855f3cb95ae15d0c3f2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Fri, 29 Jul 2022 18:53:29 +0100 +Subject: [PATCH 104/126] tools/ocaml/libs/xb: hide type of Xb.t +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Hiding the type will make it easier to change the implementation +in the future without breaking code that relies on it. + +No functional change. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 7ade30a1451734d041363c750a65d322e25b47ba) +--- + tools/ocaml/libs/xb/xb.ml | 3 +++ + tools/ocaml/libs/xb/xb.mli | 9 ++------- + tools/ocaml/xenstored/connection.ml | 8 ++------ + 3 files changed, 7 insertions(+), 13 deletions(-) + +diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml +index 104d319d7747..8404ddd8a682 100644 +--- a/tools/ocaml/libs/xb/xb.ml ++++ b/tools/ocaml/libs/xb/xb.ml +@@ -196,6 +196,9 @@ let peek_output con = Queue.peek con.pkt_out + let input_len con = Queue.length con.pkt_in + let has_in_packet con = Queue.length con.pkt_in > 0 + let get_in_packet con = Queue.pop con.pkt_in ++let has_partial_input con = match con.partial_in with ++ | HaveHdr _ -> true ++ | NoHdr (n, _) -> n < Partial.header_size () + let has_more_input con = + match con.backend with + | Fd _ -> false +diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli +index 3a00da6cddc1..794e35bb343e 100644 +--- a/tools/ocaml/libs/xb/xb.mli ++++ b/tools/ocaml/libs/xb/xb.mli +@@ -66,13 +66,7 @@ type backend_mmap = { + type backend_fd = { fd : Unix.file_descr; } + type backend = Fd of backend_fd | Xenmmap of backend_mmap + type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes +-type t = { +- backend : backend; +- pkt_in : Packet.t Queue.t; +- pkt_out : Packet.t Queue.t; +- mutable partial_in : partial_buf; +- mutable partial_out : string; +-} ++type t + val init_partial_in : unit -> partial_buf + val reconnect : t -> unit + val queue : t -> Packet.t -> unit +@@ -97,6 +91,7 @@ val has_output : t -> bool + val peek_output : t -> Packet.t + val input_len : t -> int + val has_in_packet : t -> bool ++val has_partial_input : t -> bool + val get_in_packet : t -> Packet.t + val has_more_input : t -> bool + val is_selectable : t -> bool +diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml +index 65f99ea6f28a..38b47363a173 100644 +--- a/tools/ocaml/xenstored/connection.ml ++++ b/tools/ocaml/xenstored/connection.ml +@@ -125,9 +125,7 @@ let get_perm con = + let set_target con target_domid = + con.perm <- Perms.Connection.set_target (get_perm con) ~perms:[Perms.READ; Perms.WRITE] target_domid + +-let is_backend_mmap con = match con.xb.Xenbus.Xb.backend with +- | Xenbus.Xb.Xenmmap _ -> true +- | _ -> false ++let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb + + let send_reply con tid rid ty data = + if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then +@@ -280,9 +278,7 @@ let get_transaction con tid = + + let do_input con = Xenbus.Xb.input con.xb + let has_input con = Xenbus.Xb.has_in_packet con.xb +-let has_partial_input con = match con.xb.Xenbus.Xb.partial_in with +- | HaveHdr _ -> true +- | NoHdr (n, _) -> n < Xenbus.Partial.header_size () ++let has_partial_input con = Xenbus.Xb.has_partial_input con.xb + let pop_in con = Xenbus.Xb.get_in_packet con.xb + let has_more_input con = Xenbus.Xb.has_more_input con.xb + +-- +2.37.4 + diff --git a/0105-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch b/0105-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch new file mode 100644 index 0000000..2691ae4 --- /dev/null +++ b/0105-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch @@ -0,0 +1,225 @@ +From fccdca83a4425b0e30ec9e29e9a5909e1a55b80d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:02 +0100 +Subject: [PATCH 105/126] tools/ocaml: Change Xb.input to return Packet.t + option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The queue here would only ever hold at most one element. This will simplify +follow-up patches. + +This is part of XSA-326. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit c0a86a462721008eca5ff733660de094d3c34bc7) +--- + tools/ocaml/libs/xb/xb.ml | 18 +++++------------- + tools/ocaml/libs/xb/xb.mli | 5 +---- + tools/ocaml/libs/xs/xsraw.ml | 20 ++++++-------------- + tools/ocaml/xenstored/connection.ml | 4 +--- + tools/ocaml/xenstored/process.ml | 15 +++++++-------- + 5 files changed, 20 insertions(+), 42 deletions(-) + +diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml +index 8404ddd8a682..165fd4a1edf4 100644 +--- a/tools/ocaml/libs/xb/xb.ml ++++ b/tools/ocaml/libs/xb/xb.ml +@@ -45,7 +45,6 @@ type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes + type t = + { + backend: backend; +- pkt_in: Packet.t Queue.t; + pkt_out: Packet.t Queue.t; + mutable partial_in: partial_buf; + mutable partial_out: string; +@@ -62,7 +61,6 @@ let reconnect t = match t.backend with + Xs_ring.close backend.mmap; + backend.eventchn_notify (); + (* Clear our old connection state *) +- Queue.clear t.pkt_in; + Queue.clear t.pkt_out; + t.partial_in <- init_partial_in (); + t.partial_out <- "" +@@ -124,7 +122,6 @@ let output con = + + (* NB: can throw Reconnect *) + let input con = +- let newpacket = ref false in + let to_read = + match con.partial_in with + | HaveHdr partial_pkt -> Partial.to_complete partial_pkt +@@ -143,21 +140,19 @@ let input con = + if Partial.to_complete partial_pkt = 0 then ( + let pkt = Packet.of_partialpkt partial_pkt in + con.partial_in <- init_partial_in (); +- Queue.push pkt con.pkt_in; +- newpacket := true +- ) ++ Some pkt ++ ) else None + | NoHdr (i, buf) -> + (* we complete the partial header *) + if sz > 0 then + Bytes.blit b 0 buf (Partial.header_size () - i) sz; + con.partial_in <- if sz = i then +- HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf) +- ); +- !newpacket ++ HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf); ++ None ++ ) + + let newcon backend = { + backend = backend; +- pkt_in = Queue.create (); + pkt_out = Queue.create (); + partial_in = init_partial_in (); + partial_out = ""; +@@ -193,9 +188,6 @@ let has_output con = has_new_output con || has_old_output con + + let peek_output con = Queue.peek con.pkt_out + +-let input_len con = Queue.length con.pkt_in +-let has_in_packet con = Queue.length con.pkt_in > 0 +-let get_in_packet con = Queue.pop con.pkt_in + let has_partial_input con = match con.partial_in with + | HaveHdr _ -> true + | NoHdr (n, _) -> n < Partial.header_size () +diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli +index 794e35bb343e..91c682162cea 100644 +--- a/tools/ocaml/libs/xb/xb.mli ++++ b/tools/ocaml/libs/xb/xb.mli +@@ -77,7 +77,7 @@ val write_fd : backend_fd -> 'a -> string -> int -> int + val write_mmap : backend_mmap -> 'a -> string -> int -> int + val write : t -> string -> int -> int + val output : t -> bool +-val input : t -> bool ++val input : t -> Packet.t option + val newcon : backend -> t + val open_fd : Unix.file_descr -> t + val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t +@@ -89,10 +89,7 @@ val has_new_output : t -> bool + val has_old_output : t -> bool + val has_output : t -> bool + val peek_output : t -> Packet.t +-val input_len : t -> int +-val has_in_packet : t -> bool + val has_partial_input : t -> bool +-val get_in_packet : t -> Packet.t + val has_more_input : t -> bool + val is_selectable : t -> bool + val get_fd : t -> Unix.file_descr +diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml +index d982fb24dbb1..451f8b38dbcc 100644 +--- a/tools/ocaml/libs/xs/xsraw.ml ++++ b/tools/ocaml/libs/xs/xsraw.ml +@@ -94,26 +94,18 @@ let pkt_send con = + done + + (* receive one packet - can sleep *) +-let pkt_recv con = +- let workdone = ref false in +- while not !workdone +- do +- workdone := Xb.input con.xb +- done; +- Xb.get_in_packet con.xb ++let rec pkt_recv con = ++ match Xb.input con.xb with ++ | Some packet -> packet ++ | None -> pkt_recv con + + let pkt_recv_timeout con timeout = + let fd = Xb.get_fd con.xb in + let r, _, _ = Unix.select [ fd ] [] [] timeout in + if r = [] then + true, None +- else ( +- let workdone = Xb.input con.xb in +- if workdone then +- false, (Some (Xb.get_in_packet con.xb)) +- else +- false, None +- ) ++ else ++ false, Xb.input con.xb + + let queue_watchevent con data = + let ls = split_string ~limit:2 '\000' data in +diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml +index 38b47363a173..cc20e047d2b9 100644 +--- a/tools/ocaml/xenstored/connection.ml ++++ b/tools/ocaml/xenstored/connection.ml +@@ -277,9 +277,7 @@ let get_transaction con tid = + Hashtbl.find con.transactions tid + + let do_input con = Xenbus.Xb.input con.xb +-let has_input con = Xenbus.Xb.has_in_packet con.xb + let has_partial_input con = Xenbus.Xb.has_partial_input con.xb +-let pop_in con = Xenbus.Xb.get_in_packet con.xb + let has_more_input con = Xenbus.Xb.has_more_input con.xb + + let has_output con = Xenbus.Xb.has_output con.xb +@@ -307,7 +305,7 @@ let is_bad con = match con.dom with None -> false | Some dom -> Domain.is_bad_do + Restrictions below can be relaxed once xenstored learns to dump more + of its live state in a safe way *) + let has_extra_connection_data con = +- let has_in = has_input con || has_partial_input con in ++ let has_in = has_partial_input con in + let has_out = has_output con in + let has_socket = con.dom = None in + let has_nondefault_perms = make_perm con.dom <> con.perm in +diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml +index dd58e6979cf9..cbf708213796 100644 +--- a/tools/ocaml/xenstored/process.ml ++++ b/tools/ocaml/xenstored/process.ml +@@ -195,10 +195,9 @@ let parse_live_update args = + | _ when Unix.gettimeofday () < t.deadline -> false + | l -> + warn "timeout reached: have to wait, migrate or shutdown %d domains:" (List.length l); +- let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, in: %b, out: %b, perm: %s" ++ let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, out: %b, perm: %s" + (Connection.get_domstr con) + (Connection.number_of_transactions con) +- (Connection.has_input con) + (Connection.has_output con) + (Connection.get_perm con |> Perms.Connection.to_string) + ) l in +@@ -706,16 +705,17 @@ let do_input store cons doms con = + info "%s requests a reconnect" (Connection.get_domstr con); + History.reconnect con; + info "%s reconnection complete" (Connection.get_domstr con); +- false ++ None + | Failure exp -> + error "caught exception %s" exp; + error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); + Connection.mark_as_bad con; +- false ++ None + in + +- if newpacket then ( +- let packet = Connection.pop_in con in ++ match newpacket with ++ | None -> () ++ | Some packet -> + let tid, rid, ty, data = Xenbus.Xb.Packet.unpack packet in + let req = {Packet.tid=tid; Packet.rid=rid; Packet.ty=ty; Packet.data=data} in + +@@ -725,8 +725,7 @@ let do_input store cons doms con = + (Xenbus.Xb.Op.to_string ty) (sanitize_data data); *) + process_packet ~store ~cons ~doms ~con ~req; + write_access_log ~ty ~tid ~con:(Connection.get_domstr con) ~data; +- Connection.incr_ops con; +- ) ++ Connection.incr_ops con + + let do_output _store _cons _doms con = + if Connection.has_output con then ( +-- +2.37.4 + diff --git a/0106-tools-ocaml-xb-Add-BoundedQueue.patch b/0106-tools-ocaml-xb-Add-BoundedQueue.patch new file mode 100644 index 0000000..c1f0385 --- /dev/null +++ b/0106-tools-ocaml-xb-Add-BoundedQueue.patch @@ -0,0 +1,133 @@ +From 9e5290daf923e84ca56a6f3d9fc6a333175ef0f9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:03 +0100 +Subject: [PATCH 106/126] tools/ocaml/xb: Add BoundedQueue +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Ensures we cannot store more than [capacity] elements in a [Queue]. Replacing +all Queue with this module will then ensure at compile time that all Queues +are correctly bound checked. + +Each element in the queue has a class with its own limits. This, in a +subsequent change, will ensure that command responses can proceed during a +flood of watch events. + +No functional change. + +This is part of XSA-326. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 19171fb5d888b4467a7073e8febc5e05540956e9) +--- + tools/ocaml/libs/xb/xb.ml | 92 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 92 insertions(+) + +diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml +index 165fd4a1edf4..4197a3888a68 100644 +--- a/tools/ocaml/libs/xb/xb.ml ++++ b/tools/ocaml/libs/xb/xb.ml +@@ -17,6 +17,98 @@ + module Op = struct include Op end + module Packet = struct include Packet end + ++module BoundedQueue : sig ++ type ('a, 'b) t ++ ++ (** [create ~capacity ~classify ~limit] creates a queue with maximum [capacity] elements. ++ This is burst capacity, each element is further classified according to [classify], ++ and each class can have its own [limit]. ++ [capacity] is enforced as an overall limit. ++ The [limit] can be dynamic, and can be smaller than the number of elements already queued of that class, ++ in which case those elements are considered to use "burst capacity". ++ *) ++ val create: capacity:int -> classify:('a -> 'b) -> limit:('b -> int) -> ('a, 'b) t ++ ++ (** [clear q] discards all elements from [q] *) ++ val clear: ('a, 'b) t -> unit ++ ++ (** [can_push q] when [length q < capacity]. *) ++ val can_push: ('a, 'b) t -> 'b -> bool ++ ++ (** [push e q] adds [e] at the end of queue [q] if [can_push q], or returns [None]. *) ++ val push: 'a -> ('a, 'b) t -> unit option ++ ++ (** [pop q] removes and returns first element in [q], or raises [Queue.Empty]. *) ++ val pop: ('a, 'b) t -> 'a ++ ++ (** [peek q] returns the first element in [q], or raises [Queue.Empty]. *) ++ val peek : ('a, 'b) t -> 'a ++ ++ (** [length q] returns the current number of elements in [q] *) ++ val length: ('a, 'b) t -> int ++ ++ (** [debug string_of_class q] prints queue usage statistics in an unspecified internal format. *) ++ val debug: ('b -> string) -> (_, 'b) t -> string ++end = struct ++ type ('a, 'b) t = ++ { q: 'a Queue.t ++ ; capacity: int ++ ; classify: 'a -> 'b ++ ; limit: 'b -> int ++ ; class_count: ('b, int) Hashtbl.t ++ } ++ ++ let create ~capacity ~classify ~limit = ++ { capacity; q = Queue.create (); classify; limit; class_count = Hashtbl.create 3 } ++ ++ let get_count t classification = try Hashtbl.find t.class_count classification with Not_found -> 0 ++ ++ let can_push_internal t classification class_count = ++ Queue.length t.q < t.capacity && class_count < t.limit classification ++ ++ let ok = Some () ++ ++ let push e t = ++ let classification = t.classify e in ++ let class_count = get_count t classification in ++ if can_push_internal t classification class_count then begin ++ Queue.push e t.q; ++ Hashtbl.replace t.class_count classification (class_count + 1); ++ ok ++ end ++ else ++ None ++ ++ let can_push t classification = ++ can_push_internal t classification @@ get_count t classification ++ ++ let clear t = ++ Queue.clear t.q; ++ Hashtbl.reset t.class_count ++ ++ let pop t = ++ let e = Queue.pop t.q in ++ let classification = t.classify e in ++ let () = match get_count t classification - 1 with ++ | 0 -> Hashtbl.remove t.class_count classification (* reduces memusage *) ++ | n -> Hashtbl.replace t.class_count classification n ++ in ++ e ++ ++ let peek t = Queue.peek t.q ++ let length t = Queue.length t.q ++ ++ let debug string_of_class t = ++ let b = Buffer.create 128 in ++ Printf.bprintf b "BoundedQueue capacity: %d, used: {" t.capacity; ++ Hashtbl.iter (fun packet_class count -> ++ Printf.bprintf b " %s: %d" (string_of_class packet_class) count ++ ) t.class_count; ++ Printf.bprintf b "}"; ++ Buffer.contents b ++end ++ ++ + exception End_of_file + exception Eagain + exception Noent +-- +2.37.4 + diff --git a/0107-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch b/0107-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch new file mode 100644 index 0000000..5f5c4b6 --- /dev/null +++ b/0107-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch @@ -0,0 +1,888 @@ +From 64048b4c218099b6adcf46cd7b4d1dc9c658009e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:04 +0100 +Subject: [PATCH 107/126] tools/ocaml: Limit maximum in-flight requests / + outstanding replies +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce a limit on the number of outstanding reply packets in the xenbus +queue. This limits the number of in-flight requests: when the output queue is +full we'll stop processing inputs until the output queue has room again. + +To avoid a busy loop on the Unix socket we only add it to the watched input +file descriptor set if we'd be able to call `input` on it. Even though Dom0 +is trusted and exempt from quotas a flood of events might cause a backlog +where events are produced faster than daemons in Dom0 can consume them, which +could lead to an unbounded queue size and OOM. + +Therefore the xenbus queue limit must apply to all connections, Dom0 is not +exempt from it, although if everything works correctly it will eventually +catch up. + +This prevents a malicious guest from sending more commands while it has +outstanding watch events or command replies in its input ring. However if it +can cause the generation of watch events by other means (e.g. by Dom0, or +another cooperative guest) and stop reading its own ring then watch events +would've queued up without limit. + +The xenstore protocol doesn't have a back-pressure mechanism, and doesn't +allow dropping watch events. In fact, dropping watch events is known to break +some pieces of normal functionality. This leaves little choice to safely +implement the xenstore protocol without exposing the xenstore daemon to +out-of-memory attacks. + +Implement the fix as pipes with bounded buffers: +* Use a bounded buffer for watch events +* The watch structure will have a bounded receiving pipe of watch events +* The source will have an "overflow" pipe of pending watch events it couldn't + deliver + +Items are queued up on one end and are sent as far along the pipe as possible: + + source domain -> watch -> xenbus of target -> xenstore ring/socket of target + +If the pipe is "full" at any point then back-pressure is applied and we prevent +more items from being queued up. For the source domain this means that we'll +stop accepting new commands as long as its pipe buffer is not empty. + +Before we try to enqueue an item we first check whether it is possible to send +it further down the pipe, by attempting to recursively flush the pipes. This +ensures that we retain the order of events as much as possible. + +We might break causality of watch events if the target domain's queue is full +and we need to start using the watch's queue. This is a breaking change in +the xenstore protocol, but only for domains which are not processing their +incoming ring as expected. + +When a watch is deleted its entire pending queue is dropped (no code is needed +for that, because it is part of the 'watch' type). + +There is a cache of watches that have pending events that we attempt to flush +at every cycle if possible. + +Introduce 3 limits here: +* quota-maxwatchevents on watch event destination: when this is hit the + source will not be allowed to queue up more watch events. +* quota-maxoustanding which is the number of responses not read from the ring: + once exceeded, no more inputs are processed until all outstanding replies + are consumed by the client. +* overflow queue on the watch event source: all watches that cannot be stored + on destination are queued up here, a single command can trigger multiple + watches (e.g. due to recursion). + +The overflow queue currently doesn't have an upper bound, it is difficult to +accurately calculate one as it depends on whether you are Dom0 and how many +watches each path has registered and how many watch events you can trigger +with a single command (e.g. a commit). However these events were already +using memory, this just moves them elsewhere, and as long as we correctly +block a domain it shouldn't result in unbounded memory usage. + +Note that Dom0 is not excluded from these checks, it is important that Dom0 is +especially not excluded when it is the source, since there are many ways in +which a guest could trigger Dom0 to send it watch events. + +This should protect against malicious frontends as long as the backend follows +the PV xenstore protocol and only exposes paths needed by the frontend, and +changes those paths at most once as a reaction to guest events, or protocol +state. + +The queue limits are per watch, and per domain-pair, so even if one +communication channel would be "blocked", others would keep working, and the +domain itself won't get blocked as long as it doesn't overflow the queue of +watch events. + +Similarly a malicious backend could cause the frontend to get blocked, but +this watch queue protects the frontend as well as long as it follows the PV +protocol. (Although note that protection against malicious backends is only a +best effort at the moment) + +This is part of XSA-326 / CVE-2022-42318. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 9284ae0c40fb5b9606947eaaec23dc71d0540e96) +--- + tools/ocaml/libs/xb/xb.ml | 61 +++++++-- + tools/ocaml/libs/xb/xb.mli | 11 +- + tools/ocaml/libs/xs/queueop.ml | 25 ++-- + tools/ocaml/libs/xs/xsraw.ml | 4 +- + tools/ocaml/xenstored/connection.ml | 155 +++++++++++++++++++++-- + tools/ocaml/xenstored/connections.ml | 57 +++++++-- + tools/ocaml/xenstored/define.ml | 7 + + tools/ocaml/xenstored/oxenstored.conf.in | 2 + + tools/ocaml/xenstored/process.ml | 31 ++++- + tools/ocaml/xenstored/xenstored.ml | 2 + + 10 files changed, 296 insertions(+), 59 deletions(-) + +diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml +index 4197a3888a68..b292ed7a874d 100644 +--- a/tools/ocaml/libs/xb/xb.ml ++++ b/tools/ocaml/libs/xb/xb.ml +@@ -134,14 +134,44 @@ type backend = Fd of backend_fd | Xenmmap of backend_mmap + + type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes + ++(* ++ separate capacity reservation for replies and watch events: ++ this allows a domain to keep working even when under a constant flood of ++ watch events ++*) ++type capacity = { maxoutstanding: int; maxwatchevents: int } ++ ++module Queue = BoundedQueue ++ ++type packet_class = ++ | CommandReply ++ | Watchevent ++ ++let string_of_packet_class = function ++ | CommandReply -> "command_reply" ++ | Watchevent -> "watch_event" ++ + type t = + { + backend: backend; +- pkt_out: Packet.t Queue.t; ++ pkt_out: (Packet.t, packet_class) Queue.t; + mutable partial_in: partial_buf; + mutable partial_out: string; ++ capacity: capacity + } + ++let to_read con = ++ match con.partial_in with ++ | HaveHdr partial_pkt -> Partial.to_complete partial_pkt ++ | NoHdr (i, _) -> i ++ ++let debug t = ++ Printf.sprintf "XenBus state: partial_in: %d needed, partial_out: %d bytes, pkt_out: %d packets, %s" ++ (to_read t) ++ (String.length t.partial_out) ++ (Queue.length t.pkt_out) ++ (BoundedQueue.debug string_of_packet_class t.pkt_out) ++ + let init_partial_in () = NoHdr + (Partial.header_size (), Bytes.make (Partial.header_size()) '\000') + +@@ -199,7 +229,8 @@ let output con = + let s = if String.length con.partial_out > 0 then + con.partial_out + else if Queue.length con.pkt_out > 0 then +- Packet.to_string (Queue.pop con.pkt_out) ++ let pkt = Queue.pop con.pkt_out in ++ Packet.to_string pkt + else + "" in + (* send data from s, and save the unsent data to partial_out *) +@@ -212,12 +243,15 @@ let output con = + (* after sending one packet, partial is empty *) + con.partial_out = "" + ++(* we can only process an input packet if we're guaranteed to have room ++ to store the response packet *) ++let can_input con = Queue.can_push con.pkt_out CommandReply ++ + (* NB: can throw Reconnect *) + let input con = +- let to_read = +- match con.partial_in with +- | HaveHdr partial_pkt -> Partial.to_complete partial_pkt +- | NoHdr (i, _) -> i in ++ if not (can_input con) then None ++ else ++ let to_read = to_read con in + + (* try to get more data from input stream *) + let b = Bytes.make to_read '\000' in +@@ -243,11 +277,22 @@ let input con = + None + ) + +-let newcon backend = { ++let classify t = ++ match t.Packet.ty with ++ | Op.Watchevent -> Watchevent ++ | _ -> CommandReply ++ ++let newcon ~capacity backend = ++ let limit = function ++ | CommandReply -> capacity.maxoutstanding ++ | Watchevent -> capacity.maxwatchevents ++ in ++ { + backend = backend; +- pkt_out = Queue.create (); ++ pkt_out = Queue.create ~capacity:(capacity.maxoutstanding + capacity.maxwatchevents) ~classify ~limit; + partial_in = init_partial_in (); + partial_out = ""; ++ capacity = capacity; + } + + let open_fd fd = newcon (Fd { fd = fd; }) +diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli +index 91c682162cea..71b2754ca788 100644 +--- a/tools/ocaml/libs/xb/xb.mli ++++ b/tools/ocaml/libs/xb/xb.mli +@@ -66,10 +66,11 @@ type backend_mmap = { + type backend_fd = { fd : Unix.file_descr; } + type backend = Fd of backend_fd | Xenmmap of backend_mmap + type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes ++type capacity = { maxoutstanding: int; maxwatchevents: int } + type t + val init_partial_in : unit -> partial_buf + val reconnect : t -> unit +-val queue : t -> Packet.t -> unit ++val queue : t -> Packet.t -> unit option + val read_fd : backend_fd -> 'a -> bytes -> int -> int + val read_mmap : backend_mmap -> 'a -> bytes -> int -> int + val read : t -> bytes -> int -> int +@@ -78,13 +79,14 @@ val write_mmap : backend_mmap -> 'a -> string -> int -> int + val write : t -> string -> int -> int + val output : t -> bool + val input : t -> Packet.t option +-val newcon : backend -> t +-val open_fd : Unix.file_descr -> t +-val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t ++val newcon : capacity:capacity -> backend -> t ++val open_fd : Unix.file_descr -> capacity:capacity -> t ++val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> capacity:capacity -> t + val close : t -> unit + val is_fd : t -> bool + val is_mmap : t -> bool + val output_len : t -> int ++val can_input: t -> bool + val has_new_output : t -> bool + val has_old_output : t -> bool + val has_output : t -> bool +@@ -93,3 +95,4 @@ val has_partial_input : t -> bool + val has_more_input : t -> bool + val is_selectable : t -> bool + val get_fd : t -> Unix.file_descr ++val debug: t -> string +diff --git a/tools/ocaml/libs/xs/queueop.ml b/tools/ocaml/libs/xs/queueop.ml +index 9ff5bbd529ce..4e532cdaeacb 100644 +--- a/tools/ocaml/libs/xs/queueop.ml ++++ b/tools/ocaml/libs/xs/queueop.ml +@@ -16,9 +16,10 @@ + open Xenbus + + let data_concat ls = (String.concat "\000" ls) ^ "\000" ++let queue con pkt = let r = Xb.queue con pkt in assert (r <> None) + let queue_path ty (tid: int) (path: string) con = + let data = data_concat [ path; ] in +- Xb.queue con (Xb.Packet.create tid 0 ty data) ++ queue con (Xb.Packet.create tid 0 ty data) + + (* operations *) + let directory tid path con = queue_path Xb.Op.Directory tid path con +@@ -27,48 +28,48 @@ let read tid path con = queue_path Xb.Op.Read tid path con + let getperms tid path con = queue_path Xb.Op.Getperms tid path con + + let debug commands con = +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) + + let watch path data con = + let data = data_concat [ path; data; ] in +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) + + let unwatch path data con = + let data = data_concat [ path; data; ] in +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) + + let transaction_start con = +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) + + let transaction_end tid commit con = + let data = data_concat [ (if commit then "T" else "F"); ] in +- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) ++ queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) + + let introduce domid mfn port con = + let data = data_concat [ Printf.sprintf "%u" domid; + Printf.sprintf "%nu" mfn; + string_of_int port; ] in +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) + + let release domid con = + let data = data_concat [ Printf.sprintf "%u" domid; ] in +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Release data) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Release data) + + let resume domid con = + let data = data_concat [ Printf.sprintf "%u" domid; ] in +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) + + let getdomainpath domid con = + let data = data_concat [ Printf.sprintf "%u" domid; ] in +- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) ++ queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) + + let write tid path value con = + let data = path ^ "\000" ^ value (* no NULL at the end *) in +- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Write data) ++ queue con (Xb.Packet.create tid 0 Xb.Op.Write data) + + let mkdir tid path con = queue_path Xb.Op.Mkdir tid path con + let rm tid path con = queue_path Xb.Op.Rm tid path con + + let setperms tid path perms con = + let data = data_concat [ path; perms ] in +- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) ++ queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) +diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml +index 451f8b38dbcc..cbd17280600c 100644 +--- a/tools/ocaml/libs/xs/xsraw.ml ++++ b/tools/ocaml/libs/xs/xsraw.ml +@@ -36,8 +36,10 @@ type con = { + let close con = + Xb.close con.xb + ++let capacity = { Xb.maxoutstanding = 1; maxwatchevents = 0; } ++ + let open_fd fd = { +- xb = Xb.open_fd fd; ++ xb = Xb.open_fd ~capacity fd; + watchevents = Queue.create (); + } + +diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml +index cc20e047d2b9..9624a5f9da2c 100644 +--- a/tools/ocaml/xenstored/connection.ml ++++ b/tools/ocaml/xenstored/connection.ml +@@ -20,12 +20,84 @@ open Stdext + + let xenstore_payload_max = 4096 (* xen/include/public/io/xs_wire.h *) + ++type 'a bounded_sender = 'a -> unit option ++(** a bounded sender accepts an ['a] item and returns: ++ None - if there is no room to accept the item ++ Some () - if it has successfully accepted/sent the item ++ *) ++ ++module BoundedPipe : sig ++ type 'a t ++ ++ (** [create ~capacity ~destination] creates a bounded pipe with a ++ local buffer holding at most [capacity] items. Once the buffer is ++ full it will not accept further items. items from the pipe are ++ flushed into [destination] as long as it accepts items. The ++ destination could be another pipe. ++ *) ++ val create: capacity:int -> destination:'a bounded_sender -> 'a t ++ ++ (** [is_empty t] returns whether the local buffer of [t] is empty. *) ++ val is_empty : _ t -> bool ++ ++ (** [length t] the number of items in the internal buffer *) ++ val length: _ t -> int ++ ++ (** [flush_pipe t] sends as many items from the local buffer as possible, ++ which could be none. *) ++ val flush_pipe: _ t -> unit ++ ++ (** [push t item] tries to [flush_pipe] and then push [item] ++ into the pipe if its [capacity] allows. ++ Returns [None] if there is no more room ++ *) ++ val push : 'a t -> 'a bounded_sender ++end = struct ++ (* items are enqueued in [q], and then flushed to [connect_to] *) ++ type 'a t = ++ { q: 'a Queue.t ++ ; destination: 'a bounded_sender ++ ; capacity: int ++ } ++ ++ let create ~capacity ~destination = ++ { q = Queue.create (); capacity; destination } ++ ++ let rec flush_pipe t = ++ if not Queue.(is_empty t.q) then ++ let item = Queue.peek t.q in ++ match t.destination item with ++ | None -> () (* no room *) ++ | Some () -> ++ (* successfully sent item to next stage *) ++ let _ = Queue.pop t.q in ++ (* continue trying to send more items *) ++ flush_pipe t ++ ++ let push t item = ++ (* first try to flush as many items from this pipe as possible to make room, ++ it is important to do this first to preserve the order of the items ++ *) ++ flush_pipe t; ++ if Queue.length t.q < t.capacity then begin ++ (* enqueue, instead of sending directly. ++ this ensures that [out] sees the items in the same order as we receive them ++ *) ++ Queue.push item t.q; ++ Some (flush_pipe t) ++ end else None ++ ++ let is_empty t = Queue.is_empty t.q ++ let length t = Queue.length t.q ++end ++ + type watch = { + con: t; + token: string; + path: string; + base: string; + is_relative: bool; ++ pending_watchevents: Xenbus.Xb.Packet.t BoundedPipe.t; + } + + and t = { +@@ -38,8 +110,36 @@ and t = { + anonid: int; + mutable stat_nb_ops: int; + mutable perm: Perms.Connection.t; ++ pending_source_watchevents: (watch * Xenbus.Xb.Packet.t) BoundedPipe.t + } + ++module Watch = struct ++ module T = struct ++ type t = watch ++ ++ let compare w1 w2 = ++ (* cannot compare watches from different connections *) ++ assert (w1.con == w2.con); ++ match String.compare w1.token w2.token with ++ | 0 -> String.compare w1.path w2.path ++ | n -> n ++ end ++ module Set = Set.Make(T) ++ ++ let flush_events t = ++ BoundedPipe.flush_pipe t.pending_watchevents; ++ not (BoundedPipe.is_empty t.pending_watchevents) ++ ++ let pending_watchevents t = ++ BoundedPipe.length t.pending_watchevents ++end ++ ++let source_flush_watchevents t = ++ BoundedPipe.flush_pipe t.pending_source_watchevents ++ ++let source_pending_watchevents t = ++ BoundedPipe.length t.pending_source_watchevents ++ + let mark_as_bad con = + match con.dom with + |None -> () +@@ -67,7 +167,8 @@ let watch_create ~con ~path ~token = { + token = token; + path = path; + base = get_path con; +- is_relative = path.[0] <> '/' && path.[0] <> '@' ++ is_relative = path.[0] <> '/' && path.[0] <> '@'; ++ pending_watchevents = BoundedPipe.create ~capacity:!Define.maxwatchevents ~destination:(Xenbus.Xb.queue con.xb) + } + + let get_con w = w.con +@@ -93,6 +194,9 @@ let make_perm dom = + Perms.Connection.create ~perms:[Perms.READ; Perms.WRITE] domid + + let create xbcon dom = ++ let destination (watch, pkt) = ++ BoundedPipe.push watch.pending_watchevents pkt ++ in + let id = + match dom with + | None -> let old = !anon_id_next in incr anon_id_next; old +@@ -109,6 +213,16 @@ let create xbcon dom = + anonid = id; + stat_nb_ops = 0; + perm = make_perm dom; ++ ++ (* the actual capacity will be lower, this is used as an overflow ++ buffer: anything that doesn't fit elsewhere gets put here, only ++ limited by the amount of watches that you can generate with a ++ single xenstore command (which is finite, although possibly very ++ large in theory for Dom0). Once the pipe here has any contents the ++ domain is blocked from sending more commands until it is empty ++ again though. ++ *) ++ pending_source_watchevents = BoundedPipe.create ~capacity:Sys.max_array_length ~destination + } + in + Logging.new_connection ~tid:Transaction.none ~con:(get_domstr con); +@@ -127,11 +241,17 @@ let set_target con target_domid = + + let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb + +-let send_reply con tid rid ty data = ++let packet_of con tid rid ty data = + if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then +- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000") ++ Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000" + else +- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid ty data) ++ Xenbus.Xb.Packet.create tid rid ty data ++ ++let send_reply con tid rid ty data = ++ let result = Xenbus.Xb.queue con.xb (packet_of con tid rid ty data) in ++ (* should never happen: we only process an input packet when there is room for an output packet *) ++ (* and the limit for replies is different from the limit for watch events *) ++ assert (result <> None) + + let send_error con tid rid err = send_reply con tid rid Xenbus.Xb.Op.Error (err ^ "\000") + let send_ack con tid rid ty = send_reply con tid rid ty "OK\000" +@@ -181,11 +301,11 @@ let del_watch con path token = + apath, w + + let del_watches con = +- Hashtbl.clear con.watches; ++ Hashtbl.reset con.watches; + con.nb_watches <- 0 + + let del_transactions con = +- Hashtbl.clear con.transactions ++ Hashtbl.reset con.transactions + + let list_watches con = + let ll = Hashtbl.fold +@@ -208,21 +328,29 @@ let lookup_watch_perm path = function + let lookup_watch_perms oldroot root path = + lookup_watch_perm path oldroot @ lookup_watch_perm path (Some root) + +-let fire_single_watch_unchecked watch = ++let fire_single_watch_unchecked source watch = + let data = Utils.join_by_null [watch.path; watch.token; ""] in +- send_reply watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data ++ let pkt = packet_of watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data in + +-let fire_single_watch (oldroot, root) watch = ++ match BoundedPipe.push source.pending_source_watchevents (watch, pkt) with ++ | Some () -> () (* packet queued *) ++ | None -> ++ (* a well behaved Dom0 shouldn't be able to trigger this, ++ if it happens it is likely a Dom0 bug causing runaway memory usage ++ *) ++ failwith "watch event overflow, cannot happen" ++ ++let fire_single_watch source (oldroot, root) watch = + let abspath = get_watch_path watch.con watch.path |> Store.Path.of_string in + let perms = lookup_watch_perms oldroot root abspath in + if Perms.can_fire_watch watch.con.perm perms then +- fire_single_watch_unchecked watch ++ fire_single_watch_unchecked source watch + else + let perms = perms |> List.map (Perms.Node.to_string ~sep:" ") |> String.concat ", " in + let con = get_domstr watch.con in + Logging.watch_not_fired ~con perms (Store.Path.to_string abspath) + +-let fire_watch roots watch path = ++let fire_watch source roots watch path = + let new_path = + if watch.is_relative && path.[0] = '/' + then begin +@@ -232,7 +360,7 @@ let fire_watch roots watch path = + end else + path + in +- fire_single_watch roots { watch with path = new_path } ++ fire_single_watch source roots { watch with path = new_path } + + (* Search for a valid unused transaction id. *) + let rec valid_transaction_id con proposed_id = +@@ -280,6 +408,7 @@ let do_input con = Xenbus.Xb.input con.xb + let has_partial_input con = Xenbus.Xb.has_partial_input con.xb + let has_more_input con = Xenbus.Xb.has_more_input con.xb + ++let can_input con = Xenbus.Xb.can_input con.xb && BoundedPipe.is_empty con.pending_source_watchevents + let has_output con = Xenbus.Xb.has_output con.xb + let has_old_output con = Xenbus.Xb.has_old_output con.xb + let has_new_output con = Xenbus.Xb.has_new_output con.xb +@@ -323,7 +452,7 @@ let prevents_live_update con = not (is_bad con) + && (has_extra_connection_data con || has_transaction_data con) + + let has_more_work con = +- has_more_input con || not (has_old_output con) && has_new_output con ++ (has_more_input con && can_input con) || not (has_old_output con) && has_new_output con + + let incr_ops con = con.stat_nb_ops <- con.stat_nb_ops + 1 + +diff --git a/tools/ocaml/xenstored/connections.ml b/tools/ocaml/xenstored/connections.ml +index 3c7429fe7f61..7d68c583b43a 100644 +--- a/tools/ocaml/xenstored/connections.ml ++++ b/tools/ocaml/xenstored/connections.ml +@@ -22,22 +22,30 @@ type t = { + domains: (int, Connection.t) Hashtbl.t; + ports: (Xeneventchn.t, Connection.t) Hashtbl.t; + mutable watches: Connection.watch list Trie.t; ++ mutable has_pending_watchevents: Connection.Watch.Set.t + } + + let create () = { + anonymous = Hashtbl.create 37; + domains = Hashtbl.create 37; + ports = Hashtbl.create 37; +- watches = Trie.create () ++ watches = Trie.create (); ++ has_pending_watchevents = Connection.Watch.Set.empty; + } + ++let get_capacity () = ++ (* not multiplied by maxwatch on purpose: 2nd queue in watch itself! *) ++ { Xenbus.Xb.maxoutstanding = !Define.maxoutstanding; maxwatchevents = !Define.maxwatchevents } ++ + let add_anonymous cons fd = +- let xbcon = Xenbus.Xb.open_fd fd in ++ let capacity = get_capacity () in ++ let xbcon = Xenbus.Xb.open_fd fd ~capacity in + let con = Connection.create xbcon None in + Hashtbl.add cons.anonymous (Xenbus.Xb.get_fd xbcon) con + + let add_domain cons dom = +- let xbcon = Xenbus.Xb.open_mmap (Domain.get_interface dom) (fun () -> Domain.notify dom) in ++ let capacity = get_capacity () in ++ let xbcon = Xenbus.Xb.open_mmap ~capacity (Domain.get_interface dom) (fun () -> Domain.notify dom) in + let con = Connection.create xbcon (Some dom) in + Hashtbl.add cons.domains (Domain.get_id dom) con; + match Domain.get_port dom with +@@ -48,7 +56,9 @@ let select ?(only_if = (fun _ -> true)) cons = + Hashtbl.fold (fun _ con (ins, outs) -> + if (only_if con) then ( + let fd = Connection.get_fd con in +- (fd :: ins, if Connection.has_output con then fd :: outs else outs) ++ let in_fds = if Connection.can_input con then fd :: ins else ins in ++ let out_fds = if Connection.has_output con then fd :: outs else outs in ++ in_fds, out_fds + ) else (ins, outs) + ) + cons.anonymous ([], []) +@@ -67,10 +77,17 @@ let del_watches_of_con con watches = + | [] -> None + | ws -> Some ws + ++let del_watches cons con = ++ Connection.del_watches con; ++ cons.watches <- Trie.map (del_watches_of_con con) cons.watches; ++ cons.has_pending_watchevents <- ++ cons.has_pending_watchevents |> Connection.Watch.Set.filter @@ fun w -> ++ Connection.get_con w != con ++ + let del_anonymous cons con = + try + Hashtbl.remove cons.anonymous (Connection.get_fd con); +- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; ++ del_watches cons con; + Connection.close con + with exn -> + debug "del anonymous %s" (Printexc.to_string exn) +@@ -85,7 +102,7 @@ let del_domain cons id = + | Some p -> Hashtbl.remove cons.ports p + | None -> ()) + | None -> ()); +- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; ++ del_watches cons con; + Connection.close con + with exn -> + debug "del domain %u: %s" id (Printexc.to_string exn) +@@ -136,31 +153,33 @@ let del_watch cons con path token = + cons.watches <- Trie.set cons.watches key watches; + watch + +-let del_watches cons con = +- Connection.del_watches con; +- cons.watches <- Trie.map (del_watches_of_con con) cons.watches +- + (* path is absolute *) +-let fire_watches ?oldroot root cons path recurse = ++let fire_watches ?oldroot source root cons path recurse = + let key = key_of_path path in + let path = Store.Path.to_string path in + let roots = oldroot, root in + let fire_watch _ = function + | None -> () +- | Some watches -> List.iter (fun w -> Connection.fire_watch roots w path) watches ++ | Some watches -> List.iter (fun w -> Connection.fire_watch source roots w path) watches + in + let fire_rec _x = function + | None -> () + | Some watches -> +- List.iter (Connection.fire_single_watch roots) watches ++ List.iter (Connection.fire_single_watch source roots) watches + in + Trie.iter_path fire_watch cons.watches key; + if recurse then + Trie.iter fire_rec (Trie.sub cons.watches key) + ++let send_watchevents cons con = ++ cons.has_pending_watchevents <- ++ cons.has_pending_watchevents |> Connection.Watch.Set.filter Connection.Watch.flush_events; ++ Connection.source_flush_watchevents con ++ + let fire_spec_watches root cons specpath = ++ let source = find_domain cons 0 in + iter cons (fun con -> +- List.iter (Connection.fire_single_watch (None, root)) (Connection.get_watches con specpath)) ++ List.iter (Connection.fire_single_watch source (None, root)) (Connection.get_watches con specpath)) + + let set_target cons domain target_domain = + let con = find_domain cons domain in +@@ -197,6 +216,16 @@ let debug cons = + let domains = Hashtbl.fold (fun _ con accu -> Connection.debug con :: accu) cons.domains [] in + String.concat "" (domains @ anonymous) + ++let debug_watchevents cons con = ++ (* == (physical equality) ++ has to be used here because w.con.xb.backend might contain a [unit->unit] value causing regular ++ comparison to fail due to having a 'functional value' which cannot be compared. ++ *) ++ let s = cons.has_pending_watchevents |> Connection.Watch.Set.filter (fun w -> w.con == con) in ++ let pending = s |> Connection.Watch.Set.elements ++ |> List.map (fun w -> Connection.Watch.pending_watchevents w) |> List.fold_left (+) 0 in ++ Printf.sprintf "Watches with pending events: %d, pending events total: %d" (Connection.Watch.Set.cardinal s) pending ++ + let filter ~f cons = + let fold _ v acc = if f v then v :: acc else acc in + [] +diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml +index ba63a8147e09..327b6d795ec7 100644 +--- a/tools/ocaml/xenstored/define.ml ++++ b/tools/ocaml/xenstored/define.ml +@@ -24,6 +24,13 @@ let default_config_dir = Paths.xen_config_dir + let maxwatch = ref (100) + let maxtransaction = ref (10) + let maxrequests = ref (1024) (* maximum requests per transaction *) ++let maxoutstanding = ref (1024) (* maximum outstanding requests, i.e. in-flight requests / domain *) ++let maxwatchevents = ref (1024) ++(* ++ maximum outstanding watch events per watch, ++ recommended >= maxoutstanding to avoid blocking backend transactions due to ++ malicious frontends ++ *) + + let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) + let conflict_burst_limit = ref 5.0 +diff --git a/tools/ocaml/xenstored/oxenstored.conf.in b/tools/ocaml/xenstored/oxenstored.conf.in +index 4ae48e42d47d..9d034e744b4b 100644 +--- a/tools/ocaml/xenstored/oxenstored.conf.in ++++ b/tools/ocaml/xenstored/oxenstored.conf.in +@@ -62,6 +62,8 @@ quota-maxwatch = 100 + quota-transaction = 10 + quota-maxrequests = 1024 + quota-path-max = 1024 ++quota-maxoutstanding = 1024 ++quota-maxwatchevents = 1024 + + # Activate filed base backend + persistent = false +diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml +index cbf708213796..ce39ce28b5f3 100644 +--- a/tools/ocaml/xenstored/process.ml ++++ b/tools/ocaml/xenstored/process.ml +@@ -57,7 +57,7 @@ let split_one_path data con = + | path :: "" :: [] -> Store.Path.create path (Connection.get_path con) + | _ -> raise Invalid_Cmd_Args + +-let process_watch t cons = ++let process_watch source t cons = + let oldroot = t.Transaction.oldroot in + let newroot = Store.get_root t.store in + let ops = Transaction.get_paths t |> List.rev in +@@ -67,8 +67,9 @@ let process_watch t cons = + | Xenbus.Xb.Op.Rm -> true, None, oldroot + | Xenbus.Xb.Op.Setperms -> false, Some oldroot, newroot + | _ -> raise (Failure "huh ?") in +- Connections.fire_watches ?oldroot root cons (snd op) recurse in +- List.iter (fun op -> do_op_watch op cons) ops ++ Connections.fire_watches ?oldroot source root cons (snd op) recurse in ++ List.iter (fun op -> do_op_watch op cons) ops; ++ Connections.send_watchevents cons source + + let create_implicit_path t perm path = + let dirname = Store.Path.get_parent path in +@@ -234,6 +235,20 @@ let do_debug con t _domains cons data = + | "watches" :: _ -> + let watches = Connections.debug cons in + Some (watches ^ "\000") ++ | "xenbus" :: domid :: _ -> ++ let domid = int_of_string domid in ++ let con = Connections.find_domain cons domid in ++ let s = Printf.sprintf "xenbus: %s; overflow queue length: %d, can_input: %b, has_more_input: %b, has_old_output: %b, has_new_output: %b, has_more_work: %b. pending: %s" ++ (Xenbus.Xb.debug con.xb) ++ (Connection.source_pending_watchevents con) ++ (Connection.can_input con) ++ (Connection.has_more_input con) ++ (Connection.has_old_output con) ++ (Connection.has_new_output con) ++ (Connection.has_more_work con) ++ (Connections.debug_watchevents cons con) ++ in ++ Some s + | "mfn" :: domid :: _ -> + let domid = int_of_string domid in + let con = Connections.find_domain cons domid in +@@ -342,7 +357,7 @@ let reply_ack fct con t doms cons data = + fct con t doms cons data; + Packet.Ack (fun () -> + if Transaction.get_id t = Transaction.none then +- process_watch t cons ++ process_watch con t cons + ) + + let reply_data fct con t doms cons data = +@@ -501,7 +516,7 @@ let do_watch con t _domains cons data = + Packet.Ack (fun () -> + (* xenstore.txt says this watch is fired immediately, + implying even if path doesn't exist or is unreadable *) +- Connection.fire_single_watch_unchecked watch) ++ Connection.fire_single_watch_unchecked con watch) + + let do_unwatch con _t _domains cons data = + let (node, token) = +@@ -532,7 +547,7 @@ let do_transaction_end con t domains cons data = + if not success then + raise Transaction_again; + if commit then begin +- process_watch t cons; ++ process_watch con t cons; + match t.Transaction.ty with + | Transaction.No -> + () (* no need to record anything *) +@@ -700,7 +715,8 @@ let process_packet ~store ~cons ~doms ~con ~req = + let do_input store cons doms con = + let newpacket = + try +- Connection.do_input con ++ if Connection.can_input con then Connection.do_input con ++ else None + with Xenbus.Xb.Reconnect -> + info "%s requests a reconnect" (Connection.get_domstr con); + History.reconnect con; +@@ -728,6 +744,7 @@ let do_input store cons doms con = + Connection.incr_ops con + + let do_output _store _cons _doms con = ++ Connection.source_flush_watchevents con; + if Connection.has_output con then ( + if Connection.has_new_output con then ( + let packet = Connection.peek_output con in +diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml +index 3b57ad016dfb..c799e20f1145 100644 +--- a/tools/ocaml/xenstored/xenstored.ml ++++ b/tools/ocaml/xenstored/xenstored.ml +@@ -103,6 +103,8 @@ let parse_config filename = + ("quota-maxentity", Config.Set_int Quota.maxent); + ("quota-maxsize", Config.Set_int Quota.maxsize); + ("quota-maxrequests", Config.Set_int Define.maxrequests); ++ ("quota-maxoutstanding", Config.Set_int Define.maxoutstanding); ++ ("quota-maxwatchevents", Config.Set_int Define.maxwatchevents); + ("quota-path-max", Config.Set_int Define.path_max); + ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); + ("test-eagain", Config.Set_bool Transaction.test_eagain); +-- +2.37.4 + diff --git a/0108-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch b/0108-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch new file mode 100644 index 0000000..82773df --- /dev/null +++ b/0108-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch @@ -0,0 +1,55 @@ +From 26faa6b55881445c25e7e83613c2354090fdff18 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Thu, 29 Sep 2022 13:07:35 +0200 +Subject: [PATCH 108/126] SUPPORT.md: clarify support of untrusted driver + domains with oxenstored + +Add a support statement for the scope of support regarding different +Xenstore variants. Especially oxenstored does not (yet) have security +support of untrusted driver domains, as those might drive oxenstored +out of memory by creating lots of watch events for the guests they are +servicing. + +Add a statement regarding Live Update support of oxenstored. + +This is part of XSA-326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: George Dunlap <george.dunlap@citrix.com> +Acked-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit c7bc20d8d123851a468402bbfc9e3330efff21ec) +--- + SUPPORT.md | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/SUPPORT.md b/SUPPORT.md +index 0fb262f81f40..48fb462221cf 100644 +--- a/SUPPORT.md ++++ b/SUPPORT.md +@@ -179,13 +179,18 @@ Support for running qemu-xen device model in a linux stubdomain. + + Status: Tech Preview + +-## Liveupdate of C xenstored daemon ++## Xenstore + +- Status: Tech Preview ++### C xenstored daemon + +-## Liveupdate of OCaml xenstored daemon ++ Status: Supported ++ Status, Liveupdate: Tech Preview + +- Status: Tech Preview ++### OCaml xenstored daemon ++ ++ Status: Supported ++ Status, untrusted driver domains: Supported, not security supported ++ Status, Liveupdate: Not functional + + ## Toolstack/3rd party + +-- +2.37.4 + diff --git a/0109-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch b/0109-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch new file mode 100644 index 0000000..c9a2e6e --- /dev/null +++ b/0109-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch @@ -0,0 +1,716 @@ +From 607e186fe094f8d1c78572cd3b1f7a43730203c1 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:10 +0200 +Subject: [PATCH 109/126] tools/xenstore: don't use conn->in as context for + temporary allocations + +Using the struct buffered data pointer of the current processed request +for temporary data allocations has a major drawback: the used area (and +with that the temporary data) is freed only after the response of the +request has been written to the ring page or has been read via the +socket. This can happen much later in case a guest isn't reading its +responses fast enough. + +As the temporary data can be safely freed after creating the response, +add a temporary context for that purpose and use that for allocating +the temporary memory, as it was already the case before commit +cc0612464896 ("xenstore: add small default data buffer to internal +struct"). + +Some sub-functions need to gain the "const" attribute for the talloc +context. + +This is XSA-416 / CVE-2022-42319. + +Fixes: cc0612464896 ("xenstore: add small default data buffer to internal struct") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 2a587de219cc0765330fbf9fac6827bfaf29e29b) +--- + tools/xenstore/xenstored_control.c | 31 ++++++----- + tools/xenstore/xenstored_control.h | 3 +- + tools/xenstore/xenstored_core.c | 76 ++++++++++++++++---------- + tools/xenstore/xenstored_domain.c | 29 ++++++---- + tools/xenstore/xenstored_domain.h | 21 ++++--- + tools/xenstore/xenstored_transaction.c | 14 +++-- + tools/xenstore/xenstored_transaction.h | 6 +- + tools/xenstore/xenstored_watch.c | 9 +-- + tools/xenstore/xenstored_watch.h | 6 +- + 9 files changed, 118 insertions(+), 77 deletions(-) + +diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c +index 980279fa53ff..95a60bf57858 100644 +--- a/tools/xenstore/xenstored_control.c ++++ b/tools/xenstore/xenstored_control.c +@@ -107,7 +107,7 @@ static const char *lu_begin(struct connection *conn) + + struct cmd_s { + char *cmd; +- int (*func)(void *, struct connection *, char **, int); ++ int (*func)(const void *, struct connection *, char **, int); + char *pars; + /* + * max_pars can be used to limit the size of the parameter vector, +@@ -119,7 +119,7 @@ struct cmd_s { + unsigned int max_pars; + }; + +-static int do_control_check(void *ctx, struct connection *conn, ++static int do_control_check(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num) +@@ -131,7 +131,7 @@ static int do_control_check(void *ctx, struct connection *conn, + return 0; + } + +-static int do_control_log(void *ctx, struct connection *conn, ++static int do_control_log(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num != 1) +@@ -233,7 +233,7 @@ static int quota_get(const void *ctx, struct connection *conn, + return domain_get_quota(ctx, conn, atoi(vec[0])); + } + +-static int do_control_quota(void *ctx, struct connection *conn, ++static int do_control_quota(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num == 0) +@@ -245,7 +245,7 @@ static int do_control_quota(void *ctx, struct connection *conn, + return quota_get(ctx, conn, vec, num); + } + +-static int do_control_quota_s(void *ctx, struct connection *conn, ++static int do_control_quota_s(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num == 0) +@@ -258,7 +258,7 @@ static int do_control_quota_s(void *ctx, struct connection *conn, + } + + #ifdef __MINIOS__ +-static int do_control_memreport(void *ctx, struct connection *conn, ++static int do_control_memreport(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num) +@@ -270,7 +270,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, + return 0; + } + #else +-static int do_control_logfile(void *ctx, struct connection *conn, ++static int do_control_logfile(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num != 1) +@@ -285,7 +285,7 @@ static int do_control_logfile(void *ctx, struct connection *conn, + return 0; + } + +-static int do_control_memreport(void *ctx, struct connection *conn, ++static int do_control_memreport(const void *ctx, struct connection *conn, + char **vec, int num) + { + FILE *fp; +@@ -325,7 +325,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, + } + #endif + +-static int do_control_print(void *ctx, struct connection *conn, ++static int do_control_print(const void *ctx, struct connection *conn, + char **vec, int num) + { + if (num != 1) +@@ -802,7 +802,7 @@ static const char *lu_start(const void *ctx, struct connection *conn, + return NULL; + } + +-static int do_control_lu(void *ctx, struct connection *conn, ++static int do_control_lu(const void *ctx, struct connection *conn, + char **vec, int num) + { + const char *ret = NULL; +@@ -852,7 +852,7 @@ static int do_control_lu(void *ctx, struct connection *conn, + } + #endif + +-static int do_control_help(void *, struct connection *, char **, int); ++static int do_control_help(const void *, struct connection *, char **, int); + + static struct cmd_s cmds[] = { + { "check", do_control_check, "" }, +@@ -891,7 +891,7 @@ static struct cmd_s cmds[] = { + { "help", do_control_help, "" }, + }; + +-static int do_control_help(void *ctx, struct connection *conn, ++static int do_control_help(const void *ctx, struct connection *conn, + char **vec, int num) + { + int cmd, len = 0; +@@ -927,7 +927,8 @@ static int do_control_help(void *ctx, struct connection *conn, + return 0; + } + +-int do_control(struct connection *conn, struct buffered_data *in) ++int do_control(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + unsigned int cmd, num, off; + char **vec = NULL; +@@ -947,11 +948,11 @@ int do_control(struct connection *conn, struct buffered_data *in) + num = xs_count_strings(in->buffer, in->used); + if (cmds[cmd].max_pars) + num = min(num, cmds[cmd].max_pars); +- vec = talloc_array(in, char *, num); ++ vec = talloc_array(ctx, char *, num); + if (!vec) + return ENOMEM; + if (get_strings(in, vec, num) < num) + return EIO; + +- return cmds[cmd].func(in, conn, vec + 1, num - 1); ++ return cmds[cmd].func(ctx, conn, vec + 1, num - 1); + } +diff --git a/tools/xenstore/xenstored_control.h b/tools/xenstore/xenstored_control.h +index aac61f05908f..6430c3769361 100644 +--- a/tools/xenstore/xenstored_control.h ++++ b/tools/xenstore/xenstored_control.h +@@ -16,5 +16,6 @@ + along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +-int do_control(struct connection *conn, struct buffered_data *in); ++int do_control(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + void lu_read_state(void); +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index f27d5c0101bc..806f24bbab8b 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1214,11 +1214,13 @@ static struct node *get_node_canonicalized(struct connection *conn, + return get_node(conn, ctx, *canonical_name, perm); + } + +-static int send_directory(struct connection *conn, struct buffered_data *in) ++static int send_directory(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct node *node; + +- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); ++ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, ++ XS_PERM_READ); + if (!node) + return errno; + +@@ -1227,7 +1229,7 @@ static int send_directory(struct connection *conn, struct buffered_data *in) + return 0; + } + +-static int send_directory_part(struct connection *conn, ++static int send_directory_part(const void *ctx, struct connection *conn, + struct buffered_data *in) + { + unsigned int off, len, maxlen, genlen; +@@ -1239,7 +1241,8 @@ static int send_directory_part(struct connection *conn, + return EINVAL; + + /* First arg is node name. */ +- node = get_node_canonicalized(conn, in, in->buffer, NULL, XS_PERM_READ); ++ node = get_node_canonicalized(conn, ctx, in->buffer, NULL, ++ XS_PERM_READ); + if (!node) + return errno; + +@@ -1266,7 +1269,7 @@ static int send_directory_part(struct connection *conn, + break; + } + +- data = talloc_array(in, char, genlen + len + 1); ++ data = talloc_array(ctx, char, genlen + len + 1); + if (!data) + return ENOMEM; + +@@ -1282,11 +1285,13 @@ static int send_directory_part(struct connection *conn, + return 0; + } + +-static int do_read(struct connection *conn, struct buffered_data *in) ++static int do_read(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct node *node; + +- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); ++ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, ++ XS_PERM_READ); + if (!node) + return errno; + +@@ -1476,7 +1481,8 @@ err: + } + + /* path, data... */ +-static int do_write(struct connection *conn, struct buffered_data *in) ++static int do_write(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + unsigned int offset, datalen; + struct node *node; +@@ -1490,12 +1496,12 @@ static int do_write(struct connection *conn, struct buffered_data *in) + offset = strlen(vec[0]) + 1; + datalen = in->used - offset; + +- node = get_node_canonicalized(conn, in, vec[0], &name, XS_PERM_WRITE); ++ node = get_node_canonicalized(conn, ctx, vec[0], &name, XS_PERM_WRITE); + if (!node) { + /* No permissions, invalid input? */ + if (errno != ENOENT) + return errno; +- node = create_node(conn, in, name, in->buffer + offset, ++ node = create_node(conn, ctx, name, in->buffer + offset, + datalen); + if (!node) + return errno; +@@ -1506,18 +1512,19 @@ static int do_write(struct connection *conn, struct buffered_data *in) + return errno; + } + +- fire_watches(conn, in, name, node, false, NULL); ++ fire_watches(conn, ctx, name, node, false, NULL); + send_ack(conn, XS_WRITE); + + return 0; + } + +-static int do_mkdir(struct connection *conn, struct buffered_data *in) ++static int do_mkdir(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct node *node; + char *name; + +- node = get_node_canonicalized(conn, in, onearg(in), &name, ++ node = get_node_canonicalized(conn, ctx, onearg(in), &name, + XS_PERM_WRITE); + + /* If it already exists, fine. */ +@@ -1527,10 +1534,10 @@ static int do_mkdir(struct connection *conn, struct buffered_data *in) + return errno; + if (!name) + return ENOMEM; +- node = create_node(conn, in, name, NULL, 0); ++ node = create_node(conn, ctx, name, NULL, 0); + if (!node) + return errno; +- fire_watches(conn, in, name, node, false, NULL); ++ fire_watches(conn, ctx, name, node, false, NULL); + } + send_ack(conn, XS_MKDIR); + +@@ -1628,24 +1635,25 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, + } + + +-static int do_rm(struct connection *conn, struct buffered_data *in) ++static int do_rm(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct node *node; + int ret; + char *name; + char *parentname; + +- node = get_node_canonicalized(conn, in, onearg(in), &name, ++ node = get_node_canonicalized(conn, ctx, onearg(in), &name, + XS_PERM_WRITE); + if (!node) { + /* Didn't exist already? Fine, if parent exists. */ + if (errno == ENOENT) { + if (!name) + return ENOMEM; +- parentname = get_parent(in, name); ++ parentname = get_parent(ctx, name); + if (!parentname) + return errno; +- node = read_node(conn, in, parentname); ++ node = read_node(conn, ctx, parentname); + if (node) { + send_ack(conn, XS_RM); + return 0; +@@ -1660,7 +1668,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) + if (streq(name, "/")) + return EINVAL; + +- ret = _rm(conn, in, node, name); ++ ret = _rm(conn, ctx, node, name); + if (ret) + return ret; + +@@ -1670,13 +1678,15 @@ static int do_rm(struct connection *conn, struct buffered_data *in) + } + + +-static int do_get_perms(struct connection *conn, struct buffered_data *in) ++static int do_get_perms(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct node *node; + char *strings; + unsigned int len; + +- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); ++ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, ++ XS_PERM_READ); + if (!node) + return errno; + +@@ -1689,7 +1699,8 @@ static int do_get_perms(struct connection *conn, struct buffered_data *in) + return 0; + } + +-static int do_set_perms(struct connection *conn, struct buffered_data *in) ++static int do_set_perms(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct node_perms perms, old_perms; + char *name, *permstr; +@@ -1706,7 +1717,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) + + permstr = in->buffer + strlen(in->buffer) + 1; + +- perms.p = talloc_array(in, struct xs_permissions, perms.num); ++ perms.p = talloc_array(ctx, struct xs_permissions, perms.num); + if (!perms.p) + return ENOMEM; + if (!xs_strings_to_perms(perms.p, perms.num, permstr)) +@@ -1721,7 +1732,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) + } + + /* We must own node to do this (tools can do this too). */ +- node = get_node_canonicalized(conn, in, in->buffer, &name, ++ node = get_node_canonicalized(conn, ctx, in->buffer, &name, + XS_PERM_WRITE | XS_PERM_OWNER); + if (!node) + return errno; +@@ -1756,7 +1767,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) + return errno; + } + +- fire_watches(conn, in, name, node, false, &old_perms); ++ fire_watches(conn, ctx, name, node, false, &old_perms); + send_ack(conn, XS_SET_PERMS); + + return 0; +@@ -1764,7 +1775,8 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) + + static struct { + const char *str; +- int (*func)(struct connection *conn, struct buffered_data *in); ++ int (*func)(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + unsigned int flags; + #define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */ + #define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */ +@@ -1840,6 +1852,7 @@ static void process_message(struct connection *conn, struct buffered_data *in) + struct transaction *trans; + enum xsd_sockmsg_type type = in->hdr.msg.type; + int ret; ++ void *ctx; + + if ((unsigned int)type >= XS_TYPE_COUNT || !wire_funcs[type].func) { + eprintf("Client unknown operation %i", type); +@@ -1860,10 +1873,17 @@ static void process_message(struct connection *conn, struct buffered_data *in) + return; + } + ++ ctx = talloc_new(NULL); ++ if (!ctx) { ++ send_error(conn, ENOMEM); ++ return; ++ } ++ + assert(conn->transaction == NULL); + conn->transaction = trans; + +- ret = wire_funcs[type].func(conn, in); ++ ret = wire_funcs[type].func(ctx, conn, in); ++ talloc_free(ctx); + if (ret) + send_error(conn, ret); + +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 3d5142581332..d262f4e9dbdf 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -336,7 +336,7 @@ bool domain_can_write(struct connection *conn) + return ((intf->rsp_prod - intf->rsp_cons) != XENSTORE_RING_SIZE); + } + +-static char *talloc_domain_path(void *context, unsigned int domid) ++static char *talloc_domain_path(const void *context, unsigned int domid) + { + return talloc_asprintf(context, "/local/domain/%u", domid); + } +@@ -540,7 +540,8 @@ static struct domain *introduce_domain(const void *ctx, + } + + /* domid, gfn, evtchn, path */ +-int do_introduce(struct connection *conn, struct buffered_data *in) ++int do_introduce(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct domain *domain; + char *vec[3]; +@@ -558,7 +559,7 @@ int do_introduce(struct connection *conn, struct buffered_data *in) + if (port <= 0) + return EINVAL; + +- domain = introduce_domain(in, domid, port, false); ++ domain = introduce_domain(ctx, domid, port, false); + if (!domain) + return errno; + +@@ -581,7 +582,8 @@ static struct domain *find_connected_domain(unsigned int domid) + return domain; + } + +-int do_set_target(struct connection *conn, struct buffered_data *in) ++int do_set_target(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + char *vec[2]; + unsigned int domid, tdomid; +@@ -625,7 +627,8 @@ static struct domain *onearg_domain(struct connection *conn, + } + + /* domid */ +-int do_release(struct connection *conn, struct buffered_data *in) ++int do_release(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct domain *domain; + +@@ -640,7 +643,8 @@ int do_release(struct connection *conn, struct buffered_data *in) + return 0; + } + +-int do_resume(struct connection *conn, struct buffered_data *in) ++int do_resume(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct domain *domain; + +@@ -655,7 +659,8 @@ int do_resume(struct connection *conn, struct buffered_data *in) + return 0; + } + +-int do_get_domain_path(struct connection *conn, struct buffered_data *in) ++int do_get_domain_path(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + char *path; + const char *domid_str = onearg(in); +@@ -663,18 +668,17 @@ int do_get_domain_path(struct connection *conn, struct buffered_data *in) + if (!domid_str) + return EINVAL; + +- path = talloc_domain_path(conn, atoi(domid_str)); ++ path = talloc_domain_path(ctx, atoi(domid_str)); + if (!path) + return errno; + + send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1); + +- talloc_free(path); +- + return 0; + } + +-int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) ++int do_is_domain_introduced(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + int result; + unsigned int domid; +@@ -695,7 +699,8 @@ int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) + } + + /* Allow guest to reset all watches */ +-int do_reset_watches(struct connection *conn, struct buffered_data *in) ++int do_reset_watches(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + conn_delete_all_watches(conn); + conn_delete_all_transactions(conn); +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index 0f883936f413..da513443cd46 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -24,25 +24,32 @@ void handle_event(void); + void check_domains(bool restore); + + /* domid, mfn, eventchn, path */ +-int do_introduce(struct connection *conn, struct buffered_data *in); ++int do_introduce(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* domid */ +-int do_is_domain_introduced(struct connection *conn, struct buffered_data *in); ++int do_is_domain_introduced(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* domid */ +-int do_release(struct connection *conn, struct buffered_data *in); ++int do_release(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* domid */ +-int do_resume(struct connection *conn, struct buffered_data *in); ++int do_resume(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* domid, target */ +-int do_set_target(struct connection *conn, struct buffered_data *in); ++int do_set_target(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* domid */ +-int do_get_domain_path(struct connection *conn, struct buffered_data *in); ++int do_get_domain_path(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* Allow guest to reset all watches */ +-int do_reset_watches(struct connection *conn, struct buffered_data *in); ++int do_reset_watches(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + void domain_init(int evtfd); + void dom0_init(void); +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index 28774813de83..3e3eb47326cc 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -481,7 +481,8 @@ struct transaction *transaction_lookup(struct connection *conn, uint32_t id) + return ERR_PTR(-ENOENT); + } + +-int do_transaction_start(struct connection *conn, struct buffered_data *in) ++int do_transaction_start(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct transaction *trans, *exists; + char id_str[20]; +@@ -494,8 +495,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) + conn->transaction_started > quota_max_transaction) + return ENOSPC; + +- /* Attach transaction to input for autofree until it's complete */ +- trans = talloc_zero(in, struct transaction); ++ /* Attach transaction to ctx for autofree until it's complete */ ++ trans = talloc_zero(ctx, struct transaction); + if (!trans) + return ENOMEM; + +@@ -544,7 +545,8 @@ static int transaction_fix_domains(struct transaction *trans, bool update) + return 0; + } + +-int do_transaction_end(struct connection *conn, struct buffered_data *in) ++int do_transaction_end(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + const char *arg = onearg(in); + struct transaction *trans; +@@ -562,8 +564,8 @@ int do_transaction_end(struct connection *conn, struct buffered_data *in) + if (!conn->transaction_started) + conn->ta_start_time = 0; + +- /* Attach transaction to in for auto-cleanup */ +- talloc_steal(in, trans); ++ /* Attach transaction to ctx for auto-cleanup */ ++ talloc_steal(ctx, trans); + + if (streq(arg, "T")) { + if (trans->fail) +diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h +index e3cbd6b23095..39d7f81c5127 100644 +--- a/tools/xenstore/xenstored_transaction.h ++++ b/tools/xenstore/xenstored_transaction.h +@@ -29,8 +29,10 @@ struct transaction; + + extern uint64_t generation; + +-int do_transaction_start(struct connection *conn, struct buffered_data *node); +-int do_transaction_end(struct connection *conn, struct buffered_data *in); ++int do_transaction_start(const void *ctx, struct connection *conn, ++ struct buffered_data *node); ++int do_transaction_end(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + struct transaction *transaction_lookup(struct connection *conn, uint32_t id); + +diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c +index 4970e9f1a1b9..854bbcad6e45 100644 +--- a/tools/xenstore/xenstored_watch.c ++++ b/tools/xenstore/xenstored_watch.c +@@ -243,7 +243,7 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, + return NULL; + } + +-int do_watch(struct connection *conn, struct buffered_data *in) ++int do_watch(const void *ctx, struct connection *conn, struct buffered_data *in) + { + struct watch *watch; + char *vec[2]; +@@ -252,7 +252,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) + if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) + return EINVAL; + +- errno = check_watch_path(conn, in, &(vec[0]), &relative); ++ errno = check_watch_path(conn, ctx, &(vec[0]), &relative); + if (errno) + return errno; + +@@ -283,7 +283,8 @@ int do_watch(struct connection *conn, struct buffered_data *in) + return 0; + } + +-int do_unwatch(struct connection *conn, struct buffered_data *in) ++int do_unwatch(const void *ctx, struct connection *conn, ++ struct buffered_data *in) + { + struct watch *watch; + char *node, *vec[2]; +@@ -291,7 +292,7 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) + if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) + return EINVAL; + +- node = canonicalize(conn, in, vec[0]); ++ node = canonicalize(conn, ctx, vec[0]); + if (!node) + return ENOMEM; + list_for_each_entry(watch, &conn->watches, list) { +diff --git a/tools/xenstore/xenstored_watch.h b/tools/xenstore/xenstored_watch.h +index 0e693f0839cd..091890edca96 100644 +--- a/tools/xenstore/xenstored_watch.h ++++ b/tools/xenstore/xenstored_watch.h +@@ -21,8 +21,10 @@ + + #include "xenstored_core.h" + +-int do_watch(struct connection *conn, struct buffered_data *in); +-int do_unwatch(struct connection *conn, struct buffered_data *in); ++int do_watch(const void *ctx, struct connection *conn, ++ struct buffered_data *in); ++int do_unwatch(const void *ctx, struct connection *conn, ++ struct buffered_data *in); + + /* Fire all watches: !exact means all the children are affected (ie. rm). */ + void fire_watches(struct connection *conn, const void *tmp, const char *name, +-- +2.37.4 + diff --git a/0110-tools-xenstore-fix-checking-node-permissions.patch b/0110-tools-xenstore-fix-checking-node-permissions.patch new file mode 100644 index 0000000..77345f7 --- /dev/null +++ b/0110-tools-xenstore-fix-checking-node-permissions.patch @@ -0,0 +1,143 @@ +From 8012324cb9e676bd342a5adfda1700525f195e2e Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:10 +0200 +Subject: [PATCH 110/126] tools/xenstore: fix checking node permissions + +Today chk_domain_generation() is being used to check whether a node +permission entry is still valid or whether it is referring to a domain +no longer existing. This is done by comparing the node's and the +domain's generation count. + +In case no struct domain is existing for a checked domain, but the +domain itself is valid, chk_domain_generation() assumes it is being +called due to the first node created for a new domain and it will +return success. + +This might be wrong in case the checked permission is related to an +old domain, which has just been replaced with a new domain using the +same domid. + +Fix that by letting chk_domain_generation() fail in case a struct +domain isn't found. In order to cover the case of the first node for +a new domain try to allocate the needed struct domain explicitly when +processing the related SET_PERMS command. In case a referenced domain +isn't existing, flag the related permission to be ignored right away. + +This is XSA-417 / CVE-2022-42320. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit ab128218225d3542596ca3a02aee80d55494bef8) +--- + tools/xenstore/xenstored_core.c | 5 +++++ + tools/xenstore/xenstored_domain.c | 37 +++++++++++++++++++++---------- + tools/xenstore/xenstored_domain.h | 1 + + 3 files changed, 31 insertions(+), 12 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 806f24bbab8b..8aecd425f274 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1723,6 +1723,11 @@ static int do_set_perms(const void *ctx, struct connection *conn, + if (!xs_strings_to_perms(perms.p, perms.num, permstr)) + return errno; + ++ if (domain_alloc_permrefs(&perms) < 0) ++ return ENOMEM; ++ if (perms.p[0].perms & XS_PERM_IGNORE) ++ return ENOENT; ++ + /* First arg is node name. */ + if (strstarts(in->buffer, "@")) { + if (set_perms_special(conn, in->buffer, &perms)) +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index d262f4e9dbdf..8b503c2dfe07 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -881,7 +881,6 @@ int domain_entry_inc(struct connection *conn, struct node *node) + * count (used for testing whether a node permission is older than a domain). + * + * Return values: +- * -1: error + * 0: domain has higher generation count (it is younger than a node with the + * given count), or domain isn't existing any longer + * 1: domain is older than the node +@@ -889,20 +888,38 @@ int domain_entry_inc(struct connection *conn, struct node *node) + static int chk_domain_generation(unsigned int domid, uint64_t gen) + { + struct domain *d; +- xc_dominfo_t dominfo; + + if (!xc_handle && domid == 0) + return 1; + + d = find_domain_struct(domid); +- if (d) +- return (d->generation <= gen) ? 1 : 0; + +- if (!get_domain_info(domid, &dominfo)) +- return 0; ++ return (d && d->generation <= gen) ? 1 : 0; ++} + +- d = alloc_domain(NULL, domid); +- return d ? 1 : -1; ++/* ++ * Allocate all missing struct domain referenced by a permission set. ++ * Any permission entries for not existing domains will be marked to be ++ * ignored. ++ */ ++int domain_alloc_permrefs(struct node_perms *perms) ++{ ++ unsigned int i, domid; ++ struct domain *d; ++ xc_dominfo_t dominfo; ++ ++ for (i = 0; i < perms->num; i++) { ++ domid = perms->p[i].id; ++ d = find_domain_struct(domid); ++ if (!d) { ++ if (!get_domain_info(domid, &dominfo)) ++ perms->p[i].perms |= XS_PERM_IGNORE; ++ else if (!alloc_domain(NULL, domid)) ++ return ENOMEM; ++ } ++ } ++ ++ return 0; + } + + /* +@@ -915,8 +932,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) + int ret; + + ret = chk_domain_generation(node->perms.p[0].id, node->generation); +- if (ret < 0) +- return errno; + + /* If the owner doesn't exist any longer give it to priv domain. */ + if (!ret) { +@@ -933,8 +948,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) + continue; + ret = chk_domain_generation(node->perms.p[i].id, + node->generation); +- if (ret < 0) +- return errno; + if (!ret) + node->perms.p[i].perms |= XS_PERM_IGNORE; + } +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index da513443cd46..0b4f56b8146c 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -66,6 +66,7 @@ bool domain_is_unprivileged(struct connection *conn); + + /* Remove node permissions for no longer existing domains. */ + int domain_adjust_node_perms(struct connection *conn, struct node *node); ++int domain_alloc_permrefs(struct node_perms *perms); + + /* Quota manipulation */ + int domain_entry_inc(struct connection *conn, struct node *); +-- +2.37.4 + diff --git a/0111-tools-xenstore-remove-recursion-from-construct_node.patch b/0111-tools-xenstore-remove-recursion-from-construct_node.patch new file mode 100644 index 0000000..aa63d32 --- /dev/null +++ b/0111-tools-xenstore-remove-recursion-from-construct_node.patch @@ -0,0 +1,126 @@ +From 62755d0a90344e704062e7b6943a3fa2dc5e02e6 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:11 +0200 +Subject: [PATCH 111/126] tools/xenstore: remove recursion from + construct_node() + +In order to reduce stack usage due to recursion, switch +construct_node() to use a loop instead. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit da8ee25d02a5447ba39a9800ee2a710ae1f54222) +--- + tools/xenstore/xenstored_core.c | 86 +++++++++++++++++++++------------ + 1 file changed, 55 insertions(+), 31 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 8aecd425f274..46a37e5257e5 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1343,45 +1343,69 @@ static int add_child(const void *ctx, struct node *parent, const char *name) + static struct node *construct_node(struct connection *conn, const void *ctx, + const char *name) + { +- struct node *parent, *node; +- char *parentname = get_parent(ctx, name); ++ const char **names = NULL; ++ unsigned int levels = 0; ++ struct node *node = NULL; ++ struct node *parent = NULL; ++ const char *parentname = talloc_strdup(ctx, name); + + if (!parentname) + return NULL; + +- /* If parent doesn't exist, create it. */ +- parent = read_node(conn, parentname, parentname); +- if (!parent && errno == ENOENT) +- parent = construct_node(conn, ctx, parentname); +- if (!parent) +- return NULL; ++ /* Walk the path up until an existing node is found. */ ++ while (!parent) { ++ names = talloc_realloc(ctx, names, const char *, levels + 1); ++ if (!names) ++ goto nomem; + +- /* Add child to parent. */ +- if (add_child(ctx, parent, name)) +- goto nomem; ++ /* ++ * names[0] is the name of the node to construct initially, ++ * names[1] is its parent, and so on. ++ */ ++ names[levels] = parentname; ++ parentname = get_parent(ctx, parentname); ++ if (!parentname) ++ return NULL; + +- /* Allocate node */ +- node = talloc(ctx, struct node); +- if (!node) +- goto nomem; +- node->name = talloc_strdup(node, name); +- if (!node->name) +- goto nomem; ++ /* Try to read parent node until we found an existing one. */ ++ parent = read_node(conn, ctx, parentname); ++ if (!parent && (errno != ENOENT || !strcmp(parentname, "/"))) ++ return NULL; + +- /* Inherit permissions, except unprivileged domains own what they create */ +- node->perms.num = parent->perms.num; +- node->perms.p = talloc_memdup(node, parent->perms.p, +- node->perms.num * sizeof(*node->perms.p)); +- if (!node->perms.p) +- goto nomem; +- if (domain_is_unprivileged(conn)) +- node->perms.p[0].id = conn->id; ++ levels++; ++ } ++ ++ /* Walk the path down again constructing the missing nodes. */ ++ for (; levels > 0; levels--) { ++ /* Add child to parent. */ ++ if (add_child(ctx, parent, names[levels - 1])) ++ goto nomem; ++ ++ /* Allocate node */ ++ node = talloc(ctx, struct node); ++ if (!node) ++ goto nomem; ++ node->name = talloc_steal(node, names[levels - 1]); ++ ++ /* Inherit permissions, unpriv domains own what they create. */ ++ node->perms.num = parent->perms.num; ++ node->perms.p = talloc_memdup(node, parent->perms.p, ++ node->perms.num * ++ sizeof(*node->perms.p)); ++ if (!node->perms.p) ++ goto nomem; ++ if (domain_is_unprivileged(conn)) ++ node->perms.p[0].id = conn->id; ++ ++ /* No children, no data */ ++ node->children = node->data = NULL; ++ node->childlen = node->datalen = 0; ++ node->acc.memory = 0; ++ node->parent = parent; ++ ++ parent = node; ++ } + +- /* No children, no data */ +- node->children = node->data = NULL; +- node->childlen = node->datalen = 0; +- node->acc.memory = 0; +- node->parent = parent; + return node; + + nomem: +-- +2.37.4 + diff --git a/0112-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch b/0112-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch new file mode 100644 index 0000000..8250ff0 --- /dev/null +++ b/0112-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch @@ -0,0 +1,110 @@ +From b9a005b0b4520261c6c362fca55500782837f119 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:11 +0200 +Subject: [PATCH 112/126] tools/xenstore: don't let remove_child_entry() call + corrupt() + +In case of write_node() returning an error, remove_child_entry() will +call corrupt() today. This could result in an endless recursion, as +remove_child_entry() is called by corrupt(), too: + +corrupt() + check_store() + check_store_() + remove_child_entry() + +Fix that by letting remove_child_entry() return an error instead and +let the caller decide what to do. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 0c00c51f3bc8206c7f9cf87d014650157bee2bf4) +--- + tools/xenstore/xenstored_core.c | 36 ++++++++++++++++++--------------- + 1 file changed, 20 insertions(+), 16 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 46a37e5257e5..4c3897721bdd 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1574,15 +1574,15 @@ static void memdel(void *mem, unsigned off, unsigned len, unsigned total) + memmove(mem + off, mem + off + len, total - off - len); + } + +-static void remove_child_entry(struct connection *conn, struct node *node, +- size_t offset) ++static int remove_child_entry(struct connection *conn, struct node *node, ++ size_t offset) + { + size_t childlen = strlen(node->children + offset); + + memdel(node->children, offset, childlen + 1, node->childlen); + node->childlen -= childlen + 1; +- if (write_node(conn, node, true)) +- corrupt(conn, "Can't update parent node '%s'", node->name); ++ ++ return write_node(conn, node, true); + } + + static void delete_child(struct connection *conn, +@@ -1592,7 +1592,9 @@ static void delete_child(struct connection *conn, + + for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { + if (streq(node->children+i, childname)) { +- remove_child_entry(conn, node, i); ++ if (remove_child_entry(conn, node, i)) ++ corrupt(conn, "Can't update parent node '%s'", ++ node->name); + return; + } + } +@@ -2226,6 +2228,17 @@ int remember_string(struct hashtable *hash, const char *str) + return hashtable_insert(hash, k, (void *)1); + } + ++static int rm_child_entry(struct node *node, size_t off, size_t len) ++{ ++ if (!recovery) ++ return off; ++ ++ if (remove_child_entry(NULL, node, off)) ++ log("check_store: child entry could not be removed from '%s'", ++ node->name); ++ ++ return off - len - 1; ++} + + /** + * A node has a children field that names the children of the node, separated +@@ -2278,12 +2291,7 @@ static int check_store_(const char *name, struct hashtable *reachable) + if (hashtable_search(children, childname)) { + log("check_store: '%s' is duplicated!", + childname); +- +- if (recovery) { +- remove_child_entry(NULL, node, +- i); +- i -= childlen + 1; +- } ++ i = rm_child_entry(node, i, childlen); + } + else { + if (!remember_string(children, +@@ -2300,11 +2308,7 @@ static int check_store_(const char *name, struct hashtable *reachable) + } else if (errno != ENOMEM) { + log("check_store: No child '%s' found!\n", + childname); +- +- if (recovery) { +- remove_child_entry(NULL, node, i); +- i -= childlen + 1; +- } ++ i = rm_child_entry(node, i, childlen); + } else { + log("check_store: ENOMEM"); + ret = ENOMEM; +-- +2.37.4 + diff --git a/0113-tools-xenstore-add-generic-treewalk-function.patch b/0113-tools-xenstore-add-generic-treewalk-function.patch new file mode 100644 index 0000000..b80c574 --- /dev/null +++ b/0113-tools-xenstore-add-generic-treewalk-function.patch @@ -0,0 +1,250 @@ +From 83b6c511a5989a83c50daae83c5b5a683d6dc096 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:11 +0200 +Subject: [PATCH 113/126] tools/xenstore: add generic treewalk function + +Add a generic function to walk the complete node tree. It will start +at "/" and descend recursively into each child, calling a function +specified by the caller. Depending on the return value of the user +specified function the walk will be aborted, continued, or the current +child will be skipped by not descending into its children. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 0d7c5d19bc27492360196e7dad2b227908564fff) +--- + tools/xenstore/xenstored_core.c | 143 +++++++++++++++++++++++++++++--- + tools/xenstore/xenstored_core.h | 40 +++++++++ + 2 files changed, 170 insertions(+), 13 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 4c3897721bdd..7463d0a002d7 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1804,6 +1804,135 @@ static int do_set_perms(const void *ctx, struct connection *conn, + return 0; + } + ++static char *child_name(const void *ctx, const char *s1, const char *s2) ++{ ++ if (strcmp(s1, "/")) ++ return talloc_asprintf(ctx, "%s/%s", s1, s2); ++ return talloc_asprintf(ctx, "/%s", s2); ++} ++ ++static int rm_from_parent(struct connection *conn, struct node *parent, ++ const char *name) ++{ ++ size_t off; ++ ++ if (!parent) ++ return WALK_TREE_ERROR_STOP; ++ ++ for (off = parent->childoff - 1; off && parent->children[off - 1]; ++ off--); ++ if (remove_child_entry(conn, parent, off)) { ++ log("treewalk: child entry could not be removed from '%s'", ++ parent->name); ++ return WALK_TREE_ERROR_STOP; ++ } ++ parent->childoff = off; ++ ++ return WALK_TREE_OK; ++} ++ ++static int walk_call_func(const void *ctx, struct connection *conn, ++ struct node *node, struct node *parent, void *arg, ++ int (*func)(const void *ctx, struct connection *conn, ++ struct node *node, void *arg)) ++{ ++ int ret; ++ ++ if (!func) ++ return WALK_TREE_OK; ++ ++ ret = func(ctx, conn, node, arg); ++ if (ret == WALK_TREE_RM_CHILDENTRY && parent) ++ ret = rm_from_parent(conn, parent, node->name); ++ ++ return ret; ++} ++ ++int walk_node_tree(const void *ctx, struct connection *conn, const char *root, ++ struct walk_funcs *funcs, void *arg) ++{ ++ int ret = 0; ++ void *tmpctx; ++ char *name; ++ struct node *node = NULL; ++ struct node *parent = NULL; ++ ++ tmpctx = talloc_new(ctx); ++ if (!tmpctx) { ++ errno = ENOMEM; ++ return WALK_TREE_ERROR_STOP; ++ } ++ name = talloc_strdup(tmpctx, root); ++ if (!name) { ++ errno = ENOMEM; ++ talloc_free(tmpctx); ++ return WALK_TREE_ERROR_STOP; ++ } ++ ++ /* Continue the walk until an error is returned. */ ++ while (ret >= 0) { ++ /* node == NULL possible only for the initial loop iteration. */ ++ if (node) { ++ /* Go one step up if ret or if last child finished. */ ++ if (ret || node->childoff >= node->childlen) { ++ parent = node->parent; ++ /* Call function AFTER processing a node. */ ++ ret = walk_call_func(ctx, conn, node, parent, ++ arg, funcs->exit); ++ /* Last node, so exit loop. */ ++ if (!parent) ++ break; ++ talloc_free(node); ++ /* Continue with parent. */ ++ node = parent; ++ continue; ++ } ++ /* Get next child of current node. */ ++ name = child_name(tmpctx, node->name, ++ node->children + node->childoff); ++ if (!name) { ++ ret = WALK_TREE_ERROR_STOP; ++ break; ++ } ++ /* Point to next child. */ ++ node->childoff += strlen(node->children + ++ node->childoff) + 1; ++ /* Descent into children. */ ++ parent = node; ++ } ++ /* Read next node (root node or next child). */ ++ node = read_node(conn, tmpctx, name); ++ if (!node) { ++ /* Child not found - should not happen! */ ++ /* ENOENT case can be handled by supplied function. */ ++ if (errno == ENOENT && funcs->enoent) ++ ret = funcs->enoent(ctx, conn, parent, name, ++ arg); ++ else ++ ret = WALK_TREE_ERROR_STOP; ++ if (!parent) ++ break; ++ if (ret == WALK_TREE_RM_CHILDENTRY) ++ ret = rm_from_parent(conn, parent, name); ++ if (ret < 0) ++ break; ++ talloc_free(name); ++ node = parent; ++ continue; ++ } ++ talloc_free(name); ++ node->parent = parent; ++ node->childoff = 0; ++ /* Call function BEFORE processing a node. */ ++ ret = walk_call_func(ctx, conn, node, parent, arg, ++ funcs->enter); ++ } ++ ++ talloc_free(tmpctx); ++ ++ return ret < 0 ? ret : WALK_TREE_OK; ++} ++ + static struct { + const char *str; + int (*func)(const void *ctx, struct connection *conn, +@@ -2206,18 +2335,6 @@ static int keys_equal_fn(void *key1, void *key2) + return 0 == strcmp((char *)key1, (char *)key2); + } + +- +-static char *child_name(const char *s1, const char *s2) +-{ +- if (strcmp(s1, "/")) { +- return talloc_asprintf(NULL, "%s/%s", s1, s2); +- } +- else { +- return talloc_asprintf(NULL, "/%s", s2); +- } +-} +- +- + int remember_string(struct hashtable *hash, const char *str) + { + char *k = malloc(strlen(str) + 1); +@@ -2277,7 +2394,7 @@ static int check_store_(const char *name, struct hashtable *reachable) + while (i < node->childlen && !ret) { + struct node *childnode; + size_t childlen = strlen(node->children + i); +- char * childname = child_name(node->name, ++ char * childname = child_name(NULL, node->name, + node->children + i); + + if (!childname) { +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 1eb3708f82dd..f0fd8c352857 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -195,6 +195,7 @@ struct node { + + /* Children, each nul-terminated. */ + unsigned int childlen; ++ unsigned int childoff; /* Used by walk_node_tree() internally. */ + char *children; + + /* Allocation information for node currently in store. */ +@@ -334,6 +335,45 @@ void read_state_buffered_data(const void *ctx, struct connection *conn, + const struct xs_state_connection *sc); + void read_state_node(const void *ctx, const void *state); + ++/* ++ * Walk the node tree below root calling funcs->enter() and funcs->exit() for ++ * each node. funcs->enter() is being called when entering a node, so before ++ * any of the children of the node is processed. funcs->exit() is being ++ * called when leaving the node, so after all children have been processed. ++ * funcs->enoent() is being called when a node isn't existing. ++ * funcs->*() return values: ++ * < 0: tree walk is stopped, walk_node_tree() returns funcs->*() return value ++ * in case WALK_TREE_ERROR_STOP is returned, errno should be set ++ * WALK_TREE_OK: tree walk is continuing ++ * WALK_TREE_SKIP_CHILDREN: tree walk won't descend below current node, but ++ * walk continues ++ * WALK_TREE_RM_CHILDENTRY: Remove the child entry from its parent and write ++ * the modified parent node back to the data base, implies to not descend ++ * below the current node, but to continue the walk ++ * funcs->*() is allowed to modify the node it is called for in the data base. ++ * In case funcs->enter() is deleting the node, it must not return WALK_TREE_OK ++ * in order to avoid descending into no longer existing children. ++ */ ++/* Return values for funcs->*() and walk_node_tree(). */ ++#define WALK_TREE_SUCCESS_STOP -100 /* Stop walk early, no error. */ ++#define WALK_TREE_ERROR_STOP -1 /* Stop walk due to error. */ ++#define WALK_TREE_OK 0 /* No error. */ ++/* Return value for funcs->*() only. */ ++#define WALK_TREE_SKIP_CHILDREN 1 /* Don't recurse below current node. */ ++#define WALK_TREE_RM_CHILDENTRY 2 /* Remove child entry from parent. */ ++ ++struct walk_funcs { ++ int (*enter)(const void *ctx, struct connection *conn, ++ struct node *node, void *arg); ++ int (*exit)(const void *ctx, struct connection *conn, ++ struct node *node, void *arg); ++ int (*enoent)(const void *ctx, struct connection *conn, ++ struct node *parent, char *name, void *arg); ++}; ++ ++int walk_node_tree(const void *ctx, struct connection *conn, const char *root, ++ struct walk_funcs *funcs, void *arg); ++ + #endif /* _XENSTORED_CORE_H */ + + /* +-- +2.37.4 + diff --git a/0114-tools-xenstore-simplify-check_store.patch b/0114-tools-xenstore-simplify-check_store.patch new file mode 100644 index 0000000..6247114 --- /dev/null +++ b/0114-tools-xenstore-simplify-check_store.patch @@ -0,0 +1,114 @@ +From 4096512a70fd0bb65e40ed4269a1ca74dbb16220 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:12 +0200 +Subject: [PATCH 114/126] tools/xenstore: simplify check_store() + +check_store() is using a hash table for storing all node names it has +found via walking the tree. Additionally it using another hash table +for all children of a node to detect duplicate child names. + +Simplify that by dropping the second hash table as the first one is +already holding all the needed information. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 70f719f52a220bc5bc987e4dd28e14a7039a176b) +--- + tools/xenstore/xenstored_core.c | 47 +++++++++++---------------------- + 1 file changed, 15 insertions(+), 32 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 7463d0a002d7..a48255c64cad 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -2378,50 +2378,34 @@ static int check_store_(const char *name, struct hashtable *reachable) + if (node) { + size_t i = 0; + +- struct hashtable * children = +- create_hashtable(16, hash_from_key_fn, keys_equal_fn); +- if (!children) { +- log("check_store create table: ENOMEM"); +- return ENOMEM; +- } +- + if (!remember_string(reachable, name)) { +- hashtable_destroy(children, 0); + log("check_store: ENOMEM"); + return ENOMEM; + } + + while (i < node->childlen && !ret) { +- struct node *childnode; ++ struct node *childnode = NULL; + size_t childlen = strlen(node->children + i); +- char * childname = child_name(NULL, node->name, +- node->children + i); ++ char *childname = child_name(NULL, node->name, ++ node->children + i); + + if (!childname) { + log("check_store: ENOMEM"); + ret = ENOMEM; + break; + } ++ ++ if (hashtable_search(reachable, childname)) { ++ log("check_store: '%s' is duplicated!", ++ childname); ++ i = rm_child_entry(node, i, childlen); ++ goto next; ++ } ++ + childnode = read_node(NULL, childname, childname); +- ++ + if (childnode) { +- if (hashtable_search(children, childname)) { +- log("check_store: '%s' is duplicated!", +- childname); +- i = rm_child_entry(node, i, childlen); +- } +- else { +- if (!remember_string(children, +- childname)) { +- log("check_store: ENOMEM"); +- talloc_free(childnode); +- talloc_free(childname); +- ret = ENOMEM; +- break; +- } +- ret = check_store_(childname, +- reachable); +- } ++ ret = check_store_(childname, reachable); + } else if (errno != ENOMEM) { + log("check_store: No child '%s' found!\n", + childname); +@@ -2431,19 +2415,18 @@ static int check_store_(const char *name, struct hashtable *reachable) + ret = ENOMEM; + } + ++ next: + talloc_free(childnode); + talloc_free(childname); + i += childlen + 1; + } + +- hashtable_destroy(children, 0 /* Don't free values (they are +- all (void *)1) */); + talloc_free(node); + } else if (errno != ENOMEM) { + /* Impossible, because no database should ever be without the + root, and otherwise, we've just checked in our caller + (which made a recursive call to get here). */ +- ++ + log("check_store: No child '%s' found: impossible!", name); + } else { + log("check_store: ENOMEM"); +-- +2.37.4 + diff --git a/0115-tools-xenstore-use-treewalk-for-check_store.patch b/0115-tools-xenstore-use-treewalk-for-check_store.patch new file mode 100644 index 0000000..74d58f4 --- /dev/null +++ b/0115-tools-xenstore-use-treewalk-for-check_store.patch @@ -0,0 +1,172 @@ +From a95277ee36e1db2f67e8091f4ea401975d341659 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:12 +0200 +Subject: [PATCH 115/126] tools/xenstore: use treewalk for check_store() + +Instead of doing an open tree walk using call recursion, use +walk_node_tree() when checking the store for inconsistencies. + +This will reduce code size and avoid many nesting levels of function +calls which could potentially exhaust the stack. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit a07cc0ec60612f414bedf2bafb26ec38d2602e95) +--- + tools/xenstore/xenstored_core.c | 109 +++++++++----------------------- + 1 file changed, 30 insertions(+), 79 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index a48255c64cad..ed8bc9b02ed2 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -2345,18 +2345,6 @@ int remember_string(struct hashtable *hash, const char *str) + return hashtable_insert(hash, k, (void *)1); + } + +-static int rm_child_entry(struct node *node, size_t off, size_t len) +-{ +- if (!recovery) +- return off; +- +- if (remove_child_entry(NULL, node, off)) +- log("check_store: child entry could not be removed from '%s'", +- node->name); +- +- return off - len - 1; +-} +- + /** + * A node has a children field that names the children of the node, separated + * by NULs. We check whether there are entries in there that are duplicated +@@ -2370,70 +2358,29 @@ static int rm_child_entry(struct node *node, size_t off, size_t len) + * As we go, we record each node in the given reachable hashtable. These + * entries will be used later in clean_store. + */ +-static int check_store_(const char *name, struct hashtable *reachable) ++static int check_store_step(const void *ctx, struct connection *conn, ++ struct node *node, void *arg) + { +- struct node *node = read_node(NULL, name, name); +- int ret = 0; ++ struct hashtable *reachable = arg; + +- if (node) { +- size_t i = 0; +- +- if (!remember_string(reachable, name)) { +- log("check_store: ENOMEM"); +- return ENOMEM; +- } +- +- while (i < node->childlen && !ret) { +- struct node *childnode = NULL; +- size_t childlen = strlen(node->children + i); +- char *childname = child_name(NULL, node->name, +- node->children + i); +- +- if (!childname) { +- log("check_store: ENOMEM"); +- ret = ENOMEM; +- break; +- } +- +- if (hashtable_search(reachable, childname)) { +- log("check_store: '%s' is duplicated!", +- childname); +- i = rm_child_entry(node, i, childlen); +- goto next; +- } +- +- childnode = read_node(NULL, childname, childname); +- +- if (childnode) { +- ret = check_store_(childname, reachable); +- } else if (errno != ENOMEM) { +- log("check_store: No child '%s' found!\n", +- childname); +- i = rm_child_entry(node, i, childlen); +- } else { +- log("check_store: ENOMEM"); +- ret = ENOMEM; +- } +- +- next: +- talloc_free(childnode); +- talloc_free(childname); +- i += childlen + 1; +- } +- +- talloc_free(node); +- } else if (errno != ENOMEM) { +- /* Impossible, because no database should ever be without the +- root, and otherwise, we've just checked in our caller +- (which made a recursive call to get here). */ +- +- log("check_store: No child '%s' found: impossible!", name); +- } else { +- log("check_store: ENOMEM"); +- ret = ENOMEM; ++ if (hashtable_search(reachable, (void *)node->name)) { ++ log("check_store: '%s' is duplicated!", node->name); ++ return recovery ? WALK_TREE_RM_CHILDENTRY ++ : WALK_TREE_SKIP_CHILDREN; + } + +- return ret; ++ if (!remember_string(reachable, node->name)) ++ return WALK_TREE_ERROR_STOP; ++ ++ return WALK_TREE_OK; ++} ++ ++static int check_store_enoent(const void *ctx, struct connection *conn, ++ struct node *parent, char *name, void *arg) ++{ ++ log("check_store: node '%s' not found", name); ++ ++ return recovery ? WALK_TREE_RM_CHILDENTRY : WALK_TREE_OK; + } + + +@@ -2482,24 +2429,28 @@ static void clean_store(struct hashtable *reachable) + + void check_store(void) + { +- char * root = talloc_strdup(NULL, "/"); +- struct hashtable * reachable = +- create_hashtable(16, hash_from_key_fn, keys_equal_fn); +- ++ struct hashtable *reachable; ++ struct walk_funcs walkfuncs = { ++ .enter = check_store_step, ++ .enoent = check_store_enoent, ++ }; ++ ++ reachable = create_hashtable(16, hash_from_key_fn, keys_equal_fn); + if (!reachable) { + log("check_store: ENOMEM"); + return; + } + + log("Checking store ..."); +- if (!check_store_(root, reachable) && +- !check_transactions(reachable)) ++ if (walk_node_tree(NULL, NULL, "/", &walkfuncs, reachable)) { ++ if (errno == ENOMEM) ++ log("check_store: ENOMEM"); ++ } else if (!check_transactions(reachable)) + clean_store(reachable); + log("Checking store complete."); + + hashtable_destroy(reachable, 0 /* Don't free values (they are all + (void *)1) */); +- talloc_free(root); + } + + +-- +2.37.4 + diff --git a/0116-tools-xenstore-use-treewalk-for-deleting-nodes.patch b/0116-tools-xenstore-use-treewalk-for-deleting-nodes.patch new file mode 100644 index 0000000..2dcf32e --- /dev/null +++ b/0116-tools-xenstore-use-treewalk-for-deleting-nodes.patch @@ -0,0 +1,180 @@ +From 9ead5845034c04a5c6e04d9b069d9c13141f4f33 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:12 +0200 +Subject: [PATCH 116/126] tools/xenstore: use treewalk for deleting nodes + +Instead of doing an open tree walk using call recursion, use +walk_node_tree() when deleting a sub-tree of nodes. + +This will reduce code size and avoid many nesting levels of function +calls which could potentially exhaust the stack. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit ea16962053a6849a6e7cada549ba7f8c586d85c6) +--- + tools/xenstore/xenstored_core.c | 99 ++++++++++++++------------------- + 1 file changed, 43 insertions(+), 56 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index ed8bc9b02ed2..9576411757fa 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -1300,21 +1300,6 @@ static int do_read(const void *ctx, struct connection *conn, + return 0; + } + +-static void delete_node_single(struct connection *conn, struct node *node) +-{ +- TDB_DATA key; +- +- if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) +- return; +- +- if (do_tdb_delete(conn, &key, &node->acc) != 0) { +- corrupt(conn, "Could not delete '%s'", node->name); +- return; +- } +- +- domain_entry_dec(conn, node); +-} +- + /* Must not be / */ + static char *basename(const char *name) + { +@@ -1585,69 +1570,59 @@ static int remove_child_entry(struct connection *conn, struct node *node, + return write_node(conn, node, true); + } + +-static void delete_child(struct connection *conn, +- struct node *node, const char *childname) ++static int delete_child(struct connection *conn, ++ struct node *node, const char *childname) + { + unsigned int i; + + for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { + if (streq(node->children+i, childname)) { +- if (remove_child_entry(conn, node, i)) +- corrupt(conn, "Can't update parent node '%s'", +- node->name); +- return; ++ errno = remove_child_entry(conn, node, i) ? EIO : 0; ++ return errno; + } + } + corrupt(conn, "Can't find child '%s' in %s", childname, node->name); ++ ++ errno = EIO; ++ return errno; + } + +-static int delete_node(struct connection *conn, const void *ctx, +- struct node *parent, struct node *node, bool watch_exact) ++static int delnode_sub(const void *ctx, struct connection *conn, ++ struct node *node, void *arg) + { +- char *name; ++ const char *root = arg; ++ bool watch_exact; ++ int ret; ++ TDB_DATA key; + +- /* Delete children. */ +- while (node->childlen) { +- struct node *child; ++ /* Any error here will probably be repeated for all following calls. */ ++ ret = access_node(conn, node, NODE_ACCESS_DELETE, &key); ++ if (ret > 0) ++ return WALK_TREE_SUCCESS_STOP; + +- name = talloc_asprintf(node, "%s/%s", node->name, +- node->children); +- child = name ? read_node(conn, node, name) : NULL; +- if (child) { +- if (delete_node(conn, ctx, node, child, true)) +- return errno; +- } else { +- trace("delete_node: Error deleting child '%s/%s'!\n", +- node->name, node->children); +- /* Quit deleting. */ +- errno = ENOMEM; +- return errno; +- } +- talloc_free(name); +- } ++ /* In case of error stop the walk. */ ++ if (!ret && do_tdb_delete(conn, &key, &node->acc)) ++ return WALK_TREE_SUCCESS_STOP; + + /* + * Fire the watches now, when we can still see the node permissions. + * This fine as we are single threaded and the next possible read will + * be handled only after the node has been really removed. +- */ ++ */ ++ watch_exact = strcmp(root, node->name); + fire_watches(conn, ctx, node->name, node, watch_exact, NULL); +- delete_node_single(conn, node); +- delete_child(conn, parent, basename(node->name)); +- talloc_free(node); + +- return 0; ++ domain_entry_dec(conn, node); ++ ++ return WALK_TREE_RM_CHILDENTRY; + } + +-static int _rm(struct connection *conn, const void *ctx, struct node *node, +- const char *name) ++static int _rm(struct connection *conn, const void *ctx, const char *name) + { +- /* +- * Deleting node by node, so the result is always consistent even in +- * case of a failure. +- */ + struct node *parent; + char *parentname = get_parent(ctx, name); ++ struct walk_funcs walkfuncs = { .exit = delnode_sub }; ++ int ret; + + if (!parentname) + return errno; +@@ -1655,9 +1630,21 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, + parent = read_node(conn, ctx, parentname); + if (!parent) + return read_node_can_propagate_errno() ? errno : EINVAL; +- node->parent = parent; + +- return delete_node(conn, ctx, parent, node, false); ++ ret = walk_node_tree(ctx, conn, name, &walkfuncs, (void *)name); ++ if (ret < 0) { ++ if (ret == WALK_TREE_ERROR_STOP) { ++ corrupt(conn, "error when deleting sub-nodes of %s\n", ++ name); ++ errno = EIO; ++ } ++ return errno; ++ } ++ ++ if (delete_child(conn, parent, basename(name))) ++ return errno; ++ ++ return 0; + } + + +@@ -1694,7 +1681,7 @@ static int do_rm(const void *ctx, struct connection *conn, + if (streq(name, "/")) + return EINVAL; + +- ret = _rm(conn, ctx, node, name); ++ ret = _rm(conn, ctx, name); + if (ret) + return ret; + +-- +2.37.4 + diff --git a/0117-tools-xenstore-use-treewalk-for-creating-node-record.patch b/0117-tools-xenstore-use-treewalk-for-creating-node-record.patch new file mode 100644 index 0000000..6271169 --- /dev/null +++ b/0117-tools-xenstore-use-treewalk-for-creating-node-record.patch @@ -0,0 +1,242 @@ +From 84674f206778e9b3d8d67c6c76aa8094a262d5ec Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:12 +0200 +Subject: [PATCH 117/126] tools/xenstore: use treewalk for creating node + records + +Instead of doing an open tree walk using call recursion, use +walk_node_tree() when creating the node records during a live update. + +This will reduce code size and avoid many nesting levels of function +calls which could potentially exhaust the stack. + +This is part of XSA-418 / CVE-2022-42321. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 297ac246a5d8ed656b349641288f3402dcc0251e) +--- + tools/xenstore/xenstored_core.c | 127 ++++++++++++------------------ + tools/xenstore/xenstored_core.h | 3 +- + tools/xenstore/xenstored_domain.c | 2 +- + 3 files changed, 54 insertions(+), 78 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 9576411757fa..e8cdfeef50c7 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -2990,132 +2990,109 @@ const char *dump_state_buffered_data(FILE *fp, const struct connection *c, + return NULL; + } + +-const char *dump_state_node_perms(FILE *fp, struct xs_state_node *sn, +- const struct xs_permissions *perms, ++const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, + unsigned int n_perms) + { + unsigned int p; + + for (p = 0; p < n_perms; p++) { ++ struct xs_state_node_perm sp; ++ + switch ((int)perms[p].perms & ~XS_PERM_IGNORE) { + case XS_PERM_READ: +- sn->perms[p].access = XS_STATE_NODE_PERM_READ; ++ sp.access = XS_STATE_NODE_PERM_READ; + break; + case XS_PERM_WRITE: +- sn->perms[p].access = XS_STATE_NODE_PERM_WRITE; ++ sp.access = XS_STATE_NODE_PERM_WRITE; + break; + case XS_PERM_READ | XS_PERM_WRITE: +- sn->perms[p].access = XS_STATE_NODE_PERM_BOTH; ++ sp.access = XS_STATE_NODE_PERM_BOTH; + break; + default: +- sn->perms[p].access = XS_STATE_NODE_PERM_NONE; ++ sp.access = XS_STATE_NODE_PERM_NONE; + break; + } +- sn->perms[p].flags = (perms[p].perms & XS_PERM_IGNORE) ++ sp.flags = (perms[p].perms & XS_PERM_IGNORE) + ? XS_STATE_NODE_PERM_IGNORE : 0; +- sn->perms[p].domid = perms[p].id; +- } ++ sp.domid = perms[p].id; + +- if (fwrite(sn->perms, sizeof(*sn->perms), n_perms, fp) != n_perms) +- return "Dump node permissions error"; ++ if (fwrite(&sp, sizeof(sp), 1, fp) != 1) ++ return "Dump node permissions error"; ++ } + + return NULL; + } + +-static const char *dump_state_node_tree(FILE *fp, char *path) ++struct dump_node_data { ++ FILE *fp; ++ const char *err; ++}; ++ ++static int dump_state_node_err(struct dump_node_data *data, const char *err) + { +- unsigned int pathlen, childlen, p = 0; ++ data->err = err; ++ return WALK_TREE_ERROR_STOP; ++} ++ ++static int dump_state_node(const void *ctx, struct connection *conn, ++ struct node *node, void *arg) ++{ ++ struct dump_node_data *data = arg; ++ FILE *fp = data->fp; ++ unsigned int pathlen; + struct xs_state_record_header head; + struct xs_state_node sn; +- TDB_DATA key, data; +- const struct xs_tdb_record_hdr *hdr; +- const char *child; + const char *ret; + +- pathlen = strlen(path) + 1; +- +- set_tdb_key(path, &key); +- data = tdb_fetch(tdb_ctx, key); +- if (data.dptr == NULL) +- return "Error reading node"; +- +- /* Clean up in case of failure. */ +- talloc_steal(path, data.dptr); +- +- hdr = (void *)data.dptr; ++ pathlen = strlen(node->name) + 1; + + head.type = XS_STATE_TYPE_NODE; + head.length = sizeof(sn); + sn.conn_id = 0; + sn.ta_id = 0; + sn.ta_access = 0; +- sn.perm_n = hdr->num_perms; ++ sn.perm_n = node->perms.num; + sn.path_len = pathlen; +- sn.data_len = hdr->datalen; +- head.length += hdr->num_perms * sizeof(*sn.perms); ++ sn.data_len = node->datalen; ++ head.length += node->perms.num * sizeof(*sn.perms); + head.length += pathlen; +- head.length += hdr->datalen; ++ head.length += node->datalen; + head.length = ROUNDUP(head.length, 3); + + if (fwrite(&head, sizeof(head), 1, fp) != 1) +- return "Dump node state error"; ++ return dump_state_node_err(data, "Dump node head error"); + if (fwrite(&sn, sizeof(sn), 1, fp) != 1) +- return "Dump node state error"; ++ return dump_state_node_err(data, "Dump node state error"); + +- ret = dump_state_node_perms(fp, &sn, hdr->perms, hdr->num_perms); ++ ret = dump_state_node_perms(fp, node->perms.p, node->perms.num); + if (ret) +- return ret; ++ return dump_state_node_err(data, ret); + +- if (fwrite(path, pathlen, 1, fp) != 1) +- return "Dump node path error"; +- if (hdr->datalen && +- fwrite(hdr->perms + hdr->num_perms, hdr->datalen, 1, fp) != 1) +- return "Dump node data error"; ++ if (fwrite(node->name, pathlen, 1, fp) != 1) ++ return dump_state_node_err(data, "Dump node path error"); ++ ++ if (node->datalen && fwrite(node->data, node->datalen, 1, fp) != 1) ++ return dump_state_node_err(data, "Dump node data error"); + + ret = dump_state_align(fp); + if (ret) +- return ret; ++ return dump_state_node_err(data, ret); + +- child = (char *)(hdr->perms + hdr->num_perms) + hdr->datalen; +- +- /* +- * Use path for constructing children paths. +- * As we don't write out nodes without having written their parent +- * already we will never clobber a part of the path we'll need later. +- */ +- pathlen--; +- if (path[pathlen - 1] != '/') { +- path[pathlen] = '/'; +- pathlen++; +- } +- while (p < hdr->childlen) { +- childlen = strlen(child) + 1; +- if (pathlen + childlen > XENSTORE_ABS_PATH_MAX) +- return "Dump node path length error"; +- strcpy(path + pathlen, child); +- ret = dump_state_node_tree(fp, path); +- if (ret) +- return ret; +- p += childlen; +- child += childlen; +- } +- +- talloc_free(data.dptr); +- +- return NULL; ++ return WALK_TREE_OK; + } + + const char *dump_state_nodes(FILE *fp, const void *ctx) + { +- char *path; ++ struct dump_node_data data = { ++ .fp = fp, ++ .err = "Dump node walk error" ++ }; ++ struct walk_funcs walkfuncs = { .enter = dump_state_node }; + +- path = talloc_size(ctx, XENSTORE_ABS_PATH_MAX); +- if (!path) +- return "Path buffer allocation error"; ++ if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data)) ++ return data.err; + +- strcpy(path, "/"); +- +- return dump_state_node_tree(fp, path); ++ return NULL; + } + + void read_state_global(const void *ctx, const void *state) +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index f0fd8c352857..3190494bbeb5 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -326,8 +326,7 @@ const char *dump_state_buffered_data(FILE *fp, const struct connection *c, + const struct connection *conn, + struct xs_state_connection *sc); + const char *dump_state_nodes(FILE *fp, const void *ctx); +-const char *dump_state_node_perms(FILE *fp, struct xs_state_node *sn, +- const struct xs_permissions *perms, ++const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, + unsigned int n_perms); + + void read_state_global(const void *ctx, const void *state); +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index 8b503c2dfe07..a91cc75ab59b 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -1449,7 +1449,7 @@ static const char *dump_state_special_node(FILE *fp, const char *name, + if (fwrite(&sn, sizeof(sn), 1, fp) != 1) + return "Dump special node error"; + +- ret = dump_state_node_perms(fp, &sn, perms->p, perms->num); ++ ret = dump_state_node_perms(fp, perms->p, perms->num); + if (ret) + return ret; + +-- +2.37.4 + diff --git a/0118-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch b/0118-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch new file mode 100644 index 0000000..a95a48e --- /dev/null +++ b/0118-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch @@ -0,0 +1,299 @@ +From da87661d058c4a6cf2ea6439771b9834f1c06223 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:12 +0200 +Subject: [PATCH 118/126] tools/xenstore: remove nodes owned by destroyed + domain + +In case a domain is removed from Xenstore, remove all nodes owned by +it per default. + +This tackles the problem that nodes might be created by a domain +outside its home path in Xenstore, leading to Xenstore hogging more +and more memory. Domain quota don't work in this case if the guest is +rebooting in between. + +Since XSA-322 ownership of such stale nodes is transferred to dom0, +which is helping against unintended access, but not against OOM of +Xenstore. + +As a fallback for weird cases add a Xenstore start parameter for +keeping today's way to handle stale nodes, adding the risk of Xenstore +hitting an OOM situation. + +This is part of XSA-419 / CVE-2022-42322. + +Fixes: 496306324d8d ("tools/xenstore: revoke access rights for removed domains") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 755d3f9debf8879448211fffb018f556136f6a79) +--- + tools/xenstore/xenstored_core.c | 17 +++++-- + tools/xenstore/xenstored_core.h | 4 ++ + tools/xenstore/xenstored_domain.c | 84 +++++++++++++++++++++++-------- + tools/xenstore/xenstored_domain.h | 2 +- + 4 files changed, 80 insertions(+), 27 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index e8cdfeef50c7..d5b2e59b0db6 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -80,6 +80,7 @@ static bool verbose = false; + LIST_HEAD(connections); + int tracefd = -1; + static bool recovery = true; ++bool keep_orphans = false; + static int reopen_log_pipe[2]; + static int reopen_log_pipe0_pollfd_idx = -1; + char *tracefile = NULL; +@@ -722,7 +723,7 @@ struct node *read_node(struct connection *conn, const void *ctx, + node->perms.p = hdr->perms; + node->acc.domid = node->perms.p[0].id; + node->acc.memory = data.dsize; +- if (domain_adjust_node_perms(conn, node)) ++ if (domain_adjust_node_perms(node)) + goto error; + + /* If owner is gone reset currently accounted memory size. */ +@@ -765,7 +766,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + void *p; + struct xs_tdb_record_hdr *hdr; + +- if (domain_adjust_node_perms(conn, node)) ++ if (domain_adjust_node_perms(node)) + return errno; + + data.dsize = sizeof(*hdr) +@@ -1617,7 +1618,7 @@ static int delnode_sub(const void *ctx, struct connection *conn, + return WALK_TREE_RM_CHILDENTRY; + } + +-static int _rm(struct connection *conn, const void *ctx, const char *name) ++int rm_node(struct connection *conn, const void *ctx, const char *name) + { + struct node *parent; + char *parentname = get_parent(ctx, name); +@@ -1681,7 +1682,7 @@ static int do_rm(const void *ctx, struct connection *conn, + if (streq(name, "/")) + return EINVAL; + +- ret = _rm(conn, ctx, name); ++ ret = rm_node(conn, ctx, name); + if (ret) + return ret; + +@@ -2537,6 +2538,8 @@ static void usage(void) + " -R, --no-recovery to request that no recovery should be attempted when\n" + " the store is corrupted (debug only),\n" + " -I, --internal-db store database in memory, not on disk\n" ++" -K, --keep-orphans don't delete nodes owned by a domain when the\n" ++" domain is deleted (this is a security risk!)\n" + " -V, --verbose to request verbose execution.\n"); + } + +@@ -2561,6 +2564,7 @@ static struct option options[] = { + { "timeout", 1, NULL, 'w' }, + { "no-recovery", 0, NULL, 'R' }, + { "internal-db", 0, NULL, 'I' }, ++ { "keep-orphans", 0, NULL, 'K' }, + { "verbose", 0, NULL, 'V' }, + { "watch-nb", 1, NULL, 'W' }, + #ifndef NO_LIVE_UPDATE +@@ -2641,7 +2645,7 @@ int main(int argc, char *argv[]) + orig_argc = argc; + orig_argv = argv; + +- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", ++ while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", + options, NULL)) != -1) { + switch (opt) { + case 'D': +@@ -2677,6 +2681,9 @@ int main(int argc, char *argv[]) + case 'I': + tdb_flags = TDB_INTERNAL|TDB_NOLOCK; + break; ++ case 'K': ++ keep_orphans = true; ++ break; + case 'V': + verbose = true; + break; +diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h +index 3190494bbeb5..9a9dbb2c3c86 100644 +--- a/tools/xenstore/xenstored_core.h ++++ b/tools/xenstore/xenstored_core.h +@@ -233,6 +233,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + struct node *read_node(struct connection *conn, const void *ctx, + const char *name); + ++/* Remove a node and its children. */ ++int rm_node(struct connection *conn, const void *ctx, const char *name); ++ + void setup_structure(bool live_update); + struct connection *new_connection(connwritefn_t *write, connreadfn_t *read); + struct connection *get_connection_by_id(unsigned int conn_id); +@@ -279,6 +282,7 @@ extern int quota_req_outstanding; + extern int quota_trans_nodes; + extern int quota_memory_per_domain_soft; + extern int quota_memory_per_domain_hard; ++extern bool keep_orphans; + + extern unsigned int timeout_watch_event_msec; + +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index a91cc75ab59b..ee4b19387db8 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -196,10 +196,64 @@ static void unmap_interface(void *interface) + xengnttab_unmap(*xgt_handle, interface, 1); + } + ++static int domain_tree_remove_sub(const void *ctx, struct connection *conn, ++ struct node *node, void *arg) ++{ ++ struct domain *domain = arg; ++ TDB_DATA key; ++ int ret = WALK_TREE_OK; ++ ++ if (node->perms.p[0].id != domain->domid) ++ return WALK_TREE_OK; ++ ++ if (keep_orphans) { ++ set_tdb_key(node->name, &key); ++ domain->nbentry--; ++ node->perms.p[0].id = priv_domid; ++ node->acc.memory = 0; ++ domain_entry_inc(NULL, node); ++ if (write_node_raw(NULL, &key, node, true)) { ++ /* That's unfortunate. We only can try to continue. */ ++ syslog(LOG_ERR, ++ "error when moving orphaned node %s to dom0\n", ++ node->name); ++ } else ++ trace("orphaned node %s moved to dom0\n", node->name); ++ } else { ++ if (rm_node(NULL, ctx, node->name)) { ++ /* That's unfortunate. We only can try to continue. */ ++ syslog(LOG_ERR, ++ "error when deleting orphaned node %s\n", ++ node->name); ++ } else ++ trace("orphaned node %s deleted\n", node->name); ++ ++ /* Skip children in all cases in order to avoid more errors. */ ++ ret = WALK_TREE_SKIP_CHILDREN; ++ } ++ ++ return domain->nbentry > 0 ? ret : WALK_TREE_SUCCESS_STOP; ++} ++ ++static void domain_tree_remove(struct domain *domain) ++{ ++ int ret; ++ struct walk_funcs walkfuncs = { .enter = domain_tree_remove_sub }; ++ ++ if (domain->nbentry > 0) { ++ ret = walk_node_tree(domain, NULL, "/", &walkfuncs, domain); ++ if (ret == WALK_TREE_ERROR_STOP) ++ syslog(LOG_ERR, ++ "error when looking for orphaned nodes\n"); ++ } ++} ++ + static int destroy_domain(void *_domain) + { + struct domain *domain = _domain; + ++ domain_tree_remove(domain); ++ + list_del(&domain->list); + + if (!domain->introduced) +@@ -857,15 +911,15 @@ int domain_entry_inc(struct connection *conn, struct node *node) + struct domain *d; + unsigned int domid; + +- if (!conn) ++ if (!node->perms.p) + return 0; + +- domid = node->perms.p ? node->perms.p[0].id : conn->id; ++ domid = node->perms.p[0].id; + +- if (conn->transaction) { ++ if (conn && conn->transaction) { + transaction_entry_inc(conn->transaction, domid); + } else { +- d = (domid == conn->id && conn->domain) ? conn->domain ++ d = (conn && domid == conn->id && conn->domain) ? conn->domain + : find_or_alloc_existing_domain(domid); + if (d) + d->nbentry++; +@@ -926,23 +980,11 @@ int domain_alloc_permrefs(struct node_perms *perms) + * Remove permissions for no longer existing domains in order to avoid a new + * domain with the same domid inheriting the permissions. + */ +-int domain_adjust_node_perms(struct connection *conn, struct node *node) ++int domain_adjust_node_perms(struct node *node) + { + unsigned int i; + int ret; + +- ret = chk_domain_generation(node->perms.p[0].id, node->generation); +- +- /* If the owner doesn't exist any longer give it to priv domain. */ +- if (!ret) { +- /* +- * In theory we'd need to update the number of dom0 nodes here, +- * but we could be called for a read of the node. So better +- * avoid the risk to overflow the node count of dom0. +- */ +- node->perms.p[0].id = priv_domid; +- } +- + for (i = 1; i < node->perms.num; i++) { + if (node->perms.p[i].perms & XS_PERM_IGNORE) + continue; +@@ -960,15 +1002,15 @@ void domain_entry_dec(struct connection *conn, struct node *node) + struct domain *d; + unsigned int domid; + +- if (!conn) ++ if (!node->perms.p) + return; + + domid = node->perms.p ? node->perms.p[0].id : conn->id; + +- if (conn->transaction) { ++ if (conn && conn->transaction) { + transaction_entry_dec(conn->transaction, domid); + } else { +- d = (domid == conn->id && conn->domain) ? conn->domain ++ d = (conn && domid == conn->id && conn->domain) ? conn->domain + : find_domain_struct(domid); + if (d) { + d->nbentry--; +@@ -1087,7 +1129,7 @@ int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) + * exist, as accounting is done either for a domain related to + * the current connection, or for the domain owning a node + * (which is always existing, as the owner of the node is +- * tested to exist and replaced by domid 0 if not). ++ * tested to exist and deleted or replaced by domid 0 if not). + * So not finding the related domain MUST be an error in the + * data base. + */ +diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h +index 0b4f56b8146c..491d7a325bd3 100644 +--- a/tools/xenstore/xenstored_domain.h ++++ b/tools/xenstore/xenstored_domain.h +@@ -65,7 +65,7 @@ bool domain_can_write(struct connection *conn); + bool domain_is_unprivileged(struct connection *conn); + + /* Remove node permissions for no longer existing domains. */ +-int domain_adjust_node_perms(struct connection *conn, struct node *node); ++int domain_adjust_node_perms(struct node *node); + int domain_alloc_permrefs(struct node_perms *perms); + + /* Quota manipulation */ +-- +2.37.4 + diff --git a/0119-tools-xenstore-make-the-internal-memory-data-base-th.patch b/0119-tools-xenstore-make-the-internal-memory-data-base-th.patch new file mode 100644 index 0000000..8c1611b --- /dev/null +++ b/0119-tools-xenstore-make-the-internal-memory-data-base-th.patch @@ -0,0 +1,101 @@ +From 4269999ecedf79452a3fbbfab842f045d1ece16e Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:13 +0200 +Subject: [PATCH 119/126] tools/xenstore: make the internal memory data base + the default + +Having a file backed data base has the only advantage of being capable +to dump the contents of it while Xenstore is running, and potentially +using less swap space in case the data base can't be kept in memory. + +It has the major disadvantage of a huge performance overhead: switching +to keep the data base in memory only speeds up live update of xenstored +with 120000 nodes from 20 minutes to 11 seconds. A complete tree walk +of this configuration will be reduced from 7 seconds to 280 msecs +(measured by "xenstore-control check"). + +So make the internal memory data base the default and enhance the +"--internal-db" command line parameter to take an optional parameter +allowing to switch the internal data base back to the file based one. + +This is part of XSA-419. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit d174fefa90487ddd25ebc618028f67b2e8a1f795) +--- + tools/helpers/init-xenstore-domain.c | 4 ++-- + tools/xenstore/xenstored_core.c | 13 ++++++++----- + 2 files changed, 10 insertions(+), 7 deletions(-) + +diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c +index 32689abd7479..d080dae5d3b8 100644 +--- a/tools/helpers/init-xenstore-domain.c ++++ b/tools/helpers/init-xenstore-domain.c +@@ -214,9 +214,9 @@ static int build(xc_interface *xch) + } + + if ( param ) +- snprintf(cmdline, 512, "--event %d --internal-db %s", rv, param); ++ snprintf(cmdline, 512, "--event %d %s", rv, param); + else +- snprintf(cmdline, 512, "--event %d --internal-db", rv); ++ snprintf(cmdline, 512, "--event %d", rv); + + dom->cmdline = xc_dom_strdup(dom, cmdline); + dom->xenstore_domid = domid; +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index d5b2e59b0db6..9ddbd934f794 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -2230,7 +2230,7 @@ static void accept_connection(int sock) + } + #endif + +-static int tdb_flags; ++static int tdb_flags = TDB_INTERNAL | TDB_NOLOCK; + + /* We create initial nodes manually. */ + static void manual_node(const char *name, const char *child) +@@ -2537,7 +2537,8 @@ static void usage(void) + " watch-event: time a watch-event is kept pending\n" + " -R, --no-recovery to request that no recovery should be attempted when\n" + " the store is corrupted (debug only),\n" +-" -I, --internal-db store database in memory, not on disk\n" ++" -I, --internal-db [on|off] store database in memory, not on disk, default is\n" ++" memory, with \"--internal-db off\" it is on disk\n" + " -K, --keep-orphans don't delete nodes owned by a domain when the\n" + " domain is deleted (this is a security risk!)\n" + " -V, --verbose to request verbose execution.\n"); +@@ -2563,7 +2564,7 @@ static struct option options[] = { + { "quota-soft", 1, NULL, 'q' }, + { "timeout", 1, NULL, 'w' }, + { "no-recovery", 0, NULL, 'R' }, +- { "internal-db", 0, NULL, 'I' }, ++ { "internal-db", 2, NULL, 'I' }, + { "keep-orphans", 0, NULL, 'K' }, + { "verbose", 0, NULL, 'V' }, + { "watch-nb", 1, NULL, 'W' }, +@@ -2645,7 +2646,8 @@ int main(int argc, char *argv[]) + orig_argc = argc; + orig_argv = argv; + +- while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", ++ while ((opt = getopt_long(argc, argv, ++ "DE:F:HI::KNPS:t:A:M:Q:q:T:RVW:w:U", + options, NULL)) != -1) { + switch (opt) { + case 'D': +@@ -2679,7 +2681,8 @@ int main(int argc, char *argv[]) + tracefile = optarg; + break; + case 'I': +- tdb_flags = TDB_INTERNAL|TDB_NOLOCK; ++ if (optarg && !strcmp(optarg, "off")) ++ tdb_flags = 0; + break; + case 'K': + keep_orphans = true; +-- +2.37.4 + diff --git a/0120-docs-enhance-xenstore.txt-with-permissions-descripti.patch b/0120-docs-enhance-xenstore.txt-with-permissions-descripti.patch new file mode 100644 index 0000000..e0d7d9e --- /dev/null +++ b/0120-docs-enhance-xenstore.txt-with-permissions-descripti.patch @@ -0,0 +1,51 @@ +From bc3921135cf8590d0f587f460be431922183c4c4 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:13 +0200 +Subject: [PATCH 120/126] docs: enhance xenstore.txt with permissions + description +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The permission scheme of Xenstore nodes is not really covered by +docs/misc/xenstore.txt, other than referring to the Xen wiki. + +Add a paragraph explaining the permissions of nodes, and especially +mentioning removal of nodes when a domain has been removed from +Xenstore. + +This is part of XSA-419. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit d084d2c6dff7044956ebdf83a259ad6081a1d921) +--- + docs/misc/xenstore.txt | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt +index a7d006519ae8..eccd596ee38c 100644 +--- a/docs/misc/xenstore.txt ++++ b/docs/misc/xenstore.txt +@@ -43,6 +43,17 @@ bytes are forbidden; clients specifying relative paths should keep + them to within 2048 bytes. (See XENSTORE_*_PATH_MAX in xs_wire.h.) + + ++Each node has one or multiple permission entries. Permissions are ++granted by domain-id, the first permission entry of each node specifies ++the owner of the node. Permissions of a node can be changed by the ++owner of the node, the owner can only be modified by the control ++domain (usually domain id 0). The owner always has the right to read ++and write the node, while other permissions can be setup to allow ++read and/or write access. When a domain is being removed from Xenstore ++nodes owned by that domain will be removed together with all of those ++nodes' children. ++ ++ + Communication with xenstore is via either sockets, or event channel + and shared memory, as specified in io/xs_wire.h: each message in + either direction is a header formatted as a struct xsd_sockmsg +-- +2.37.4 + diff --git a/0121-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch b/0121-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch new file mode 100644 index 0000000..722700e --- /dev/null +++ b/0121-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch @@ -0,0 +1,93 @@ +From b9ede0950b3a6526d5ccea074841f093e0580948 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:06 +0100 +Subject: [PATCH 121/126] tools/ocaml/xenstored: Fix quota bypass on domain + shutdown +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +XSA-322 fixed a domid reuse vulnerability by assigning Dom0 as the owner of +any nodes left after a domain is shutdown (e.g. outside its /local/domain/N +tree). + +However Dom0 has no quota on purpose, so this opened up another potential +attack vector. Avoid it by deleting these nodes instead of assigning them to +Dom0. + +This is part of XSA-419 / CVE-2022-42323. + +Fixes: c46eff921209 ("tools/ocaml/xenstored: clean up permissions for dead domains") +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit db471408edd46af403b8bd44d180a928ad7fbb80) +--- + tools/ocaml/xenstored/perms.ml | 3 +-- + tools/ocaml/xenstored/store.ml | 29 +++++++++++++++++++++-------- + 2 files changed, 22 insertions(+), 10 deletions(-) + +diff --git a/tools/ocaml/xenstored/perms.ml b/tools/ocaml/xenstored/perms.ml +index e8a16221f8fa..84f2503e8e29 100644 +--- a/tools/ocaml/xenstored/perms.ml ++++ b/tools/ocaml/xenstored/perms.ml +@@ -64,8 +64,7 @@ let get_owner perm = perm.owner + * *) + let remove_domid ~domid perm = + let acl = List.filter (fun (acl_domid, _) -> acl_domid <> domid) perm.acl in +- let owner = if perm.owner = domid then 0 else perm.owner in +- { perm with acl; owner } ++ if perm.owner = domid then None else Some { perm with acl; owner = perm.owner } + + let default0 = create 0 NONE [] + +diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml +index 20e67b142746..70f0c83de404 100644 +--- a/tools/ocaml/xenstored/store.ml ++++ b/tools/ocaml/xenstored/store.ml +@@ -87,10 +87,21 @@ let check_owner node connection = + + let rec recurse fct node = fct node; SymbolMap.iter (fun _ -> recurse fct) node.children + +-(** [recurse_map f tree] applies [f] on each node in the tree recursively *) +-let recurse_map f = ++(** [recurse_filter_map f tree] applies [f] on each node in the tree recursively, ++ possibly removing some nodes. ++ Note that the nodes removed this way won't generate watch events. ++*) ++let recurse_filter_map f = ++ let invalid = -1 in ++ let is_valid _ node = node.perms.owner <> invalid in + let rec walk node = +- f { node with children = SymbolMap.map walk node.children } ++ (* Map.filter_map is Ocaml 4.11+ only *) ++ let node = ++ { node with children = ++ SymbolMap.map walk node.children |> SymbolMap.filter is_valid } in ++ match f node with ++ | Some keep -> keep ++ | None -> { node with perms = {node.perms with owner = invalid } } + in + walk + +@@ -444,11 +455,13 @@ let setperms store perm path nperms = + + let reset_permissions store domid = + Logging.info "store|node" "Cleaning up xenstore ACLs for domid %d" domid; +- store.root <- Node.recurse_map (fun node -> +- let perms = Perms.Node.remove_domid ~domid node.perms in +- if perms <> node.perms then +- Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); +- { node with perms } ++ store.root <- Node.recurse_filter_map (fun node -> ++ match Perms.Node.remove_domid ~domid node.perms with ++ | None -> None ++ | Some perms -> ++ if perms <> node.perms then ++ Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); ++ Some { node with perms } + ) store.root + + type ops = { +-- +2.37.4 + diff --git a/0122-tools-ocaml-Ensure-packet-size-is-never-negative.patch b/0122-tools-ocaml-Ensure-packet-size-is-never-negative.patch new file mode 100644 index 0000000..35a14f1 --- /dev/null +++ b/0122-tools-ocaml-Ensure-packet-size-is-never-negative.patch @@ -0,0 +1,75 @@ +From d3649d33e1eae49d3925ef34a7ccf39cae8852e6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Wed, 12 Oct 2022 19:13:05 +0100 +Subject: [PATCH 122/126] tools/ocaml: Ensure packet size is never negative +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Integers in Ocaml have 63 or 31 bits of signed precision. + +On 64-bit builds of Ocaml, this is fine because a C uint32_t always fits +within a 63-bit signed integer. + +In 32-bit builds of Ocaml, this goes wrong. The C uint32_t is truncated +first (loses the top bit), then has a unsigned/signed mismatch. + +A "negative" value (i.e. a packet on the ring of between 1G and 2G in size) +will trigger an exception later in Bytes.make in xb.ml, and because the packet +is not removed from the ring, the exception re-triggers on every subsequent +query, creating a livelock. + +Fix both the source of the exception in Xb, and as defence in depth, mark the +domain as bad for any Invalid_argument exceptions to avoid the risk of +livelock. + +This is XSA-420 / CVE-2022-42324. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit ae34df4d82636f4c82700b447ea2c93b9f82b3f3) +--- + tools/ocaml/libs/xb/partial.ml | 6 +++--- + tools/ocaml/xenstored/process.ml | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/ocaml/libs/xb/partial.ml b/tools/ocaml/libs/xb/partial.ml +index b6e2a716e263..3aa8927eb7f0 100644 +--- a/tools/ocaml/libs/xb/partial.ml ++++ b/tools/ocaml/libs/xb/partial.ml +@@ -36,7 +36,7 @@ let of_string s = + This will leave the guest connection is a bad state and will + be hard to recover from without restarting the connection + (ie rebooting the guest) *) +- let dlen = min xenstore_payload_max dlen in ++ let dlen = max 0 (min xenstore_payload_max dlen) in + { + tid = tid; + rid = rid; +@@ -46,8 +46,8 @@ let of_string s = + } + + let append pkt s sz = +- if pkt.len > 4096 then failwith "Buffer.add: cannot grow buffer"; +- Buffer.add_string pkt.buf (String.sub s 0 sz) ++ if Buffer.length pkt.buf + sz > xenstore_payload_max then failwith "Buffer.add: cannot grow buffer"; ++ Buffer.add_substring pkt.buf s 0 sz + + let to_complete pkt = + pkt.len - (Buffer.length pkt.buf) +diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml +index ce39ce28b5f3..6cb990ee7fb2 100644 +--- a/tools/ocaml/xenstored/process.ml ++++ b/tools/ocaml/xenstored/process.ml +@@ -722,7 +722,7 @@ let do_input store cons doms con = + History.reconnect con; + info "%s reconnection complete" (Connection.get_domstr con); + None +- | Failure exp -> ++ | Invalid_argument exp | Failure exp -> + error "caught exception %s" exp; + error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); + Connection.mark_as_bad con; +-- +2.37.4 + diff --git a/0123-tools-xenstore-fix-deleting-node-in-transaction.patch b/0123-tools-xenstore-fix-deleting-node-in-transaction.patch new file mode 100644 index 0000000..efa7178 --- /dev/null +++ b/0123-tools-xenstore-fix-deleting-node-in-transaction.patch @@ -0,0 +1,46 @@ +From 2d3476effe3a9236867562f14dc26979a6527080 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:13 +0200 +Subject: [PATCH 123/126] tools/xenstore: fix deleting node in transaction + +In case a node has been created in a transaction and it is later +deleted in the same transaction, the transaction will be terminated +with an error. + +As this error is encountered only when handling the deleted node at +transaction finalization, the transaction will have been performed +partially and without updating the accounting information. This will +enable a malicious guest to create arbitrary number of nodes. + +This is part of XSA-421 / CVE-2022-42325. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Tested-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 13ac37f1416cae88d97f7baf6cf2a827edb9a187) +--- + tools/xenstore/xenstored_transaction.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index 3e3eb47326cc..7ffe21bb5285 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -418,7 +418,13 @@ static int finalize_transaction(struct connection *conn, + true); + talloc_free(data.dptr); + } else { +- ret = do_tdb_delete(conn, &key, NULL); ++ /* ++ * A node having been created and later deleted ++ * in this transaction will have no generation ++ * information stored. ++ */ ++ ret = (i->generation == NO_GENERATION) ++ ? 0 : do_tdb_delete(conn, &key, NULL); + } + if (ret) + goto err; +-- +2.37.4 + diff --git a/0124-tools-xenstore-harden-transaction-finalization-again.patch b/0124-tools-xenstore-harden-transaction-finalization-again.patch new file mode 100644 index 0000000..8279aeb --- /dev/null +++ b/0124-tools-xenstore-harden-transaction-finalization-again.patch @@ -0,0 +1,410 @@ +From e818f4f0dabf83a6138cd77d7464495fab7bfc16 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 13 Sep 2022 07:35:14 +0200 +Subject: [PATCH 124/126] tools/xenstore: harden transaction finalization + against errors + +When finalizing a transaction, any error occurring after checking for +conflicts will result in the transaction being performed only +partially today. Additionally accounting data will not be updated at +the end of the transaction, which might result in further problems +later. + +Avoid those problems by multiple modifications: + +- free any transaction specific nodes which don't need to be committed + as they haven't been written during the transaction as soon as their + generation count has been verified, this will reduce the risk of + out-of-memory situations + +- store the transaction specific node name in struct accessed_node in + order to avoid the need to allocate additional memory for it when + finalizing the transaction + +- don't stop the transaction finalization when hitting an error + condition, but try to continue to handle all modified nodes + +- in case of a detected error do the accounting update as needed and + call the data base checking only after that + +- if writing a node in a transaction is failing (e.g. due to a failed + quota check), fail the transaction, as prior changes to struct + accessed_node can't easily be undone in that case + +This is part of XSA-421 / CVE-2022-42326. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +Tested-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 2dd823ca7237e7fb90c890642d6a3b357a26fcff) +--- + tools/xenstore/xenstored_core.c | 16 ++- + tools/xenstore/xenstored_transaction.c | 171 +++++++++++-------------- + tools/xenstore/xenstored_transaction.h | 4 +- + 3 files changed, 92 insertions(+), 99 deletions(-) + +diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c +index 9ddbd934f794..3c008c8cd455 100644 +--- a/tools/xenstore/xenstored_core.c ++++ b/tools/xenstore/xenstored_core.c +@@ -692,8 +692,7 @@ struct node *read_node(struct connection *conn, const void *ctx, + return NULL; + } + +- if (transaction_prepend(conn, name, &key)) +- return NULL; ++ transaction_prepend(conn, name, &key); + + data = tdb_fetch(tdb_ctx, key); + +@@ -811,10 +810,21 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, + static int write_node(struct connection *conn, struct node *node, + bool no_quota_check) + { ++ int ret; ++ + if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) + return errno; + +- return write_node_raw(conn, &node->key, node, no_quota_check); ++ ret = write_node_raw(conn, &node->key, node, no_quota_check); ++ if (ret && conn && conn->transaction) { ++ /* ++ * Reverting access_node() is hard, so just fail the ++ * transaction. ++ */ ++ fail_transaction(conn->transaction); ++ } ++ ++ return ret; + } + + enum xs_perm_type perm_for_conn(struct connection *conn, +diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c +index 7ffe21bb5285..ac854197cadb 100644 +--- a/tools/xenstore/xenstored_transaction.c ++++ b/tools/xenstore/xenstored_transaction.c +@@ -114,7 +114,8 @@ struct accessed_node + struct list_head list; + + /* The name of the node. */ +- char *node; ++ char *trans_name; /* Transaction specific name. */ ++ char *node; /* Main data base name. */ + + /* Generation count (or NO_GENERATION) for conflict checking. */ + uint64_t generation; +@@ -199,25 +200,20 @@ static char *transaction_get_node_name(void *ctx, struct transaction *trans, + * Prepend the transaction to name if node has been modified in the current + * transaction. + */ +-int transaction_prepend(struct connection *conn, const char *name, +- TDB_DATA *key) ++void transaction_prepend(struct connection *conn, const char *name, ++ TDB_DATA *key) + { +- char *tdb_name; ++ struct accessed_node *i; + +- if (!conn || !conn->transaction || +- !find_accessed_node(conn->transaction, name)) { +- set_tdb_key(name, key); +- return 0; ++ if (conn && conn->transaction) { ++ i = find_accessed_node(conn->transaction, name); ++ if (i) { ++ set_tdb_key(i->trans_name, key); ++ return; ++ } + } + +- tdb_name = transaction_get_node_name(conn->transaction, +- conn->transaction, name); +- if (!tdb_name) +- return errno; +- +- set_tdb_key(tdb_name, key); +- +- return 0; ++ set_tdb_key(name, key); + } + + /* +@@ -240,7 +236,6 @@ int access_node(struct connection *conn, struct node *node, + struct accessed_node *i = NULL; + struct transaction *trans; + TDB_DATA local_key; +- const char *trans_name = NULL; + int ret; + bool introduce = false; + +@@ -259,10 +254,6 @@ int access_node(struct connection *conn, struct node *node, + + trans = conn->transaction; + +- trans_name = transaction_get_node_name(node, trans, node->name); +- if (!trans_name) +- goto nomem; +- + i = find_accessed_node(trans, node->name); + if (!i) { + if (trans->nodes >= quota_trans_nodes && +@@ -273,9 +264,10 @@ int access_node(struct connection *conn, struct node *node, + i = talloc_zero(trans, struct accessed_node); + if (!i) + goto nomem; +- i->node = talloc_strdup(i, node->name); +- if (!i->node) ++ i->trans_name = transaction_get_node_name(i, trans, node->name); ++ if (!i->trans_name) + goto nomem; ++ i->node = strchr(i->trans_name, '/') + 1; + if (node->generation != NO_GENERATION && node->perms.num) { + i->perms.p = talloc_array(i, struct xs_permissions, + node->perms.num); +@@ -302,7 +294,7 @@ int access_node(struct connection *conn, struct node *node, + i->generation = node->generation; + i->check_gen = true; + if (node->generation != NO_GENERATION) { +- set_tdb_key(trans_name, &local_key); ++ set_tdb_key(i->trans_name, &local_key); + ret = write_node_raw(conn, &local_key, node, true); + if (ret) + goto err; +@@ -321,7 +313,7 @@ int access_node(struct connection *conn, struct node *node, + return -1; + + if (key) { +- set_tdb_key(trans_name, key); ++ set_tdb_key(i->trans_name, key); + if (type == NODE_ACCESS_WRITE) + i->ta_node = true; + if (type == NODE_ACCESS_DELETE) +@@ -333,7 +325,6 @@ int access_node(struct connection *conn, struct node *node, + nomem: + ret = ENOMEM; + err: +- talloc_free((void *)trans_name); + talloc_free(i); + trans->fail = true; + errno = ret; +@@ -371,100 +362,90 @@ void queue_watches(struct connection *conn, const char *name, bool watch_exact) + * base. + */ + static int finalize_transaction(struct connection *conn, +- struct transaction *trans) ++ struct transaction *trans, bool *is_corrupt) + { +- struct accessed_node *i; ++ struct accessed_node *i, *n; + TDB_DATA key, ta_key, data; + struct xs_tdb_record_hdr *hdr; + uint64_t gen; +- char *trans_name; +- int ret; + +- list_for_each_entry(i, &trans->accessed, list) { +- if (!i->check_gen) +- continue; ++ list_for_each_entry_safe(i, n, &trans->accessed, list) { ++ if (i->check_gen) { ++ set_tdb_key(i->node, &key); ++ data = tdb_fetch(tdb_ctx, key); ++ hdr = (void *)data.dptr; ++ if (!data.dptr) { ++ if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) ++ return EIO; ++ gen = NO_GENERATION; ++ } else ++ gen = hdr->generation; ++ talloc_free(data.dptr); ++ if (i->generation != gen) ++ return EAGAIN; ++ } + +- set_tdb_key(i->node, &key); +- data = tdb_fetch(tdb_ctx, key); +- hdr = (void *)data.dptr; +- if (!data.dptr) { +- if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) +- return EIO; +- gen = NO_GENERATION; +- } else +- gen = hdr->generation; +- talloc_free(data.dptr); +- if (i->generation != gen) +- return EAGAIN; ++ /* Entries for unmodified nodes can be removed early. */ ++ if (!i->modified) { ++ if (i->ta_node) { ++ set_tdb_key(i->trans_name, &ta_key); ++ if (do_tdb_delete(conn, &ta_key, NULL)) ++ return EIO; ++ } ++ list_del(&i->list); ++ talloc_free(i); ++ } + } + + while ((i = list_top(&trans->accessed, struct accessed_node, list))) { +- trans_name = transaction_get_node_name(i, trans, i->node); +- if (!trans_name) +- /* We are doomed: the transaction is only partial. */ +- goto err; +- +- set_tdb_key(trans_name, &ta_key); +- +- if (i->modified) { +- set_tdb_key(i->node, &key); +- if (i->ta_node) { +- data = tdb_fetch(tdb_ctx, ta_key); +- if (!data.dptr) +- goto err; ++ set_tdb_key(i->node, &key); ++ if (i->ta_node) { ++ set_tdb_key(i->trans_name, &ta_key); ++ data = tdb_fetch(tdb_ctx, ta_key); ++ if (data.dptr) { + hdr = (void *)data.dptr; + hdr->generation = ++generation; +- ret = do_tdb_write(conn, &key, &data, NULL, +- true); ++ *is_corrupt |= do_tdb_write(conn, &key, &data, ++ NULL, true); + talloc_free(data.dptr); ++ if (do_tdb_delete(conn, &ta_key, NULL)) ++ *is_corrupt = true; + } else { +- /* +- * A node having been created and later deleted +- * in this transaction will have no generation +- * information stored. +- */ +- ret = (i->generation == NO_GENERATION) +- ? 0 : do_tdb_delete(conn, &key, NULL); +- } +- if (ret) +- goto err; +- if (i->fire_watch) { +- fire_watches(conn, trans, i->node, NULL, +- i->watch_exact, +- i->perms.p ? &i->perms : NULL); ++ *is_corrupt = true; + } ++ } else { ++ /* ++ * A node having been created and later deleted ++ * in this transaction will have no generation ++ * information stored. ++ */ ++ *is_corrupt |= (i->generation == NO_GENERATION) ++ ? false ++ : do_tdb_delete(conn, &key, NULL); + } ++ if (i->fire_watch) ++ fire_watches(conn, trans, i->node, NULL, i->watch_exact, ++ i->perms.p ? &i->perms : NULL); + +- if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) +- goto err; + list_del(&i->list); + talloc_free(i); + } + + return 0; +- +-err: +- corrupt(conn, "Partial transaction"); +- return EIO; + } + + static int destroy_transaction(void *_transaction) + { + struct transaction *trans = _transaction; + struct accessed_node *i; +- char *trans_name; + TDB_DATA key; + + wrl_ntransactions--; + trace_destroy(trans, "transaction"); + while ((i = list_top(&trans->accessed, struct accessed_node, list))) { + if (i->ta_node) { +- trans_name = transaction_get_node_name(i, trans, +- i->node); +- if (trans_name) { +- set_tdb_key(trans_name, &key); +- do_tdb_delete(trans->conn, &key, NULL); +- } ++ set_tdb_key(i->trans_name, &key); ++ do_tdb_delete(trans->conn, &key, NULL); + } + list_del(&i->list); + talloc_free(i); +@@ -556,6 +537,7 @@ int do_transaction_end(const void *ctx, struct connection *conn, + { + const char *arg = onearg(in); + struct transaction *trans; ++ bool is_corrupt = false; + int ret; + + if (!arg || (!streq(arg, "T") && !streq(arg, "F"))) +@@ -579,13 +561,17 @@ int do_transaction_end(const void *ctx, struct connection *conn, + ret = transaction_fix_domains(trans, false); + if (ret) + return ret; +- if (finalize_transaction(conn, trans)) +- return EAGAIN; ++ ret = finalize_transaction(conn, trans, &is_corrupt); ++ if (ret) ++ return ret; + + wrl_apply_debit_trans_commit(conn); + + /* fix domain entry for each changed domain */ + transaction_fix_domains(trans, true); ++ ++ if (is_corrupt) ++ corrupt(conn, "transaction inconsistency"); + } + send_ack(conn, XS_TRANSACTION_END); + +@@ -660,7 +646,7 @@ int check_transactions(struct hashtable *hash) + struct connection *conn; + struct transaction *trans; + struct accessed_node *i; +- char *tname, *tnode; ++ char *tname; + + list_for_each_entry(conn, &connections, list) { + list_for_each_entry(trans, &conn->transaction_list, list) { +@@ -672,11 +658,8 @@ int check_transactions(struct hashtable *hash) + list_for_each_entry(i, &trans->accessed, list) { + if (!i->ta_node) + continue; +- tnode = transaction_get_node_name(tname, trans, +- i->node); +- if (!tnode || !remember_string(hash, tnode)) ++ if (!remember_string(hash, i->trans_name)) + goto nomem; +- talloc_free(tnode); + } + + talloc_free(tname); +diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h +index 39d7f81c5127..3417303f9427 100644 +--- a/tools/xenstore/xenstored_transaction.h ++++ b/tools/xenstore/xenstored_transaction.h +@@ -48,8 +48,8 @@ int __must_check access_node(struct connection *conn, struct node *node, + void queue_watches(struct connection *conn, const char *name, bool watch_exact); + + /* Prepend the transaction to name if appropriate. */ +-int transaction_prepend(struct connection *conn, const char *name, +- TDB_DATA *key); ++void transaction_prepend(struct connection *conn, const char *name, ++ TDB_DATA *key); + + /* Mark the transaction as failed. This will prevent it to be committed. */ + void fail_transaction(struct transaction *trans); +-- +2.37.4 + diff --git a/0125-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch b/0125-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch new file mode 100644 index 0000000..f1667ac --- /dev/null +++ b/0125-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch @@ -0,0 +1,82 @@ +From 07be0fe497349ed423c5201bdc410b6281ebf04f Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 14 Jun 2022 16:18:36 +0100 +Subject: [PATCH 125/126] x86/spec-ctrl: Enumeration for IBPB_RET + +The IBPB_RET bit indicates that the CPU's implementation of MSR_PRED_CMD.IBPB +does flush the RSB/RAS too. + +This is part of XSA-422 / CVE-2022-23824. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 24496558e650535bdbd22cc04731e82276cd1b3f) +--- + tools/libs/light/libxl_cpuid.c | 1 + + tools/misc/xen-cpuid.c | 1 + + xen/arch/x86/spec_ctrl.c | 5 +++-- + xen/include/public/arch-x86/cpufeatureset.h | 1 + + 4 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c +index 2632efc6adb0..4cc2f211b878 100644 +--- a/tools/libs/light/libxl_cpuid.c ++++ b/tools/libs/light/libxl_cpuid.c +@@ -284,6 +284,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) + {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, + {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, + {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, ++ {"ibpb-ret", 0x80000008, NA, CPUID_REG_EBX, 30, 1}, + + {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, + {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index e83bc4793d6e..5c944c24fe36 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -158,6 +158,7 @@ static const char *const str_e8b[32] = + [24] = "amd-ssbd", [25] = "virt-ssbd", + [26] = "ssb-no", + [28] = "psfd", [29] = "btc-no", ++ [30] = "ibpb-ret", + }; + + static const char *const str_7d0[32] = +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 3ff602bd0281..459c64d139b6 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -419,7 +419,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + * Hardware read-only information, stating immunity to certain issues, or + * suggestions of which mitigation to use. + */ +- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", +@@ -436,7 +436,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", +- (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); ++ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : "", ++ (e8b & cpufeat_mask(X86_FEATURE_IBPB_RET)) ? " IBPB_RET" : ""); + + /* Hardware features which need driving to mitigate issues. */ + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 1bbc7da4b53c..41a358d575d3 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -266,6 +266,7 @@ XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ + XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ + XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ + XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ ++XEN_CPUFEATURE(IBPB_RET, 8*32+30) /*A IBPB clears RSB/RAS too. */ + + /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ + XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ +-- +2.37.4 + diff --git a/0126-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch b/0126-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch new file mode 100644 index 0000000..2abb0f2 --- /dev/null +++ b/0126-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch @@ -0,0 +1,113 @@ +From 32445f23fea6a533fc1d7ade5871246d75210bf1 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 14 Jun 2022 16:18:36 +0100 +Subject: [PATCH 126/126] x86/spec-ctrl: Mitigate IBPB not flushing the RSB/RAS + +Introduce spec_ctrl_new_guest_context() to encapsulate all logic pertaining to +using MSR_PRED_CMD for a new guest context, even if it only has one user +presently. + +Introduce X86_BUG_IBPB_NO_RET, and use it extend spec_ctrl_new_guest_context() +with a manual fixup for hardware which mis-implements IBPB. + +This is part of XSA-422 / CVE-2022-23824. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 2b27967fb89d7904a1571a2fb963b1c9cac548db) +--- + xen/arch/x86/asm-macros.c | 1 + + xen/arch/x86/domain.c | 2 +- + xen/arch/x86/spec_ctrl.c | 8 ++++++++ + xen/include/asm-x86/cpufeatures.h | 1 + + xen/include/asm-x86/spec_ctrl.h | 22 ++++++++++++++++++++++ + 5 files changed, 33 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/asm-macros.c b/xen/arch/x86/asm-macros.c +index 7e536b0d82f5..891d86c7655c 100644 +--- a/xen/arch/x86/asm-macros.c ++++ b/xen/arch/x86/asm-macros.c +@@ -1,2 +1,3 @@ + #include <asm/asm-defns.h> + #include <asm/alternative-asm.h> ++#include <asm/spec_ctrl_asm.h> +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index e9b8ed4c96c2..b82e18dd62d8 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -2069,7 +2069,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) + */ + if ( *last_id != next_id ) + { +- wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); ++ spec_ctrl_new_guest_context(); + *last_id = next_id; + } + } +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 459c64d139b6..5636853aae6b 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -775,6 +775,14 @@ static void __init ibpb_calculations(void) + return; + } + ++ /* ++ * AMD/Hygon CPUs to date (June 2022) don't flush the the RAS. Future ++ * CPUs are expected to enumerate IBPB_RET when this has been fixed. ++ * Until then, cover the difference with the software sequence. ++ */ ++ if ( boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_IBPB_RET) ) ++ setup_force_cpu_cap(X86_BUG_IBPB_NO_RET); ++ + /* + * IBPB-on-entry mitigations for Branch Type Confusion. + * +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index b233e5835fb5..bdb119a34c5d 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -48,6 +48,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for + + #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ + #define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ ++#define X86_BUG_IBPB_NO_RET X86_BUG( 3) /* IBPB doesn't flush the RSB/RAS */ + + /* Total number of capability words, inc synth and bug words. */ + #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index 33e845991b0a..e400ff227391 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -65,6 +65,28 @@ + void init_speculation_mitigations(void); + void spec_ctrl_init_domain(struct domain *d); + ++/* ++ * Switch to a new guest prediction context. ++ * ++ * This flushes all indirect branch predictors (BTB, RSB/RAS), so guest code ++ * which has previously run on this CPU can't attack subsequent guest code. ++ * ++ * As this flushes the RSB/RAS, it destroys the predictions of the calling ++ * context. For best performace, arrange for this to be used when we're going ++ * to jump out of the current context, e.g. with reset_stack_and_jump(). ++ * ++ * For hardware which mis-implements IBPB, fix up by flushing the RSB/RAS ++ * manually. ++ */ ++static always_inline void spec_ctrl_new_guest_context(void) ++{ ++ wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); ++ ++ /* (ab)use alternative_input() to specify clobbers. */ ++ alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET, ++ : "rax", "rcx"); ++} ++ + extern int8_t opt_ibpb_ctxt_switch; + extern bool opt_ssbd; + extern int8_t opt_eager_fpu; +-- +2.37.4 + @@ -1,6 +1,6 @@ -Xen upstream patchset #1 for 4.15.4-pre +Xen upstream patchset #2 for 4.15.4-pre Containing patches from RELEASE-4.15.3 (feecaf4abf733e83b7a297190819eca7a7f65168) to -staging-4.15 (816580afdd1730d4f85f64477a242a439af1cdf8) +staging-4.15 (32445f23fea6a533fc1d7ade5871246d75210bf1) |