1
0
mirror of https://git.FreeBSD.org/ports.git synced 2025-02-02 11:09:29 +00:00

xen-kernel: apply XSA-{237..244}

Approved by:	bapt (implicit)
MFH:		2017Q4
Sponsored by:	Citrix Systems R&D
This commit is contained in:
Roger Pau Monné 2017-10-12 15:02:30 +00:00
parent 0e31c2371d
commit aac1e9a2f0
Notes: svn2git 2021-03-31 03:12:20 +00:00
svn path=/head/; revision=451882
14 changed files with 1298 additions and 2 deletions

View File

@ -2,7 +2,7 @@
PORTNAME= xen
PORTVERSION= 4.7.2
PORTREVISION= 5
PORTREVISION= 6
CATEGORIES= emulators
MASTER_SITES= http://downloads.xenproject.org/release/xen/${PORTVERSION}/
PKGNAMESUFFIX= -kernel
@ -67,7 +67,20 @@ EXTRA_PATCHES= ${FILESDIR}/0001-xen-logdirty-prevent-preemption-if-finished.patc
${FILESDIR}/xsa231-4.7.patch:-p1 \
${FILESDIR}/xsa232.patch:-p1 \
${FILESDIR}/xsa233.patch:-p1 \
${FILESDIR}/xsa234-4.8.patch:-p1
${FILESDIR}/xsa234-4.8.patch:-p1 \
${FILESDIR}/0001-x86-dont-allow-MSI-pIRQ-mapping-on-unowned-device.patch:-p1 \
${FILESDIR}/0002-x86-enforce-proper-privilege-when-mapping-pIRQ-s.patch:-p1 \
${FILESDIR}/0003-x86-MSI-disallow-redundant-enabling.patch:-p1 \
${FILESDIR}/0004-x86-IRQ-conditionally-preserve-irq-pirq-mapping-on-error.patch:-p1 \
${FILESDIR}/0005-x86-FLASK-fix-unmap-domain-IRQ-XSM-hook.patch:-p1 \
${FILESDIR}/xsa238.patch:-p1 \
${FILESDIR}/xsa239.patch:-p1 \
${FILESDIR}/0001-x86-limit-linear-page-table-use-to-a-single-level.patch:-p1 \
${FILESDIR}/0002-x86-mm-Disable-PV-linear-pagetables-by-default.patch:-p1 \
${FILESDIR}/xsa241-4.8.patch:-p1 \
${FILESDIR}/xsa242-4.9.patch:-p1 \
${FILESDIR}/xsa243-4.7.patch:-p1 \
${FILESDIR}/xsa244-4.7.patch:-p1
.include <bsd.port.options.mk>

View File

@ -0,0 +1,27 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86: don't allow MSI pIRQ mapping on unowned device
MSI setup should be permitted only for existing devices owned by the
respective guest (the operation may still be carried out by the domain
controlling that guest).
This is part of XSA-237.
Reported-by: HW42 <hw42@ipsumj.de>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1964,7 +1964,10 @@ int map_domain_pirq(
if ( !cpu_has_apic )
goto done;
- pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
+ pdev = pci_get_pdev_by_domain(d, msi->seg, msi->bus, msi->devfn);
+ if ( !pdev )
+ goto done;
+
ret = pci_enable_msi(msi, &msi_desc);
if ( ret )
{

View File

@ -0,0 +1,494 @@
From ea7513a3e3f28cfec59dda6e128b6b4968685762 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 28 Sep 2017 15:17:27 +0100
Subject: [PATCH 1/2] x86: limit linear page table use to a single level
That's the only way that they're meant to be used. Without such a
restriction arbitrarily long chains of same-level page tables can be
built, tearing down of which may then cause arbitrarily deep recursion,
causing a stack overflow. To facilitate this restriction, a counter is
being introduced to track both the number of same-level entries in a
page table as well as the number of uses of a page table in another
same-level one (counting into positive and negative direction
respectively, utilizing the fact that both counts can't be non-zero at
the same time).
Note that the added accounting introduces a restriction on the number
of times a page can be used in other same-level page tables - more than
32k of such uses are no longer possible.
Note also that some put_page_and_type[_preemptible]() calls are
replaced with open-coded equivalents. This seemed preferrable to
adding "parent_table" to the matrix of functions.
Note further that cross-domain same-level page table references are no
longer permitted (they probably never should have been).
This is XSA-240.
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
---
xen/arch/x86/domain.c | 1 +
xen/arch/x86/mm.c | 171 ++++++++++++++++++++++++++++++++++++++-----
xen/include/asm-x86/domain.h | 2 +
xen/include/asm-x86/mm.h | 25 +++++--
4 files changed, 175 insertions(+), 24 deletions(-)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 452748dd5b..44ed2ccd0a 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1237,6 +1237,7 @@ int arch_set_info_guest(
case -EINTR:
rc = -ERESTART;
case -ERESTART:
+ v->arch.old_guest_ptpg = NULL;
v->arch.old_guest_table =
pagetable_get_page(v->arch.guest_table);
v->arch.guest_table = pagetable_null();
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index e97ecccd93..e81a461b91 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -732,6 +732,61 @@ static void put_data_page(
put_page(page);
}
+static bool_t inc_linear_entries(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+ do {
+ /*
+ * The check below checks for the "linear use" count being non-zero
+ * as well as overflow. Signed integer overflow is undefined behavior
+ * according to the C spec. However, as long as linear_pt_count is
+ * smaller in size than 'int', the arithmetic operation of the
+ * increment below won't overflow; rather the result will be truncated
+ * when stored. Ensure that this is always true.
+ */
+ BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+ oc = nc++;
+ if ( nc <= 0 )
+ return 0;
+ nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+ } while ( oc != nc );
+
+ return 1;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) oc;
+
+ oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+ ASSERT(oc > 0);
+}
+
+static bool_t inc_linear_uses(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+ do {
+ /* See the respective comment in inc_linear_entries(). */
+ BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+ oc = nc--;
+ if ( nc >= 0 )
+ return 0;
+ nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+ } while ( oc != nc );
+
+ return 1;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+ typeof(pg->linear_pt_count) oc;
+
+ oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+ ASSERT(oc < 0);
+}
+
/*
* We allow root tables to map each other (a.k.a. linear page tables). It
* needs some special care with reference counts and access permissions:
@@ -761,15 +816,35 @@ get_##level##_linear_pagetable( \
\
if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
{ \
+ struct page_info *ptpg = mfn_to_page(pde_pfn); \
+ \
+ /* Make sure the page table belongs to the correct domain. */ \
+ if ( unlikely(page_get_owner(ptpg) != d) ) \
+ return 0; \
+ \
/* Make sure the mapped frame belongs to the correct domain. */ \
if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
return 0; \
\
/* \
- * Ensure that the mapped frame is an already-validated page table. \
+ * Ensure that the mapped frame is an already-validated page table \
+ * and is not itself having linear entries, as well as that the \
+ * containing page table is not iself in use as a linear page table \
+ * elsewhere. \
* If so, atomically increment the count (checking for overflow). \
*/ \
page = mfn_to_page(pfn); \
+ if ( !inc_linear_entries(ptpg) ) \
+ { \
+ put_page(page); \
+ return 0; \
+ } \
+ if ( !inc_linear_uses(page) ) \
+ { \
+ dec_linear_entries(ptpg); \
+ put_page(page); \
+ return 0; \
+ } \
y = page->u.inuse.type_info; \
do { \
x = y; \
@@ -777,6 +852,8 @@ get_##level##_linear_pagetable( \
unlikely((x & (PGT_type_mask|PGT_validated)) != \
(PGT_##level##_page_table|PGT_validated)) ) \
{ \
+ dec_linear_uses(page); \
+ dec_linear_entries(ptpg); \
put_page(page); \
return 0; \
} \
@@ -1201,6 +1278,9 @@ get_page_from_l4e(
l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
} while ( 0 )
+static int _put_page_type(struct page_info *page, bool_t preemptible,
+ struct page_info *ptpg);
+
void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
{
unsigned long pfn = l1e_get_pfn(l1e);
@@ -1270,17 +1350,22 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
if ( l2e_get_flags(l2e) & _PAGE_PSE )
put_superpage(l2e_get_pfn(l2e));
else
- put_page_and_type(l2e_get_page(l2e));
+ {
+ struct page_info *pg = l2e_get_page(l2e);
+ int rc = _put_page_type(pg, 0, mfn_to_page(pfn));
+
+ ASSERT(!rc);
+ put_page(pg);
+ }
return 0;
}
-static int __put_page_type(struct page_info *, int preemptible);
-
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
int partial, bool_t defer)
{
struct page_info *pg;
+ int rc;
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
return 1;
@@ -1303,21 +1388,28 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
if ( unlikely(partial > 0) )
{
ASSERT(!defer);
- return __put_page_type(pg, 1);
+ return _put_page_type(pg, 1, mfn_to_page(pfn));
}
if ( defer )
{
+ current->arch.old_guest_ptpg = mfn_to_page(pfn);
current->arch.old_guest_table = pg;
return 0;
}
- return put_page_and_type_preemptible(pg);
+ rc = _put_page_type(pg, 1, mfn_to_page(pfn));
+ if ( likely(!rc) )
+ put_page(pg);
+
+ return rc;
}
static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
int partial, bool_t defer)
{
+ int rc = 1;
+
if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
(l4e_get_pfn(l4e) != pfn) )
{
@@ -1326,18 +1418,22 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
if ( unlikely(partial > 0) )
{
ASSERT(!defer);
- return __put_page_type(pg, 1);
+ return _put_page_type(pg, 1, mfn_to_page(pfn));
}
if ( defer )
{
+ current->arch.old_guest_ptpg = mfn_to_page(pfn);
current->arch.old_guest_table = pg;
return 0;
}
- return put_page_and_type_preemptible(pg);
+ rc = _put_page_type(pg, 1, mfn_to_page(pfn));
+ if ( likely(!rc) )
+ put_page(pg);
}
- return 1;
+
+ return rc;
}
static int alloc_l1_table(struct page_info *page)
@@ -1535,6 +1631,7 @@ static int alloc_l3_table(struct page_info *page)
{
page->nr_validated_ptes = i;
page->partial_pte = 0;
+ current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
}
while ( i-- > 0 )
@@ -1627,6 +1724,7 @@ static int alloc_l4_table(struct page_info *page)
{
if ( current->arch.old_guest_table )
page->nr_validated_ptes++;
+ current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
}
}
@@ -2369,14 +2467,20 @@ int free_page_type(struct page_info *pag
}
-static int __put_final_page_type(
- struct page_info *page, unsigned long type, int preemptible)
+static int _put_final_page_type(struct page_info *page, unsigned long type,
+ bool_t preemptible, struct page_info *ptpg)
{
int rc = free_page_type(page, type, preemptible);
/* No need for atomic update of type_info here: noone else updates it. */
if ( rc == 0 )
{
+ if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+ {
+ dec_linear_uses(page);
+ dec_linear_entries(ptpg);
+ }
+ ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
/*
* Record TLB information for flush later. We do not stamp page tables
* when running in shadow mode:
@@ -2412,8 +2516,8 @@ static int __put_final_page_type(
}
-static int __put_page_type(struct page_info *page,
- int preemptible)
+static int _put_page_type(struct page_info *page, bool_t preemptible,
+ struct page_info *ptpg)
{
unsigned long nx, x, y = page->u.inuse.type_info;
int rc = 0;
@@ -2440,12 +2544,28 @@ static int __put_page_type(struct page_info *page,
x, nx)) != x) )
continue;
/* We cleared the 'valid bit' so we do the clean up. */
- rc = __put_final_page_type(page, x, preemptible);
+ rc = _put_final_page_type(page, x, preemptible, ptpg);
+ ptpg = NULL;
if ( x & PGT_partial )
put_page(page);
break;
}
+ if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+ {
+ /*
+ * page_set_tlbflush_timestamp() accesses the same union
+ * linear_pt_count lives in. Unvalidated page table pages,
+ * however, should occur during domain destruction only
+ * anyway. Updating of linear_pt_count luckily is not
+ * necessary anymore for a dying domain.
+ */
+ ASSERT(page_get_owner(page)->is_dying);
+ ASSERT(page->linear_pt_count < 0);
+ ASSERT(ptpg->linear_pt_count > 0);
+ ptpg = NULL;
+ }
+
/*
* Record TLB information for flush later. We do not stamp page
* tables when running in shadow mode:
@@ -2465,6 +2585,13 @@ static int __put_page_type(struct page_info *page,
return -EINTR;
}
+ if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+ {
+ ASSERT(!rc);
+ dec_linear_uses(page);
+ dec_linear_entries(ptpg);
+ }
+
return rc;
}
@@ -2599,6 +2726,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
page->nr_validated_ptes = 0;
page->partial_pte = 0;
}
+ page->linear_pt_count = 0;
rc = alloc_page_type(page, type, preemptible);
}
@@ -2610,7 +2738,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
void put_page_type(struct page_info *page)
{
- int rc = __put_page_type(page, 0);
+ int rc = _put_page_type(page, 0, NULL);
ASSERT(rc == 0);
(void)rc;
}
@@ -2626,7 +2754,7 @@ int get_page_type(struct page_info *page, unsigned long type)
int put_page_type_preemptible(struct page_info *page)
{
- return __put_page_type(page, 1);
+ return _put_page_type(page, 1, NULL);
}
int get_page_type_preemptible(struct page_info *page, unsigned long type)
@@ -2832,11 +2960,14 @@ int put_old_guest_table(struct vcpu *v)
if ( !v->arch.old_guest_table )
return 0;
- switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) )
+ switch ( rc = _put_page_type(v->arch.old_guest_table, 1,
+ v->arch.old_guest_ptpg) )
{
case -EINTR:
case -ERESTART:
return -ERESTART;
+ case 0:
+ put_page(v->arch.old_guest_table);
}
v->arch.old_guest_table = NULL;
@@ -2993,6 +3124,7 @@ int new_guest_cr3(unsigned long mfn)
rc = -ERESTART;
/* fallthrough */
case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
break;
default:
@@ -3260,7 +3392,10 @@ long do_mmuext_op(
if ( type == PGT_l1_page_table )
put_page_and_type(page);
else
+ {
+ curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
+ }
}
}
@@ -3293,6 +3428,7 @@ long do_mmuext_op(
{
case -EINTR:
case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
rc = 0;
break;
@@ -3371,6 +3507,7 @@ long do_mmuext_op(
rc = -ERESTART;
/* fallthrough */
case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
break;
default:
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 165e533ab3..5ef761be8b 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -529,6 +529,8 @@ struct arch_vcpu
pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */
pagetable_t guest_table; /* (MFN) guest notion of cr3 */
struct page_info *old_guest_table; /* partially destructed pagetable */
+ struct page_info *old_guest_ptpg; /* containing page table of the */
+ /* former, if any */
/* guest_table holds a ref to the page, and also a type-count unless
* shadow refcounts are in use */
pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index a30e76db1e..905c7971f2 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -125,11 +125,11 @@ struct page_info
u32 tlbflush_timestamp;
/*
- * When PGT_partial is true then this field is valid and indicates
- * that PTEs in the range [0, @nr_validated_ptes) have been validated.
- * An extra page reference must be acquired (or not dropped) whenever
- * PGT_partial gets set, and it must be dropped when the flag gets
- * cleared. This is so that a get() leaving a page in partially
+ * When PGT_partial is true then the first two fields are valid and
+ * indicate that PTEs in the range [0, @nr_validated_ptes) have been
+ * validated. An extra page reference must be acquired (or not dropped)
+ * whenever PGT_partial gets set, and it must be dropped when the flag
+ * gets cleared. This is so that a get() leaving a page in partially
* validated state (where the caller would drop the reference acquired
* due to the getting of the type [apparently] failing [-ERESTART])
* would not accidentally result in a page left with zero general
@@ -153,10 +153,18 @@ struct page_info
* put_page_from_lNe() (due to the apparent failure), and hence it
* must be dropped when the put operation is resumed (and completes),
* but it must not be acquired if picking up the page for validation.
+ *
+ * The 3rd field, @linear_pt_count, indicates
+ * - by a positive value, how many same-level page table entries a page
+ * table has,
+ * - by a negative value, in how many same-level page tables a page is
+ * in use.
*/
struct {
- u16 nr_validated_ptes;
- s8 partial_pte;
+ u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+ u16 :16 - PAGETABLE_ORDER - 1 - 2;
+ s16 partial_pte:2;
+ s16 linear_pt_count;
};
/*
@@ -207,6 +215,9 @@ struct page_info
#define PGT_count_width PG_shift(9)
#define PGT_count_mask ((1UL<<PGT_count_width)-1)
+/* Are the 'type mask' bits identical? */
+#define PGT_type_equal(x, y) (!(((x) ^ (y)) & PGT_type_mask))
+
/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated PG_shift(1)
#define PGC_allocated PG_mask(1, 1)
--
2.14.1

View File

@ -0,0 +1,66 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86: enforce proper privilege when (un)mapping pIRQ-s
(Un)mapping of IRQs, just like other RESOURCE__ADD* / RESOURCE__REMOVE*
actions (in FLASK terms) should be XSM_DM_PRIV rather than XSM_TARGET.
This in turn requires bypassing the XSM check in physdev_unmap_pirq()
for the HVM emuirq case just like is being done in physdev_map_pirq().
The primary goal security wise, however, is to no longer allow HVM
guests, by specifying their own domain ID instead of DOMID_SELF, to
enter code paths intended for PV guest and the control domains of HVM
guests only.
This is part of XSA-237.
Reported-by: HW42 <hw42@ipsumj.de>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -110,7 +110,7 @@ int physdev_map_pirq(domid_t domid, int
if ( d == NULL )
return -ESRCH;
- ret = xsm_map_domain_pirq(XSM_TARGET, d);
+ ret = xsm_map_domain_pirq(XSM_DM_PRIV, d);
if ( ret )
goto free_domain;
@@ -255,13 +255,14 @@ int physdev_map_pirq(domid_t domid, int
int physdev_unmap_pirq(domid_t domid, int pirq)
{
struct domain *d;
- int ret;
+ int ret = 0;
d = rcu_lock_domain_by_any_id(domid);
if ( d == NULL )
return -ESRCH;
- ret = xsm_unmap_domain_pirq(XSM_TARGET, d);
+ if ( domid != DOMID_SELF || !is_hvm_domain(d) )
+ ret = xsm_unmap_domain_pirq(XSM_DM_PRIV, d);
if ( ret )
goto free_domain;
--- a/xen/include/xsm/dummy.h
+++ b/xen/include/xsm/dummy.h
@@ -453,7 +453,7 @@ static XSM_INLINE char *xsm_show_irq_sid
static XSM_INLINE int xsm_map_domain_pirq(XSM_DEFAULT_ARG struct domain *d)
{
- XSM_ASSERT_ACTION(XSM_TARGET);
+ XSM_ASSERT_ACTION(XSM_DM_PRIV);
return xsm_default_action(action, current->domain, d);
}
@@ -465,7 +465,7 @@ static XSM_INLINE int xsm_map_domain_irq
static XSM_INLINE int xsm_unmap_domain_pirq(XSM_DEFAULT_ARG struct domain *d)
{
- XSM_ASSERT_ACTION(XSM_TARGET);
+ XSM_ASSERT_ACTION(XSM_DM_PRIV);
return xsm_default_action(action, current->domain, d);
}

View File

@ -0,0 +1,82 @@
From 9a4b34729f1bb92eea1e1efe52e6face9f0b17ae Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@citrix.com>
Date: Fri, 22 Sep 2017 11:46:55 +0100
Subject: [PATCH 2/2] x86/mm: Disable PV linear pagetables by default
Allowing pagetables to point to other pagetables of the same level
(often called 'linear pagetables') has been included in Xen since its
inception. But it is not used by the most common PV guests (Linux,
NetBSD, minios), and has been the source of a number of subtle
reference-counting bugs.
Add a command-line option to control whether PV linear pagetables are
allowed (disabled by default).
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes since v2:
- s/_/-/; in command-line option
- Added __read_mostly
---
docs/misc/xen-command-line.markdown | 15 +++++++++++++++
xen/arch/x86/mm.c | 9 +++++++++
2 files changed, 24 insertions(+)
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 73f5265fc6..061aff5edc 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1280,6 +1280,21 @@ The following resources are available:
CDP, one COS will corespond two CBMs other than one with CAT, due to the
sum of CBMs is fixed, that means actual `cos_max` in use will automatically
reduce to half when CDP is enabled.
+
+### pv-linear-pt
+> `= <boolean>`
+
+> Default: `false`
+
+Allow PV guests to have pagetable entries pointing to other pagetables
+of the same level (i.e., allowing L2 PTEs to point to other L2 pages).
+This technique is often called "linear pagetables", and is sometimes
+used to allow operating systems a simple way to consistently map the
+current process's pagetables into its own virtual address space.
+
+None of the most common PV operating systems (Linux, NetBSD, MiniOS)
+use this technique, but there may be custom operating systems which
+do.
### reboot
> `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | [c]old]`
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index e81a461b91..f748d4a221 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -799,6 +799,9 @@ static void dec_linear_uses(struct page_info *pg)
* frame if it is mapped by a different root table. This is sufficient and
* also necessary to allow validation of a root table mapping itself.
*/
+static bool_t __read_mostly pv_linear_pt_enable = 0;
+boolean_param("pv-linear-pt", pv_linear_pt_enable);
+
#define define_get_linear_pagetable(level) \
static int \
get_##level##_linear_pagetable( \
@@ -808,6 +811,12 @@ get_##level##_linear_pagetable( \
struct page_info *page; \
unsigned long pfn; \
\
+ if ( !pv_linear_pt_enable ) \
+ { \
+ MEM_LOG("Attempt to create linear p.t. (feature disabled)"); \
+ return 0; \
+ } \
+ \
if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
{ \
MEM_LOG("Attempt to create linear p.t. with write perms"); \
--
2.14.1

View File

@ -0,0 +1,55 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/MSI: disallow redundant enabling
At the moment, Xen attempts to allow redundant enabling of MSI by
having pci_enable_msi() return 0, and point to the existing MSI
descriptor, when the msi already exists.
Unfortunately, if subsequent errors are encountered, the cleanup
paths assume pci_enable_msi() had done full initialization, and
hence undo everything that was assumed to be done by that
function without also undoing other setup that would normally
occur only after that function was called (in map_domain_pirq()
itself).
Rather than try to make the redundant enabling case work properly, just
forbid it entirely by having pci_enable_msi() return -EEXIST when MSI
is already set up.
This is part of XSA-237.
Reported-by: HW42 <hw42@ipsumj.de>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -1050,11 +1050,10 @@ static int __pci_enable_msi(struct msi_i
old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
if ( old_desc )
{
- printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+ printk(XENLOG_ERR "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
msi->irq, msi->seg, msi->bus,
PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
- *desc = old_desc;
- return 0;
+ return -EEXIST;
}
old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
@@ -1118,11 +1117,10 @@ static int __pci_enable_msix(struct msi_
old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
if ( old_desc )
{
- printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+ printk(XENLOG_ERR "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
msi->irq, msi->seg, msi->bus,
PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
- *desc = old_desc;
- return 0;
+ return -EEXIST;
}
old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);

View File

@ -0,0 +1,124 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/IRQ: conditionally preserve irq <-> pirq mapping on map error paths
Mappings that had been set up before should not be torn down when
handling unrelated errors.
This is part of XSA-237.
Reported-by: HW42 <hw42@ipsumj.de>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1252,7 +1252,8 @@ static int prepare_domain_irq_pirq(struc
return -ENOMEM;
}
*pinfo = info;
- return 0;
+
+ return !!err;
}
static void set_domain_irq_pirq(struct domain *d, int irq, struct pirq *pirq)
@@ -1295,7 +1296,10 @@ int init_domain_irq_mapping(struct domai
continue;
err = prepare_domain_irq_pirq(d, i, i, &info);
if ( err )
+ {
+ ASSERT(err < 0);
break;
+ }
set_domain_irq_pirq(d, i, info);
}
@@ -1903,6 +1907,7 @@ int map_domain_pirq(
struct pirq *info;
struct irq_desc *desc;
unsigned long flags;
+ DECLARE_BITMAP(prepared, MAX_MSI_IRQS) = {};
ASSERT(spin_is_locked(&d->event_lock));
@@ -1946,8 +1951,10 @@ int map_domain_pirq(
}
ret = prepare_domain_irq_pirq(d, irq, pirq, &info);
- if ( ret )
+ if ( ret < 0 )
goto revoke;
+ if ( !ret )
+ __set_bit(0, prepared);
desc = irq_to_desc(irq);
@@ -2019,8 +2026,10 @@ int map_domain_pirq(
irq = create_irq(NUMA_NO_NODE);
ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info)
: irq;
- if ( ret )
+ if ( ret < 0 )
break;
+ if ( !ret )
+ __set_bit(nr, prepared);
msi_desc[nr].irq = irq;
if ( irq_permit_access(d, irq) != 0 )
@@ -2053,15 +2062,15 @@ int map_domain_pirq(
desc->msi_desc = NULL;
spin_unlock_irqrestore(&desc->lock, flags);
}
- while ( nr-- )
+ while ( nr )
{
if ( irq >= 0 && irq_deny_access(d, irq) )
printk(XENLOG_G_ERR
"dom%d: could not revoke access to IRQ%d (pirq %d)\n",
d->domain_id, irq, pirq);
- if ( info )
+ if ( info && test_bit(nr, prepared) )
cleanup_domain_irq_pirq(d, irq, info);
- info = pirq_info(d, pirq + nr);
+ info = pirq_info(d, pirq + --nr);
irq = info->arch.irq;
}
msi_desc->irq = -1;
@@ -2077,12 +2086,14 @@ int map_domain_pirq(
spin_lock_irqsave(&desc->lock, flags);
set_domain_irq_pirq(d, irq, info);
spin_unlock_irqrestore(&desc->lock, flags);
+ ret = 0;
}
done:
if ( ret )
{
- cleanup_domain_irq_pirq(d, irq, info);
+ if ( test_bit(0, prepared) )
+ cleanup_domain_irq_pirq(d, irq, info);
revoke:
if ( irq_deny_access(d, irq) )
printk(XENLOG_G_ERR
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -185,7 +185,7 @@ int physdev_map_pirq(domid_t domid, int
}
else if ( type == MAP_PIRQ_TYPE_MULTI_MSI )
{
- if ( msi->entry_nr <= 0 || msi->entry_nr > 32 )
+ if ( msi->entry_nr <= 0 || msi->entry_nr > MAX_MSI_IRQS )
ret = -EDOM;
else if ( msi->entry_nr != 1 && !iommu_intremap )
ret = -EOPNOTSUPP;
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -55,6 +55,8 @@
/* MAX fixed pages reserved for mapping MSIX tables. */
#define FIX_MSIX_MAX_PAGES 512
+#define MAX_MSI_IRQS 32 /* limited by MSI capability struct properties */
+
struct msi_info {
u16 seg;
u8 bus;

View File

@ -0,0 +1,37 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/FLASK: fix unmap-domain-IRQ XSM hook
The caller and the FLASK implementation of xsm_unmap_domain_irq()
disagreed about what the "data" argument points to in the MSI case:
Change both sides to pass/take a PCI device.
This is part of XSA-237.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2144,7 +2144,8 @@ int unmap_domain_pirq(struct domain *d,
nr = msi_desc->msi.nvec;
}
- ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc);
+ ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq,
+ msi_desc ? msi_desc->dev : NULL);
if ( ret )
goto done;
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -915,8 +915,8 @@ static int flask_unmap_domain_msi (struc
u32 *sid, struct avc_audit_data *ad)
{
#ifdef CONFIG_HAS_PCI
- struct msi_info *msi = data;
- u32 machine_bdf = (msi->seg << 16) | (msi->bus << 8) | msi->devfn;
+ const struct pci_dev *pdev = data;
+ u32 machine_bdf = (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn;
AVC_AUDIT_DATA_INIT(ad, DEV);
ad->device = machine_bdf;

View File

@ -0,0 +1,45 @@
From cdc2887076b19b39fab9faec495082586f3113df Mon Sep 17 00:00:00 2001
From: XenProject Security Team <security@xenproject.org>
Date: Tue, 5 Sep 2017 13:41:37 +0200
Subject: x86/ioreq server: correctly handle bogus
XEN_DMOP_{,un}map_io_range_to_ioreq_server arguments
Misbehaving device model can pass incorrect XEN_DMOP_map/
unmap_io_range_to_ioreq_server arguments, namely end < start when
specifying address range. When this happens we hit ASSERT(s <= e) in
rangeset_contains_range()/rangeset_overlaps_range() with debug builds.
Production builds will not trap right away but may misbehave later
while handling such bogus ranges.
This is XSA-238.
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
xen/arch/x86/hvm/ioreq.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index b2a8b0e986..8c8bf1f0ec 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -820,6 +820,9 @@ int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
struct hvm_ioreq_server *s;
int rc;
+ if ( start > end )
+ return -EINVAL;
+
spin_lock_recursive(&d->arch.hvm_domain.ioreq_server.lock);
rc = -ENOENT;
@@ -872,6 +875,9 @@ int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
struct hvm_ioreq_server *s;
int rc;
+ if ( start > end )
+ return -EINVAL;
+
spin_lock_recursive(&d->arch.hvm_domain.ioreq_server.lock);
rc = -ENOENT;

View File

@ -0,0 +1,46 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/HVM: prefill partially used variable on emulation paths
Certain handlers ignore the access size (vioapic_write() being the
example this was found with), perhaps leading to subsequent reads
seeing data that wasn't actually written by the guest. For
consistency and extra safety also do this on the read path of
hvm_process_io_intercept(), even if this doesn't directly affect what
guests get to see, as we've supposedly already dealt with read handlers
leaving data completely unitialized.
This is XSA-239.
Reported-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -129,7 +129,7 @@ static int hvmemul_do_io(
.count = *reps,
.dir = dir,
.df = df,
- .data = data,
+ .data = data_is_addr ? data : 0,
.data_is_ptr = data_is_addr, /* ioreq_t field name is misleading */
.state = STATE_IOREQ_READY,
};
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -127,6 +127,7 @@ int hvm_process_io_intercept(const struc
addr = (p->type == IOREQ_TYPE_COPY) ?
p->addr + step * i :
p->addr;
+ data = 0;
rc = ops->read(handler, addr, p->size, &data);
if ( rc != X86EMUL_OKAY )
break;
@@ -161,6 +162,7 @@ int hvm_process_io_intercept(const struc
{
if ( p->data_is_ptr )
{
+ data = 0;
switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
p->size) )
{

View File

@ -0,0 +1,120 @@
x86: don't store possibly stale TLB flush time stamp
While the timing window is extremely narrow, it is theoretically
possible for an update to the TLB flush clock and a subsequent flush
IPI to happen between the read and write parts of the update of the
per-page stamp. Exclude this possibility by disabling interrupts
across the update, preventing the IPI to be serviced in the middle.
This is XSA-241.
Reported-by: Jann Horn <jannh@google.com>
Suggested-by: George Dunlap <george.dunlap@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
--- a/xen/arch/arm/smp.c
+++ b/xen/arch/arm/smp.c
@@ -1,4 +1,5 @@
#include <xen/config.h>
+#include <xen/mm.h>
#include <asm/system.h>
#include <asm/smp.h>
#include <asm/cpregs.h>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2524,7 +2524,7 @@ static int _put_final_page_type(struct p
*/
if ( !(shadow_mode_enabled(page_get_owner(page)) &&
(page->count_info & PGC_page_table)) )
- page->tlbflush_timestamp = tlbflush_current_time();
+ page_set_tlbflush_timestamp(page);
wmb();
page->u.inuse.type_info--;
}
@@ -2534,7 +2534,7 @@ static int _put_final_page_type(struct p
(PGT_count_mask|PGT_validated|PGT_partial)) == 1);
if ( !(shadow_mode_enabled(page_get_owner(page)) &&
(page->count_info & PGC_page_table)) )
- page->tlbflush_timestamp = tlbflush_current_time();
+ page_set_tlbflush_timestamp(page);
wmb();
page->u.inuse.type_info |= PGT_validated;
}
@@ -2588,7 +2588,7 @@ static int _put_page_type(struct page_in
if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
{
/*
- * page_set_tlbflush_timestamp() accesses the same union
+ * set_tlbflush_timestamp() accesses the same union
* linear_pt_count lives in. Unvalidated page table pages,
* however, should occur during domain destruction only
* anyway. Updating of linear_pt_count luckily is not
@@ -2609,7 +2609,7 @@ static int _put_page_type(struct page_in
*/
if ( !(shadow_mode_enabled(page_get_owner(page)) &&
(page->count_info & PGC_page_table)) )
- page->tlbflush_timestamp = tlbflush_current_time();
+ page_set_tlbflush_timestamp(page);
}
if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1464,7 +1464,7 @@ void shadow_free(struct domain *d, mfn_t
* TLBs when we reuse the page. Because the destructors leave the
* contents of the pages in place, we can delay TLB flushes until
* just before the allocator hands the page out again. */
- sp->tlbflush_timestamp = tlbflush_current_time();
+ page_set_tlbflush_timestamp(sp);
perfc_decr(shadow_alloc_count);
page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
sp = next;
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -960,7 +960,7 @@ static void free_heap_pages(
/* If a page has no owner it will need no safety TLB flush. */
pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
if ( pg[i].u.free.need_tlbflush )
- pg[i].tlbflush_timestamp = tlbflush_current_time();
+ page_set_tlbflush_timestamp(&pg[i]);
/* This page is not a guest frame any more. */
page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
--- a/xen/include/asm-arm/flushtlb.h
+++ b/xen/include/asm-arm/flushtlb.h
@@ -12,6 +12,11 @@ static inline void tlbflush_filter(cpuma
#define tlbflush_current_time() (0)
+static inline void page_set_tlbflush_timestamp(struct page_info *page)
+{
+ page->tlbflush_timestamp = tlbflush_current_time();
+}
+
#if defined(CONFIG_ARM_32)
# include <asm/arm32/flushtlb.h>
#elif defined(CONFIG_ARM_64)
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -23,6 +23,20 @@ DECLARE_PER_CPU(u32, tlbflush_time);
#define tlbflush_current_time() tlbflush_clock
+static inline void page_set_tlbflush_timestamp(struct page_info *page)
+{
+ /*
+ * Prevent storing a stale time stamp, which could happen if an update
+ * to tlbflush_clock plus a subsequent flush IPI happen between the
+ * reading of tlbflush_clock and the writing of the struct page_info
+ * field.
+ */
+ ASSERT(local_irq_is_enabled());
+ local_irq_disable();
+ page->tlbflush_timestamp = tlbflush_current_time();
+ local_irq_enable();
+}
+
/*
* @cpu_stamp is the timestamp at last TLB flush for the CPU we are testing.
* @lastuse_stamp is a timestamp taken when the PFN we are testing was last

View File

@ -0,0 +1,43 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86: don't allow page_unlock() to drop the last type reference
Only _put_page_type() does the necessary cleanup, and hence not all
domain pages can be released during guest cleanup (leaving around
zombie domains) if we get this wrong.
This is XSA-242.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1923,7 +1923,11 @@ void page_unlock(struct page_info *page)
do {
x = y;
+ ASSERT((x & PGT_count_mask) && (x & PGT_locked));
+
nx = x - (1 | PGT_locked);
+ /* We must not drop the last reference here. */
+ ASSERT(nx & PGT_count_mask);
} while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
}
@@ -2611,6 +2615,17 @@ static int _put_page_type(struct page_in
(page->count_info & PGC_page_table)) )
page_set_tlbflush_timestamp(page);
}
+ else if ( unlikely((nx & (PGT_locked | PGT_count_mask)) ==
+ (PGT_locked | 1)) )
+ {
+ /*
+ * We must not drop the second to last reference when the page is
+ * locked, as page_unlock() doesn't do any cleanup of the type.
+ */
+ cpu_relax();
+ y = page->u.inuse.type_info;
+ continue;
+ }
if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
break;

View File

@ -0,0 +1,93 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/shadow: Don't create self-linear shadow mappings for 4-level translated guests
When initially creating a monitor table for 4-level translated guests, don't
install a shadow-linear mapping. This mapping is actually self-linear, and
trips up the writeable heuristic logic into following Xen's mappings, not the
guests' shadows it was expecting to follow.
A consequence of this is that sh_guess_wrmap() needs to cope with there being
no shadow-linear mapping present, which in practice occurs once each time a
vcpu switches to 4-level paging from a different paging mode.
An appropriate shadow-linear slot will be inserted into the monitor table
either while constructing lower level monitor tables, or by sh_update_cr3().
While fixing this, clarify the safety of the other mappings. Despite
appearing unsafe, it is correct to create a guest-linear mapping for
translated domains; this is self-linear and doesn't point into the translated
domain. Drop a dead clause for translate != external guests.
This is XSA-243.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Tim Deegan <tim@xen.org>
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 428be37..c83932f 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -1472,26 +1472,38 @@ void sh_install_xen_entries_in_l4(struct domain *d, mfn_t gl4mfn, mfn_t sl4mfn)
sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = shadow_l4e_empty();
}
- /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
- * shadows on 64-bit xen, this linear mapping is later replaced by the
- * monitor pagetable structure, which is built in make_monitor_table
- * and maintained by sh_update_linear_entries. */
- sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
- shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
-
- /* Self linear mapping. */
- if ( shadow_mode_translate(d) && !shadow_mode_external(d) )
+ /*
+ * Linear mapping slots:
+ *
+ * Calling this function with gl4mfn == sl4mfn is used to construct a
+ * monitor table for translated domains. In this case, gl4mfn forms the
+ * self-linear mapping (i.e. not pointing into the translated domain), and
+ * the shadow-linear slot is skipped. The shadow-linear slot is either
+ * filled when constructing lower level monitor tables, or via
+ * sh_update_cr3() for 4-level guests.
+ *
+ * Calling this function with gl4mfn != sl4mfn is used for non-translated
+ * guests, where the shadow-linear slot is actually self-linear, and the
+ * guest-linear slot points into the guests view of its pagetables.
+ */
+ if ( shadow_mode_translate(d) )
{
- // linear tables may not be used with translated PV guests
- sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+ ASSERT(mfn_x(gl4mfn) == mfn_x(sl4mfn));
+
+ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
shadow_l4e_empty();
}
else
{
- sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
- shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+ ASSERT(mfn_x(gl4mfn) != mfn_x(sl4mfn));
+
+ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
}
+ sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+
unmap_domain_page(sl4e);
}
#endif
@@ -4293,6 +4305,11 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
/* Carefully look in the shadow linear map for the l1e we expect */
#if SHADOW_PAGING_LEVELS >= 4
+ /* Is a shadow linear map is installed in the first place? */
+ sl4p = v->arch.paging.shadow.guest_vtable;
+ sl4p += shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
+ if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+ return 0;
sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
return 0;

View File

@ -0,0 +1,51 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/cpu: fix IST handling during PCPU bringup
Clear IST references in newly allocated IDTs. Nothing good will come of
having them set before the TSS is suitably constructed (although the chances
of the CPU surviving such an IST interrupt/exception is extremely slim).
Uniformly set the IST references after the TSS is in place. This fixes an
issue on AMD hardware, where onlining a PCPU while PCPU0 is in HVM context
will cause IST_NONE to be copied into the new IDT, making that PCPU vulnerable
to privilege escalation from PV guests until it subsequently schedules an HVM
guest.
This is XSA-244.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -617,6 +617,7 @@ void __init early_cpu_init(void)
* - Sets up TSS with stack pointers, including ISTs
* - Inserts TSS selector into regular and compat GDTs
* - Loads GDT, IDT, TR then null LDT
+ * - Sets up IST references in the IDT
*/
void load_system_tables(void)
{
@@ -663,6 +664,10 @@ void load_system_tables(void)
asm volatile ("lidt %0" : : "m" (idtr) );
asm volatile ("ltr %w0" : : "rm" (TSS_ENTRY << 3) );
asm volatile ("lldt %w0" : : "rm" (0) );
+
+ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF);
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
}
/*
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -715,6 +715,9 @@ static int cpu_smpboot_alloc(unsigned in
if ( idt_tables[cpu] == NULL )
goto oom;
memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
+ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )