320 lines
12 KiB
Diff
320 lines
12 KiB
Diff
From: Matt Fleming <matt@codeblueprint.co.uk>
|
|
Date: Fri, 27 Nov 2015 21:09:34 +0000
|
|
Subject: [5/5] x86/efi: Build our own page table structures
|
|
Origin: https://git.kernel.org/cgit/linux/kernel/git/mfleming/efi.git/commit?id=67a9108ed4313b85a9c53406d80dc1ae3f8c3e36
|
|
|
|
With commit e1a58320a38d ("x86/mm: Warn on W^X mappings") all
|
|
users booting on 64-bit UEFI machines see the following warning,
|
|
|
|
------------[ cut here ]------------
|
|
WARNING: CPU: 7 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x5dc/0x780()
|
|
x86/mm: Found insecure W+X mapping at address ffff88000005f000/0xffff88000005f000
|
|
...
|
|
x86/mm: Checked W+X mappings: FAILED, 165660 W+X pages found.
|
|
...
|
|
|
|
This is caused by mapping EFI regions with RWX permissions.
|
|
There isn't much we can do to restrict the permissions for these
|
|
regions due to the way the firmware toolchains mix code and
|
|
data, but we can at least isolate these mappings so that they do
|
|
not appear in the regular kernel page tables.
|
|
|
|
In commit d2f7cbe7b26a ("x86/efi: Runtime services virtual
|
|
mapping") we started using 'trampoline_pgd' to map the EFI
|
|
regions because there was an existing identity mapping there
|
|
which we use during the SetVirtualAddressMap() call and for
|
|
broken firmware that accesses those addresses.
|
|
|
|
But 'trampoline_pgd' shares some PGD entries with
|
|
'swapper_pg_dir' and does not provide the isolation we require.
|
|
Notably the virtual address for __START_KERNEL_map and
|
|
MODULES_START are mapped by the same PGD entry so we need to be
|
|
more careful when copying changes over in
|
|
efi_sync_low_kernel_mappings().
|
|
|
|
This patch doesn't go the full mile, we still want to share some
|
|
PGD entries with 'swapper_pg_dir'. Having completely separate
|
|
page tables brings its own issues such as synchronising new
|
|
mappings after memory hotplug and module loading. Sharing also
|
|
keeps memory usage down.
|
|
|
|
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
|
|
Reviewed-by: Borislav Petkov <bp@suse.de>
|
|
Acked-by: Borislav Petkov <bp@suse.de>
|
|
Cc: Andrew Morton <akpm@linux-foundation.org>
|
|
Cc: Andy Lutomirski <luto@amacapital.net>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
|
|
Cc: Borislav Petkov <bp@alien8.de>
|
|
Cc: Brian Gerst <brgerst@gmail.com>
|
|
Cc: Dave Jones <davej@codemonkey.org.uk>
|
|
Cc: Denys Vlasenko <dvlasenk@redhat.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
|
|
Cc: Stephen Smalley <sds@tycho.nsa.gov>
|
|
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Toshi Kani <toshi.kani@hp.com>
|
|
Cc: linux-efi@vger.kernel.org
|
|
Link: http://lkml.kernel.org/r/1448658575-17029-6-git-send-email-matt@codeblueprint.co.uk
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
---
|
|
arch/x86/include/asm/efi.h | 1 +
|
|
arch/x86/platform/efi/efi.c | 39 ++++++-----------
|
|
arch/x86/platform/efi/efi_32.c | 5 +++
|
|
arch/x86/platform/efi/efi_64.c | 97 +++++++++++++++++++++++++++++++++++-------
|
|
4 files changed, 102 insertions(+), 40 deletions(-)
|
|
|
|
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
|
|
index 347eeacb06a8..8fd9e637629a 100644
|
|
--- a/arch/x86/include/asm/efi.h
|
|
+++ b/arch/x86/include/asm/efi.h
|
|
@@ -136,6 +136,7 @@ extern void __init efi_memory_uc(u64 addr, unsigned long size);
|
|
extern void __init efi_map_region(efi_memory_desc_t *md);
|
|
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
|
|
extern void efi_sync_low_kernel_mappings(void);
|
|
+extern int __init efi_alloc_page_tables(void);
|
|
extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
|
|
extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
|
|
extern void __init old_map_region(efi_memory_desc_t *md);
|
|
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
|
|
index ad285404ea7f..3c1f3cd7b2ba 100644
|
|
--- a/arch/x86/platform/efi/efi.c
|
|
+++ b/arch/x86/platform/efi/efi.c
|
|
@@ -869,7 +869,7 @@ static void __init kexec_enter_virtual_mode(void)
|
|
* This function will switch the EFI runtime services to virtual mode.
|
|
* Essentially, we look through the EFI memmap and map every region that
|
|
* has the runtime attribute bit set in its memory descriptor into the
|
|
- * ->trampoline_pgd page table using a top-down VA allocation scheme.
|
|
+ * efi_pgd page table.
|
|
*
|
|
* The old method which used to update that memory descriptor with the
|
|
* virtual address obtained from ioremap() is still supported when the
|
|
@@ -879,8 +879,8 @@ static void __init kexec_enter_virtual_mode(void)
|
|
*
|
|
* The new method does a pagetable switch in a preemption-safe manner
|
|
* so that we're in a different address space when calling a runtime
|
|
- * function. For function arguments passing we do copy the PGDs of the
|
|
- * kernel page table into ->trampoline_pgd prior to each call.
|
|
+ * function. For function arguments passing we do copy the PUDs of the
|
|
+ * kernel page table into efi_pgd prior to each call.
|
|
*
|
|
* Specially for kexec boot, efi runtime maps in previous kernel should
|
|
* be passed in via setup_data. In that case runtime ranges will be mapped
|
|
@@ -895,6 +895,12 @@ static void __init __efi_enter_virtual_mode(void)
|
|
|
|
efi.systab = NULL;
|
|
|
|
+ if (efi_alloc_page_tables()) {
|
|
+ pr_err("Failed to allocate EFI page tables\n");
|
|
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
|
|
+ return;
|
|
+ }
|
|
+
|
|
efi_merge_regions();
|
|
new_memmap = efi_map_regions(&count, &pg_shift);
|
|
if (!new_memmap) {
|
|
@@ -954,28 +960,11 @@ static void __init __efi_enter_virtual_mode(void)
|
|
efi_runtime_mkexec();
|
|
|
|
/*
|
|
- * We mapped the descriptor array into the EFI pagetable above but we're
|
|
- * not unmapping it here. Here's why:
|
|
- *
|
|
- * We're copying select PGDs from the kernel page table to the EFI page
|
|
- * table and when we do so and make changes to those PGDs like unmapping
|
|
- * stuff from them, those changes appear in the kernel page table and we
|
|
- * go boom.
|
|
- *
|
|
- * From setup_real_mode():
|
|
- *
|
|
- * ...
|
|
- * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
|
|
- *
|
|
- * In this particular case, our allocation is in PGD 0 of the EFI page
|
|
- * table but we've copied that PGD from PGD[272] of the EFI page table:
|
|
- *
|
|
- * pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272
|
|
- *
|
|
- * where the direct memory mapping in kernel space is.
|
|
- *
|
|
- * new_memmap's VA comes from that direct mapping and thus clearing it,
|
|
- * it would get cleared in the kernel page table too.
|
|
+ * We mapped the descriptor array into the EFI pagetable above
|
|
+ * but we're not unmapping it here because if we're running in
|
|
+ * EFI mixed mode we need all of memory to be accessible when
|
|
+ * we pass parameters to the EFI runtime services in the
|
|
+ * thunking code.
|
|
*
|
|
* efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
|
|
*/
|
|
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
|
|
index ed5b67338294..58d669bc8250 100644
|
|
--- a/arch/x86/platform/efi/efi_32.c
|
|
+++ b/arch/x86/platform/efi/efi_32.c
|
|
@@ -38,6 +38,11 @@
|
|
* say 0 - 3G.
|
|
*/
|
|
|
|
+int __init efi_alloc_page_tables(void)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
void efi_sync_low_kernel_mappings(void) {}
|
|
void __init efi_dump_pagetable(void) {}
|
|
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
|
|
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
|
|
index b19cdac959b2..4897f518760f 100644
|
|
--- a/arch/x86/platform/efi/efi_64.c
|
|
+++ b/arch/x86/platform/efi/efi_64.c
|
|
@@ -40,6 +40,7 @@
|
|
#include <asm/fixmap.h>
|
|
#include <asm/realmode.h>
|
|
#include <asm/time.h>
|
|
+#include <asm/pgalloc.h>
|
|
|
|
/*
|
|
* We allocate runtime services regions bottom-up, starting from -4G, i.e.
|
|
@@ -121,22 +122,92 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
|
|
early_code_mapping_set_exec(0);
|
|
}
|
|
|
|
+static pgd_t *efi_pgd;
|
|
+
|
|
+/*
|
|
+ * We need our own copy of the higher levels of the page tables
|
|
+ * because we want to avoid inserting EFI region mappings (EFI_VA_END
|
|
+ * to EFI_VA_START) into the standard kernel page tables. Everything
|
|
+ * else can be shared, see efi_sync_low_kernel_mappings().
|
|
+ */
|
|
+int __init efi_alloc_page_tables(void)
|
|
+{
|
|
+ pgd_t *pgd;
|
|
+ pud_t *pud;
|
|
+ gfp_t gfp_mask;
|
|
+
|
|
+ if (efi_enabled(EFI_OLD_MEMMAP))
|
|
+ return 0;
|
|
+
|
|
+ gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO;
|
|
+ efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
|
|
+ if (!efi_pgd)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ pgd = efi_pgd + pgd_index(EFI_VA_END);
|
|
+
|
|
+ pud = pud_alloc_one(NULL, 0);
|
|
+ if (!pud) {
|
|
+ free_page((unsigned long)efi_pgd);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ pgd_populate(NULL, pgd, pud);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/*
|
|
* Add low kernel mappings for passing arguments to EFI functions.
|
|
*/
|
|
void efi_sync_low_kernel_mappings(void)
|
|
{
|
|
- unsigned num_pgds;
|
|
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
|
|
+ unsigned num_entries;
|
|
+ pgd_t *pgd_k, *pgd_efi;
|
|
+ pud_t *pud_k, *pud_efi;
|
|
|
|
if (efi_enabled(EFI_OLD_MEMMAP))
|
|
return;
|
|
|
|
- num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
|
|
+ /*
|
|
+ * We can share all PGD entries apart from the one entry that
|
|
+ * covers the EFI runtime mapping space.
|
|
+ *
|
|
+ * Make sure the EFI runtime region mappings are guaranteed to
|
|
+ * only span a single PGD entry and that the entry also maps
|
|
+ * other important kernel regions.
|
|
+ */
|
|
+ BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
|
|
+ BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
|
|
+ (EFI_VA_END & PGDIR_MASK));
|
|
+
|
|
+ pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
|
|
+ pgd_k = pgd_offset_k(PAGE_OFFSET);
|
|
+
|
|
+ num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
|
|
+ memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
|
|
|
|
- memcpy(pgd + pgd_index(PAGE_OFFSET),
|
|
- init_mm.pgd + pgd_index(PAGE_OFFSET),
|
|
- sizeof(pgd_t) * num_pgds);
|
|
+ /*
|
|
+ * We share all the PUD entries apart from those that map the
|
|
+ * EFI regions. Copy around them.
|
|
+ */
|
|
+ BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
|
|
+ BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
|
|
+
|
|
+ pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
|
|
+ pud_efi = pud_offset(pgd_efi, 0);
|
|
+
|
|
+ pgd_k = pgd_offset_k(EFI_VA_END);
|
|
+ pud_k = pud_offset(pgd_k, 0);
|
|
+
|
|
+ num_entries = pud_index(EFI_VA_END);
|
|
+ memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
|
|
+
|
|
+ pud_efi = pud_offset(pgd_efi, EFI_VA_START);
|
|
+ pud_k = pud_offset(pgd_k, EFI_VA_START);
|
|
+
|
|
+ num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
|
|
+ memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
|
|
}
|
|
|
|
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
|
|
@@ -150,8 +221,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
|
|
if (efi_enabled(EFI_OLD_MEMMAP))
|
|
return 0;
|
|
|
|
- efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
|
|
- pgd = __va(efi_scratch.efi_pgt);
|
|
+ efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
|
|
+ pgd = efi_pgd;
|
|
|
|
/*
|
|
* It can happen that the physical address of new_memmap lands in memory
|
|
@@ -216,16 +287,14 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
|
|
|
|
void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
|
|
{
|
|
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
|
|
-
|
|
- kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
|
|
+ kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages);
|
|
}
|
|
|
|
static void __init __map_region(efi_memory_desc_t *md, u64 va)
|
|
{
|
|
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
|
|
unsigned long flags = 0;
|
|
unsigned long pfn;
|
|
+ pgd_t *pgd = efi_pgd;
|
|
|
|
if (!(md->attribute & EFI_MEMORY_WB))
|
|
flags |= _PAGE_PCD;
|
|
@@ -334,9 +403,7 @@ void __init efi_runtime_mkexec(void)
|
|
void __init efi_dump_pagetable(void)
|
|
{
|
|
#ifdef CONFIG_EFI_PGT_DUMP
|
|
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
|
|
-
|
|
- ptdump_walk_pgd_level(NULL, pgd);
|
|
+ ptdump_walk_pgd_level(NULL, efi_pgd);
|
|
#endif
|
|
}
|
|
|