From c7f9edd8f545dba40fb663144d65c3a7cdc64350 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 5 Jan 2017 18:08:40 -0800 Subject: [PATCH] kernel - Add NUMA awareness to vm_page_alloc() and related functions * Add NUMA awareness to the kernel memory subsystem. This first iteration will primarily affect user pages. kmalloc and objcache are not NUMA-friendly yet (and its questionable how useful it would be to make them so). * Tested with synth on monster (4-socket opteron / 48 cores) and a 2-socket xeon (32 threads). Appears to dole out localized pages 5:1 to 10:1. --- sys/kern/subr_cpu_topology.c | 29 ++--- sys/platform/pc64/acpica/acpi_srat.c | 148 ++++++++++++++++++++++++++ sys/platform/pc64/conf/files | 1 + sys/platform/pc64/include/smp.h | 1 + sys/platform/pc64/x86_64/mp_machdep.c | 6 ++ sys/platform/vkernel64/include/smp.h | 1 + sys/sys/kernel.h | 3 +- sys/vm/vm_page.c | 81 ++++++++++++++ sys/vm/vm_page.h | 1 + 9 files changed, 258 insertions(+), 13 deletions(-) create mode 100644 sys/platform/pc64/acpica/acpi_srat.c diff --git a/sys/kern/subr_cpu_topology.c b/sys/kern/subr_cpu_topology.c index 557733c3bb..a59821084e 100644 --- a/sys/kern/subr_cpu_topology.c +++ b/sys/kern/subr_cpu_topology.c @@ -165,7 +165,7 @@ migrate_elements(cpu_node_t **a, int n, int pos) * BSP. When we found a match, at that level the CPUs are siblings. */ static void -build_cpu_topology(void) +build_cpu_topology(int assumed_ncpus) { detect_cpu_topology(); int i; @@ -184,7 +184,7 @@ build_cpu_topology(void) * Find the number of siblings within chip * and witin core to build up the topology */ - for (i = 0; i < ncpus; i++) { + for (i = 0; i < assumed_ncpus; i++) { cpumask_t mask; CPUMASK_ASSBIT(mask, i); @@ -203,7 +203,7 @@ build_cpu_topology(void) } cores_per_chip /= threads_per_core; - chips_per_package = ncpus / (cores_per_chip * threads_per_core); + chips_per_package = assumed_ncpus / (cores_per_chip * threads_per_core); if (bootverbose) kprintf("CPU Topology: cores_per_chip: %d; threads_per_core: %d; chips_per_package: %d;\n", @@ -278,7 +278,7 @@ build_cpu_topology(void) bzero(visited, MAXCPU * sizeof(int)); - for (i = 0; i < ncpus; i++) { + for (i = 0; i < assumed_ncpus; i++) { if (visited[i] == 0) { pos = 0; visited[i] = 1; @@ -537,7 +537,7 @@ get_cpu_node_by_chipid(int chip_id) /* init pcpu_sysctl structure info */ static void -init_pcpu_topology_sysctl(void) +init_pcpu_topology_sysctl(int assumed_ncpus) { struct sbuf sb; cpumask_t mask; @@ -549,7 +549,7 @@ init_pcpu_topology_sysctl(void) pcpu_sysctl = kmalloc(sizeof(*pcpu_sysctl) * MAXCPU, M_PCPUSYS, M_INTWAIT | M_ZERO); - for (i = 0; i < ncpus; i++) { + for (i = 0; i < assumed_ncpus; i++) { sbuf_new(&sb, pcpu_sysctl[i].cpu_name, sizeof(pcpu_sysctl[i].cpu_name), SBUF_FIXEDLEN); sbuf_printf(&sb,"cpu%d", i); @@ -602,7 +602,7 @@ init_pcpu_topology_sysctl(void) cpu_topology_phys_ids = max_id - min_id + 1; if (cpu_topology_phys_ids <= 0) /* don't crash */ cpu_topology_phys_ids = 1; - for (i = 0; i < ncpus; i++) { + for (i = 0; i < assumed_ncpus; i++) { pcpu_sysctl[i].physical_id %= cpu_topology_phys_ids; } } @@ -611,7 +611,7 @@ init_pcpu_topology_sysctl(void) * the CPU Topology to user-space. */ static void -build_sysctl_cpu_topology(void) +build_sysctl_cpu_topology(int assumed_ncpus) { int i; struct sbuf sb; @@ -651,7 +651,7 @@ build_sysctl_cpu_topology(void) "Members of the CPU Topology"); /* SYSCTL per_cpu info */ - for (i = 0; i < ncpus; i++) { + for (i = 0; i < assumed_ncpus; i++) { /* New leaf : hw.cpu_topology.cpux */ sysctl_ctx_init(&pcpu_sysctl[i].sysctl_ctx); pcpu_sysctl[i].sysctl_tree = SYSCTL_ADD_NODE(&pcpu_sysctl[i].sysctl_ctx, @@ -759,14 +759,19 @@ get_cpu_phys_id(int cpuid) return(0); } +extern int naps; + /* Build the CPU Topology and SYSCTL Topology tree */ static void init_cpu_topology(void) { - build_cpu_topology(); + int assumed_ncpus; + + assumed_ncpus = naps + 1; - init_pcpu_topology_sysctl(); - build_sysctl_cpu_topology(); + build_cpu_topology(assumed_ncpus); + init_pcpu_topology_sysctl(assumed_ncpus); + build_sysctl_cpu_topology(assumed_ncpus); } SYSINIT(cpu_topology, SI_BOOT2_CPU_TOPOLOGY, SI_ORDER_FIRST, init_cpu_topology, NULL); diff --git a/sys/platform/pc64/acpica/acpi_srat.c b/sys/platform/pc64/acpica/acpi_srat.c new file mode 100644 index 0000000000..5e0d5491c8 --- /dev/null +++ b/sys/platform/pc64/acpica/acpi_srat.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2009 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Sepherosa Ziehau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "acpi.h" +#include "acpi_sdt_var.h" +#include "acpi_sci_var.h" + +extern int naps; + +#define MADT_VPRINTF(fmt, arg...) \ +do { \ + if (bootverbose) \ + kprintf("ACPI MADT: " fmt , ##arg); \ +} while (0) + +#define MADT_INT_BUS_ISA 0 + +typedef union srat_entry { + ACPI_SUBTABLE_HEADER head; + ACPI_SRAT_CPU_AFFINITY cpu; + ACPI_SRAT_MEM_AFFINITY mem; + ACPI_SRAT_X2APIC_CPU_AFFINITY x2apic; + ACPI_SRAT_GICC_AFFINITY gicc; +} srat_entry_t; + +static void +srat_probe(void) +{ + vm_paddr_t srat_paddr; + ACPI_TABLE_SRAT *srat; + srat_entry_t *mem; + srat_entry_t *cpu; + int error = 0; + + /* + * Map the SRAT if it exists + */ + srat_paddr = sdt_search(ACPI_SIG_SRAT); + if (srat_paddr == 0) { + kprintf("srat_probe: can't locate SRAT\n"); + return; + } + + srat = sdt_sdth_map(srat_paddr); + KKASSERT(srat != NULL); + + if (srat->Header.Length < sizeof(*srat)) { + kprintf("acpi: invalid SRAT length %u\n", + srat->Header.Length); + error = EINVAL; + goto done; + } + + cpu = NULL; + + for (mem = (srat_entry_t *)(srat + 1); + (char *)mem < (char *)srat + srat->Header.Length; + mem = (srat_entry_t *)((char *)mem + mem->head.Length)) { + /* + * Mem scan memory affinity only + */ + if (mem->head.Type != ACPI_SRAT_TYPE_MEMORY_AFFINITY) + continue; + if ((mem->mem.Flags & ACPI_SRAT_MEM_ENABLED) == 0) + continue; + + kprintf("MemAffinity %016jx,%ldMB Prox=%u ", + mem->mem.BaseAddress, + mem->mem.Length / (1024 * 1024), + mem->mem.ProximityDomain); + + /* + * Look for associated cpu affinity + */ + if (cpu == NULL || + mem->mem.ProximityDomain != cpu->cpu.ProximityDomainLo) { + for (cpu = (srat_entry_t *)(srat + 1); + (char *)cpu < (char *)srat + srat->Header.Length; + cpu = (srat_entry_t *)((char *)cpu + + cpu->head.Length)) { + if (cpu->head.Type != + ACPI_SRAT_TYPE_CPU_AFFINITY) + continue; + if ((cpu->cpu.Flags & + ACPI_SRAT_CPU_USE_AFFINITY) == 0) + continue; + if (mem->mem.ProximityDomain == + cpu->cpu.ProximityDomainLo) { + break; + } + } + if ((char *)cpu >= (char *)srat + srat->Header.Length) + cpu = NULL; + } + if (cpu) { + kprintf("CpuApicId %02x Socket %d\n", + cpu->cpu.ApicId, + get_chip_ID_from_APICID(cpu->cpu.ApicId)); + vm_numa_organize(mem->mem.BaseAddress, + mem->mem.Length, + get_chip_ID_from_APICID(cpu->cpu.ApicId)); + } else { + kprintf("(not found)\n"); + } + } + +done: + sdt_sdth_unmap(&srat->Header); +} + +SYSINIT(srat_probe, SI_BOOT2_NUMA, SI_ORDER_FIRST, srat_probe, 0); diff --git a/sys/platform/pc64/conf/files b/sys/platform/pc64/conf/files index ad9f97fbbd..beb5ac33fb 100644 --- a/sys/platform/pc64/conf/files +++ b/sys/platform/pc64/conf/files @@ -193,6 +193,7 @@ platform/pc64/x86_64/mptable.c standard platform/pc64/acpica/acpi_sdt.c standard platform/pc64/acpica/acpi_fadt.c standard platform/pc64/acpica/acpi_madt.c standard +platform/pc64/acpica/acpi_srat.c standard dev/misc/atkbd/atkbd_isa.c optional atkbd dev/misc/atkbdc_layer/atkbdc_isa.c optional atkbdc platform/pc64/x86_64/efirt.c optional efirt diff --git a/sys/platform/pc64/include/smp.h b/sys/platform/pc64/include/smp.h index fef3375f60..20b4a34ad3 100644 --- a/sys/platform/pc64/include/smp.h +++ b/sys/platform/pc64/include/smp.h @@ -86,6 +86,7 @@ int fix_amd_topology(void); /* Interface functions for IDs calculation */ int get_chip_ID(int cpuid); +int get_chip_ID_from_APICID(int apicid); int get_core_number_within_chip(int cpuid); int get_logical_CPU_number_within_core(int cpuid); diff --git a/sys/platform/pc64/x86_64/mp_machdep.c b/sys/platform/pc64/x86_64/mp_machdep.c index ec8f013897..7bb55a8568 100644 --- a/sys/platform/pc64/x86_64/mp_machdep.c +++ b/sys/platform/pc64/x86_64/mp_machdep.c @@ -1763,6 +1763,12 @@ get_chip_ID(int cpuid) (logical_CPU_bits + core_bits); } +int +get_chip_ID_from_APICID(int apicid) +{ + return apicid >> (logical_CPU_bits + core_bits); +} + int get_core_number_within_chip(int cpuid) { diff --git a/sys/platform/vkernel64/include/smp.h b/sys/platform/vkernel64/include/smp.h index 9db37906cc..93cd3e3e49 100644 --- a/sys/platform/vkernel64/include/smp.h +++ b/sys/platform/vkernel64/include/smp.h @@ -38,6 +38,7 @@ void detect_cpu_topology(void); /* Interface functions for IDs calculation */ int get_chip_ID(int cpuid); +int get_chip_ID_from_APICID(int apicid); int get_core_number_within_chip(int cpuid); int get_logical_CPU_number_within_core(int cpuid); diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index 5b8d3cf954..a8ea20cb48 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -149,6 +149,8 @@ enum sysinit_sub_id { SI_BOOT2_PRESMP = 0x1a00000, /* register SMP configs */ SI_BOOT2_START_CPU = 0x1a40000, /* start CPU (BSP) */ SI_BOOT2_LAPIC = 0x1a50000, /* configure Local APIC */ + SI_BOOT2_CPU_TOPOLOGY = 0x1e58000, + SI_BOOT2_NUMA = 0x1e5c000, SI_BOOT2_START_APS = 0x1a60000, /* start all APs */ SI_BOOT2_IOAPIC = 0x1a70000, /* configure I/O APIC */ SI_BOOT2_FINISH_PIC = 0x1a80000, /* finish PIC configure */ @@ -168,7 +170,6 @@ enum sysinit_sub_id { SI_BOOT2_BIOS = 0x1d00000, SI_BOOT2_MACHDEP = 0x1d80000, SI_BOOT2_KLD = 0x1e00000, - SI_BOOT2_CPU_TOPOLOGY = 0x1e40000, SI_BOOT2_USCHED = 0x1e80000, SI_BOOT2_PROC0 = 0x1f00000, diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 94643eba85..ef69f3b5f7 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -222,12 +222,14 @@ vm_add_new_page(vm_paddr_t pa) m->flags = 0; m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK; m->pat_mode = PAT_WRITE_BACK; + /* * Twist for cpu localization in addition to page coloring, so * different cpus selecting by m->queue get different page colors. */ m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE) & PQ_L2_MASK; m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE)) & PQ_L2_MASK; + /* * Reserve a certain number of contiguous low memory pages for * contigmalloc() to use. @@ -430,6 +432,85 @@ vm_page_startup(void) virtual_start = vaddr; } +/* + * Reorganize VM pages based on numa data. May be called as many times as + * necessary. Will reorganize the vm_page_t page color and related queue(s) + * to allow vm_page_alloc() to choose pages based on socket affinity. + */ +void +vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid) +{ + vm_paddr_t scan_beg; + vm_paddr_t scan_end; + vm_paddr_t ran_end; + struct vpgqueues *vpq; + vm_page_t m; + int i; + int socket_mod; + int socket_value; + + /* + * Check if no physical information, or there was only one socket + * (so don't waste time doing nothing!). + */ + if (cpu_topology_phys_ids <= 1 || + cpu_topology_core_ids == 0) { + return; + } + + /* + * Setup for our iteration. Note that ACPI may iterate CPU + * sockets starting at 0 or 1 or some other number. The + * cpu_topology code mod's it against the socket count. + */ + ran_end = ran_beg + bytes; + physid %= cpu_topology_phys_ids; + + socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids; + socket_value = physid * socket_mod; + + /* + * Adjust vm_page->pc and requeue all affected pages. The + * allocator will then be able to localize memory allocations + * to some degree. + */ + for (i = 0; phys_avail[i].phys_end; ++i) { + scan_beg = phys_avail[i].phys_beg; + scan_end = phys_avail[i].phys_end; + if (scan_end <= ran_beg) + continue; + if (scan_beg >= ran_end) + continue; + if (scan_beg < ran_beg) + scan_beg = ran_beg; + if (scan_end > ran_end) + scan_end = ran_end; + if (atop(scan_end) >= first_page + vm_page_array_size) + scan_end = ptoa(first_page + vm_page_array_size); + + m = PHYS_TO_VM_PAGE(scan_beg); + while (scan_beg < scan_end) { + if (m->queue != PQ_NONE) { + vpq = &vm_page_queues[m->queue]; + TAILQ_REMOVE(&vpq->pl, m, pageq); + m->queue -= m->pc; + m->pc %= socket_mod; + m->pc += socket_value; + m->pc &= PQ_L2_MASK; + m->queue += m->pc; + vpq = &vm_page_queues[m->queue]; + TAILQ_INSERT_HEAD(&vpq->pl, m, pageq); + } else { + m->pc %= socket_mod; + m->pc += socket_value; + m->pc &= PQ_L2_MASK; + } + scan_beg += PAGE_SIZE; + ++m; + } + } +} + /* * We tended to reserve a ton of memory for contigmalloc(). Now that most * drivers have initialized we want to return most the remaining free diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 6ee8ca42f5..06905ced64 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -416,6 +416,7 @@ vm_page_t vm_page_repurpose(struct vm_object *, vm_pindex_t, int, int *, void vm_page_remove (vm_page_t); void vm_page_rename (vm_page_t, struct vm_object *, vm_pindex_t); void vm_page_startup (void); +void vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid); void vm_page_unmanage (vm_page_t); void vm_page_unwire (vm_page_t, int); void vm_page_wire (vm_page_t); -- 2.41.0