kernel - Add NUMA awareness to vm_page_alloc() and related functions
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 6 Jan 2017 02:08:40 +0000 (18:08 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 6 Jan 2017 02:43:19 +0000 (18:43 -0800)
* Add NUMA awareness to the kernel memory subsystem.  This first iteration
  will primarily affect user pages.  kmalloc and objcache are not
  NUMA-friendly yet (and its questionable how useful it would be to make
  them so).

* Tested with synth on monster (4-socket opteron / 48 cores) and a 2-socket
  xeon (32 threads).  Appears to dole out localized pages 5:1 to 10:1.

sys/kern/subr_cpu_topology.c
sys/platform/pc64/acpica/acpi_srat.c [new file with mode: 0644]
sys/platform/pc64/conf/files
sys/platform/pc64/include/smp.h
sys/platform/pc64/x86_64/mp_machdep.c
sys/platform/vkernel64/include/smp.h
sys/sys/kernel.h
sys/vm/vm_page.c
sys/vm/vm_page.h

index 557733c..a598210 100644 (file)
@@ -165,7 +165,7 @@ migrate_elements(cpu_node_t **a, int n, int pos)
  * BSP. When we found a match, at that level the CPUs are siblings.
  */
 static void
-build_cpu_topology(void)
+build_cpu_topology(int assumed_ncpus)
 {
        detect_cpu_topology();
        int i;
@@ -184,7 +184,7 @@ build_cpu_topology(void)
         * Find the number of siblings within chip
         * and witin core to build up the topology
         */
-       for (i = 0; i < ncpus; i++) {
+       for (i = 0; i < assumed_ncpus; i++) {
                cpumask_t mask;
 
                CPUMASK_ASSBIT(mask, i);
@@ -203,7 +203,7 @@ build_cpu_topology(void)
        }
 
        cores_per_chip /= threads_per_core;
-       chips_per_package = ncpus / (cores_per_chip * threads_per_core);
+       chips_per_package = assumed_ncpus / (cores_per_chip * threads_per_core);
        
        if (bootverbose)
                kprintf("CPU Topology: cores_per_chip: %d; threads_per_core: %d; chips_per_package: %d;\n",
@@ -278,7 +278,7 @@ build_cpu_topology(void)
 
                bzero(visited, MAXCPU * sizeof(int));
 
-               for (i = 0; i < ncpus; i++) {
+               for (i = 0; i < assumed_ncpus; i++) {
                        if (visited[i] == 0) {
                                pos = 0;
                                visited[i] = 1;
@@ -537,7 +537,7 @@ get_cpu_node_by_chipid(int chip_id)
 
 /* init pcpu_sysctl structure info */
 static void
-init_pcpu_topology_sysctl(void)
+init_pcpu_topology_sysctl(int assumed_ncpus)
 {
        struct sbuf sb;
        cpumask_t mask;
@@ -549,7 +549,7 @@ init_pcpu_topology_sysctl(void)
        pcpu_sysctl = kmalloc(sizeof(*pcpu_sysctl) * MAXCPU, M_PCPUSYS,
                              M_INTWAIT | M_ZERO);
 
-       for (i = 0; i < ncpus; i++) {
+       for (i = 0; i < assumed_ncpus; i++) {
                sbuf_new(&sb, pcpu_sysctl[i].cpu_name,
                    sizeof(pcpu_sysctl[i].cpu_name), SBUF_FIXEDLEN);
                sbuf_printf(&sb,"cpu%d", i);
@@ -602,7 +602,7 @@ init_pcpu_topology_sysctl(void)
        cpu_topology_phys_ids = max_id - min_id + 1;
        if (cpu_topology_phys_ids <= 0)         /* don't crash */
                cpu_topology_phys_ids = 1;
-       for (i = 0; i < ncpus; i++) {
+       for (i = 0; i < assumed_ncpus; i++) {
                pcpu_sysctl[i].physical_id %= cpu_topology_phys_ids;
        }
 }
@@ -611,7 +611,7 @@ init_pcpu_topology_sysctl(void)
  * the CPU Topology to user-space.
  */
 static void
-build_sysctl_cpu_topology(void)
+build_sysctl_cpu_topology(int assumed_ncpus)
 {
        int i;
        struct sbuf sb;
@@ -651,7 +651,7 @@ build_sysctl_cpu_topology(void)
            "Members of the CPU Topology");
 
        /* SYSCTL per_cpu info */
-       for (i = 0; i < ncpus; i++) {
+       for (i = 0; i < assumed_ncpus; i++) {
                /* New leaf : hw.cpu_topology.cpux */
                sysctl_ctx_init(&pcpu_sysctl[i].sysctl_ctx); 
                pcpu_sysctl[i].sysctl_tree = SYSCTL_ADD_NODE(&pcpu_sysctl[i].sysctl_ctx,
@@ -759,14 +759,19 @@ get_cpu_phys_id(int cpuid)
        return(0);
 }
 
+extern int naps;
+
 /* Build the CPU Topology and SYSCTL Topology tree */
 static void
 init_cpu_topology(void)
 {
-       build_cpu_topology();
+       int assumed_ncpus;
+
+       assumed_ncpus = naps + 1;
 
-       init_pcpu_topology_sysctl();
-       build_sysctl_cpu_topology();
+       build_cpu_topology(assumed_ncpus);
+       init_pcpu_topology_sysctl(assumed_ncpus);
+       build_sysctl_cpu_topology(assumed_ncpus);
 }
 SYSINIT(cpu_topology, SI_BOOT2_CPU_TOPOLOGY, SI_ORDER_FIRST,
     init_cpu_topology, NULL);
diff --git a/sys/platform/pc64/acpica/acpi_srat.c b/sys/platform/pc64/acpica/acpi_srat.c
new file mode 100644 (file)
index 0000000..5e0d549
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Sepherosa Ziehau <sepherosa@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <vm/vm_page.h>
+
+#include "acpi.h"
+#include "acpi_sdt_var.h"
+#include "acpi_sci_var.h"
+
+extern int naps;
+
+#define MADT_VPRINTF(fmt, arg...) \
+do { \
+       if (bootverbose) \
+               kprintf("ACPI MADT: " fmt , ##arg); \
+} while (0)
+
+#define MADT_INT_BUS_ISA       0
+
+typedef union srat_entry {
+       ACPI_SUBTABLE_HEADER    head;
+       ACPI_SRAT_CPU_AFFINITY  cpu;
+       ACPI_SRAT_MEM_AFFINITY  mem;
+       ACPI_SRAT_X2APIC_CPU_AFFINITY   x2apic;
+       ACPI_SRAT_GICC_AFFINITY gicc;
+} srat_entry_t;
+
+static void
+srat_probe(void)
+{
+       vm_paddr_t srat_paddr;
+       ACPI_TABLE_SRAT *srat;
+       srat_entry_t *mem;
+       srat_entry_t *cpu;
+       int error = 0;
+
+       /*
+        * Map the SRAT if it exists
+        */
+       srat_paddr = sdt_search(ACPI_SIG_SRAT);
+       if (srat_paddr == 0) {
+               kprintf("srat_probe: can't locate SRAT\n");
+               return;
+       }
+
+       srat = sdt_sdth_map(srat_paddr);
+       KKASSERT(srat != NULL);
+
+       if (srat->Header.Length < sizeof(*srat)) {
+               kprintf("acpi: invalid SRAT length %u\n",
+                       srat->Header.Length);
+               error = EINVAL;
+               goto done;
+       }
+
+       cpu = NULL;
+
+       for (mem = (srat_entry_t *)(srat + 1);
+            (char *)mem < (char *)srat + srat->Header.Length;
+            mem = (srat_entry_t *)((char *)mem + mem->head.Length)) {
+               /*
+                * Mem scan memory affinity only
+                */
+               if (mem->head.Type != ACPI_SRAT_TYPE_MEMORY_AFFINITY)
+                       continue;
+               if ((mem->mem.Flags & ACPI_SRAT_MEM_ENABLED) == 0)
+                       continue;
+
+               kprintf("MemAffinity %016jx,%ldMB Prox=%u ",
+                       mem->mem.BaseAddress,
+                       mem->mem.Length / (1024 * 1024),
+                       mem->mem.ProximityDomain);
+
+               /*
+                * Look for associated cpu affinity
+                */
+               if (cpu == NULL ||
+                   mem->mem.ProximityDomain != cpu->cpu.ProximityDomainLo) {
+                       for (cpu = (srat_entry_t *)(srat + 1);
+                            (char *)cpu < (char *)srat + srat->Header.Length;
+                            cpu = (srat_entry_t *)((char *)cpu +
+                                                   cpu->head.Length)) {
+                               if (cpu->head.Type !=
+                                   ACPI_SRAT_TYPE_CPU_AFFINITY)
+                                       continue;
+                               if ((cpu->cpu.Flags &
+                                    ACPI_SRAT_CPU_USE_AFFINITY) == 0)
+                                       continue;
+                               if (mem->mem.ProximityDomain ==
+                                   cpu->cpu.ProximityDomainLo) {
+                                       break;
+                               }
+                       }
+                       if ((char *)cpu >= (char *)srat + srat->Header.Length)
+                               cpu = NULL;
+               }
+               if (cpu) {
+                       kprintf("CpuApicId %02x Socket %d\n",
+                               cpu->cpu.ApicId,
+                               get_chip_ID_from_APICID(cpu->cpu.ApicId));
+                       vm_numa_organize(mem->mem.BaseAddress,
+                                        mem->mem.Length,
+                                   get_chip_ID_from_APICID(cpu->cpu.ApicId));
+               } else {
+                       kprintf("(not found)\n");
+               }
+       }
+
+done:
+       sdt_sdth_unmap(&srat->Header);
+}
+
+SYSINIT(srat_probe, SI_BOOT2_NUMA, SI_ORDER_FIRST, srat_probe, 0);
index ad9f97f..beb5ac3 100644 (file)
@@ -193,6 +193,7 @@ platform/pc64/x86_64/mptable.c              standard
 platform/pc64/acpica/acpi_sdt.c        standard
 platform/pc64/acpica/acpi_fadt.c       standard
 platform/pc64/acpica/acpi_madt.c       standard
+platform/pc64/acpica/acpi_srat.c       standard
 dev/misc/atkbd/atkbd_isa.c             optional        atkbd
 dev/misc/atkbdc_layer/atkbdc_isa.c     optional        atkbdc
 platform/pc64/x86_64/efirt.c           optional        efirt
index fef3375..20b4a34 100644 (file)
@@ -86,6 +86,7 @@ int fix_amd_topology(void);
 
 /* Interface functions for IDs calculation */
 int get_chip_ID(int cpuid);
+int get_chip_ID_from_APICID(int apicid);
 int get_core_number_within_chip(int cpuid);
 int get_logical_CPU_number_within_core(int cpuid);
 
index ec8f013..7bb55a8 100644 (file)
@@ -1763,6 +1763,12 @@ get_chip_ID(int cpuid)
            (logical_CPU_bits + core_bits);
 }
 
+int
+get_chip_ID_from_APICID(int apicid)
+{
+       return apicid >> (logical_CPU_bits + core_bits);
+}
+
 int
 get_core_number_within_chip(int cpuid)
 {
index 9db3790..93cd3e3 100644 (file)
@@ -38,6 +38,7 @@ void detect_cpu_topology(void);
 
 /* Interface functions for IDs calculation */
 int get_chip_ID(int cpuid);
+int get_chip_ID_from_APICID(int apicid);
 int get_core_number_within_chip(int cpuid);
 int get_logical_CPU_number_within_core(int cpuid);
 
index 5b8d3cf..a8ea20c 100644 (file)
@@ -149,6 +149,8 @@ enum sysinit_sub_id {
        SI_BOOT2_PRESMP         = 0x1a00000,    /* register SMP configs */
        SI_BOOT2_START_CPU      = 0x1a40000,    /* start CPU (BSP) */
        SI_BOOT2_LAPIC          = 0x1a50000,    /* configure Local APIC */
+       SI_BOOT2_CPU_TOPOLOGY   = 0x1e58000,
+       SI_BOOT2_NUMA           = 0x1e5c000,
        SI_BOOT2_START_APS      = 0x1a60000,    /* start all APs */
        SI_BOOT2_IOAPIC         = 0x1a70000,    /* configure I/O APIC */
        SI_BOOT2_FINISH_PIC     = 0x1a80000,    /* finish PIC configure */
@@ -168,7 +170,6 @@ enum sysinit_sub_id {
        SI_BOOT2_BIOS           = 0x1d00000,
        SI_BOOT2_MACHDEP        = 0x1d80000,
        SI_BOOT2_KLD            = 0x1e00000,
-       SI_BOOT2_CPU_TOPOLOGY   = 0x1e40000,
        SI_BOOT2_USCHED         = 0x1e80000,
        SI_BOOT2_PROC0          = 0x1f00000,
 
index 94643eb..ef69f3b 100644 (file)
@@ -222,12 +222,14 @@ vm_add_new_page(vm_paddr_t pa)
        m->flags = 0;
        m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
        m->pat_mode = PAT_WRITE_BACK;
+
        /*
         * Twist for cpu localization in addition to page coloring, so
         * different cpus selecting by m->queue get different page colors.
         */
        m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE) & PQ_L2_MASK;
        m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE)) & PQ_L2_MASK;
+
        /*
         * Reserve a certain number of contiguous low memory pages for
         * contigmalloc() to use.
@@ -430,6 +432,85 @@ vm_page_startup(void)
                virtual_start = vaddr;
 }
 
+/*
+ * Reorganize VM pages based on numa data.  May be called as many times as
+ * necessary.  Will reorganize the vm_page_t page color and related queue(s)
+ * to allow vm_page_alloc() to choose pages based on socket affinity.
+ */
+void
+vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
+{
+       vm_paddr_t scan_beg;
+       vm_paddr_t scan_end;
+       vm_paddr_t ran_end;
+       struct vpgqueues *vpq;
+       vm_page_t m;
+       int i;
+       int socket_mod;
+       int socket_value;
+
+       /*
+        * Check if no physical information, or there was only one socket
+        * (so don't waste time doing nothing!).
+        */
+       if (cpu_topology_phys_ids <= 1 ||
+           cpu_topology_core_ids == 0) {
+               return;
+       }
+
+       /*
+        * Setup for our iteration.  Note that ACPI may iterate CPU
+        * sockets starting at 0 or 1 or some other number.  The
+        * cpu_topology code mod's it against the socket count.
+        */
+       ran_end = ran_beg + bytes;
+       physid %= cpu_topology_phys_ids;
+
+       socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
+       socket_value = physid * socket_mod;
+
+       /*
+        * Adjust vm_page->pc and requeue all affected pages.  The
+        * allocator will then be able to localize memory allocations
+        * to some degree.
+        */
+       for (i = 0; phys_avail[i].phys_end; ++i) {
+               scan_beg = phys_avail[i].phys_beg;
+               scan_end = phys_avail[i].phys_end;
+               if (scan_end <= ran_beg)
+                       continue;
+               if (scan_beg >= ran_end)
+                       continue;
+               if (scan_beg < ran_beg)
+                       scan_beg = ran_beg;
+               if (scan_end > ran_end)
+                       scan_end = ran_end;
+               if (atop(scan_end) >= first_page + vm_page_array_size)
+                       scan_end = ptoa(first_page + vm_page_array_size);
+
+               m = PHYS_TO_VM_PAGE(scan_beg);
+               while (scan_beg < scan_end) {
+                       if (m->queue != PQ_NONE) {
+                               vpq = &vm_page_queues[m->queue];
+                               TAILQ_REMOVE(&vpq->pl, m, pageq);
+                               m->queue -= m->pc;
+                               m->pc %= socket_mod;
+                               m->pc += socket_value;
+                               m->pc &= PQ_L2_MASK;
+                               m->queue += m->pc;
+                               vpq = &vm_page_queues[m->queue];
+                               TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
+                       } else {
+                               m->pc %= socket_mod;
+                               m->pc += socket_value;
+                               m->pc &= PQ_L2_MASK;
+                       }
+                       scan_beg += PAGE_SIZE;
+                       ++m;
+               }
+       }
+}
+
 /*
  * We tended to reserve a ton of memory for contigmalloc().  Now that most
  * drivers have initialized we want to return most the remaining free
index 6ee8ca4..06905ce 100644 (file)
@@ -416,6 +416,7 @@ vm_page_t vm_page_repurpose(struct vm_object *, vm_pindex_t, int, int *,
 void vm_page_remove (vm_page_t);
 void vm_page_rename (vm_page_t, struct vm_object *, vm_pindex_t);
 void vm_page_startup (void);
+void vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid);
 void vm_page_unmanage (vm_page_t);
 void vm_page_unwire (vm_page_t, int);
 void vm_page_wire (vm_page_t);