2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
14 * 3. Neither the name of The DragonFly Project nor the names of its
15 * contributors may be used to endorse or promote products derived
16 * from this software without specific, prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/sysctl.h>
38 #include <sys/cpu_topology.h>
40 #include <machine/smp.h>
46 #define INDENT_BUF_SIZE LEVEL_NO*3
49 /* Per-cpu sysctl nodes and info */
50 struct per_cpu_sysctl_info {
51 struct sysctl_ctx_list sysctl_ctx;
52 struct sysctl_oid *sysctl_tree;
56 int ht_id; /* thread id within core */
57 char physical_siblings[8*MAXCPU];
58 char core_siblings[8*MAXCPU];
60 typedef struct per_cpu_sysctl_info per_cpu_sysctl_info_t;
62 /* Memory for topology */
63 __read_frequently static cpu_node_t cpu_topology_nodes[MAXCPU];
64 /* Root node pointer */
65 __read_frequently static cpu_node_t *cpu_root_node;
67 static struct sysctl_ctx_list cpu_topology_sysctl_ctx;
68 static struct sysctl_oid *cpu_topology_sysctl_tree;
69 static char cpu_topology_members[8*MAXCPU];
70 static per_cpu_sysctl_info_t *pcpu_sysctl;
71 static void sbuf_print_cpuset(struct sbuf *sb, cpumask_t *mask);
73 __read_frequently int cpu_topology_levels_number = 1;
74 __read_frequently int cpu_topology_ht_ids;
75 __read_frequently int cpu_topology_core_ids;
76 __read_frequently int cpu_topology_phys_ids;
77 __read_frequently cpu_node_t *root_cpu_node;
79 MALLOC_DEFINE(M_PCPUSYS, "pcpusys", "pcpu sysctl topology");
81 SYSCTL_INT(_hw, OID_AUTO, cpu_topology_ht_ids, CTLFLAG_RW,
82 &cpu_topology_ht_ids, 0, "# of logical cores per real core");
83 SYSCTL_INT(_hw, OID_AUTO, cpu_topology_core_ids, CTLFLAG_RW,
84 &cpu_topology_core_ids, 0, "# of real cores per package");
85 SYSCTL_INT(_hw, OID_AUTO, cpu_topology_phys_ids, CTLFLAG_RW,
86 &cpu_topology_phys_ids, 0, "# of physical packages");
88 /* Get the next valid apicid starting
89 * from current apicid (curr_apicid
92 get_next_valid_apicid(int curr_apicid)
94 int next_apicid = curr_apicid;
98 while(get_cpuid_from_apicid(next_apicid) == -1 &&
99 next_apicid < NAPICID);
100 if (next_apicid == NAPICID) {
101 kprintf("Warning: No next valid APICID found. Returning -1\n");
107 /* Generic topology tree. The parameters have the following meaning:
108 * - children_no_per_level : the number of children on each level
109 * - level_types : the type of the level (THREAD, CORE, CHIP, etc)
110 * - cur_level : the current level of the tree
111 * - node : the current node
112 * - last_free_node : the last free node in the global array.
113 * - cpuid : basicly this are the ids of the leafs
116 build_topology_tree(int *children_no_per_level,
117 uint8_t *level_types,
120 cpu_node_t **last_free_node,
125 node->child_no = children_no_per_level[cur_level];
126 node->type = level_types[cur_level];
127 CPUMASK_ASSZERO(node->members);
128 node->compute_unit_id = -1;
130 if (node->child_no == 0) {
131 *apicid = get_next_valid_apicid(*apicid);
132 CPUMASK_ASSBIT(node->members, get_cpuid_from_apicid(*apicid));
136 if (node->parent_node == NULL)
137 root_cpu_node = node;
139 for (i = 0; i < node->child_no; i++) {
140 node->child_node[i] = *last_free_node;
143 node->child_node[i]->parent_node = node;
145 build_topology_tree(children_no_per_level,
152 CPUMASK_ORMASK(node->members, node->child_node[i]->members);
156 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
158 migrate_elements(cpu_node_t **a, int n, int pos)
162 for (i = pos; i < n - 1 ; i++) {
169 /* Build CPU topology. The detection is made by comparing the
170 * chip, core and logical IDs of each CPU with the IDs of the
171 * BSP. When we found a match, at that level the CPUs are siblings.
174 build_cpu_topology(int assumed_ncpus)
178 int threads_per_core = 0;
179 int cores_per_chip = 0;
180 int chips_per_package = 0;
181 int children_no_per_level[LEVEL_NO];
182 uint8_t level_types[LEVEL_NO];
184 cpu_node_t *root = &cpu_topology_nodes[0];
185 cpu_node_t *last_free_node = root + 1;
187 detect_cpu_topology();
190 * Assume that the topology is uniform.
191 * Find the number of siblings within the chip
192 * and within the core to build up the topology.
194 for (i = 0; i < assumed_ncpus; i++) {
197 CPUMASK_ASSBIT(mask, i);
200 /* smp_active_mask has not been initialized yet, ignore */
201 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0)
205 if (get_chip_ID(BSPID) != get_chip_ID(i))
209 if (get_core_number_within_chip(BSPID) ==
210 get_core_number_within_chip(i)) {
215 cores_per_chip /= threads_per_core;
216 chips_per_package = assumed_ncpus / (cores_per_chip * threads_per_core);
218 kprintf("CPU Topology: cores_per_chip: %d; threads_per_core: %d; "
219 "chips_per_package: %d;\n",
220 cores_per_chip, threads_per_core, chips_per_package);
222 if (threads_per_core > 1) { /* HT available - 4 levels */
224 children_no_per_level[0] = chips_per_package;
225 children_no_per_level[1] = cores_per_chip;
226 children_no_per_level[2] = threads_per_core;
227 children_no_per_level[3] = 0;
229 level_types[0] = PACKAGE_LEVEL;
230 level_types[1] = CHIP_LEVEL;
231 level_types[2] = CORE_LEVEL;
232 level_types[3] = THREAD_LEVEL;
234 build_topology_tree(children_no_per_level,
241 cpu_topology_levels_number = 4;
243 } else if (cores_per_chip > 1) { /* No HT available - 3 levels */
245 children_no_per_level[0] = chips_per_package;
246 children_no_per_level[1] = cores_per_chip;
247 children_no_per_level[2] = 0;
249 level_types[0] = PACKAGE_LEVEL;
250 level_types[1] = CHIP_LEVEL;
251 level_types[2] = CORE_LEVEL;
253 build_topology_tree(children_no_per_level,
260 cpu_topology_levels_number = 3;
262 } else { /* No HT and no Multi-Core - 2 levels */
264 children_no_per_level[0] = chips_per_package;
265 children_no_per_level[1] = 0;
267 level_types[0] = PACKAGE_LEVEL;
268 level_types[1] = CHIP_LEVEL;
270 build_topology_tree(children_no_per_level,
277 cpu_topology_levels_number = 2;
281 cpu_root_node = root;
284 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
285 if (fix_amd_topology() == 0) {
286 int visited[MAXCPU], i, j, pos, cpuid;
287 cpu_node_t *leaf, *parent;
289 bzero(visited, MAXCPU * sizeof(int));
291 for (i = 0; i < assumed_ncpus; i++) {
292 if (visited[i] == 0) {
295 leaf = get_cpu_node_by_cpuid(i);
297 KASSERT(leaf != NULL, ("cpu %d NULL node", i));
298 if (leaf->type == CORE_LEVEL) {
299 parent = leaf->parent_node;
301 last_free_node->child_node[0] = leaf;
302 last_free_node->child_no = 1;
303 last_free_node->members = leaf->members;
304 last_free_node->compute_unit_id = leaf->compute_unit_id;
305 last_free_node->parent_node = parent;
306 last_free_node->type = CORE_LEVEL;
309 for (j = 0; j < parent->child_no; j++) {
310 if (parent->child_node[j] != leaf) {
312 cpuid = BSFCPUMASK(parent->child_node[j]->members);
313 if (visited[cpuid] == 0 &&
314 parent->child_node[j]->compute_unit_id == leaf->compute_unit_id) {
316 last_free_node->child_node[last_free_node->child_no] = parent->child_node[j];
317 last_free_node->child_no++;
318 CPUMASK_ORMASK(last_free_node->members, parent->child_node[j]->members);
320 parent->child_node[j]->type = THREAD_LEVEL;
321 parent->child_node[j]->parent_node = last_free_node;
324 migrate_elements(parent->child_node, parent->child_no, j);
332 if (last_free_node->child_no > 1) {
333 parent->child_node[pos] = last_free_node;
334 leaf->type = THREAD_LEVEL;
335 leaf->parent_node = last_free_node;
345 /* Recursive function helper to print the CPU topology tree */
347 print_cpu_topology_tree_sysctl_helper(cpu_node_t *node,
356 sbuf_bcat(sb, buf, buf_len);
358 sbuf_printf(sb, "\\-");
359 buf[buf_len] = ' ';buf_len++;
360 buf[buf_len] = ' ';buf_len++;
362 sbuf_printf(sb, "|-");
363 buf[buf_len] = '|';buf_len++;
364 buf[buf_len] = ' ';buf_len++;
367 bsr_member = BSRCPUMASK(node->members);
369 if (node->type == PACKAGE_LEVEL) {
370 sbuf_printf(sb,"PACKAGE MEMBERS: ");
371 } else if (node->type == CHIP_LEVEL) {
372 sbuf_printf(sb,"CHIP ID %d: ",
373 get_chip_ID(bsr_member));
374 } else if (node->type == CORE_LEVEL) {
375 if (node->compute_unit_id != (uint8_t)-1) {
376 sbuf_printf(sb,"Compute Unit ID %d: ",
377 node->compute_unit_id);
379 sbuf_printf(sb,"CORE ID %d: ",
380 get_core_number_within_chip(bsr_member));
382 } else if (node->type == THREAD_LEVEL) {
383 if (node->compute_unit_id != (uint8_t)-1) {
384 sbuf_printf(sb,"THREAD ID %d: ",
385 get_core_number_within_chip(bsr_member));
387 sbuf_printf(sb,"THREAD ID %d: ",
388 get_logical_CPU_number_within_core(bsr_member));
391 sbuf_printf(sb,"UNKNOWN: ");
393 sbuf_print_cpuset(sb, &node->members);
394 sbuf_printf(sb,"\n");
396 for (i = 0; i < node->child_no; i++) {
397 print_cpu_topology_tree_sysctl_helper(node->child_node[i],
398 sb, buf, buf_len, i == (node->child_no -1));
402 /* SYSCTL PROCEDURE for printing the CPU Topology tree */
404 print_cpu_topology_tree_sysctl(SYSCTL_HANDLER_ARGS)
408 char buf[INDENT_BUF_SIZE];
410 KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized"));
412 sb = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
416 sbuf_printf(sb,"\n");
417 print_cpu_topology_tree_sysctl_helper(cpu_root_node, sb, buf, 0, 1);
421 ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
428 /* SYSCTL PROCEDURE for printing the CPU Topology level description */
430 print_cpu_topology_level_description_sysctl(SYSCTL_HANDLER_ARGS)
435 sb = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
439 if (cpu_topology_levels_number == 4) /* HT available */
440 sbuf_printf(sb, "0 - thread; 1 - core; 2 - socket; 3 - anything");
441 else if (cpu_topology_levels_number == 3) /* No HT available */
442 sbuf_printf(sb, "0 - core; 1 - socket; 2 - anything");
443 else if (cpu_topology_levels_number == 2) /* No HT and no Multi-Core */
444 sbuf_printf(sb, "0 - socket; 1 - anything");
446 sbuf_printf(sb, "Unknown");
450 ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
457 /* Find a cpu_node_t by a mask */
459 get_cpu_node_by_cpumask(cpu_node_t * node,
462 cpu_node_t * found = NULL;
465 if (CPUMASK_CMPMASKEQ(node->members, mask))
468 for (i = 0; i < node->child_no; i++) {
469 found = get_cpu_node_by_cpumask(node->child_node[i], mask);
478 get_cpu_node_by_cpuid(int cpuid) {
481 CPUMASK_ASSBIT(mask, cpuid);
483 KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized"));
485 return get_cpu_node_by_cpumask(cpu_root_node, mask);
488 /* Get the mask of siblings for level_type of a cpuid */
490 get_cpumask_from_level(int cpuid,
496 CPUMASK_ASSBIT(mask, cpuid);
498 KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized"));
500 node = get_cpu_node_by_cpumask(cpu_root_node, mask);
503 CPUMASK_ASSZERO(mask);
507 while (node != NULL) {
508 if (node->type == level_type) {
509 return node->members;
511 node = node->parent_node;
513 CPUMASK_ASSZERO(mask);
518 static const cpu_node_t *
519 get_cpu_node_by_chipid2(const cpu_node_t *node, int chip_id)
523 if (node->type != CHIP_LEVEL) {
524 const cpu_node_t *ret = NULL;
527 for (i = 0; i < node->child_no; ++i) {
528 ret = get_cpu_node_by_chipid2(node->child_node[i],
536 cpuid = BSRCPUMASK(node->members);
537 if (get_chip_ID(cpuid) == chip_id)
543 get_cpu_node_by_chipid(int chip_id)
545 KASSERT(cpu_root_node != NULL, ("cpu_root_node isn't initialized"));
546 return get_cpu_node_by_chipid2(cpu_root_node, chip_id);
549 /* init pcpu_sysctl structure info */
551 init_pcpu_topology_sysctl(int assumed_ncpus)
560 pcpu_sysctl = kmalloc(sizeof(*pcpu_sysctl) * MAXCPU, M_PCPUSYS,
563 for (i = 0; i < assumed_ncpus; i++) {
564 sbuf_new(&sb, pcpu_sysctl[i].cpu_name,
565 sizeof(pcpu_sysctl[i].cpu_name), SBUF_FIXEDLEN);
566 sbuf_printf(&sb,"cpu%d", i);
570 /* Get physical siblings */
571 mask = get_cpumask_from_level(i, CHIP_LEVEL);
572 if (CPUMASK_TESTZERO(mask)) {
573 pcpu_sysctl[i].physical_id = INVALID_ID;
577 sbuf_new(&sb, pcpu_sysctl[i].physical_siblings,
578 sizeof(pcpu_sysctl[i].physical_siblings), SBUF_FIXEDLEN);
579 sbuf_print_cpuset(&sb, &mask);
583 phys_id = get_chip_ID(i);
584 pcpu_sysctl[i].physical_id = phys_id;
585 if (min_id < 0 || min_id > phys_id)
587 if (max_id < 0 || max_id < phys_id)
590 /* Get core siblings */
591 mask = get_cpumask_from_level(i, CORE_LEVEL);
592 if (CPUMASK_TESTZERO(mask)) {
593 pcpu_sysctl[i].core_id = INVALID_ID;
597 sbuf_new(&sb, pcpu_sysctl[i].core_siblings,
598 sizeof(pcpu_sysctl[i].core_siblings), SBUF_FIXEDLEN);
599 sbuf_print_cpuset(&sb, &mask);
603 pcpu_sysctl[i].core_id = get_core_number_within_chip(i);
604 if (cpu_topology_core_ids < pcpu_sysctl[i].core_id + 1)
605 cpu_topology_core_ids = pcpu_sysctl[i].core_id + 1;
607 pcpu_sysctl[i].ht_id = get_logical_CPU_number_within_core(i);
608 if (cpu_topology_ht_ids < pcpu_sysctl[i].ht_id + 1)
609 cpu_topology_ht_ids = pcpu_sysctl[i].ht_id + 1;
613 * Normalize physical ids so they can be used by the VM system.
614 * Some systems number starting at 0 others number starting at 1.
616 cpu_topology_phys_ids = max_id - min_id + 1;
617 if (cpu_topology_phys_ids <= 0) /* don't crash */
618 cpu_topology_phys_ids = 1;
619 for (i = 0; i < assumed_ncpus; i++) {
620 pcpu_sysctl[i].physical_id %= cpu_topology_phys_ids;
624 /* Build SYSCTL structure for revealing
625 * the CPU Topology to user-space.
628 build_sysctl_cpu_topology(int assumed_ncpus)
633 /* SYSCTL new leaf for "cpu_topology" */
634 sysctl_ctx_init(&cpu_topology_sysctl_ctx);
635 cpu_topology_sysctl_tree = SYSCTL_ADD_NODE(&cpu_topology_sysctl_ctx,
636 SYSCTL_STATIC_CHILDREN(_hw),
641 /* SYSCTL cpu_topology "tree" entry */
642 SYSCTL_ADD_PROC(&cpu_topology_sysctl_ctx,
643 SYSCTL_CHILDREN(cpu_topology_sysctl_tree),
644 OID_AUTO, "tree", CTLTYPE_STRING | CTLFLAG_RD,
645 NULL, 0, print_cpu_topology_tree_sysctl, "A",
646 "Tree print of CPU topology");
648 /* SYSCTL cpu_topology "level_description" entry */
649 SYSCTL_ADD_PROC(&cpu_topology_sysctl_ctx,
650 SYSCTL_CHILDREN(cpu_topology_sysctl_tree),
651 OID_AUTO, "level_description", CTLTYPE_STRING | CTLFLAG_RD,
652 NULL, 0, print_cpu_topology_level_description_sysctl, "A",
653 "Level description of CPU topology");
655 /* SYSCTL cpu_topology "members" entry */
656 sbuf_new(&sb, cpu_topology_members,
657 sizeof(cpu_topology_members), SBUF_FIXEDLEN);
658 sbuf_print_cpuset(&sb, &cpu_root_node->members);
661 SYSCTL_ADD_STRING(&cpu_topology_sysctl_ctx,
662 SYSCTL_CHILDREN(cpu_topology_sysctl_tree),
663 OID_AUTO, "members", CTLFLAG_RD,
664 cpu_topology_members, 0,
665 "Members of the CPU Topology");
667 /* SYSCTL per_cpu info */
668 for (i = 0; i < assumed_ncpus; i++) {
669 /* New leaf : hw.cpu_topology.cpux */
670 sysctl_ctx_init(&pcpu_sysctl[i].sysctl_ctx);
671 pcpu_sysctl[i].sysctl_tree = SYSCTL_ADD_NODE(&pcpu_sysctl[i].sysctl_ctx,
672 SYSCTL_CHILDREN(cpu_topology_sysctl_tree),
674 pcpu_sysctl[i].cpu_name,
677 /* Check if the physical_id found is valid */
678 if (pcpu_sysctl[i].physical_id == INVALID_ID) {
682 /* Add physical id info */
683 SYSCTL_ADD_INT(&pcpu_sysctl[i].sysctl_ctx,
684 SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree),
685 OID_AUTO, "physical_id", CTLFLAG_RD,
686 &pcpu_sysctl[i].physical_id, 0,
689 /* Add physical siblings */
690 SYSCTL_ADD_STRING(&pcpu_sysctl[i].sysctl_ctx,
691 SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree),
692 OID_AUTO, "physical_siblings", CTLFLAG_RD,
693 pcpu_sysctl[i].physical_siblings, 0,
694 "Physical siblings");
696 /* Check if the core_id found is valid */
697 if (pcpu_sysctl[i].core_id == INVALID_ID) {
701 /* Add core id info */
702 SYSCTL_ADD_INT(&pcpu_sysctl[i].sysctl_ctx,
703 SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree),
704 OID_AUTO, "core_id", CTLFLAG_RD,
705 &pcpu_sysctl[i].core_id, 0,
708 /*Add core siblings */
709 SYSCTL_ADD_STRING(&pcpu_sysctl[i].sysctl_ctx,
710 SYSCTL_CHILDREN(pcpu_sysctl[i].sysctl_tree),
711 OID_AUTO, "core_siblings", CTLFLAG_RD,
712 pcpu_sysctl[i].core_siblings, 0,
719 sbuf_print_cpuset(struct sbuf *sb, cpumask_t *mask)
726 sbuf_printf(sb, "cpus(");
727 CPUSET_FOREACH(i, *mask) {
738 sbuf_printf(sb, ", ");
740 sbuf_printf(sb, "%d", b);
742 sbuf_printf(sb, "%d-%d", b, e - 1);
749 sbuf_printf(sb, ", ");
752 sbuf_printf(sb, "%d", b);
754 sbuf_printf(sb, "%d-%d", b, e - 1);
757 sbuf_printf(sb, ") ");
761 get_cpu_ht_id(int cpuid)
764 return(pcpu_sysctl[cpuid].ht_id);
769 get_cpu_core_id(int cpuid)
772 return(pcpu_sysctl[cpuid].core_id);
777 get_cpu_phys_id(int cpuid)
780 return(pcpu_sysctl[cpuid].physical_id);
785 * Returns the highest amount of memory attached to any single node.
786 * Returns 0 if the system is not NUMA or only has one node.
788 * This function is used by the scheduler.
791 get_highest_node_memory(void)
795 if (cpu_root_node && cpu_root_node->type == PACKAGE_LEVEL &&
796 cpu_root_node->child_node[1]) {
800 for (i = 0 ; i < MAXCPU && cpu_root_node->child_node[i]; ++i) {
801 cpup = cpu_root_node->child_node[i];
802 if (highest < cpup->phys_mem)
803 highest = cpup->phys_mem;
811 /* Build the CPU Topology and SYSCTL Topology tree */
813 init_cpu_topology(void)
817 assumed_ncpus = naps + 1;
819 build_cpu_topology(assumed_ncpus);
820 init_pcpu_topology_sysctl(assumed_ncpus);
821 build_sysctl_cpu_topology(assumed_ncpus);
823 SYSINIT(cpu_topology, SI_BOOT2_CPU_TOPOLOGY, SI_ORDER_FIRST,
824 init_cpu_topology, NULL);