powerd: Rework cpu and cpu power domain selection
[dragonfly.git] / usr.sbin / powerd / powerd.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 /*
36  * The powerd daemon :
37  * - Monitor the cpu load and adjusts cpu and cpu power domain
38  *   performance accordingly.
39  * - Monitor battery life.  Alarm alerts and shutdown the machine
40  *   if battery life goes low.
41  */
42
43 #define _KERNEL_STRUCTURES
44 #include <sys/types.h>
45 #include <sys/sysctl.h>
46 #include <sys/kinfo.h>
47 #include <sys/file.h>
48 #include <sys/queue.h>
49 #include <sys/soundcard.h>
50 #include <sys/time.h>
51 #include <machine/cpufunc.h>
52 #include <err.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <unistd.h>
56 #include <string.h>
57 #include <syslog.h>
58
59 #include "alert1.h"
60
61 #define MAXDOM          MAXCPU  /* worst case, 1 cpu per domain */
62
63 #define MAXFREQ         64
64
65 struct cpu_pwrdom {
66         TAILQ_ENTRY(cpu_pwrdom) dom_link;
67         int                     dom_id;
68         int                     dom_ncpus;
69         cpumask_t               dom_cpumask;
70 };
71
72 struct cpu_state {
73         double                  cpu_qavg;
74         double                  cpu_uavg;       /* used for speeding up */
75         double                  cpu_davg;       /* used for slowing down */
76         int                     cpu_limit;
77         int                     cpu_count;
78         char                    cpu_name[8];
79 };
80
81 static void usage(void);
82 static void get_ncpus(void);
83
84 /* usched cpumask */
85 static void get_uschedcpus(void);
86 static void set_uschedcpus(void);
87
88 /* perfbias(4) */
89 static int has_perfbias(void);
90 static void set_perfbias(int, int);
91
92 /* acpi(4) P-state */
93 static void acpi_getcpufreq_str(int, int *, int *);
94 static int acpi_getcpufreq_bin(int, int *, int *);
95 static void acpi_get_cpufreq(int, int *, int *);
96 static void acpi_set_cpufreq(int, int);
97 static int acpi_get_cpupwrdom(void);
98
99 /* Performance monitoring */
100 static void init_perf(void);
101 static void mon_perf(double);
102 static void adj_perf(cpumask_t, cpumask_t);
103 static void adj_cpu_pwrdom(int, int);
104 static void adj_cpu_perf(int, int);
105 static void get_cputime(double);
106 static int get_nstate(struct cpu_state *, double);
107 static void add_spare_cpus(const cpumask_t, int);
108 static void restore_perf(void);
109
110 /* Battery monitoring */
111 static int has_battery(void);
112 static int mon_battery(void);
113 static void low_battery_alert(int);
114
115 /* Runtime states for performance monitoring */
116 static int global_pcpu_limit;
117 static struct cpu_state pcpu_state[MAXCPU];
118 static struct cpu_state global_cpu_state;
119 static cpumask_t cpu_used;              /* cpus w/ high perf */
120 static cpumask_t cpu_pwrdom_used;       /* cpu power domains w/ high perf */
121 static cpumask_t usched_cpu_used;       /* cpus for usched */
122
123 /* Constants */
124 static cpumask_t cpu_pwrdom_mask;       /* usable cpu power domains */
125 static int cpu2pwrdom[MAXCPU];          /* cpu to cpu power domain map */
126 static struct cpu_pwrdom *cpu_pwrdomain[MAXDOM];
127 static int NCpus;                       /* # of cpus */
128
129 static int DebugOpt;
130 static int TurboOpt = 1;
131 static int PowerFd;
132 static int Hysteresis = 10;     /* percentage */
133 static double TriggerUp = 0.25; /* single-cpu load to force max freq */
134 static double TriggerDown;      /* load per cpu to force the min freq */
135 static int HasPerfbias = 1;
136
137 static volatile int stopped;
138
139 /* Battery life monitoring */
140 static int BatLifeMin = 2;      /* shutdown the box, if low on battery life */
141 static struct timespec BatLifePrevT;
142 static int BatLifePollIntvl = 5; /* unit: sec */
143 static struct timespec BatShutdownStartT;
144 static int BatShutdownLinger = -1;
145 static int BatShutdownLingerSet = 60; /* unit: sec */
146 static int BatShutdownLingerCnt;
147 static int BatShutdownAudioAlert = 1;
148
149 static void sigintr(int signo);
150
151 int
152 main(int ac, char **av)
153 {
154         double srt;
155         double pollrate;
156         int ch;
157         char buf[64];
158         int monbat;
159
160         srt = 8.0;      /* time for samples - 8 seconds */
161         pollrate = 1.0; /* polling rate in seconds */
162
163         while ((ch = getopt(ac, av, "dep:r:tu:B:L:P:QT:")) != -1) {
164                 switch(ch) {
165                 case 'd':
166                         DebugOpt = 1;
167                         break;
168                 case 'e':
169                         HasPerfbias = 0;
170                         break;
171                 case 'p':
172                         Hysteresis = (int)strtol(optarg, NULL, 10);
173                         break;
174                 case 'r':
175                         pollrate = strtod(optarg, NULL);
176                         break;
177                 case 't':
178                         TurboOpt = 0;
179                         break;
180                 case 'u':
181                         TriggerUp = (double)strtol(optarg, NULL, 10) / 100;
182                         break;
183                 case 'B':
184                         BatLifeMin = strtol(optarg, NULL, 10);
185                         break;
186                 case 'L':
187                         BatShutdownLingerSet = strtol(optarg, NULL, 10);
188                         if (BatShutdownLingerSet < 0)
189                                 BatShutdownLingerSet = 0;
190                         break;
191                 case 'P':
192                         BatLifePollIntvl = strtol(optarg, NULL, 10);
193                         break;
194                 case 'Q':
195                         BatShutdownAudioAlert = 0;
196                         break;
197                 case 'T':
198                         srt = strtod(optarg, NULL);
199                         break;
200                 default:
201                         usage();
202                         /* NOT REACHED */
203                 }
204         }
205         ac -= optind;
206         av += optind;
207
208         setlinebuf(stdout);
209
210         /* Get number of cpus */
211         get_ncpus();
212
213         if (0 > Hysteresis || Hysteresis > 99) {
214                 fprintf(stderr, "Invalid hysteresis value\n");
215                 exit(1);
216         }
217
218         if (0 > TriggerUp || TriggerUp > 1) {
219                 fprintf(stderr, "Invalid load limit value\n");
220                 exit(1);
221         }
222
223         TriggerDown = TriggerUp - (TriggerUp * (double) Hysteresis / 100);
224
225         /*
226          * Make sure powerd is not already running.
227          */
228         PowerFd = open("/var/run/powerd.pid", O_CREAT|O_RDWR, 0644);
229         if (PowerFd < 0) {
230                 fprintf(stderr,
231                         "Cannot create /var/run/powerd.pid, "
232                         "continuing anyway\n");
233         } else {
234                 if (flock(PowerFd, LOCK_EX|LOCK_NB) < 0) {
235                         fprintf(stderr, "powerd is already running\n");
236                         exit(1);
237                 }
238         }
239
240         /*
241          * Demonize and set pid
242          */
243         if (DebugOpt == 0) {
244                 daemon(0, 0);
245                 openlog("powerd", LOG_CONS | LOG_PID, LOG_DAEMON);
246         }
247
248         if (PowerFd >= 0) {
249                 ftruncate(PowerFd, 0);
250                 snprintf(buf, sizeof(buf), "%d\n", (int)getpid());
251                 write(PowerFd, buf, strlen(buf));
252         }
253
254         /* Do we need to monitor battery life? */
255         if (BatLifePollIntvl <= 0)
256                 monbat = 0;
257         else
258                 monbat = has_battery();
259
260         /* Do we have perfbias(4)? */
261         if (HasPerfbias)
262                 HasPerfbias = has_perfbias();
263
264         /*
265          * Wait hw.acpi.cpu.px_dom* sysctl to be created by kernel.
266          *
267          * Since hw.acpi.cpu.px_dom* creation is queued into ACPI
268          * taskqueue and ACPI taskqueue is shared across various
269          * ACPI modules, any delay in other modules may cause
270          * hw.acpi.cpu.px_dom* to be created at quite a later time
271          * (e.g. cmbat module's task could take quite a lot of time).
272          */
273         for (;;) {
274                 /* Prime delta cputime calculation. */
275                 get_cputime(pollrate);
276
277                 /* Wait for all cpus to appear */
278                 if (acpi_get_cpupwrdom())
279                         break;
280                 usleep((int)(pollrate * 1000000.0));
281         }
282
283         /*
284          * Catch some signals so that max performance could be restored.
285          */
286         signal(SIGINT, sigintr);
287         signal(SIGTERM, sigintr);
288
289         /* Initialize performance states */
290         init_perf();
291
292         srt = srt / pollrate;   /* convert to sample count */
293         if (DebugOpt)
294                 printf("samples for downgrading: %5.2f\n", srt);
295
296         /*
297          * Monitoring loop
298          */
299         while (!stopped) {
300                 /*
301                  * Monitor performance
302                  */
303                 get_cputime(pollrate);
304                 mon_perf(srt);
305
306                 /*
307                  * Monitor battery
308                  */
309                 if (monbat)
310                         monbat = mon_battery();
311
312                 usleep((int)(pollrate * 1000000.0));
313         }
314
315         /*
316          * Set to maximum performance if killed.
317          */
318         syslog(LOG_INFO, "killed, setting max and exiting");
319         restore_perf();
320
321         exit(0);
322 }
323
324 static void
325 sigintr(int signo __unused)
326 {
327         stopped = 1;
328 }
329
330 /*
331  * Figure out the cpu power domains.
332  */
333 static int
334 acpi_get_cpupwrdom(void)
335 {
336         struct cpu_pwrdom *dom;
337         cpumask_t pwrdom_mask;
338         char buf[64];
339         char members[1024];
340         char *str;
341         size_t msize;
342         int n, i, ncpu = 0, dom_id;
343
344         memset(cpu2pwrdom, 0, sizeof(cpu2pwrdom));
345         memset(cpu_pwrdomain, 0, sizeof(cpu_pwrdomain));
346         CPUMASK_ASSZERO(cpu_pwrdom_mask);
347
348         for (i = 0; i < MAXDOM; ++i) {
349                 snprintf(buf, sizeof(buf),
350                          "hw.acpi.cpu.px_dom%d.available", i);
351                 if (sysctlbyname(buf, NULL, NULL, NULL, 0) < 0)
352                         continue;
353
354                 dom = calloc(1, sizeof(*dom));
355                 dom->dom_id = i;
356
357                 if (cpu_pwrdomain[i] != NULL) {
358                         fprintf(stderr, "cpu power domain %d exists\n", i);
359                         exit(1);
360                 }
361                 cpu_pwrdomain[i] = dom;
362                 CPUMASK_ORBIT(cpu_pwrdom_mask, i);
363         }
364         pwrdom_mask = cpu_pwrdom_mask;
365
366         while (CPUMASK_TESTNZERO(pwrdom_mask)) {
367                 dom_id = BSFCPUMASK(pwrdom_mask);
368                 CPUMASK_NANDBIT(pwrdom_mask, dom_id);
369                 dom = cpu_pwrdomain[dom_id];
370
371                 CPUMASK_ASSZERO(dom->dom_cpumask);
372
373                 snprintf(buf, sizeof(buf),
374                          "hw.acpi.cpu.px_dom%d.members", dom->dom_id);
375                 msize = sizeof(members);
376                 if (sysctlbyname(buf, members, &msize, NULL, 0) < 0) {
377                         cpu_pwrdomain[dom_id] = NULL;
378                         free(dom);
379                         continue;
380                 }
381
382                 members[msize] = 0;
383                 for (str = strtok(members, " "); str; str = strtok(NULL, " ")) {
384                         n = -1;
385                         sscanf(str, "cpu%d", &n);
386                         if (n >= 0) {
387                                 ++ncpu;
388                                 ++dom->dom_ncpus;
389                                 CPUMASK_ORBIT(dom->dom_cpumask, n);
390                                 cpu2pwrdom[n] = dom->dom_id;
391                         }
392                 }
393                 if (dom->dom_ncpus == 0) {
394                         cpu_pwrdomain[dom_id] = NULL;
395                         free(dom);
396                         continue;
397                 }
398                 if (DebugOpt) {
399                         printf("dom%d cpumask: ", dom->dom_id);
400                         for (i = 0; i < (int)NELEM(dom->dom_cpumask.ary); ++i) {
401                                 printf("%jx ",
402                                     (uintmax_t)dom->dom_cpumask.ary[i]);
403                         }
404                         printf("\n");
405                 }
406         }
407
408         if (ncpu != NCpus) {
409                 if (DebugOpt)
410                         printf("Found %d cpus, expecting %d\n", ncpu, NCpus);
411
412                 pwrdom_mask = cpu_pwrdom_mask;
413                 while (CPUMASK_TESTNZERO(pwrdom_mask)) {
414                         dom_id = BSFCPUMASK(pwrdom_mask);
415                         CPUMASK_NANDBIT(pwrdom_mask, dom_id);
416                         dom = cpu_pwrdomain[dom_id];
417                         if (dom != NULL)
418                                 free(dom);
419                 }
420                 return 0;
421         }
422         return 1;
423 }
424
425 /*
426  * Save per-cpu load and sum of per-cpu load.
427  */
428 static void
429 get_cputime(double pollrate)
430 {
431         static struct kinfo_cputime ocpu_time[MAXCPU];
432         static struct kinfo_cputime ncpu_time[MAXCPU];
433         size_t slen;
434         int ncpu;
435         int cpu;
436         uint64_t delta;
437
438         bcopy(ncpu_time, ocpu_time, sizeof(struct kinfo_cputime) * NCpus);
439
440         slen = sizeof(ncpu_time);
441         if (sysctlbyname("kern.cputime", &ncpu_time, &slen, NULL, 0) < 0) {
442                 fprintf(stderr, "kern.cputime sysctl not available\n");
443                 exit(1);
444         }
445         ncpu = slen / sizeof(ncpu_time[0]);
446
447         delta = 0;
448         for (cpu = 0; cpu < ncpu; ++cpu) {
449                 uint64_t d;
450
451                 d = (ncpu_time[cpu].cp_user + ncpu_time[cpu].cp_sys +
452                      ncpu_time[cpu].cp_nice + ncpu_time[cpu].cp_intr) -
453                     (ocpu_time[cpu].cp_user + ocpu_time[cpu].cp_sys +
454                      ocpu_time[cpu].cp_nice + ocpu_time[cpu].cp_intr);
455                 pcpu_state[cpu].cpu_qavg = (double)d / (pollrate * 1000000.0);
456
457                 delta += d;
458         }
459         global_cpu_state.cpu_qavg = (double)delta / (pollrate * 1000000.0);
460 }
461
462 static void
463 acpi_getcpufreq_str(int dom_id, int *highest0, int *lowest0)
464 {
465         char buf[256], sysid[64];
466         size_t buflen;
467         char *ptr;
468         int v, highest, lowest;
469
470         /*
471          * Retrieve availability list
472          */
473         snprintf(sysid, sizeof(sysid), "hw.acpi.cpu.px_dom%d.available",
474             dom_id);
475         buflen = sizeof(buf) - 1;
476         if (sysctlbyname(sysid, buf, &buflen, NULL, 0) < 0)
477                 return;
478         buf[buflen] = 0;
479
480         /*
481          * Parse out the highest and lowest cpu frequencies
482          */
483         ptr = buf;
484         highest = lowest = 0;
485         while (ptr && (v = strtol(ptr, &ptr, 10)) > 0) {
486                 if (lowest == 0 || lowest > v)
487                         lowest = v;
488                 if (highest == 0 || highest < v)
489                         highest = v;
490                 /* 
491                  * Detect turbo mode
492                  */
493                 if (!TurboOpt && highest - v == 1)
494                         highest = v;
495         }
496
497         *highest0 = highest;
498         *lowest0 = lowest;
499 }
500
501 static int
502 acpi_getcpufreq_bin(int dom_id, int *highest0, int *lowest0)
503 {
504         char sysid[64];
505         int freq[MAXFREQ];
506         size_t freqlen;
507         int freqcnt;
508
509         /*
510          * Retrieve availability list
511          */
512         snprintf(sysid, sizeof(sysid), "hw.acpi.cpu.px_dom%d.avail", dom_id);
513         freqlen = sizeof(freq);
514         if (sysctlbyname(sysid, freq, &freqlen, NULL, 0) < 0)
515                 return 0;
516
517         freqcnt = freqlen / sizeof(freq[0]);
518         if (freqcnt == 0)
519                 return 0;
520
521         *lowest0 = freq[freqcnt - 1];
522
523         *highest0 = freq[0];
524         if (!TurboOpt && freqcnt > 1 && freq[0] - freq[1] == 1)
525                 *highest0 = freq[1];
526         return 1;
527 }
528
529 static void
530 acpi_get_cpufreq(int dom_id, int *highest, int *lowest)
531 {
532         *highest = 0;
533         *lowest = 0;
534
535         if (acpi_getcpufreq_bin(dom_id, highest, lowest))
536                 return;
537         acpi_getcpufreq_str(dom_id, highest, lowest);
538 }
539
540 static
541 void
542 usage(void)
543 {
544         fprintf(stderr, "usage: powerd [-dt] [-p hysteresis] "
545             "[-u trigger_up] [-T sample_interval] [-r poll_interval] "
546             "[-B min_battery_life] [-L low_battery_linger] "
547             "[-P battery_poll_interval] [-Q] [-e]\n");
548         exit(1);
549 }
550
551 #ifndef timespecsub
552 #define timespecsub(vvp, uvp)                                           \
553         do {                                                            \
554                 (vvp)->tv_sec -= (uvp)->tv_sec;                         \
555                 (vvp)->tv_nsec -= (uvp)->tv_nsec;                       \
556                 if ((vvp)->tv_nsec < 0) {                               \
557                         (vvp)->tv_sec--;                                \
558                         (vvp)->tv_nsec += 1000000000;                   \
559                 }                                                       \
560         } while (0)
561 #endif
562
563 #define BAT_SYSCTL_TIME_MAX     50000000 /* unit: nanosecond */
564
565 static int
566 has_battery(void)
567 {
568         struct timespec s, e;
569         size_t len;
570         int val;
571
572         clock_gettime(CLOCK_MONOTONIC_FAST, &s);
573         BatLifePrevT = s;
574
575         len = sizeof(val);
576         if (sysctlbyname("hw.acpi.acline", &val, &len, NULL, 0) < 0) {
577                 /* No AC line information */
578                 return 0;
579         }
580         clock_gettime(CLOCK_MONOTONIC_FAST, &e);
581
582         timespecsub(&e, &s);
583         if (e.tv_sec > 0 || e.tv_nsec > BAT_SYSCTL_TIME_MAX) {
584                 /* hw.acpi.acline takes to long to be useful */
585                 syslog(LOG_NOTICE, "hw.acpi.acline takes too long");
586                 return 0;
587         }
588
589         clock_gettime(CLOCK_MONOTONIC_FAST, &s);
590         len = sizeof(val);
591         if (sysctlbyname("hw.acpi.battery.life", &val, &len, NULL, 0) < 0) {
592                 /* No battery life */
593                 return 0;
594         }
595         clock_gettime(CLOCK_MONOTONIC_FAST, &e);
596
597         timespecsub(&e, &s);
598         if (e.tv_sec > 0 || e.tv_nsec > BAT_SYSCTL_TIME_MAX) {
599                 /* hw.acpi.battery.life takes to long to be useful */
600                 syslog(LOG_NOTICE, "hw.acpi.battery.life takes too long");
601                 return 0;
602         }
603         return 1;
604 }
605
606 static void
607 low_battery_alert(int life)
608 {
609         int fmt, stereo, freq;
610         int fd;
611
612         syslog(LOG_ALERT, "low battery life %d%%, please plugin AC line, #%d",
613             life, BatShutdownLingerCnt);
614         ++BatShutdownLingerCnt;
615
616         if (!BatShutdownAudioAlert)
617                 return;
618
619         fd = open("/dev/dsp", O_WRONLY);
620         if (fd < 0)
621                 return;
622
623         fmt = AFMT_S16_LE;
624         if (ioctl(fd, SNDCTL_DSP_SETFMT, &fmt, sizeof(fmt)) < 0)
625                 goto done;
626
627         stereo = 0;
628         if (ioctl(fd, SNDCTL_DSP_STEREO, &stereo, sizeof(stereo)) < 0)
629                 goto done;
630
631         freq = 44100;
632         if (ioctl(fd, SNDCTL_DSP_SPEED, &freq, sizeof(freq)) < 0)
633                 goto done;
634
635         write(fd, alert1, sizeof(alert1));
636         write(fd, alert1, sizeof(alert1));
637
638 done:
639         close(fd);
640 }
641
642 static int
643 mon_battery(void)
644 {
645         struct timespec cur, ts;
646         int acline, life;
647         size_t len;
648
649         clock_gettime(CLOCK_MONOTONIC_FAST, &cur);
650         ts = cur;
651         timespecsub(&ts, &BatLifePrevT);
652         if (ts.tv_sec < BatLifePollIntvl)
653                 return 1;
654         BatLifePrevT = cur;
655
656         len = sizeof(acline);
657         if (sysctlbyname("hw.acpi.acline", &acline, &len, NULL, 0) < 0)
658                 return 1;
659         if (acline) {
660                 BatShutdownLinger = -1;
661                 BatShutdownLingerCnt = 0;
662                 return 1;
663         }
664
665         len = sizeof(life);
666         if (sysctlbyname("hw.acpi.battery.life", &life, &len, NULL, 0) < 0)
667                 return 1;
668
669         if (BatShutdownLinger > 0) {
670                 ts = cur;
671                 timespecsub(&ts, &BatShutdownStartT);
672                 if (ts.tv_sec > BatShutdownLinger)
673                         BatShutdownLinger = 0;
674         }
675
676         if (life <= BatLifeMin) {
677                 if (BatShutdownLinger == 0 || BatShutdownLingerSet == 0) {
678                         syslog(LOG_ALERT, "low battery life %d%%, "
679                             "shutting down", life);
680                         if (vfork() == 0)
681                                 execlp("poweroff", "poweroff", NULL);
682                         return 0;
683                 } else if (BatShutdownLinger < 0) {
684                         BatShutdownLinger = BatShutdownLingerSet;
685                         BatShutdownStartT = cur;
686                 }
687                 low_battery_alert(life);
688         }
689         return 1;
690 }
691
692 static void
693 get_ncpus(void)
694 {
695         size_t slen;
696
697         slen = sizeof(NCpus);
698         if (sysctlbyname("hw.ncpu", &NCpus, &slen, NULL, 0) < 0)
699                 err(1, "sysctlbyname hw.ncpu failed");
700         if (DebugOpt)
701                 printf("hw.ncpu %d\n", NCpus);
702 }
703
704 static void
705 get_uschedcpus(void)
706 {
707         size_t slen;
708
709         slen = sizeof(usched_cpu_used);
710         if (sysctlbyname("kern.usched_global_cpumask", &usched_cpu_used, &slen,
711             NULL, 0) < 0)
712                 err(1, "sysctlbyname kern.usched_global_cpumask failed");
713         if (DebugOpt) {
714                 int i;
715
716                 printf("usched cpumask was: ");
717                 for (i = 0; i < (int)NELEM(usched_cpu_used.ary); ++i)
718                         printf("%jx ", (uintmax_t)usched_cpu_used.ary[i]);
719                 printf("\n");
720         }
721 }
722
723 static void
724 set_uschedcpus(void)
725 {
726         if (DebugOpt) {
727                 int i;
728
729                 printf("usched cpumask: ");
730                 for (i = 0; i < (int)NELEM(usched_cpu_used.ary); ++i) {
731                         printf("%jx ",
732                             (uintmax_t)usched_cpu_used.ary[i]);
733                 }
734                 printf("\n");
735         }
736         sysctlbyname("kern.usched_global_cpumask", NULL, 0,
737             &usched_cpu_used, sizeof(usched_cpu_used));
738 }
739
740 static int
741 has_perfbias(void)
742 {
743         size_t len;
744         int hint;
745
746         len = sizeof(hint);
747         if (sysctlbyname("machdep.perfbias0.hint", &hint, &len, NULL, 0) < 0)
748                 return 0;
749         return 1;
750 }
751
752 static void
753 set_perfbias(int cpu, int inc)
754 {
755         int hint = inc ? 0 : 15;
756         char sysid[64];
757
758         if (DebugOpt)
759                 printf("cpu%d set perfbias hint %d\n", cpu, hint);
760         snprintf(sysid, sizeof(sysid), "machdep.perfbias%d.hint", cpu);
761         sysctlbyname(sysid, NULL, NULL, &hint, sizeof(hint));
762 }
763
764 static void
765 init_perf(void)
766 {
767         struct cpu_state *state;
768         int cpu;
769
770         /* Get usched cpumask */
771         get_uschedcpus();
772
773         /*
774          * Assume everything are used and are maxed out, before we
775          * start.
776          */
777
778         CPUMASK_ASSBMASK(cpu_used, NCpus);
779         cpu_pwrdom_used = cpu_pwrdom_mask;
780         global_pcpu_limit = NCpus;
781
782         for (cpu = 0; cpu < NCpus; ++cpu) {
783                 state = &pcpu_state[cpu];
784
785                 state->cpu_uavg = 0.0;
786                 state->cpu_davg = 0.0;
787                 state->cpu_limit = 1;
788                 state->cpu_count = 1;
789                 snprintf(state->cpu_name, sizeof(state->cpu_name), "cpu%d",
790                     cpu);
791         }
792
793         state = &global_cpu_state;
794         state->cpu_uavg = 0.0;
795         state->cpu_davg = 0.0;
796         state->cpu_limit = NCpus;
797         state->cpu_count = NCpus;
798         strlcpy(state->cpu_name, "global", sizeof(state->cpu_name));
799 }
800
801 static int
802 get_nstate(struct cpu_state *state, double srt)
803 {
804         int ustate, dstate, nstate;
805
806         /* speeding up */
807         state->cpu_uavg = (state->cpu_uavg * 2.0 + state->cpu_qavg) / 3.0;
808         /* slowing down */
809         state->cpu_davg = (state->cpu_davg * srt + state->cpu_qavg) / (srt + 1);
810         if (state->cpu_davg < state->cpu_uavg)
811                 state->cpu_davg = state->cpu_uavg;
812
813         ustate = state->cpu_uavg / TriggerUp;
814         if (ustate < state->cpu_limit)
815                 ustate = state->cpu_uavg / TriggerDown;
816         dstate = state->cpu_davg / TriggerUp;
817         if (dstate < state->cpu_limit)
818                 dstate = state->cpu_davg / TriggerDown;
819
820         nstate = (ustate > dstate) ? ustate : dstate;
821         if (nstate > state->cpu_count)
822                 nstate = state->cpu_count;
823
824         if (DebugOpt) {
825                 printf("%s qavg=%5.2f uavg=%5.2f davg=%5.2f "
826                     "%2d ncpus=%d\n", state->cpu_name,
827                     state->cpu_qavg, state->cpu_uavg, state->cpu_davg,
828                     state->cpu_limit, nstate);
829         }
830         return nstate;
831 }
832
833 static void
834 mon_perf(double srt)
835 {
836         cpumask_t ocpu_used, ocpu_pwrdom_used;
837         int pnstate = 0, nstate;
838         int cpu;
839
840         /*
841          * Find cpus requiring performance and their cooresponding power
842          * domains.  Save the number of cpus requiring performance in
843          * pnstate.
844          */
845         ocpu_used = cpu_used;
846         ocpu_pwrdom_used = cpu_pwrdom_used;
847
848         CPUMASK_ASSZERO(cpu_used);
849         CPUMASK_ASSZERO(cpu_pwrdom_used);
850
851         for (cpu = 0; cpu < NCpus; ++cpu) {
852                 struct cpu_state *state = &pcpu_state[cpu];
853                 int s;
854
855                 s = get_nstate(state, srt);
856                 if (s) {
857                         CPUMASK_ORBIT(cpu_used, cpu);
858                         CPUMASK_ORBIT(cpu_pwrdom_used, cpu2pwrdom[cpu]);
859                 }
860                 pnstate += s;
861
862                 state->cpu_limit = s;
863         }
864
865         /*
866          * Calculate nstate, the number of cpus we wish to run at max
867          * performance.
868          */
869         nstate = get_nstate(&global_cpu_state, srt);
870
871         if (nstate == global_cpu_state.cpu_limit &&
872             (pnstate == global_pcpu_limit || nstate > pnstate)) {
873                 /* Nothing changed; keep the sets */
874                 cpu_used = ocpu_used;
875                 cpu_pwrdom_used = ocpu_pwrdom_used;
876
877                 global_pcpu_limit = pnstate;
878                 return;
879         }
880         global_pcpu_limit = pnstate;
881
882         if (nstate > pnstate) {
883                 /*
884                  * Add spare cpus to meet global performance requirement.
885                  */
886                 add_spare_cpus(ocpu_used, nstate - pnstate);
887         }
888
889         global_cpu_state.cpu_limit = nstate;
890
891         /*
892          * Adjust cpu and cpu power domain performance
893          */
894         adj_perf(ocpu_used, ocpu_pwrdom_used);
895 }
896
897 static void
898 add_spare_cpus(const cpumask_t ocpu_used, int ncpu)
899 {
900         cpumask_t saved_pwrdom, xcpu_used;
901         int done = 0, cpu;
902
903         /*
904          * Find more cpus in the previous cpu set.
905          */
906         xcpu_used = cpu_used;
907         CPUMASK_XORMASK(xcpu_used, ocpu_used);
908         while (CPUMASK_TESTNZERO(xcpu_used)) {
909                 cpu = BSFCPUMASK(xcpu_used);
910                 CPUMASK_NANDBIT(xcpu_used, cpu);
911
912                 if (CPUMASK_TESTBIT(ocpu_used, cpu)) {
913                         CPUMASK_ORBIT(cpu_pwrdom_used, cpu2pwrdom[cpu]);
914                         CPUMASK_ORBIT(cpu_used, cpu);
915                         --ncpu;
916                         if (ncpu == 0)
917                                 return;
918                 }
919         }
920
921         /*
922          * Find more cpus in the used cpu power domains.
923          */
924         saved_pwrdom = cpu_pwrdom_used;
925 again:
926         while (CPUMASK_TESTNZERO(saved_pwrdom)) {
927                 cpumask_t unused_cpumask;
928                 int dom;
929
930                 dom = BSFCPUMASK(saved_pwrdom);
931                 CPUMASK_NANDBIT(saved_pwrdom, dom);
932
933                 unused_cpumask = cpu_pwrdomain[dom]->dom_cpumask;
934                 CPUMASK_NANDMASK(unused_cpumask, cpu_used);
935
936                 while (CPUMASK_TESTNZERO(unused_cpumask)) {
937                         cpu = BSFCPUMASK(unused_cpumask);
938                         CPUMASK_NANDBIT(unused_cpumask, cpu);
939
940                         CPUMASK_ORBIT(cpu_pwrdom_used, dom);
941                         CPUMASK_ORBIT(cpu_used, cpu);
942                         --ncpu;
943                         if (ncpu == 0)
944                                 return;
945                 }
946         }
947         if (!done) {
948                 done = 1;
949                 /*
950                  * Find more cpus in unused cpu power domains
951                  */
952                 saved_pwrdom = cpu_pwrdom_mask;
953                 CPUMASK_NANDMASK(saved_pwrdom, cpu_pwrdom_used);
954                 goto again;
955         }
956         if (DebugOpt)
957                 printf("%d cpus not found\n", ncpu);
958 }
959
960 static void
961 acpi_set_cpufreq(int dom, int inc)
962 {
963         int lowest, highest, desired;
964         char sysid[64];
965
966         acpi_get_cpufreq(dom, &highest, &lowest);
967         if (highest == 0 || lowest == 0)
968                 return;
969         desired = inc ? highest : lowest;
970
971         if (DebugOpt)
972                 printf("dom%d set frequency %d\n", dom, desired);
973         snprintf(sysid, sizeof(sysid), "hw.acpi.cpu.px_dom%d.select", dom);
974         sysctlbyname(sysid, NULL, NULL, &desired, sizeof(desired));
975 }
976
977 static void
978 adj_cpu_pwrdom(int dom, int inc)
979 {
980         acpi_set_cpufreq(dom, inc);
981 }
982
983 static void
984 adj_cpu_perf(int cpu, int inc)
985 {
986         if (DebugOpt) {
987                 if (inc)
988                         printf("cpu%d increase perf\n", cpu);
989                 else
990                         printf("cpu%d decrease perf\n", cpu);
991         }
992
993         if (HasPerfbias)
994                 set_perfbias(cpu, inc);
995 }
996
997 static void
998 adj_perf(cpumask_t xcpu_used, cpumask_t xcpu_pwrdom_used)
999 {
1000         cpumask_t old_usched_used;
1001         int cpu, inc;
1002
1003         /*
1004          * Set cpus requiring performance to the userland process
1005          * scheduler.  Leave the rest of cpus unmapped.
1006          */
1007         old_usched_used = usched_cpu_used;
1008         usched_cpu_used = cpu_used;
1009         if (CPUMASK_TESTZERO(usched_cpu_used))
1010                 CPUMASK_ORBIT(usched_cpu_used, 0);
1011         if (CPUMASK_CMPMASKNEQ(usched_cpu_used, old_usched_used))
1012                 set_uschedcpus();
1013
1014         /*
1015          * Adjust per-cpu performance.
1016          */
1017         CPUMASK_XORMASK(xcpu_used, cpu_used);
1018         while (CPUMASK_TESTNZERO(xcpu_used)) {
1019                 cpu = BSFCPUMASK(xcpu_used);
1020                 CPUMASK_NANDBIT(xcpu_used, cpu);
1021
1022                 if (CPUMASK_TESTBIT(cpu_used, cpu)) {
1023                         /* Increase cpu performance */
1024                         inc = 1;
1025                 } else {
1026                         /* Decrease cpu performance */
1027                         inc = 0;
1028                 }
1029                 adj_cpu_perf(cpu, inc);
1030         }
1031
1032         /*
1033          * Adjust cpu power domain performance.  This could affect
1034          * a set of cpus.
1035          */
1036         CPUMASK_XORMASK(xcpu_pwrdom_used, cpu_pwrdom_used);
1037         while (CPUMASK_TESTNZERO(xcpu_pwrdom_used)) {
1038                 int dom;
1039
1040                 dom = BSFCPUMASK(xcpu_pwrdom_used);
1041                 CPUMASK_NANDBIT(xcpu_pwrdom_used, dom);
1042
1043                 if (CPUMASK_TESTBIT(cpu_pwrdom_used, dom)) {
1044                         /* Increase cpu power domain performance */
1045                         inc = 1;
1046                 } else {
1047                         /* Decrease cpu power domain performance */
1048                         inc = 0;
1049                 }
1050                 adj_cpu_pwrdom(dom, inc);
1051         }
1052 }
1053
1054 static void
1055 restore_perf(void)
1056 {
1057         cpumask_t ocpu_used, ocpu_pwrdom_used;
1058
1059         ocpu_used = cpu_used;
1060         ocpu_pwrdom_used = cpu_pwrdom_used;
1061
1062         /* Max out all cpus and cpu power domains performance */
1063         CPUMASK_ASSBMASK(cpu_used, NCpus);
1064         cpu_pwrdom_used = cpu_pwrdom_mask;
1065
1066         adj_perf(ocpu_used, ocpu_pwrdom_used);
1067 }