powerd: Use linked list for CPU power domain
[dragonfly.git] / usr.sbin / powerd / powerd.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 /*
36  * The powerd daemon monitors the cpu load and adjusts cpu frequencies
37  * via hw.acpi.cpu.px_dom*.
38  */
39
40 #define _KERNEL_STRUCTURES
41 #include <sys/types.h>
42 #include <sys/sysctl.h>
43 #include <sys/kinfo.h>
44 #include <sys/file.h>
45 #include <sys/queue.h>
46 #include <sys/soundcard.h>
47 #include <sys/time.h>
48 #include <err.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <unistd.h>
52 #include <string.h>
53 #include <syslog.h>
54
55 #include "alert1.h"
56
57 #define MAXDOM          MAXCPU  /* worst case, 1 cpu per domain */
58
59 struct cpu_pwrdom {
60         TAILQ_ENTRY(cpu_pwrdom) dom_link;
61         int                     dom_id;
62         int                     dom_ncpus;
63 };
64 TAILQ_HEAD(cpu_pwrdom_list, cpu_pwrdom);
65
66 static void usage(void);
67 static double getcputime(double);
68 static void acpi_setcpufreq(int nstate);
69 static void setupdominfo(void);
70 static int has_battery(void);
71 static int mon_battery(void);
72 static void getncpus(void);
73
74 static struct cpu_pwrdom_list CpuPwrDomain =
75     TAILQ_HEAD_INITIALIZER(CpuPwrDomain);
76 static struct cpu_pwrdom *CpuPwrDomLimit;
77 static struct cpu_pwrdom CpuPwrDomLast;
78 static int NCpuPwrDomUsed;
79
80 static int TotalCpus;
81 int DebugOpt;
82 int TurboOpt = 1;
83 int CpuLimit;           /* # of cpus at max frequency */
84 int PowerFd;
85 int NCpus;
86 int CpuCount[MAXDOM];   /* # of cpus in any given domain */
87 int Hysteresis = 10;    /* percentage */
88 double TriggerUp = 0.25;/* single-cpu load to force max freq */
89 double TriggerDown; /* load per cpu to force the min freq */
90 static int BatLifeMin = 2; /* shutdown the box, if low on battery life */
91 static struct timespec BatLifePrevT;
92 static int BatLifePollIntvl = 5; /* unit: sec */
93
94 static struct timespec BatShutdownStartT;
95 static int BatShutdownLinger = -1;
96 static int BatShutdownLingerSet = 60; /* unit: sec */
97 static int BatShutdownLingerCnt;
98 static int BatShutdownAudioAlert = 1;
99
100 static void sigintr(int signo);
101
102 int
103 main(int ac, char **av)
104 {
105         double qavg;
106         double uavg;    /* uavg - used for speeding up */
107         double davg;    /* davg - used for slowing down */
108         double srt;
109         double pollrate;
110         int ch;
111         int ustate;
112         int dstate;
113         int nstate;
114         char buf[64];
115         int monbat;
116
117         srt = 8.0;      /* time for samples - 8 seconds */
118         pollrate = 1.0; /* polling rate in seconds */
119
120         while ((ch = getopt(ac, av, "dp:r:tu:B:L:P:QT:")) != -1) {
121                 switch(ch) {
122                 case 'd':
123                         DebugOpt = 1;
124                         break;
125                 case 'p':
126                         Hysteresis = (int)strtol(optarg, NULL, 10);
127                         break;
128                 case 'r':
129                         pollrate = strtod(optarg, NULL);
130                         break;
131                 case 't':
132                         TurboOpt = 0;
133                         break;
134                 case 'u':
135                         TriggerUp = (double)strtol(optarg, NULL, 10) / 100;
136                         break;
137                 case 'B':
138                         BatLifeMin = strtol(optarg, NULL, 10);
139                         break;
140                 case 'L':
141                         BatShutdownLingerSet = strtol(optarg, NULL, 10);
142                         if (BatShutdownLingerSet < 0)
143                                 BatShutdownLingerSet = 0;
144                         break;
145                 case 'P':
146                         BatLifePollIntvl = strtol(optarg, NULL, 10);
147                         break;
148                 case 'Q':
149                         BatShutdownAudioAlert = 0;
150                         break;
151                 case 'T':
152                         srt = strtod(optarg, NULL);
153                         break;
154                 default:
155                         usage();
156                         /* NOT REACHED */
157                 }
158         }
159         ac -= optind;
160         av += optind;
161
162         /* Get the number of cpus */
163         getncpus();
164
165         if (0 > Hysteresis || Hysteresis > 99) {
166                 fprintf(stderr, "Invalid hysteresis value\n");
167                 exit(1);
168         }
169
170         if (0 > TriggerUp || TriggerUp > 1) {
171                 fprintf(stderr, "Invalid load limit value\n");
172                 exit(1);
173         }
174
175         TriggerDown = TriggerUp - (TriggerUp * (double) Hysteresis / 100);
176
177         /*
178          * Make sure powerd is not already running.
179          */
180         PowerFd = open("/var/run/powerd.pid", O_CREAT|O_RDWR, 0644);
181         if (PowerFd < 0) {
182                 fprintf(stderr,
183                         "Cannot create /var/run/powerd.pid, "
184                         "continuing anyway\n");
185         } else {
186                 if (flock(PowerFd, LOCK_EX|LOCK_NB) < 0) {
187                         fprintf(stderr, "powerd is already running\n");
188                         exit(1);
189                 }
190         }
191
192         /*
193          * Demonize and set pid
194          */
195         if (DebugOpt == 0) {
196                 daemon(0, 0);
197                 openlog("powerd", LOG_CONS | LOG_PID, LOG_DAEMON);
198         }
199
200         if (PowerFd >= 0) {
201                 ftruncate(PowerFd, 0);
202                 snprintf(buf, sizeof(buf), "%d\n", (int)getpid());
203                 write(PowerFd, buf, strlen(buf));
204         }
205
206         /* Do we need to monitor battery life? */
207         if (BatLifePollIntvl <= 0)
208                 monbat = 0;
209         else
210                 monbat = has_battery();
211
212         /*
213          * Wait hw.acpi.cpu.px_dom* sysctl to be created by kernel
214          *
215          * Since hw.acpi.cpu.px_dom* creation is queued into ACPI
216          * taskqueue and ACPI taskqueue is shared across various
217          * ACPI modules, any delay in other modules may cause
218          * hw.acpi.cpu.px_dom* to be created at quite a later time
219          * (e.g. cmbat module's task could take quite a lot of time).
220          */
221         for (;;) {
222                 /*
223                  * Prime delta cputime calculation, make sure at least
224                  * dom0 exists.
225                  */
226                 getcputime(pollrate);
227
228                 setupdominfo();
229                 if (TAILQ_EMPTY(&CpuPwrDomain)) {
230                         usleep((int)(pollrate * 1000000.0));
231                         continue;
232                 }
233
234                 CpuPwrDomLimit = &CpuPwrDomLast;
235                 CpuLimit = NCpus;
236                 break;
237         }
238
239         /*
240          * Set to maximum performance if killed.
241          */
242         signal(SIGINT, sigintr);
243         signal(SIGTERM, sigintr);
244         uavg = 0.0;
245         davg = 0.0;
246
247         srt = srt / pollrate;   /* convert to sample count */
248
249         if (DebugOpt)
250                 printf("samples for downgrading: %5.2f\n", srt);
251
252         /*
253          * Monitoring loop
254          *
255          * Calculate nstate, the number of cpus we wish to run at max
256          * frequency.  All remaining cpus will be set to their lowest
257          * frequency and mapped out of the user process scheduler.
258          */
259         for (;;) {
260                 qavg = getcputime(pollrate);
261                 uavg = (uavg * 2.0 + qavg) / 3.0;       /* speeding up */
262                 davg = (davg * srt + qavg) / (srt + 1); /* slowing down */
263                 if (davg < uavg)
264                         davg = uavg;
265
266                 ustate = uavg / TriggerUp;
267                 if (ustate < CpuLimit)
268                         ustate = uavg / TriggerDown;
269                 dstate = davg / TriggerUp;
270                 if (dstate < CpuLimit)
271                         dstate = davg / TriggerDown;
272
273                 nstate = (ustate > dstate) ? ustate : dstate;
274                 if (nstate > NCpus)
275                         nstate = NCpus;
276
277                 if (DebugOpt) {
278                         printf("\rqavg=%5.2f uavg=%5.2f davg=%5.2f "
279                                "%2d/%2d ncpus=%d\r",
280                                 qavg, uavg, davg,
281                                 CpuLimit, NCpuPwrDomUsed, nstate);
282                         fflush(stdout);
283                 }
284                 if (nstate != CpuLimit)
285                         acpi_setcpufreq(nstate);
286                 if (monbat)
287                         monbat = mon_battery();
288                 usleep((int)(pollrate * 1000000.0));
289         }
290 }
291
292 static
293 void
294 sigintr(int signo __unused)
295 {
296         syslog(LOG_INFO, "killed, setting max and exiting");
297         acpi_setcpufreq(NCpus);
298         exit(1);
299 }
300
301 /*
302  * Figure out the domains and calculate the CpuCount[] array.
303  */
304 static void
305 setupdominfo(void)
306 {
307         struct cpu_pwrdom *dom;
308         struct cpu_pwrdom_list tmp_list;
309         char buf[64];
310         char members[1024];
311         char *str;
312         size_t msize;
313         int n, i;
314
315         TAILQ_INIT(&tmp_list);
316         for (i = 0; i < MAXDOM; ++i) {
317                 snprintf(buf, sizeof(buf),
318                          "hw.acpi.cpu.px_dom%d.available", i);
319                 if (sysctlbyname(buf, NULL, NULL, NULL, 0) < 0)
320                         continue;
321
322                 dom = calloc(1, sizeof(*dom));
323                 dom->dom_id = i;
324                 TAILQ_INSERT_TAIL(&tmp_list, dom, dom_link);
325         }
326
327         while ((dom = TAILQ_FIRST(&tmp_list)) != NULL) {
328                 int bsp_domain = 0;
329
330                 TAILQ_REMOVE(&tmp_list, dom, dom_link);
331
332                 snprintf(buf, sizeof(buf),
333                          "hw.acpi.cpu.px_dom%d.members", dom->dom_id);
334                 msize = sizeof(members);
335                 if (sysctlbyname(buf, members, &msize, NULL, 0) < 0) {
336                         free(dom);
337                         continue;
338                 }
339
340                 members[msize] = 0;
341                 for (str = strtok(members, " "); str; str = strtok(NULL, " ")) {
342                         n = -1;
343                         sscanf(str, "cpu%d", &n);
344                         if (n >= 0) {
345                                 ++NCpus;
346                                 ++dom->dom_ncpus;
347                                 if (n == 0)
348                                         bsp_domain = 1;
349                         }
350                 }
351                 if (dom->dom_ncpus == 0) {
352                         free(dom);
353                         continue;
354                 }
355
356                 if (bsp_domain) {
357                         /*
358                          * Use the power domain containing the BSP as the first
359                          * power domain.  So if all CPUs are idle, we could
360                          * leave BSP to the usched without too much trouble.
361                          */
362                         TAILQ_INSERT_HEAD(&CpuPwrDomain, dom, dom_link);
363                 } else {
364                         TAILQ_INSERT_TAIL(&CpuPwrDomain, dom, dom_link);
365                 }
366                 ++NCpuPwrDomUsed;
367         }
368         if (!TAILQ_EMPTY(&CpuPwrDomain)) {
369                 /* Install sentinel */
370                 CpuPwrDomLast.dom_id = -1;
371                 TAILQ_INSERT_TAIL(&CpuPwrDomain, &CpuPwrDomLast, dom_link);
372         }
373 }
374
375 /*
376  * Return the one-second cpu load.  One cpu at 100% will return a value
377  * of 1.0.  On a SMP system N cpus running at 100% will return a value of N.
378  */
379 static
380 double
381 getcputime(double pollrate)
382 {
383         static struct kinfo_cputime ocpu_time[MAXCPU];
384         static struct kinfo_cputime ncpu_time[MAXCPU];
385         size_t slen;
386         int ncpu;
387         int cpu;
388         uint64_t delta;
389
390         bcopy(ncpu_time, ocpu_time, sizeof(struct kinfo_cputime) * TotalCpus);
391         slen = sizeof(ncpu_time);
392         if (sysctlbyname("kern.cputime", &ncpu_time, &slen, NULL, 0) < 0) {
393                 fprintf(stderr, "kern.cputime sysctl not available\n");
394                 exit(1);
395         }
396         ncpu = slen / sizeof(ncpu_time[0]);
397
398         delta = 0;
399         for (cpu = 0; cpu < ncpu; ++cpu) {
400                 delta += (ncpu_time[cpu].cp_user + ncpu_time[cpu].cp_sys +
401                           ncpu_time[cpu].cp_nice + ncpu_time[cpu].cp_intr) -
402                          (ocpu_time[cpu].cp_user + ocpu_time[cpu].cp_sys +
403                           ocpu_time[cpu].cp_nice + ocpu_time[cpu].cp_intr);
404         }
405         return((double)delta / (pollrate * 1000000.0));
406 }
407
408 /*
409  * nstate is the requested number of cpus that we wish to run at full
410  * frequency.  We calculate how many domains we have to adjust to reach
411  * this goal.
412  *
413  * This function also sets the user scheduler global cpu mask.
414  */
415 static
416 void
417 acpi_setcpufreq(int nstate)
418 {
419         int ncpus = 0;
420         int increasing = (nstate > CpuLimit);
421         struct cpu_pwrdom *dom, *domBeg, *domEnd;
422         int lowest;
423         int highest;
424         int desired;
425         int v;
426         char *sysid;
427         char *ptr;
428         char buf[256];
429         size_t buflen;
430         cpumask_t global_cpumask;
431
432         /*
433          * Calculate the ending domain if the number of operating cpus
434          * has increased.
435          *
436          * Calculate the starting domain if the number of operating cpus
437          * has decreased.
438          */
439         NCpuPwrDomUsed = 0;
440         for (dom = TAILQ_FIRST(&CpuPwrDomain); dom != &CpuPwrDomLast;
441              dom = TAILQ_NEXT(dom, dom_link)) {
442                 if (ncpus >= nstate)
443                         break;
444                 ncpus += dom->dom_ncpus;
445                 ++NCpuPwrDomUsed;
446         }
447
448         syslog(LOG_INFO, "using %d cpus", nstate);
449
450         /*
451          * Set the mask of cpus the userland scheduler is allowed to use.
452          */
453         CPUMASK_ASSBMASK(global_cpumask, nstate == 0 ? 1 : nstate);
454         sysctlbyname("kern.usched_global_cpumask", NULL, 0,
455                      &global_cpumask, sizeof(global_cpumask));
456
457         if (increasing) {
458                 domBeg = CpuPwrDomLimit;
459                 domEnd = dom;
460         } else {
461                 domBeg = dom;
462                 domEnd = CpuPwrDomLimit;
463         }
464         CpuPwrDomLimit = dom;
465         CpuLimit = nstate;
466
467         /*
468          * Adjust the cpu frequency
469          */
470         if (DebugOpt)
471                 printf("\n");
472         for (dom = domBeg; dom != domEnd; dom = TAILQ_NEXT(dom, dom_link)) {
473                 /*
474                  * Retrieve availability list
475                  */
476                 asprintf(&sysid, "hw.acpi.cpu.px_dom%d.available", dom->dom_id);
477                 buflen = sizeof(buf) - 1;
478                 v = sysctlbyname(sysid, buf, &buflen, NULL, 0);
479                 free(sysid);
480                 if (v < 0)
481                         continue;
482                 buf[buflen] = 0;
483
484                 /*
485                  * Parse out the highest and lowest cpu frequencies
486                  */
487                 ptr = buf;
488                 highest = lowest = 0;
489                 while (ptr && (v = strtol(ptr, &ptr, 10)) > 0) {
490                         if (lowest == 0 || lowest > v)
491                                 lowest = v;
492                         if (highest == 0 || highest < v)
493                                 highest = v;
494                         /* 
495                          * Detect turbo mode
496                          */
497                         if ((highest - v == 1) && ! TurboOpt)
498                                 highest = v;
499
500                 }
501
502                 /*
503                  * Calculate the desired cpu frequency, test, and set.
504                  */
505                 desired = increasing ? highest : lowest;
506
507                 asprintf(&sysid, "hw.acpi.cpu.px_dom%d.select", dom->dom_id);
508                 buflen = sizeof(v);
509                 v = 0;
510                 sysctlbyname(sysid, &v, &buflen, NULL, 0);
511                 {
512                         if (DebugOpt) {
513                                 printf("dom%d set frequency %d\n",
514                                        dom->dom_id, desired);
515                         }
516                         sysctlbyname(sysid, NULL, NULL,
517                                      &desired, sizeof(desired));
518                 }
519                 free(sysid);
520         }
521 }
522
523 static
524 void
525 usage(void)
526 {
527         fprintf(stderr, "usage: powerd [-dt] [-p hysteresis] "
528             "[-u trigger_up] [-T sample_interval] [-r poll_interval] "
529             "[-B min_battery_life] [-L low_battery_linger] "
530             "[-P battery_poll_interval] [-Q]\n");
531         exit(1);
532 }
533
534 #ifndef timespecsub
535 #define timespecsub(vvp, uvp)                                           \
536         do {                                                            \
537                 (vvp)->tv_sec -= (uvp)->tv_sec;                         \
538                 (vvp)->tv_nsec -= (uvp)->tv_nsec;                       \
539                 if ((vvp)->tv_nsec < 0) {                               \
540                         (vvp)->tv_sec--;                                \
541                         (vvp)->tv_nsec += 1000000000;                   \
542                 }                                                       \
543         } while (0)
544 #endif
545
546 #define BAT_SYSCTL_TIME_MAX     50000000 /* unit: nanosecond */
547
548 static int
549 has_battery(void)
550 {
551         struct timespec s, e;
552         size_t len;
553         int val;
554
555         clock_gettime(CLOCK_MONOTONIC_FAST, &s);
556         BatLifePrevT = s;
557
558         len = sizeof(val);
559         if (sysctlbyname("hw.acpi.acline", &val, &len, NULL, 0) < 0) {
560                 /* No AC line information */
561                 return 0;
562         }
563         clock_gettime(CLOCK_MONOTONIC_FAST, &e);
564
565         timespecsub(&e, &s);
566         if (e.tv_sec > 0 || e.tv_nsec > BAT_SYSCTL_TIME_MAX) {
567                 /* hw.acpi.acline takes to long to be useful */
568                 syslog(LOG_NOTICE, "hw.acpi.acline takes too long");
569                 return 0;
570         }
571
572         clock_gettime(CLOCK_MONOTONIC_FAST, &s);
573         len = sizeof(val);
574         if (sysctlbyname("hw.acpi.battery.life", &val, &len, NULL, 0) < 0) {
575                 /* No battery life */
576                 return 0;
577         }
578         clock_gettime(CLOCK_MONOTONIC_FAST, &e);
579
580         timespecsub(&e, &s);
581         if (e.tv_sec > 0 || e.tv_nsec > BAT_SYSCTL_TIME_MAX) {
582                 /* hw.acpi.battery.life takes to long to be useful */
583                 syslog(LOG_NOTICE, "hw.acpi.battery.life takes too long");
584                 return 0;
585         }
586         return 1;
587 }
588
589 static void
590 low_battery_alert(int life)
591 {
592         int fmt, stereo, freq;
593         int fd;
594
595         syslog(LOG_ALERT, "low battery life %d%%, please plugin AC line, #%d",
596             life, BatShutdownLingerCnt);
597         ++BatShutdownLingerCnt;
598
599         if (!BatShutdownAudioAlert)
600                 return;
601
602         fd = open("/dev/dsp", O_WRONLY);
603         if (fd < 0)
604                 return;
605
606         fmt = AFMT_S16_LE;
607         if (ioctl(fd, SNDCTL_DSP_SETFMT, &fmt, sizeof(fmt)) < 0)
608                 goto done;
609
610         stereo = 0;
611         if (ioctl(fd, SNDCTL_DSP_STEREO, &stereo, sizeof(stereo)) < 0)
612                 goto done;
613
614         freq = 44100;
615         if (ioctl(fd, SNDCTL_DSP_SPEED, &freq, sizeof(freq)) < 0)
616                 goto done;
617
618         write(fd, alert1, sizeof(alert1));
619         write(fd, alert1, sizeof(alert1));
620
621 done:
622         close(fd);
623 }
624
625 static int
626 mon_battery(void)
627 {
628         struct timespec cur, ts;
629         int acline, life;
630         size_t len;
631
632         clock_gettime(CLOCK_MONOTONIC_FAST, &cur);
633         ts = cur;
634         timespecsub(&ts, &BatLifePrevT);
635         if (ts.tv_sec < BatLifePollIntvl)
636                 return 1;
637         BatLifePrevT = cur;
638
639         len = sizeof(acline);
640         if (sysctlbyname("hw.acpi.acline", &acline, &len, NULL, 0) < 0)
641                 return 1;
642         if (acline) {
643                 BatShutdownLinger = -1;
644                 BatShutdownLingerCnt = 0;
645                 return 1;
646         }
647
648         len = sizeof(life);
649         if (sysctlbyname("hw.acpi.battery.life", &life, &len, NULL, 0) < 0)
650                 return 1;
651
652         if (BatShutdownLinger > 0) {
653                 ts = cur;
654                 timespecsub(&ts, &BatShutdownStartT);
655                 if (ts.tv_sec > BatShutdownLinger)
656                         BatShutdownLinger = 0;
657         }
658
659         if (life <= BatLifeMin) {
660                 if (BatShutdownLinger == 0 || BatShutdownLingerSet == 0) {
661                         syslog(LOG_ALERT, "low battery life %d%%, "
662                             "shutting down", life);
663                         if (vfork() == 0)
664                                 execlp("poweroff", "poweroff", NULL);
665                         return 0;
666                 } else if (BatShutdownLinger < 0) {
667                         BatShutdownLinger = BatShutdownLingerSet;
668                         BatShutdownStartT = cur;
669                 }
670                 low_battery_alert(life);
671         }
672         return 1;
673 }
674
675 static void
676 getncpus(void)
677 {
678         size_t slen;
679
680         slen = sizeof(TotalCpus);
681         if (sysctlbyname("hw.ncpu", &TotalCpus, &slen, NULL, 0) < 0)
682                 err(1, "sysctlbyname hw.ncpu failed");
683         if (DebugOpt)
684                 printf("hw.ncpu %d\n", TotalCpus);
685 }