vmm: Use struct vcpu in the rendezvous code.
[freebsd.git] / sys / amd64 / vmm / io / vlapic.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  * Copyright (c) 2019 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_bhyve_snapshot.h"
36
37 #include <sys/param.h>
38 #include <sys/lock.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mutex.h>
42 #include <sys/systm.h>
43 #include <sys/smp.h>
44
45 #include <x86/specialreg.h>
46 #include <x86/apicreg.h>
47
48 #include <machine/clock.h>
49 #include <machine/smp.h>
50
51 #include <machine/vmm.h>
52 #include <machine/vmm_snapshot.h>
53
54 #include "vmm_lapic.h"
55 #include "vmm_ktr.h"
56 #include "vmm_stat.h"
57
58 #include "vlapic.h"
59 #include "vlapic_priv.h"
60 #include "vioapic.h"
61
62 #define PRIO(x)                 ((x) >> 4)
63
64 #define VLAPIC_VERSION          (0x14)
65
66 #define x2apic(vlapic)  (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
67
68 /*
69  * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
70  * vlapic_callout_handler() and vcpu accesses to:
71  * - timer_freq_bt, timer_period_bt, timer_fire_bt
72  * - timer LVT register
73  */
74 #define VLAPIC_TIMER_LOCK(vlapic)       mtx_lock_spin(&((vlapic)->timer_mtx))
75 #define VLAPIC_TIMER_UNLOCK(vlapic)     mtx_unlock_spin(&((vlapic)->timer_mtx))
76 #define VLAPIC_TIMER_LOCKED(vlapic)     mtx_owned(&((vlapic)->timer_mtx))
77
78 /*
79  * APIC timer frequency:
80  * - arbitrary but chosen to be in the ballpark of contemporary hardware.
81  * - power-of-two to avoid loss of precision when converted to a bintime.
82  */
83 #define VLAPIC_BUS_FREQ         (128 * 1024 * 1024)
84
85 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
86 static void vlapic_callout_handler(void *arg);
87 static void vlapic_reset(struct vlapic *vlapic);
88
89 static __inline uint32_t
90 vlapic_get_id(struct vlapic *vlapic)
91 {
92
93         if (x2apic(vlapic))
94                 return (vlapic->vcpuid);
95         else
96                 return (vlapic->vcpuid << 24);
97 }
98
99 static uint32_t
100 x2apic_ldr(struct vlapic *vlapic)
101 {
102         int apicid;
103         uint32_t ldr;
104
105         apicid = vlapic_get_id(vlapic);
106         ldr = 1 << (apicid & 0xf);
107         ldr |= (apicid & 0xffff0) << 12;
108         return (ldr);
109 }
110
111 void
112 vlapic_dfr_write_handler(struct vlapic *vlapic)
113 {
114         struct LAPIC *lapic;
115
116         lapic = vlapic->apic_page;
117         if (x2apic(vlapic)) {
118                 VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
119                     lapic->dfr);
120                 lapic->dfr = 0;
121                 return;
122         }
123
124         lapic->dfr &= APIC_DFR_MODEL_MASK;
125         lapic->dfr |= APIC_DFR_RESERVED;
126
127         if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
128                 VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
129         else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
130                 VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
131         else
132                 VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
133 }
134
135 void
136 vlapic_ldr_write_handler(struct vlapic *vlapic)
137 {
138         struct LAPIC *lapic;
139
140         lapic = vlapic->apic_page;
141
142         /* LDR is read-only in x2apic mode */
143         if (x2apic(vlapic)) {
144                 VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
145                     lapic->ldr);
146                 lapic->ldr = x2apic_ldr(vlapic);
147         } else {
148                 lapic->ldr &= ~APIC_LDR_RESERVED;
149                 VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
150         }
151 }
152
153 void
154 vlapic_id_write_handler(struct vlapic *vlapic)
155 {
156         struct LAPIC *lapic;
157
158         /*
159          * We don't allow the ID register to be modified so reset it back to
160          * its default value.
161          */
162         lapic = vlapic->apic_page;
163         lapic->id = vlapic_get_id(vlapic);
164 }
165
166 static int
167 vlapic_timer_divisor(uint32_t dcr)
168 {
169         switch (dcr & 0xB) {
170         case APIC_TDCR_1:
171                 return (1);
172         case APIC_TDCR_2:
173                 return (2);
174         case APIC_TDCR_4:
175                 return (4);
176         case APIC_TDCR_8:
177                 return (8);
178         case APIC_TDCR_16:
179                 return (16);
180         case APIC_TDCR_32:
181                 return (32);
182         case APIC_TDCR_64:
183                 return (64);
184         case APIC_TDCR_128:
185                 return (128);
186         default:
187                 panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
188         }
189 }
190
191 #if 0
192 static inline void
193 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
194 {
195         printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
196             *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
197             *lvt & APIC_LVTT_M);
198 }
199 #endif
200
201 static uint32_t
202 vlapic_get_ccr(struct vlapic *vlapic)
203 {
204         struct bintime bt_now, bt_rem;
205         struct LAPIC *lapic __diagused;
206         uint32_t ccr;
207
208         ccr = 0;
209         lapic = vlapic->apic_page;
210
211         VLAPIC_TIMER_LOCK(vlapic);
212         if (callout_active(&vlapic->callout)) {
213                 /*
214                  * If the timer is scheduled to expire in the future then
215                  * compute the value of 'ccr' based on the remaining time.
216                  */
217                 binuptime(&bt_now);
218                 if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
219                         bt_rem = vlapic->timer_fire_bt;
220                         bintime_sub(&bt_rem, &bt_now);
221                         ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
222                         ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
223                 }
224         }
225         KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
226             "icr_timer is %#x", ccr, lapic->icr_timer));
227         VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
228             ccr, lapic->icr_timer);
229         VLAPIC_TIMER_UNLOCK(vlapic);
230         return (ccr);
231 }
232
233 void
234 vlapic_dcr_write_handler(struct vlapic *vlapic)
235 {
236         struct LAPIC *lapic;
237         int divisor;
238
239         lapic = vlapic->apic_page;
240         VLAPIC_TIMER_LOCK(vlapic);
241
242         divisor = vlapic_timer_divisor(lapic->dcr_timer);
243         VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
244             lapic->dcr_timer, divisor);
245
246         /*
247          * Update the timer frequency and the timer period.
248          *
249          * XXX changes to the frequency divider will not take effect until
250          * the timer is reloaded.
251          */
252         FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
253         vlapic->timer_period_bt = vlapic->timer_freq_bt;
254         bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
255
256         VLAPIC_TIMER_UNLOCK(vlapic);
257 }
258
259 void
260 vlapic_esr_write_handler(struct vlapic *vlapic)
261 {
262         struct LAPIC *lapic;
263
264         lapic = vlapic->apic_page;
265         lapic->esr = vlapic->esr_pending;
266         vlapic->esr_pending = 0;
267 }
268
269 int
270 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
271 {
272         struct LAPIC *lapic;
273         uint32_t *irrptr, *tmrptr, mask;
274         int idx;
275
276         KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
277
278         lapic = vlapic->apic_page;
279         if (!(lapic->svr & APIC_SVR_ENABLE)) {
280                 VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
281                     "interrupt %d", vector);
282                 return (0);
283         }
284
285         if (vector < 16) {
286                 vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
287                     false);
288                 VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
289                     vector);
290                 return (1);
291         }
292
293         if (vlapic->ops.set_intr_ready)
294                 return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
295
296         idx = (vector / 32) * 4;
297         mask = 1 << (vector % 32);
298
299         irrptr = &lapic->irr0;
300         atomic_set_int(&irrptr[idx], mask);
301
302         /*
303          * Verify that the trigger-mode of the interrupt matches with
304          * the vlapic TMR registers.
305          */
306         tmrptr = &lapic->tmr0;
307         if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
308                 VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
309                     "interrupt is %s-triggered", idx / 4, tmrptr[idx],
310                     level ? "level" : "edge");
311         }
312
313         VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
314         return (1);
315 }
316
317 static __inline uint32_t *
318 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
319 {
320         struct LAPIC    *lapic = vlapic->apic_page;
321         int              i;
322
323         switch (offset) {
324         case APIC_OFFSET_CMCI_LVT:
325                 return (&lapic->lvt_cmci);
326         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
327                 i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
328                 return ((&lapic->lvt_timer) + i);
329         default:
330                 panic("vlapic_get_lvt: invalid LVT\n");
331         }
332 }
333
334 static __inline int
335 lvt_off_to_idx(uint32_t offset)
336 {
337         int index;
338
339         switch (offset) {
340         case APIC_OFFSET_CMCI_LVT:
341                 index = APIC_LVT_CMCI;
342                 break;
343         case APIC_OFFSET_TIMER_LVT:
344                 index = APIC_LVT_TIMER;
345                 break;
346         case APIC_OFFSET_THERM_LVT:
347                 index = APIC_LVT_THERMAL;
348                 break;
349         case APIC_OFFSET_PERF_LVT:
350                 index = APIC_LVT_PMC;
351                 break;
352         case APIC_OFFSET_LINT0_LVT:
353                 index = APIC_LVT_LINT0;
354                 break;
355         case APIC_OFFSET_LINT1_LVT:
356                 index = APIC_LVT_LINT1;
357                 break;
358         case APIC_OFFSET_ERROR_LVT:
359                 index = APIC_LVT_ERROR;
360                 break;
361         default:
362                 index = -1;
363                 break;
364         }
365         KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
366             "invalid lvt index %d for offset %#x", index, offset));
367
368         return (index);
369 }
370
371 static __inline uint32_t
372 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
373 {
374         int idx;
375         uint32_t val;
376
377         idx = lvt_off_to_idx(offset);
378         val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
379         return (val);
380 }
381
382 void
383 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
384 {
385         uint32_t *lvtptr, mask, val;
386         struct LAPIC *lapic;
387         int idx;
388
389         lapic = vlapic->apic_page;
390         lvtptr = vlapic_get_lvtptr(vlapic, offset);     
391         val = *lvtptr;
392         idx = lvt_off_to_idx(offset);
393
394         if (!(lapic->svr & APIC_SVR_ENABLE))
395                 val |= APIC_LVT_M;
396         mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
397         switch (offset) {
398         case APIC_OFFSET_TIMER_LVT:
399                 mask |= APIC_LVTT_TM;
400                 break;
401         case APIC_OFFSET_ERROR_LVT:
402                 break;
403         case APIC_OFFSET_LINT0_LVT:
404         case APIC_OFFSET_LINT1_LVT:
405                 mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
406                 /* FALLTHROUGH */
407         default:
408                 mask |= APIC_LVT_DM;
409                 break;
410         }
411         val &= mask;
412         *lvtptr = val;
413         atomic_store_rel_32(&vlapic->lvt_last[idx], val);
414 }
415
416 static void
417 vlapic_mask_lvts(struct vlapic *vlapic)
418 {
419         struct LAPIC *lapic = vlapic->apic_page;
420
421         lapic->lvt_cmci |= APIC_LVT_M;
422         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
423
424         lapic->lvt_timer |= APIC_LVT_M;
425         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
426
427         lapic->lvt_thermal |= APIC_LVT_M;
428         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
429
430         lapic->lvt_pcint |= APIC_LVT_M;
431         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
432
433         lapic->lvt_lint0 |= APIC_LVT_M;
434         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
435
436         lapic->lvt_lint1 |= APIC_LVT_M;
437         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
438
439         lapic->lvt_error |= APIC_LVT_M;
440         vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
441 }
442
443 static int
444 vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
445 {
446         uint32_t mode, reg, vec;
447
448         reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
449
450         if (reg & APIC_LVT_M)
451                 return (0);
452         vec = reg & APIC_LVT_VECTOR;
453         mode = reg & APIC_LVT_DM;
454
455         switch (mode) {
456         case APIC_LVT_DM_FIXED:
457                 if (vec < 16) {
458                         vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
459                             lvt == APIC_LVT_ERROR);
460                         return (0);
461                 }
462                 if (vlapic_set_intr_ready(vlapic, vec, false))
463                         vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
464                 break;
465         case APIC_LVT_DM_NMI:
466                 vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
467                 break;
468         case APIC_LVT_DM_EXTINT:
469                 vm_inject_extint(vlapic->vm, vlapic->vcpuid);
470                 break;
471         default:
472                 // Other modes ignored
473                 return (0);
474         }
475         return (1);
476 }
477
478 #if 1
479 static void
480 dump_isrvec_stk(struct vlapic *vlapic)
481 {
482         int i;
483         uint32_t *isrptr;
484
485         isrptr = &vlapic->apic_page->isr0;
486         for (i = 0; i < 8; i++)
487                 printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
488
489         for (i = 0; i <= vlapic->isrvec_stk_top; i++)
490                 printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
491 }
492 #endif
493
494 /*
495  * Algorithm adopted from section "Interrupt, Task and Processor Priority"
496  * in Intel Architecture Manual Vol 3a.
497  */
498 static void
499 vlapic_update_ppr(struct vlapic *vlapic)
500 {
501         int isrvec, tpr, ppr;
502
503         /*
504          * Note that the value on the stack at index 0 is always 0.
505          *
506          * This is a placeholder for the value of ISRV when none of the
507          * bits is set in the ISRx registers.
508          */
509         isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
510         tpr = vlapic->apic_page->tpr;
511
512 #if 1
513         {
514                 int i, lastprio, curprio, vector, idx;
515                 uint32_t *isrptr;
516
517                 if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
518                         panic("isrvec_stk is corrupted: %d", isrvec);
519
520                 /*
521                  * Make sure that the priority of the nested interrupts is
522                  * always increasing.
523                  */
524                 lastprio = -1;
525                 for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
526                         curprio = PRIO(vlapic->isrvec_stk[i]);
527                         if (curprio <= lastprio) {
528                                 dump_isrvec_stk(vlapic);
529                                 panic("isrvec_stk does not satisfy invariant");
530                         }
531                         lastprio = curprio;
532                 }
533
534                 /*
535                  * Make sure that each bit set in the ISRx registers has a
536                  * corresponding entry on the isrvec stack.
537                  */
538                 i = 1;
539                 isrptr = &vlapic->apic_page->isr0;
540                 for (vector = 0; vector < 256; vector++) {
541                         idx = (vector / 32) * 4;
542                         if (isrptr[idx] & (1 << (vector % 32))) {
543                                 if (i > vlapic->isrvec_stk_top ||
544                                     vlapic->isrvec_stk[i] != vector) {
545                                         dump_isrvec_stk(vlapic);
546                                         panic("ISR and isrvec_stk out of sync");
547                                 }
548                                 i++;
549                         }
550                 }
551         }
552 #endif
553
554         if (PRIO(tpr) >= PRIO(isrvec))
555                 ppr = tpr;
556         else
557                 ppr = isrvec & 0xf0;
558
559         vlapic->apic_page->ppr = ppr;
560         VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
561 }
562
563 void
564 vlapic_sync_tpr(struct vlapic *vlapic)
565 {
566         vlapic_update_ppr(vlapic);
567 }
568
569 static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
570
571 static void
572 vlapic_process_eoi(struct vlapic *vlapic)
573 {
574         struct LAPIC    *lapic = vlapic->apic_page;
575         uint32_t        *isrptr, *tmrptr;
576         int             i, idx, bitpos, vector;
577
578         isrptr = &lapic->isr0;
579         tmrptr = &lapic->tmr0;
580
581         for (i = 7; i >= 0; i--) {
582                 idx = i * 4;
583                 bitpos = fls(isrptr[idx]);
584                 if (bitpos-- != 0) {
585                         if (vlapic->isrvec_stk_top <= 0) {
586                                 panic("invalid vlapic isrvec_stk_top %d",
587                                       vlapic->isrvec_stk_top);
588                         }
589                         isrptr[idx] &= ~(1 << bitpos);
590                         vector = i * 32 + bitpos;
591                         VLAPIC_CTR1(vlapic, "EOI vector %d", vector);
592                         VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
593                         vlapic->isrvec_stk_top--;
594                         vlapic_update_ppr(vlapic);
595                         if ((tmrptr[idx] & (1 << bitpos)) != 0) {
596                                 vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
597                                     vector);
598                         }
599                         return;
600                 }
601         }
602         VLAPIC_CTR0(vlapic, "Gratuitous EOI");
603         vmm_stat_incr(vlapic->vcpu, VLAPIC_GRATUITOUS_EOI, 1);
604 }
605
606 static __inline int
607 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
608 {
609
610         return (lvt & mask);
611 }
612
613 static __inline int
614 vlapic_periodic_timer(struct vlapic *vlapic)
615 {
616         uint32_t lvt;
617
618         lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
619
620         return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
621 }
622
623 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
624
625 static void
626 vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
627 {
628
629         vlapic->esr_pending |= mask;
630
631         /*
632          * Avoid infinite recursion if the error LVT itself is configured with
633          * an illegal vector.
634          */
635         if (lvt_error)
636                 return;
637
638         if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
639                 vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_ERROR, 1);
640         }
641 }
642
643 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
644
645 static void
646 vlapic_fire_timer(struct vlapic *vlapic)
647 {
648
649         KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
650
651         if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
652                 VLAPIC_CTR0(vlapic, "vlapic timer fired");
653                 vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_TIMER, 1);
654         }
655 }
656
657 static VMM_STAT(VLAPIC_INTR_CMC,
658     "corrected machine check interrupts generated by vlapic");
659
660 void
661 vlapic_fire_cmci(struct vlapic *vlapic)
662 {
663
664         if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
665                 vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_CMC, 1);
666         }
667 }
668
669 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
670     "lvts triggered");
671
672 int
673 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
674 {
675
676         if (vlapic_enabled(vlapic) == false) {
677                 /*
678                  * When the local APIC is global/hardware disabled,
679                  * LINT[1:0] pins are configured as INTR and NMI pins,
680                  * respectively.
681                 */
682                 switch (vector) {
683                         case APIC_LVT_LINT0:
684                                 vm_inject_extint(vlapic->vm, vlapic->vcpuid);
685                                 break;
686                         case APIC_LVT_LINT1:
687                                 vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
688                                 break;
689                         default:
690                                 break;
691                 }
692                 return (0);
693         }
694
695         switch (vector) {
696         case APIC_LVT_LINT0:
697         case APIC_LVT_LINT1:
698         case APIC_LVT_TIMER:
699         case APIC_LVT_ERROR:
700         case APIC_LVT_PMC:
701         case APIC_LVT_THERMAL:
702         case APIC_LVT_CMCI:
703                 if (vlapic_fire_lvt(vlapic, vector)) {
704                         vmm_stat_array_incr(vlapic->vcpu, LVTS_TRIGGERRED,
705                             vector, 1);
706                 }
707                 break;
708         default:
709                 return (EINVAL);
710         }
711         return (0);
712 }
713
714 static void
715 vlapic_callout_reset(struct vlapic *vlapic, sbintime_t t)
716 {
717         callout_reset_sbt_curcpu(&vlapic->callout, t, 0,
718             vlapic_callout_handler, vlapic, 0);
719 }
720
721 static void
722 vlapic_callout_handler(void *arg)
723 {
724         struct vlapic *vlapic;
725         struct bintime bt, btnow;
726         sbintime_t rem_sbt;
727
728         vlapic = arg;
729
730         VLAPIC_TIMER_LOCK(vlapic);
731         if (callout_pending(&vlapic->callout))  /* callout was reset */
732                 goto done;
733
734         if (!callout_active(&vlapic->callout))  /* callout was stopped */
735                 goto done;
736
737         callout_deactivate(&vlapic->callout);
738
739         vlapic_fire_timer(vlapic);
740
741         if (vlapic_periodic_timer(vlapic)) {
742                 binuptime(&btnow);
743                 KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
744                     ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
745                     btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
746                     vlapic->timer_fire_bt.frac));
747
748                 /*
749                  * Compute the delta between when the timer was supposed to
750                  * fire and the present time.
751                  */
752                 bt = btnow;
753                 bintime_sub(&bt, &vlapic->timer_fire_bt);
754
755                 rem_sbt = bttosbt(vlapic->timer_period_bt);
756                 if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
757                         /*
758                          * Adjust the time until the next countdown downward
759                          * to account for the lost time.
760                          */
761                         rem_sbt -= bttosbt(bt);
762                 } else {
763                         /*
764                          * If the delta is greater than the timer period then
765                          * just reset our time base instead of trying to catch
766                          * up.
767                          */
768                         vlapic->timer_fire_bt = btnow;
769                         VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
770                             "usecs, period is %lu usecs - resetting time base",
771                             bttosbt(bt) / SBT_1US,
772                             bttosbt(vlapic->timer_period_bt) / SBT_1US);
773                 }
774
775                 bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
776                 vlapic_callout_reset(vlapic, rem_sbt);
777         }
778 done:
779         VLAPIC_TIMER_UNLOCK(vlapic);
780 }
781
782 void
783 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
784 {
785         struct LAPIC *lapic;
786         sbintime_t sbt;
787         uint32_t icr_timer;
788
789         VLAPIC_TIMER_LOCK(vlapic);
790
791         lapic = vlapic->apic_page;
792         icr_timer = lapic->icr_timer;
793
794         vlapic->timer_period_bt = vlapic->timer_freq_bt;
795         bintime_mul(&vlapic->timer_period_bt, icr_timer);
796
797         if (icr_timer != 0) {
798                 binuptime(&vlapic->timer_fire_bt);
799                 bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
800
801                 sbt = bttosbt(vlapic->timer_period_bt);
802                 vlapic_callout_reset(vlapic, sbt);
803         } else
804                 callout_stop(&vlapic->callout);
805
806         VLAPIC_TIMER_UNLOCK(vlapic);
807 }
808
809 /*
810  * This function populates 'dmask' with the set of vcpus that match the
811  * addressing specified by the (dest, phys, lowprio) tuple.
812  * 
813  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
814  * or xAPIC (8-bit) destination field.
815  */
816 static void
817 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
818     bool lowprio, bool x2apic_dest)
819 {
820         struct vlapic *vlapic;
821         uint32_t dfr, ldr, ldest, cluster;
822         uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
823         cpuset_t amask;
824         int vcpuid;
825
826         if ((x2apic_dest && dest == 0xffffffff) ||
827             (!x2apic_dest && dest == 0xff)) {
828                 /*
829                  * Broadcast in both logical and physical modes.
830                  */
831                 *dmask = vm_active_cpus(vm);
832                 return;
833         }
834
835         if (phys) {
836                 /*
837                  * Physical mode: destination is APIC ID.
838                  */
839                 CPU_ZERO(dmask);
840                 vcpuid = vm_apicid2vcpuid(vm, dest);
841                 amask = vm_active_cpus(vm);
842                 if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
843                         CPU_SET(vcpuid, dmask);
844         } else {
845                 /*
846                  * In the "Flat Model" the MDA is interpreted as an 8-bit wide
847                  * bitmask. This model is only available in the xAPIC mode.
848                  */
849                 mda_flat_ldest = dest & 0xff;
850
851                 /*
852                  * In the "Cluster Model" the MDA is used to identify a
853                  * specific cluster and a set of APICs in that cluster.
854                  */
855                 if (x2apic_dest) {
856                         mda_cluster_id = dest >> 16;
857                         mda_cluster_ldest = dest & 0xffff;
858                 } else {
859                         mda_cluster_id = (dest >> 4) & 0xf;
860                         mda_cluster_ldest = dest & 0xf;
861                 }
862
863                 /*
864                  * Logical mode: match each APIC that has a bit set
865                  * in its LDR that matches a bit in the ldest.
866                  */
867                 CPU_ZERO(dmask);
868                 amask = vm_active_cpus(vm);
869                 CPU_FOREACH_ISSET(vcpuid, &amask) {
870                         vlapic = vm_lapic(vm_vcpu(vm, vcpuid));
871                         dfr = vlapic->apic_page->dfr;
872                         ldr = vlapic->apic_page->ldr;
873
874                         if ((dfr & APIC_DFR_MODEL_MASK) ==
875                             APIC_DFR_MODEL_FLAT) {
876                                 ldest = ldr >> 24;
877                                 mda_ldest = mda_flat_ldest;
878                         } else if ((dfr & APIC_DFR_MODEL_MASK) ==
879                             APIC_DFR_MODEL_CLUSTER) {
880                                 if (x2apic(vlapic)) {
881                                         cluster = ldr >> 16;
882                                         ldest = ldr & 0xffff;
883                                 } else {
884                                         cluster = ldr >> 28;
885                                         ldest = (ldr >> 24) & 0xf;
886                                 }
887                                 if (cluster != mda_cluster_id)
888                                         continue;
889                                 mda_ldest = mda_cluster_ldest;
890                         } else {
891                                 /*
892                                  * Guest has configured a bad logical
893                                  * model for this vcpu - skip it.
894                                  */
895                                 VLAPIC_CTR1(vlapic, "vlapic has bad logical "
896                                     "model %x - cannot deliver interrupt", dfr);
897                                 continue;
898                         }
899
900                         if ((mda_ldest & ldest) != 0) {
901                                 CPU_SET(vcpuid, dmask);
902                                 if (lowprio)
903                                         break;
904                         }
905                 }
906         }
907 }
908
909 static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
910
911 static void
912 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
913 {
914         struct LAPIC *lapic = vlapic->apic_page;
915
916         if (lapic->tpr != val) {
917                 VLAPIC_CTR2(vlapic, "vlapic TPR changed from %#x to %#x",
918                     lapic->tpr, val);
919                 lapic->tpr = val;
920                 vlapic_update_ppr(vlapic);
921         }
922 }
923
924 static uint8_t
925 vlapic_get_tpr(struct vlapic *vlapic)
926 {
927         struct LAPIC *lapic = vlapic->apic_page;
928
929         return (lapic->tpr);
930 }
931
932 void
933 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
934 {
935         uint8_t tpr;
936
937         if (val & ~0xf) {
938                 vm_inject_gp(vlapic->vcpu);
939                 return;
940         }
941
942         tpr = val << 4;
943         vlapic_set_tpr(vlapic, tpr);
944 }
945
946 uint64_t
947 vlapic_get_cr8(struct vlapic *vlapic)
948 {
949         uint8_t tpr;
950
951         tpr = vlapic_get_tpr(vlapic);
952         return (tpr >> 4);
953 }
954
955 static bool
956 vlapic_is_icr_valid(uint64_t icrval)
957 {
958         uint32_t mode = icrval & APIC_DELMODE_MASK;
959         uint32_t level = icrval & APIC_LEVEL_MASK;
960         uint32_t trigger = icrval & APIC_TRIGMOD_MASK;
961         uint32_t shorthand = icrval & APIC_DEST_MASK;
962
963         switch (mode) {
964         case APIC_DELMODE_FIXED:
965                 if (trigger == APIC_TRIGMOD_EDGE)
966                         return (true);
967                 /*
968                  * AMD allows a level assert IPI and Intel converts a level
969                  * assert IPI into an edge IPI.
970                  */
971                 if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT)
972                         return (true);
973                 break;
974         case APIC_DELMODE_LOWPRIO:
975         case APIC_DELMODE_SMI:
976         case APIC_DELMODE_NMI:
977         case APIC_DELMODE_INIT:
978                 if (trigger == APIC_TRIGMOD_EDGE &&
979                     (shorthand == APIC_DEST_DESTFLD ||
980                         shorthand == APIC_DEST_ALLESELF))
981                         return (true);
982                 /*
983                  * AMD allows a level assert IPI and Intel converts a level
984                  * assert IPI into an edge IPI.
985                  */
986                 if (trigger == APIC_TRIGMOD_LEVEL &&
987                     level == APIC_LEVEL_ASSERT &&
988                     (shorthand == APIC_DEST_DESTFLD ||
989                         shorthand == APIC_DEST_ALLESELF))
990                         return (true);
991                 /*
992                  * An level triggered deassert INIT is defined in the Intel
993                  * Multiprocessor Specification and the Intel Software Developer
994                  * Manual. Due to the MPS it's required to send a level assert
995                  * INIT to a cpu and then a level deassert INIT. Some operating
996                  * systems e.g. FreeBSD or Linux use that algorithm. According
997                  * to the SDM a level deassert INIT is only supported by Pentium
998                  * and P6 processors. It's always send to all cpus regardless of
999                  * the destination or shorthand field. It resets the arbitration
1000                  * id register. This register is not software accessible and
1001                  * only required for the APIC bus arbitration. So, the level
1002                  * deassert INIT doesn't need any emulation and we should ignore
1003                  * it. The SDM also defines that newer processors don't support
1004                  * the level deassert INIT and it's not valid any more. As it's
1005                  * defined for older systems, it can't be invalid per se.
1006                  * Otherwise, backward compatibility would be broken. However,
1007                  * when returning false here, it'll be ignored which is the
1008                  * desired behaviour.
1009                  */
1010                 if (mode == APIC_DELMODE_INIT &&
1011                     trigger == APIC_TRIGMOD_LEVEL &&
1012                     level == APIC_LEVEL_DEASSERT)
1013                         return (false);
1014                 break;
1015         case APIC_DELMODE_STARTUP:
1016                 if (shorthand == APIC_DEST_DESTFLD ||
1017                     shorthand == APIC_DEST_ALLESELF)
1018                         return (true);
1019                 break;
1020         case APIC_DELMODE_RR:
1021                 /* Only available on AMD! */
1022                 if (trigger == APIC_TRIGMOD_EDGE &&
1023                     shorthand == APIC_DEST_DESTFLD)
1024                         return (true);
1025                 break;
1026         case APIC_DELMODE_RESV:
1027                 return (false);
1028         default:
1029                 __assert_unreachable();
1030         }
1031
1032         return (false);
1033 }
1034
1035 int
1036 vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
1037 {
1038         int i;
1039         bool phys;
1040         cpuset_t dmask, ipimask;
1041         uint64_t icrval;
1042         uint32_t dest, vec, mode, shorthand;
1043         struct vlapic *vlapic2;
1044         struct vm_exit *vmexit;
1045         struct LAPIC *lapic;
1046
1047         lapic = vlapic->apic_page;
1048         lapic->icr_lo &= ~APIC_DELSTAT_PEND;
1049         icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
1050
1051         if (x2apic(vlapic))
1052                 dest = icrval >> 32;
1053         else
1054                 dest = icrval >> (32 + 24);
1055         vec = icrval & APIC_VECTOR_MASK;
1056         mode = icrval & APIC_DELMODE_MASK;
1057         phys = (icrval & APIC_DESTMODE_LOG) == 0;
1058         shorthand = icrval & APIC_DEST_MASK;
1059
1060         VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
1061
1062         switch (shorthand) {
1063         case APIC_DEST_DESTFLD:
1064                 vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic));
1065                 break;
1066         case APIC_DEST_SELF:
1067                 CPU_SETOF(vlapic->vcpuid, &dmask);
1068                 break;
1069         case APIC_DEST_ALLISELF:
1070                 dmask = vm_active_cpus(vlapic->vm);
1071                 break;
1072         case APIC_DEST_ALLESELF:
1073                 dmask = vm_active_cpus(vlapic->vm);
1074                 CPU_CLR(vlapic->vcpuid, &dmask);
1075                 break;
1076         default:
1077                 __assert_unreachable();
1078         }
1079
1080         /*
1081          * Ignore invalid combinations of the icr.
1082          */
1083         if (!vlapic_is_icr_valid(icrval)) {
1084                 VLAPIC_CTR1(vlapic, "Ignoring invalid ICR %016lx", icrval);
1085                 return (0);
1086         }
1087
1088         /*
1089          * ipimask is a set of vCPUs needing userland handling of the current
1090          * IPI.
1091          */
1092         CPU_ZERO(&ipimask);
1093
1094         switch (mode) {
1095         case APIC_DELMODE_FIXED:
1096                 if (vec < 16) {
1097                         vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
1098                             false);
1099                         VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
1100                         return (0);
1101                 }
1102
1103                 CPU_FOREACH_ISSET(i, &dmask) {
1104                         lapic_intr_edge(vlapic->vm, i, vec);
1105                         vmm_stat_array_incr(vlapic->vcpu, IPIS_SENT, i, 1);
1106                         VLAPIC_CTR2(vlapic,
1107                             "vlapic sending ipi %d to vcpuid %d", vec, i);
1108                 }
1109
1110                 break;
1111         case APIC_DELMODE_NMI:
1112                 CPU_FOREACH_ISSET(i, &dmask) {
1113                         vm_inject_nmi(vlapic->vm, i);
1114                         VLAPIC_CTR1(vlapic,
1115                             "vlapic sending ipi nmi to vcpuid %d", i);
1116                 }
1117
1118                 break;
1119         case APIC_DELMODE_INIT:
1120                 if (!vlapic->ipi_exit) {
1121                         if (!phys)
1122                                 break;
1123
1124                         i = vm_apicid2vcpuid(vlapic->vm, dest);
1125                         if (i >= vm_get_maxcpus(vlapic->vm) ||
1126                             i == vlapic->vcpuid)
1127                                 break;
1128
1129                         /*
1130                          * Userland which doesn't support the IPI exit
1131                          * requires that the boot state is set to SIPI
1132                          * here.
1133                          */
1134                         vlapic2 = vm_lapic(vm_vcpu(vlapic->vm, i));
1135                         vlapic2->boot_state = BS_SIPI;
1136                         break;
1137                 }
1138
1139                 CPU_COPY(&dmask, &ipimask);
1140                 break;
1141         case APIC_DELMODE_STARTUP:
1142                 if (!vlapic->ipi_exit) {
1143                         if (!phys)
1144                                 break;
1145
1146                         /*
1147                          * Old bhyve versions don't support the IPI
1148                          * exit. Translate it into the old style.
1149                          */
1150                         i = vm_apicid2vcpuid(vlapic->vm, dest);
1151                         if (i >= vm_get_maxcpus(vlapic->vm) ||
1152                             i == vlapic->vcpuid)
1153                                 break;
1154
1155                         /*
1156                          * Ignore SIPIs in any state other than wait-for-SIPI
1157                          */
1158                         vlapic2 = vm_lapic(vm_vcpu(vlapic->vm, i));
1159                         if (vlapic2->boot_state != BS_SIPI)
1160                                 break;
1161                         vlapic2->boot_state = BS_RUNNING;
1162
1163                         vmexit = vm_exitinfo(vlapic->vcpu);
1164                         vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
1165                         vmexit->u.spinup_ap.vcpu = i;
1166                         vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
1167
1168                         *retu = true;
1169                         break;
1170                 }
1171
1172                 CPU_FOREACH_ISSET(i, &dmask) {
1173                         vlapic2 = vm_lapic(vm_vcpu(vlapic->vm, i));
1174
1175                         /*
1176                          * Ignore SIPIs in any state other than wait-for-SIPI
1177                          */
1178                         if (vlapic2->boot_state != BS_SIPI)
1179                                 continue;
1180                         vlapic2->boot_state = BS_RUNNING;
1181                         CPU_SET(i, &ipimask);
1182                 }
1183
1184                 break;
1185         default:
1186                 return (1);
1187         }
1188
1189         if (!CPU_EMPTY(&ipimask)) {
1190                 vmexit = vm_exitinfo(vlapic->vcpu);
1191                 vmexit->exitcode = VM_EXITCODE_IPI;
1192                 vmexit->u.ipi.mode = mode;
1193                 vmexit->u.ipi.vector = vec;
1194                 vmexit->u.ipi.dmask = dmask;
1195
1196                 *retu = true;
1197         }
1198
1199         return (0);
1200 }
1201
1202 static void
1203 vlapic_handle_init(struct vcpu *vcpu, void *arg)
1204 {
1205         struct vlapic *vlapic = vm_lapic(vcpu);
1206
1207         vlapic_reset(vlapic);
1208
1209         /* vlapic_reset modifies the boot state. */
1210         vlapic->boot_state = BS_SIPI;
1211 }
1212
1213 int
1214 vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1215 {
1216         *retu = true;
1217         switch (vme->u.ipi.mode) {
1218         case APIC_DELMODE_INIT:
1219                 vm_smp_rendezvous(vcpu, vme->u.ipi.dmask, vlapic_handle_init,
1220                     NULL);
1221                 break;
1222         case APIC_DELMODE_STARTUP:
1223                 break;
1224         default:
1225                 return (1);
1226         }
1227
1228         return (0);
1229 }
1230
1231 void
1232 vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val)
1233 {
1234         int vec;
1235
1236         KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode"));
1237
1238         vec = val & 0xff;
1239         lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
1240         vmm_stat_array_incr(vlapic->vcpu, IPIS_SENT, vlapic->vcpuid, 1);
1241         VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
1242 }
1243
1244 int
1245 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
1246 {
1247         struct LAPIC    *lapic = vlapic->apic_page;
1248         int              idx, i, bitpos, vector;
1249         uint32_t        *irrptr, val;
1250
1251         vlapic_update_ppr(vlapic);
1252
1253         if (vlapic->ops.pending_intr)
1254                 return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
1255
1256         irrptr = &lapic->irr0;
1257
1258         for (i = 7; i >= 0; i--) {
1259                 idx = i * 4;
1260                 val = atomic_load_acq_int(&irrptr[idx]);
1261                 bitpos = fls(val);
1262                 if (bitpos != 0) {
1263                         vector = i * 32 + (bitpos - 1);
1264                         if (PRIO(vector) > PRIO(lapic->ppr)) {
1265                                 VLAPIC_CTR1(vlapic, "pending intr %d", vector);
1266                                 if (vecptr != NULL)
1267                                         *vecptr = vector;
1268                                 return (1);
1269                         } else 
1270                                 break;
1271                 }
1272         }
1273         return (0);
1274 }
1275
1276 void
1277 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
1278 {
1279         struct LAPIC    *lapic = vlapic->apic_page;
1280         uint32_t        *irrptr, *isrptr;
1281         int             idx, stk_top;
1282
1283         if (vlapic->ops.intr_accepted)
1284                 return ((*vlapic->ops.intr_accepted)(vlapic, vector));
1285
1286         /*
1287          * clear the ready bit for vector being accepted in irr 
1288          * and set the vector as in service in isr.
1289          */
1290         idx = (vector / 32) * 4;
1291
1292         irrptr = &lapic->irr0;
1293         atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
1294         VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
1295
1296         isrptr = &lapic->isr0;
1297         isrptr[idx] |= 1 << (vector % 32);
1298         VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
1299
1300         /*
1301          * Update the PPR
1302          */
1303         vlapic->isrvec_stk_top++;
1304
1305         stk_top = vlapic->isrvec_stk_top;
1306         if (stk_top >= ISRVEC_STK_SIZE)
1307                 panic("isrvec_stk_top overflow %d", stk_top);
1308
1309         vlapic->isrvec_stk[stk_top] = vector;
1310 }
1311
1312 void
1313 vlapic_svr_write_handler(struct vlapic *vlapic)
1314 {
1315         struct LAPIC *lapic;
1316         uint32_t old, new, changed;
1317
1318         lapic = vlapic->apic_page;
1319
1320         new = lapic->svr;
1321         old = vlapic->svr_last;
1322         vlapic->svr_last = new;
1323
1324         changed = old ^ new;
1325         if ((changed & APIC_SVR_ENABLE) != 0) {
1326                 if ((new & APIC_SVR_ENABLE) == 0) {
1327                         /*
1328                          * The apic is now disabled so stop the apic timer
1329                          * and mask all the LVT entries.
1330                          */
1331                         VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
1332                         VLAPIC_TIMER_LOCK(vlapic);
1333                         callout_stop(&vlapic->callout);
1334                         VLAPIC_TIMER_UNLOCK(vlapic);
1335                         vlapic_mask_lvts(vlapic);
1336                 } else {
1337                         /*
1338                          * The apic is now enabled so restart the apic timer
1339                          * if it is configured in periodic mode.
1340                          */
1341                         VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
1342                         if (vlapic_periodic_timer(vlapic))
1343                                 vlapic_icrtmr_write_handler(vlapic);
1344                 }
1345         }
1346 }
1347
1348 int
1349 vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
1350     uint64_t *data, bool *retu)
1351 {
1352         struct LAPIC    *lapic = vlapic->apic_page;
1353         uint32_t        *reg;
1354         int              i;
1355
1356         /* Ignore MMIO accesses in x2APIC mode */
1357         if (x2apic(vlapic) && mmio_access) {
1358                 VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode",
1359                     offset);
1360                 *data = 0;
1361                 goto done;
1362         }
1363
1364         if (!x2apic(vlapic) && !mmio_access) {
1365                 /*
1366                  * XXX Generate GP fault for MSR accesses in xAPIC mode
1367                  */
1368                 VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in "
1369                     "xAPIC mode", offset);
1370                 *data = 0;
1371                 goto done;
1372         }
1373
1374         if (offset > sizeof(*lapic)) {
1375                 *data = 0;
1376                 goto done;
1377         }
1378
1379         offset &= ~3;
1380         switch(offset)
1381         {
1382                 case APIC_OFFSET_ID:
1383                         *data = lapic->id;
1384                         break;
1385                 case APIC_OFFSET_VER:
1386                         *data = lapic->version;
1387                         break;
1388                 case APIC_OFFSET_TPR:
1389                         *data = vlapic_get_tpr(vlapic);
1390                         break;
1391                 case APIC_OFFSET_APR:
1392                         *data = lapic->apr;
1393                         break;
1394                 case APIC_OFFSET_PPR:
1395                         *data = lapic->ppr;
1396                         break;
1397                 case APIC_OFFSET_EOI:
1398                         *data = lapic->eoi;
1399                         break;
1400                 case APIC_OFFSET_LDR:
1401                         *data = lapic->ldr;
1402                         break;
1403                 case APIC_OFFSET_DFR:
1404                         *data = lapic->dfr;
1405                         break;
1406                 case APIC_OFFSET_SVR:
1407                         *data = lapic->svr;
1408                         break;
1409                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1410                         i = (offset - APIC_OFFSET_ISR0) >> 2;
1411                         reg = &lapic->isr0;
1412                         *data = *(reg + i);
1413                         break;
1414                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1415                         i = (offset - APIC_OFFSET_TMR0) >> 2;
1416                         reg = &lapic->tmr0;
1417                         *data = *(reg + i);
1418                         break;
1419                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1420                         i = (offset - APIC_OFFSET_IRR0) >> 2;
1421                         reg = &lapic->irr0;
1422                         *data = atomic_load_acq_int(reg + i);
1423                         break;
1424                 case APIC_OFFSET_ESR:
1425                         *data = lapic->esr;
1426                         break;
1427                 case APIC_OFFSET_ICR_LOW: 
1428                         *data = lapic->icr_lo;
1429                         if (x2apic(vlapic))
1430                                 *data |= (uint64_t)lapic->icr_hi << 32;
1431                         break;
1432                 case APIC_OFFSET_ICR_HI: 
1433                         *data = lapic->icr_hi;
1434                         break;
1435                 case APIC_OFFSET_CMCI_LVT:
1436                 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1437                         *data = vlapic_get_lvt(vlapic, offset); 
1438 #ifdef INVARIANTS
1439                         reg = vlapic_get_lvtptr(vlapic, offset);
1440                         KASSERT(*data == *reg, ("inconsistent lvt value at "
1441                             "offset %#lx: %#lx/%#x", offset, *data, *reg));
1442 #endif
1443                         break;
1444                 case APIC_OFFSET_TIMER_ICR:
1445                         *data = lapic->icr_timer;
1446                         break;
1447                 case APIC_OFFSET_TIMER_CCR:
1448                         *data = vlapic_get_ccr(vlapic);
1449                         break;
1450                 case APIC_OFFSET_TIMER_DCR:
1451                         *data = lapic->dcr_timer;
1452                         break;
1453                 case APIC_OFFSET_SELF_IPI:
1454                         /*
1455                          * XXX generate a GP fault if vlapic is in x2apic mode
1456                          */
1457                         *data = 0;
1458                         break;
1459                 case APIC_OFFSET_RRR:
1460                 default:
1461                         *data = 0;
1462                         break;
1463         }
1464 done:
1465         VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
1466         return 0;
1467 }
1468
1469 int
1470 vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
1471     uint64_t data, bool *retu)
1472 {
1473         struct LAPIC    *lapic = vlapic->apic_page;
1474         uint32_t        *regptr;
1475         int             retval;
1476
1477         KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
1478             ("vlapic_write: invalid offset %#lx", offset));
1479
1480         VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx",
1481             offset, data);
1482
1483         if (offset > sizeof(*lapic))
1484                 return (0);
1485
1486         /* Ignore MMIO accesses in x2APIC mode */
1487         if (x2apic(vlapic) && mmio_access) {
1488                 VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx "
1489                     "in x2APIC mode", data, offset);
1490                 return (0);
1491         }
1492
1493         /*
1494          * XXX Generate GP fault for MSR accesses in xAPIC mode
1495          */
1496         if (!x2apic(vlapic) && !mmio_access) {
1497                 VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx "
1498                     "in xAPIC mode", data, offset);
1499                 return (0);
1500         }
1501
1502         retval = 0;
1503         switch(offset)
1504         {
1505                 case APIC_OFFSET_ID:
1506                         lapic->id = data;
1507                         vlapic_id_write_handler(vlapic);
1508                         break;
1509                 case APIC_OFFSET_TPR:
1510                         vlapic_set_tpr(vlapic, data & 0xff);
1511                         break;
1512                 case APIC_OFFSET_EOI:
1513                         vlapic_process_eoi(vlapic);
1514                         break;
1515                 case APIC_OFFSET_LDR:
1516                         lapic->ldr = data;
1517                         vlapic_ldr_write_handler(vlapic);
1518                         break;
1519                 case APIC_OFFSET_DFR:
1520                         lapic->dfr = data;
1521                         vlapic_dfr_write_handler(vlapic);
1522                         break;
1523                 case APIC_OFFSET_SVR:
1524                         lapic->svr = data;
1525                         vlapic_svr_write_handler(vlapic);
1526                         break;
1527                 case APIC_OFFSET_ICR_LOW: 
1528                         lapic->icr_lo = data;
1529                         if (x2apic(vlapic))
1530                                 lapic->icr_hi = data >> 32;
1531                         retval = vlapic_icrlo_write_handler(vlapic, retu);
1532                         break;
1533                 case APIC_OFFSET_ICR_HI:
1534                         lapic->icr_hi = data;
1535                         break;
1536                 case APIC_OFFSET_CMCI_LVT:
1537                 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1538                         regptr = vlapic_get_lvtptr(vlapic, offset);
1539                         *regptr = data;
1540                         vlapic_lvt_write_handler(vlapic, offset);
1541                         break;
1542                 case APIC_OFFSET_TIMER_ICR:
1543                         lapic->icr_timer = data;
1544                         vlapic_icrtmr_write_handler(vlapic);
1545                         break;
1546
1547                 case APIC_OFFSET_TIMER_DCR:
1548                         lapic->dcr_timer = data;
1549                         vlapic_dcr_write_handler(vlapic);
1550                         break;
1551
1552                 case APIC_OFFSET_ESR:
1553                         vlapic_esr_write_handler(vlapic);
1554                         break;
1555
1556                 case APIC_OFFSET_SELF_IPI:
1557                         if (x2apic(vlapic))
1558                                 vlapic_self_ipi_handler(vlapic, data);
1559                         break;
1560
1561                 case APIC_OFFSET_VER:
1562                 case APIC_OFFSET_APR:
1563                 case APIC_OFFSET_PPR:
1564                 case APIC_OFFSET_RRR:
1565                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1566                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1567                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1568                 case APIC_OFFSET_TIMER_CCR:
1569                 default:
1570                         // Read only.
1571                         break;
1572         }
1573
1574         return (retval);
1575 }
1576
1577 static void
1578 vlapic_reset(struct vlapic *vlapic)
1579 {
1580         struct LAPIC *lapic;
1581
1582         lapic = vlapic->apic_page;
1583         bzero(lapic, sizeof(struct LAPIC));
1584
1585         lapic->id = vlapic_get_id(vlapic);
1586         lapic->version = VLAPIC_VERSION;
1587         lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
1588         lapic->dfr = 0xffffffff;
1589         lapic->svr = APIC_SVR_VECTOR;
1590         vlapic_mask_lvts(vlapic);
1591         vlapic_reset_tmr(vlapic);
1592
1593         lapic->dcr_timer = 0;
1594         vlapic_dcr_write_handler(vlapic);
1595
1596         if (vlapic->vcpuid == 0)
1597                 vlapic->boot_state = BS_RUNNING;        /* BSP */
1598         else
1599                 vlapic->boot_state = BS_INIT;           /* AP */
1600
1601         vlapic->svr_last = lapic->svr;
1602 }
1603
1604 void
1605 vlapic_init(struct vlapic *vlapic)
1606 {
1607         KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
1608         KASSERT(vlapic->vcpuid >= 0 &&
1609             vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
1610             ("vlapic_init: vcpuid is not initialized"));
1611         KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
1612             "initialized"));
1613
1614         /*
1615          * If the vlapic is configured in x2apic mode then it will be
1616          * accessed in the critical section via the MSR emulation code.
1617          *
1618          * Therefore the timer mutex must be a spinlock because blockable
1619          * mutexes cannot be acquired in a critical section.
1620          */
1621         mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
1622         callout_init(&vlapic->callout, 1);
1623
1624         vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1625
1626         if (vlapic->vcpuid == 0)
1627                 vlapic->msr_apicbase |= APICBASE_BSP;
1628
1629         vlapic->ipi_exit = false;
1630
1631         vlapic_reset(vlapic);
1632 }
1633
1634 void
1635 vlapic_cleanup(struct vlapic *vlapic)
1636 {
1637
1638         callout_drain(&vlapic->callout);
1639 }
1640
1641 uint64_t
1642 vlapic_get_apicbase(struct vlapic *vlapic)
1643 {
1644
1645         return (vlapic->msr_apicbase);
1646 }
1647
1648 int
1649 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
1650 {
1651
1652         if (vlapic->msr_apicbase != new) {
1653                 VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx "
1654                     "not supported", vlapic->msr_apicbase, new);
1655                 return (-1);
1656         }
1657
1658         return (0);
1659 }
1660
1661 void
1662 vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state)
1663 {
1664         struct vlapic *vlapic;
1665         struct LAPIC *lapic;
1666
1667         vlapic = vm_lapic(vcpu);
1668
1669         if (state == X2APIC_DISABLED)
1670                 vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1671         else
1672                 vlapic->msr_apicbase |= APICBASE_X2APIC;
1673
1674         /*
1675          * Reset the local APIC registers whose values are mode-dependent.
1676          *
1677          * XXX this works because the APIC mode can be changed only at vcpu
1678          * initialization time.
1679          */
1680         lapic = vlapic->apic_page;
1681         lapic->id = vlapic_get_id(vlapic);
1682         if (x2apic(vlapic)) {
1683                 lapic->ldr = x2apic_ldr(vlapic);
1684                 lapic->dfr = 0;
1685         } else {
1686                 lapic->ldr = 0;
1687                 lapic->dfr = 0xffffffff;
1688         }
1689
1690         if (state == X2APIC_ENABLED) {
1691                 if (vlapic->ops.enable_x2apic_mode)
1692                         (*vlapic->ops.enable_x2apic_mode)(vlapic);
1693         }
1694 }
1695
1696 void
1697 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1698     int delmode, int vec)
1699 {
1700         bool lowprio;
1701         int vcpuid;
1702         cpuset_t dmask;
1703
1704         if (delmode != IOART_DELFIXED &&
1705             delmode != IOART_DELLOPRI &&
1706             delmode != IOART_DELEXINT) {
1707                 VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
1708                 return;
1709         }
1710         lowprio = (delmode == IOART_DELLOPRI);
1711
1712         /*
1713          * We don't provide any virtual interrupt redirection hardware so
1714          * all interrupts originating from the ioapic or MSI specify the
1715          * 'dest' in the legacy xAPIC format.
1716          */
1717         vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1718
1719         CPU_FOREACH_ISSET(vcpuid, &dmask) {
1720                 if (delmode == IOART_DELEXINT) {
1721                         vm_inject_extint(vm, vcpuid);
1722                 } else {
1723                         lapic_set_intr(vm, vcpuid, vec, level);
1724                 }
1725         }
1726 }
1727
1728 void
1729 vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
1730 {
1731         /*
1732          * Post an interrupt to the vcpu currently running on 'hostcpu'.
1733          *
1734          * This is done by leveraging features like Posted Interrupts (Intel)
1735          * Doorbell MSR (AMD AVIC) that avoid a VM exit.
1736          *
1737          * If neither of these features are available then fallback to
1738          * sending an IPI to 'hostcpu'.
1739          */
1740         if (vlapic->ops.post_intr)
1741                 (*vlapic->ops.post_intr)(vlapic, hostcpu);
1742         else
1743                 ipi_cpu(hostcpu, ipinum);
1744 }
1745
1746 bool
1747 vlapic_enabled(struct vlapic *vlapic)
1748 {
1749         struct LAPIC *lapic = vlapic->apic_page;
1750
1751         if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
1752             (lapic->svr & APIC_SVR_ENABLE) != 0)
1753                 return (true);
1754         else
1755                 return (false);
1756 }
1757
1758 static void
1759 vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
1760 {
1761         struct LAPIC *lapic;
1762         uint32_t *tmrptr, mask;
1763         int idx;
1764
1765         lapic = vlapic->apic_page;
1766         tmrptr = &lapic->tmr0;
1767         idx = (vector / 32) * 4;
1768         mask = 1 << (vector % 32);
1769         if (level)
1770                 tmrptr[idx] |= mask;
1771         else
1772                 tmrptr[idx] &= ~mask;
1773
1774         if (vlapic->ops.set_tmr != NULL)
1775                 (*vlapic->ops.set_tmr)(vlapic, vector, level);
1776 }
1777
1778 void
1779 vlapic_reset_tmr(struct vlapic *vlapic)
1780 {
1781         int vector;
1782
1783         VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
1784
1785         for (vector = 0; vector <= 255; vector++)
1786                 vlapic_set_tmr(vlapic, vector, false);
1787 }
1788
1789 void
1790 vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
1791     int delmode, int vector)
1792 {
1793         cpuset_t dmask;
1794         bool lowprio;
1795
1796         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
1797
1798         /*
1799          * A level trigger is valid only for fixed and lowprio delivery modes.
1800          */
1801         if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
1802                 VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
1803                     "delivery-mode %d", delmode);
1804                 return;
1805         }
1806
1807         lowprio = (delmode == APIC_DELMODE_LOWPRIO);
1808         vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
1809
1810         if (!CPU_ISSET(vlapic->vcpuid, &dmask))
1811                 return;
1812
1813         VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
1814         vlapic_set_tmr(vlapic, vector, true);
1815 }
1816
1817 #ifdef BHYVE_SNAPSHOT
1818 static void
1819 vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr)
1820 {
1821         /* The implementation is similar to the one in the
1822          * `vlapic_icrtmr_write_handler` function
1823          */
1824         sbintime_t sbt;
1825         struct bintime bt;
1826
1827         VLAPIC_TIMER_LOCK(vlapic);
1828
1829         bt = vlapic->timer_freq_bt;
1830         bintime_mul(&bt, ccr);
1831
1832         if (ccr != 0) {
1833                 binuptime(&vlapic->timer_fire_bt);
1834                 bintime_add(&vlapic->timer_fire_bt, &bt);
1835
1836                 sbt = bttosbt(bt);
1837                 vlapic_callout_reset(vlapic, sbt);
1838         } else {
1839                 /* even if the CCR was 0, periodic timers should be reset */
1840                 if (vlapic_periodic_timer(vlapic)) {
1841                         binuptime(&vlapic->timer_fire_bt);
1842                         bintime_add(&vlapic->timer_fire_bt,
1843                                     &vlapic->timer_period_bt);
1844                         sbt = bttosbt(vlapic->timer_period_bt);
1845
1846                         callout_stop(&vlapic->callout);
1847                         vlapic_callout_reset(vlapic, sbt);
1848                 }
1849         }
1850
1851         VLAPIC_TIMER_UNLOCK(vlapic);
1852 }
1853
1854 int
1855 vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
1856 {
1857         int ret;
1858         struct vlapic *vlapic;
1859         struct LAPIC *lapic;
1860         uint32_t ccr;
1861         uint16_t i, maxcpus;
1862
1863         KASSERT(vm != NULL, ("%s: arg was NULL", __func__));
1864
1865         ret = 0;
1866
1867         maxcpus = vm_get_maxcpus(vm);
1868         for (i = 0; i < maxcpus; i++) {
1869                 vlapic = vm_lapic(vm_vcpu(vm, i));
1870
1871                 /* snapshot the page first; timer period depends on icr_timer */
1872                 lapic = vlapic->apic_page;
1873                 SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done);
1874
1875                 SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done);
1876
1877                 SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec,
1878                                       meta, ret, done);
1879                 SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac,
1880                                       meta, ret, done);
1881
1882                 /*
1883                  * Timer period is equal to 'icr_timer' ticks at a frequency of
1884                  * 'timer_freq_bt'.
1885                  */
1886                 if (meta->op == VM_SNAPSHOT_RESTORE) {
1887                         vlapic->timer_period_bt = vlapic->timer_freq_bt;
1888                         bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
1889                 }
1890
1891                 SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk,
1892                                       sizeof(vlapic->isrvec_stk),
1893                                       meta, ret, done);
1894                 SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done);
1895                 SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done);
1896
1897                 SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last,
1898                                       sizeof(vlapic->lvt_last),
1899                                       meta, ret, done);
1900
1901                 if (meta->op == VM_SNAPSHOT_SAVE)
1902                         ccr = vlapic_get_ccr(vlapic);
1903
1904                 SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done);
1905
1906                 if (meta->op == VM_SNAPSHOT_RESTORE &&
1907                     vlapic_enabled(vlapic) && lapic->icr_timer != 0) {
1908                         /* Reset the value of the 'timer_fire_bt' and the vlapic
1909                          * callout based on the value of the current count
1910                          * register saved when the VM snapshot was created.
1911                          * If initial count register is 0, timer is not used.
1912                          * Look at "10.5.4 APIC Timer" in Software Developer Manual.
1913                          */
1914                         vlapic_reset_callout(vlapic, ccr);
1915                 }
1916         }
1917
1918 done:
1919         return (ret);
1920 }
1921 #endif