calendar(1): Significant update to calendars from OpenBSD
[dragonfly.git] / lib / libevtr / evtr.c
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  * 
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  * 
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 #include <assert.h>
33 #include <ctype.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <stdarg.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/param.h>
42 #include <sys/queue.h>
43 #include <sys/stat.h>
44 #include <sys/tree.h>
45
46
47 #include "evtr.h"
48 #include "internal.h"
49
50 unsigned evtr_debug;
51
52 static
53 void
54 printd_set_flags(const char *str, unsigned int *flags)
55 {
56         /*
57          * This is suboptimal as we don't detect
58          * invalid flags.
59          */
60         for (; *str; ++str) {
61                 if ('A' == *str) {
62                         *flags = -1;
63                         return;
64                 }
65                 if (!islower(*str))
66                         err(2, "invalid debug flag %c\n", *str);
67                 *flags |= 1 << (*str - 'a');
68         }
69 }
70
71
72 enum {
73         MAX_EVHDR_SIZE = PATH_MAX + 200,
74         /* string namespaces */
75         EVTR_NS_PATH = 0x1,
76         EVTR_NS_FUNC,
77         EVTR_NS_DSTR,
78         EVTR_NS_MAX,
79         NR_BUCKETS = 1021,      /* prime */
80         PARSE_ERR_BUFSIZE = 256,
81         REC_ALIGN = 8,
82         REC_BOUNDARY = 1 << 14,
83         FILTF_ID = 0x10,
84         EVTRF_WR = 0x1,         /* open for writing */
85         EVTRQF_PENDING = 0x1,
86 };
87
88 typedef uint16_t fileid_t;
89 typedef uint16_t funcid_t;
90 typedef uint16_t fmtid_t;
91
92 struct trace_event_header {
93         uint8_t type;
94         uint64_t ts;    /* XXX: this should only be part of probe */
95 } __attribute__((packed));
96
97 struct probe_event_header {
98         struct trace_event_header eh;
99         /*
100          * For these fields, 0 implies "not available"
101          */
102         fileid_t file;
103         funcid_t caller1;
104         funcid_t caller2;
105         funcid_t func;
106         uint16_t line;
107         fmtid_t fmt;
108         uint16_t datalen;
109         uint8_t cpu;    /* -1 if n/a */
110 } __attribute__((packed));
111
112 struct string_event_header {
113         struct trace_event_header eh;
114         uint16_t ns;
115         uint32_t id;
116         uint16_t len;
117 } __attribute__((packed));
118
119 struct fmt_event_header {
120         struct trace_event_header eh;
121         uint16_t id;
122         uint8_t subsys_len;
123         uint8_t fmt_len;
124 } __attribute__((packed));
125
126 struct cpuinfo_event_header {
127         double freq;
128         uint8_t cpu;
129 } __attribute__((packed));
130
131 struct hashentry {
132         uintptr_t key;
133         uintptr_t val;
134         struct hashentry *next;
135 };
136
137 struct hashtab {
138         struct hashentry *buckets[NR_BUCKETS];
139         uintptr_t (*hashfunc)(uintptr_t);
140         uintptr_t (*cmpfunc)(uintptr_t, uintptr_t);
141 };
142
143 struct symtab {
144         struct hashtab tab;
145 };
146
147 struct event_fmt {
148         const char *subsys;
149         const char *fmt;
150 };
151
152 struct event_filter_unresolved {
153         TAILQ_ENTRY(event_filter_unresolved) link;
154         evtr_filter_t filt;
155 };
156
157 struct id_map {
158         RB_ENTRY(id_map) rb_node;
159         int id;
160         const void *data;
161 };
162
163 RB_HEAD(id_tree, id_map);
164 struct string_map {
165         struct id_tree root;
166 };
167
168 struct fmt_map {
169         struct id_tree root;
170 };
171
172 RB_HEAD(thread_tree, evtr_thread);
173
174 struct thread_map {
175         struct thread_tree root;
176 };
177
178 struct event_callback {
179         void (*cb)(evtr_event_t, void *data);
180         void *data;     /* this field must be malloc()ed */
181 };
182
183 struct cpu {
184         struct evtr_thread *td; /* currently executing thread */
185         double freq;
186 };
187
188 struct evtr {
189         FILE *f;
190         int flags;
191         int err;
192         const char *errmsg;
193         off_t bytes;
194         union {
195                 /*
196                  * When writing, we keep track of the strings we've
197                  * already dumped so we only dump them once.
198                  * Paths, function names etc belong to different
199                  * namespaces.
200                  */
201                 struct hashtab_str *strings[EVTR_NS_MAX - 1];
202                 /*
203                  * When reading, we build a map from id to string.
204                  * Every id must be defined at the point of use.
205                  */
206                 struct string_map maps[EVTR_NS_MAX - 1];
207         };
208         union {
209                 /* same as above, but for subsys+fmt pairs */
210                 struct fmt_map fmtmap;
211                 struct hashtab_str *fmts;
212         };
213         struct thread_map threads;
214         struct cpu *cpus;
215         int ncpus;
216 };
217
218 struct evtr_query {
219         evtr_t evtr;
220         off_t off;
221         evtr_filter_t filt;
222         int nfilt;
223         int nmatched;
224         int ntried;
225         void *buf;
226         int bufsize;
227         struct symtab *symtab;
228         int ncbs;
229         struct event_callback **cbs;
230         /*
231          * Filters that have a format specified and we
232          * need to resolve that to an fmtid
233          */
234         TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
235         int err;
236         const char *errmsg;
237         char parse_err_buf[PARSE_ERR_BUFSIZE];
238         int flags;
239         struct evtr_event pending_event;
240 };
241
242 void
243 evtr_set_debug(const char *str)
244 {
245         printd_set_flags(str, &evtr_debug);
246 }
247
248 static int id_map_cmp(struct id_map *, struct id_map *);
249 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
250 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
251
252 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
253 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
254 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
255
256 static inline
257 void
258 validate_string(const char *str)
259 {
260         if (!(evtr_debug & MISC))
261                 return;
262         for (; *str; ++str)
263                 assert(isprint(*str));
264 }
265
266 static
267 void
268 id_tree_free(struct id_tree *root)
269 {
270         struct id_map *v, *n;
271
272         for (v = RB_MIN(id_tree, root); v; v = n) {
273                 n = RB_NEXT(id_tree, root, v);
274                 RB_REMOVE(id_tree, root, v);
275         }
276 }
277
278 static
279 int
280 evtr_register_callback(evtr_query_t q, void (*fn)(evtr_event_t, void *), void *d)
281 {
282         struct event_callback *cb;
283         void *cbs;
284
285         if (!(cb = malloc(sizeof(*cb)))) {
286                 q->err = ENOMEM;
287                 return !0;
288         }
289         cb->cb = fn;
290         cb->data = d;
291         if (!(cbs = realloc(q->cbs, (++q->ncbs) * sizeof(cb)))) {
292                 --q->ncbs;
293                 free(cb);
294                 q->err = ENOMEM;
295                 return !0;
296         }
297         q->cbs = cbs;
298         q->cbs[q->ncbs - 1] = cb;
299         return 0;
300 }
301
302 static
303 void
304 evtr_deregister_callbacks(evtr_query_t q)
305 {
306         int i;
307
308         for (i = 0; i < q->ncbs; ++i) {
309                 free(q->cbs[i]);
310         }
311         free(q->cbs);
312         q->cbs = NULL;
313 }
314
315 static
316 void
317 evtr_run_callbacks(evtr_event_t ev, evtr_query_t q)
318 {
319         struct event_callback *cb;
320         int i;
321
322         for (i = 0; i < q->ncbs; ++i) {
323                 cb = q->cbs[i];
324                 cb->cb(ev, cb->data);
325         }
326 }
327
328 static
329 struct cpu *
330 evtr_cpu(evtr_t evtr, int c)
331 {
332         if ((c < 0) || (c >= evtr->ncpus))
333                 return NULL;
334         return &evtr->cpus[c];
335 }
336
337 static int parse_format_data(evtr_event_t ev, const char *fmt, ...)
338                __printflike(2, 3) __scanflike(2, 3);
339
340 static
341 int
342 parse_format_data(evtr_event_t ev, const char *fmt, ...)
343 {
344         va_list ap;
345         char buf[2048];
346
347         if (strcmp(fmt, ev->fmt))
348                 return 0;
349         vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
350         printd(MISC, "string is: %s\n", buf);
351         va_start(ap, fmt);
352         return vsscanf(buf, fmt, ap);
353 }
354
355 static
356 void
357 evtr_deregister_filters(evtr_query_t q, evtr_filter_t filt, int nfilt)
358 {
359         struct event_filter_unresolved *u, *tmp;
360         int i;
361         TAILQ_FOREACH_MUTABLE(u, &q->unresolved_filtq, link, tmp) {
362                 for (i = 0; i < nfilt; ++i) {
363                         if (u->filt == &filt[i]) {
364                                 TAILQ_REMOVE(&q->unresolved_filtq, u, link);
365                         }
366                 }
367         }
368 }
369
370 static
371 int
372 evtr_filter_register(evtr_query_t q, evtr_filter_t filt)
373 {
374         struct event_filter_unresolved *res;
375
376         if (!(res = malloc(sizeof(*res)))) {
377                 q->err = ENOMEM;
378                 return !0;
379         }
380         res->filt = filt;
381         TAILQ_INSERT_TAIL(&q->unresolved_filtq, res, link);
382         return 0;
383 }
384
385 static
386 int
387 evtr_query_needs_parsing(evtr_query_t q)
388 {
389         int i;
390
391         for (i = 0; i < q->nfilt; ++i)
392                 if (q->filt[i].ev_type == EVTR_TYPE_STMT)
393                         return !0;
394         return 0;
395 }
396
397 void
398 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
399 {
400         /*
401          * XXX: we implicitly trust the format string.
402          * We shouldn't.
403          */
404         if (ev->fmtdatalen) {
405                 vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
406         } else {
407                 strlcpy(buf, ev->fmt, len);
408         }
409 }
410
411 int
412 evtr_error(evtr_t evtr)
413 {
414         return evtr->err || (evtr->errmsg != NULL);
415 }
416
417 const char *
418 evtr_errmsg(evtr_t evtr)
419 {
420         return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
421 }
422
423 int
424 evtr_query_error(evtr_query_t q)
425 {
426         return q->err || (q->errmsg != NULL) || evtr_error(q->evtr);
427 }
428
429 const char *
430 evtr_query_errmsg(evtr_query_t q)
431 {
432         return q->errmsg ? q->errmsg :
433                 (q->err ? strerror(q->err) :
434                  (evtr_errmsg(q->evtr)));
435 }
436
437 static
438 int
439 id_map_cmp(struct id_map *a, struct id_map *b)
440 {
441         return a->id - b->id;
442 }
443
444 static
445 int
446 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
447 {
448         ptrdiff_t d;
449         d =  a->id - b->id;
450         if (d < 0)
451                 return -1;
452         if (!d)
453                 return 0;
454         return 1;
455 }
456
457 #define DEFINE_MAP_FIND(prefix, type)           \
458         static                                  \
459         type                            \
460         prefix ## _map_find(struct id_tree *tree, int id)\
461         {                                                \
462                 struct id_map *sid;                      \
463                                                         \
464                 sid = id_tree_RB_LOOKUP(tree, id);      \
465                 return sid ? sid->data : NULL;          \
466         }
467
468 DEFINE_MAP_FIND(string, const char *)
469 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
470
471 static
472 struct evtr_thread *
473 thread_map_find(struct thread_map *map, void *id)
474 {
475         return thread_tree_RB_LOOKUP(&map->root, id);
476 }
477
478 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)     \
479         static                                  \
480         int                                                             \
481         prefix ## _map_insert(struct id_tree *tree, type data, int id) \
482         {                                                               \
483         struct id_map *sid, *osid;                                      \
484                                                                         \
485         sid = malloc(sizeof(*sid));                                     \
486         if (!sid) {                                                     \
487                 return ENOMEM;                                          \
488         }                                                               \
489         sid->id = id;                                                   \
490         sid->data = data;                                               \
491         if ((osid = id_tree_RB_INSERT(tree, sid))) {                    \
492                 free(sid);                                              \
493                 if (_cmp((type)osid->data, data)) {                     \
494                         return EEXIST;                                  \
495                 }                                                       \
496                 printd(DS, "mapping already exists, skipping\n");               \
497                 /* we're OK with redefinitions of an id to the same string */ \
498                 return 0;                                               \
499         }                                                               \
500         /* only do the strdup if we're inserting a new string */        \
501         sid->data = _dup(data);         /* XXX: oom */                  \
502         return 0;                                                       \
503 }
504
505 static
506 void
507 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
508 {
509         struct evtr_thread *otd;
510
511         if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
512                 /*
513                  * Thread addresses might be reused, we're
514                  * ok with that.
515                  * DANGER, Will Robinson: this means the user
516                  * of the API needs to copy event->td if they
517                  * want it to remain stable.
518                  */
519                 free((void *)otd->comm);
520                 otd->comm = td->comm;
521                 free(td);
522         }
523 }
524
525 static
526 int
527 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
528 {
529         int ret = 0;
530
531         if (a->subsys) {
532                 if (b->subsys) {
533                         ret = strcmp(a->subsys, b->subsys);
534                 } else {
535                         ret = strcmp(a->subsys, "");
536                 }
537         } else if (b->subsys) {
538                         ret = strcmp("", b->subsys);
539         }
540         if (ret)
541                 return ret;
542         return strcmp(a->fmt, b->fmt);
543 }
544
545 static
546 struct event_fmt *
547 event_fmt_dup(const struct event_fmt *o)
548 {
549         struct event_fmt *n;
550
551         if (!(n = malloc(sizeof(*n)))) {
552                 return n;
553         }
554         memcpy(n, o, sizeof(*n));
555         return n;
556 }
557
558 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
559 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
560
561 int
562 hash_find(const struct hashtab *tab, uintptr_t key, uintptr_t *val)
563 {
564         struct hashentry *ent;
565
566         for(ent = tab->buckets[tab->hashfunc(key)];
567             ent && tab->cmpfunc(ent->key, key);
568             ent = ent->next);
569
570         if (!ent)
571                 return !0;
572         *val = ent->val;
573         return 0;
574 }
575
576 struct hashentry *
577 hash_insert(struct hashtab *tab, uintptr_t key, uintptr_t val)
578 {
579         struct hashentry *ent;
580         int hsh;
581
582         if (!(ent = malloc(sizeof(*ent)))) {
583                 fprintf(stderr, "out of memory\n");
584                 return NULL;
585         }
586         hsh = tab->hashfunc(key);
587         ent->next = tab->buckets[hsh];
588         ent->key = key;
589         ent->val = val;
590         tab->buckets[hsh] = ent;
591         return ent;
592 }
593
594 static
595 uintptr_t
596 cmpfunc_pointer(uintptr_t a, uintptr_t b)
597 {
598         return b - a;
599 }
600
601 static
602 uintptr_t
603 hashfunc_pointer(uintptr_t p)
604 {
605         return p % NR_BUCKETS;
606 }
607
608 struct hashtab *
609 hash_new(void)
610 {
611         struct hashtab *tab;
612         if (!(tab = calloc(sizeof(struct hashtab), 1)))
613                 return tab;
614         tab->hashfunc = &hashfunc_pointer;
615         tab->cmpfunc = &cmpfunc_pointer;
616         return tab;
617 }
618
619 struct hashtab_str {    /* string -> id map */
620         struct hashtab tab;
621         uint16_t id;
622 };
623
624 static
625 uintptr_t
626 hashfunc_string(uintptr_t p)
627 {
628         const char *str = (char *)p;
629         unsigned long hash = 5381;
630         int c;
631
632         while ((c = *str++))
633             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
634         return hash  % NR_BUCKETS;
635 }
636
637 static
638 uintptr_t
639 cmpfunc_string(uintptr_t a, uintptr_t b)
640 {
641         return strcmp((char *)a, (char *)b);
642 }
643
644
645 static
646 struct hashtab_str *
647 strhash_new(void)
648 {
649         struct hashtab_str *strtab;
650         if (!(strtab = calloc(sizeof(struct hashtab_str), 1)))
651                 return strtab;
652         strtab->tab.hashfunc = &hashfunc_string;
653         strtab->tab.cmpfunc = &cmpfunc_string;
654         return strtab;
655 }
656
657 static
658 void
659 strhash_destroy(struct hashtab_str *strtab)
660 {
661         free(strtab);
662 }
663
664 static
665 int
666 strhash_find(struct hashtab_str *strtab, const char *str, uint16_t *id)
667 {
668         uintptr_t val;
669
670         if (hash_find(&strtab->tab, (uintptr_t)str, &val))
671                 return !0;
672         *id = (uint16_t)val;
673         return 0;
674 }
675
676 static
677 int
678 strhash_insert(struct hashtab_str *strtab, const char *str, uint16_t *id)
679 {
680         uintptr_t val;
681
682         val = ++strtab->id;
683         if (strtab->id == 0) {
684                 fprintf(stderr, "too many strings\n");
685                 return ERANGE;
686         }
687         str = strdup(str);
688         if (!str) {
689                 fprintf(stderr, "out of memory\n");
690                 --strtab->id;
691                 return ENOMEM;
692         }
693         hash_insert(&strtab->tab, (uintptr_t)str, (uintptr_t)val);
694         *id = strtab->id;
695         return 0;
696 }
697
698 struct symtab *
699 symtab_new(void)
700 {
701         struct symtab *symtab;
702         if (!(symtab = calloc(sizeof(struct symtab), 1)))
703                 return symtab;
704         symtab->tab.hashfunc = &hashfunc_string;
705         symtab->tab.cmpfunc = &cmpfunc_string;
706         return symtab;
707 }
708
709 void
710 symtab_destroy(struct symtab *symtab)
711 {
712         free(symtab);
713 }
714
715 struct evtr_variable *
716 symtab_find(const struct symtab *symtab, const char *str)
717 {
718         uintptr_t val;
719
720         if (hash_find(&symtab->tab, (uintptr_t)str, &val))
721                 return NULL;
722         return (struct evtr_variable *)val;
723 }
724
725 int
726 symtab_insert(struct symtab *symtab, const char *name,
727                struct evtr_variable *var)
728 {
729         name = strdup(name);
730         if (!name) {
731                 fprintf(stderr, "out of memory\n");
732                 return ENOMEM;
733         }
734         hash_insert(&symtab->tab, (uintptr_t)name, (uintptr_t)var);
735         return 0;
736 }
737
738 static
739 int
740 evtr_filter_match(evtr_query_t q, evtr_filter_t f, evtr_event_t ev)
741 {
742         if ((f->cpu != -1) && (f->cpu != ev->cpu))
743                 return 0;
744
745         assert(!(f->flags & FILTF_ID));
746         if (ev->type != f->ev_type)
747                 return 0;
748         if (ev->type == EVTR_TYPE_PROBE) {
749                 if (f->fmt && strcmp(ev->fmt, f->fmt))
750                         return 0;
751         } else if (ev->type == EVTR_TYPE_STMT) {
752                 struct evtr_variable *var;
753                 /* resolve var */
754                 /* XXX: no need to do that *every* time */
755                 parse_var(f->var, q->symtab, &var, &q->parse_err_buf[0],
756                           PARSE_ERR_BUFSIZE);
757                 /*
758                  * Ignore errors, they're expected since the
759                  * variable might not be instantiated yet
760                  */
761                 if (var != ev->stmt.var)
762                         return 0;
763         }
764         return !0;
765 }
766
767 static
768 int
769 evtr_match_filters(struct evtr_query *q, evtr_event_t ev)
770 {
771         int i;
772
773         /* no filters means we're interested in all events */
774         if (!q->nfilt)
775                 return !0;
776         ++q->ntried;
777         for (i = 0; i < q->nfilt; ++i) {
778                 if (evtr_filter_match(q, &q->filt[i], ev)) {
779                         ++q->nmatched;
780                         return !0;
781                 }
782         }
783         return 0;
784 }
785
786 static
787 void
788 parse_callback(evtr_event_t ev, void *d)
789 {
790         evtr_query_t q = (evtr_query_t)d;
791         if (ev->type != EVTR_TYPE_PROBE)
792                 return;
793         if (!ev->fmt || (ev->fmt[0] != '#'))
794                 return;
795         /*
796          * Copy the event to ->pending_event, then call
797          * the parser to convert it into a synthesized
798          * EVTR_TYPE_STMT event.
799          */
800         memcpy(&q->pending_event, ev, sizeof(*ev));
801         parse_string(&q->pending_event, q->symtab, &ev->fmt[1],
802                      &q->parse_err_buf[0], PARSE_ERR_BUFSIZE);
803         if (q->parse_err_buf[0]) {      /* parse error */
804                 q->errmsg = &q->parse_err_buf[0];
805                 return;
806         }
807         if (!evtr_match_filters(q, &q->pending_event))
808                 return;
809         /*
810          * This will cause us to return ->pending_event next time
811          * we're called.
812          */
813         q->flags |= EVTRQF_PENDING;
814 }
815
816 static
817 void
818 thread_creation_callback(evtr_event_t ev, void *d)
819 {
820         evtr_query_t q = (evtr_query_t)d;
821         evtr_t evtr = q->evtr;
822         struct evtr_thread *td;
823         void *ktd;
824         char buf[20];
825
826         if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
827                 return;
828         }
829         buf[19] = '\0';
830
831         if (!(td = malloc(sizeof(*td)))) {
832                 q->err = ENOMEM;
833                 return;
834         }
835         td->id = ktd;
836         td->userdata = NULL;
837         if (!(td->comm = strdup(buf))) {
838                 free(td);
839                 q->err = ENOMEM;
840                 return;
841         }
842         printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
843         thread_map_insert(&evtr->threads, td);
844 }
845
846 static
847 void
848 thread_switch_callback(evtr_event_t ev, void *d)
849 {
850         evtr_t evtr = ((evtr_query_t)d)->evtr;
851         struct evtr_thread *tdp, *tdn;
852         void *ktdp, *ktdn;
853         struct cpu *cpu;
854         static struct evtr_event tdcr;
855         static char *fmt = "new_td %p %s";
856         char tidstr[40];
857         char fmtdata[sizeof(void *) + sizeof(char *)];
858
859         cpu = evtr_cpu(evtr, ev->cpu);
860         if (!cpu) {
861                 printw("invalid cpu %d\n", ev->cpu);
862                 return;
863         }
864         if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
865                 return;
866         }
867         tdp = thread_map_find(&evtr->threads, ktdp);
868         if (!tdp) {
869                 printd(DS, "switching from unknown thread %p\n", ktdp);
870         }
871         tdn = thread_map_find(&evtr->threads, ktdn);
872         if (!tdn) {
873                 /*
874                  * Fake a thread creation event for threads we
875                  * haven't seen before.
876                  */
877                 tdcr.type = EVTR_TYPE_PROBE;
878                 tdcr.ts = ev->ts;
879                 tdcr.file = NULL;
880                 tdcr.func = NULL;
881                 tdcr.line = 0;
882                 tdcr.fmt = fmt;
883                 tdcr.fmtdata = &fmtdata;
884                 tdcr.fmtdatalen = sizeof(fmtdata);
885                 tdcr.cpu = ev->cpu;
886                 tdcr.td = NULL;
887                 snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
888                 ((void **)fmtdata)[0] = ktdn;
889                 ((char **)fmtdata)[1] = &tidstr[0];
890                 thread_creation_callback(&tdcr, d);
891
892                 tdn = thread_map_find(&evtr->threads, ktdn);
893                 assert(tdn != NULL);
894                 printd(DS, "switching to unknown thread %p\n", ktdn);
895                 cpu->td = tdn;
896                 return;
897         }
898         printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
899         cpu->td = tdn;
900 }
901
902 static
903 void
904 assert_foff_in_sync(evtr_t evtr)
905 {
906         off_t off;
907
908         /*
909          * We keep our own offset because we
910          * might want to support mmap()
911          */
912         off = ftello(evtr->f);
913         if (evtr->bytes != off) {
914                 fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
915                 abort();
916         }
917 }
918
919 static
920 int
921 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
922 {
923         assert_foff_in_sync(evtr);
924         if (fwrite(buf, bytes, 1, evtr->f) != 1) {
925                 evtr->err = errno;
926                 evtr->errmsg = strerror(errno);
927                 return !0;
928         }
929         evtr->bytes += bytes;
930         assert_foff_in_sync(evtr);
931         return 0;
932 }
933
934 /*
935  * Called after dumping a record to make sure the next
936  * record is REC_ALIGN aligned. This does not make much sense,
937  * as we shouldn't be using packed structs anyway.
938  */
939 static
940 int
941 evtr_dump_pad(evtr_t evtr)
942 {
943         size_t pad;
944         static char buf[REC_ALIGN];
945
946         pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
947         if (pad > 0) {
948                 return evtr_write(evtr, buf, pad);
949         }
950         return 0;
951 }
952
953 /*
954  * We make sure that there is a new record every REC_BOUNDARY
955  * bytes, this costs next to nothing in space and allows for
956  * fast seeking.
957  */
958 static
959 int
960 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
961 {
962         unsigned pad, i;
963         static char buf[256];
964
965         pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
966         /* if adding @bytes would cause us to cross a boundary... */
967         if (bytes > pad) {
968                 /* then pad to the boundary */
969                 for (i = 0; i < (pad / sizeof(buf)); ++i) {
970                         if (evtr_write(evtr, buf, sizeof(buf))) {
971                                 return !0;
972                         }
973                 }
974                 i = pad % sizeof(buf);
975                 if (i) {
976                         if (evtr_write(evtr, buf, i)) {
977                                 return !0;
978                         }
979                 }
980         }
981         return 0;
982 }
983
984 static
985 int
986 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
987 {
988         struct fmt_event_header fmt;
989         uint16_t id;
990         int err;
991         char *subsys = "", buf[1024];
992
993         if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
994                 evtr->errmsg = "name of subsystem is too large";
995                 evtr->err = ERANGE;
996                 return 0;
997         }
998         if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
999                 evtr->errmsg = "fmt + name of subsystem is too large";
1000                 evtr->err = ERANGE;
1001                 return 0;
1002         }
1003
1004         if (!strhash_find(evtr->fmts, buf, &id)) {
1005                 return id;
1006         }
1007         if ((err = strhash_insert(evtr->fmts, buf, &id))) {
1008                 evtr->err = err;
1009                 return 0;
1010         }
1011
1012         fmt.eh.type = EVTR_TYPE_FMT;
1013         fmt.eh.ts = ts;
1014         fmt.subsys_len = strlen(subsys);
1015         fmt.fmt_len = strlen(ev->fmt);
1016         fmt.id = id;
1017         if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
1018                                      fmt.fmt_len))
1019                 return 0;
1020         if (evtr_write(evtr, &fmt, sizeof(fmt)))
1021                 return 0;
1022         if (evtr_write(evtr, subsys, fmt.subsys_len))
1023                 return 0;
1024         if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
1025                 return 0;
1026         if (evtr_dump_pad(evtr))
1027                 return 0;
1028         return fmt.id;
1029 }
1030
1031 /*
1032  * Replace string pointers or string ids in fmtdata
1033  */ 
1034 static
1035 int
1036 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
1037                    const char *(*replace)(void *, const char *), void *ctx)
1038 {
1039         const char *f, *p;
1040         size_t skipsize, intsz;
1041         int ret = 0;
1042
1043         for (f = fmt; f[0] != '\0'; ++f) {
1044                 if (f[0] != '%')
1045                         continue;
1046                 ++f;
1047                 skipsize = 0;
1048                 for (p = f; p[0]; ++p) {
1049                         int again = 0;
1050                         /*
1051                          * Eat flags. Notice this will accept duplicate
1052                          * flags.
1053                          */
1054                         switch (p[0]) {
1055                         case '#':
1056                         case '0':
1057                         case '-':
1058                         case ' ':
1059                         case '+':
1060                         case '\'':
1061                                 again = !0;
1062                                 break;
1063                         }
1064                         if (!again)
1065                                 break;
1066                 }
1067                 /* Eat minimum field width, if any */
1068                 for (; isdigit(p[0]); ++p)
1069                         ;
1070                 if (p[0] == '.')
1071                         ++p;
1072                 /* Eat precision, if any */
1073                 for (; isdigit(p[0]); ++p)
1074                         ;
1075                 intsz = 0;
1076                 switch (p[0]) {
1077                 case 'l':
1078                         if (p[1] == 'l') {
1079                                 ++p;
1080                                 intsz = sizeof(long long);
1081                         } else {
1082                                 intsz = sizeof(long);
1083                         }
1084                         break;
1085                 case 'j':
1086                         intsz = sizeof(intmax_t);
1087                         break;
1088                 case 't':
1089                         intsz = sizeof(ptrdiff_t);
1090                         break;
1091                 case 'z':
1092                         intsz = sizeof(size_t);
1093                         break;
1094                 default:
1095                         break;
1096                 }
1097                 if (intsz != 0)
1098                         ++p;
1099                 else
1100                         intsz = sizeof(int);
1101
1102                 switch (p[0]) {
1103                 case 'd':
1104                 case 'i':
1105                 case 'o':
1106                 case 'u':
1107                 case 'x':
1108                 case 'X':
1109                 case 'c':
1110                         skipsize = intsz;
1111                         break;
1112                 case 'p':
1113                         skipsize = sizeof(void *);
1114                         break;
1115                 case 'f':
1116                         if (p[-1] == 'l')
1117                                 skipsize = sizeof(double);
1118                         else
1119                                 skipsize = sizeof(float);
1120                         break;
1121                 case 's':
1122                         ((const char **)fmtdata)[0] =
1123                                 replace(ctx, ((char **)fmtdata)[0]);
1124                         skipsize = sizeof(char *);
1125                         ++ret;
1126                         break;
1127                 default:
1128                         fprintf(stderr, "Unknown conversion specifier %c "
1129                                 "in fmt starting with %s", p[0], f - 1);
1130                         return -1;
1131                 }
1132                 fmtdata += skipsize;
1133         }
1134         return ret;
1135 }
1136
1137 /* XXX: do we really want the timestamp? */
1138 static
1139 int
1140 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
1141 {
1142         struct string_event_header s;
1143         int err;
1144         uint16_t id;
1145
1146         assert((0 <= ns) && (ns < EVTR_NS_MAX));
1147         if (!strhash_find(evtr->strings[ns], str, &id)) {
1148                 return id;
1149         }
1150         if ((err = strhash_insert(evtr->strings[ns], str, &id))) {
1151                 evtr->err = err;
1152                 return 0;
1153         }
1154
1155         printd(DS, "hash_insert %s ns %d id %d\n", str, ns, id);
1156         s.eh.type = EVTR_TYPE_STR;
1157         s.eh.ts = ts;
1158         s.ns = ns;
1159         s.id = id;
1160         s.len = strnlen(str, PATH_MAX);
1161
1162         if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
1163                 return 0;
1164         if (evtr_write(evtr, &s, sizeof(s)))
1165                 return 0;
1166         if (evtr_write(evtr, str, s.len))
1167                 return 0;
1168         if (evtr_dump_pad(evtr))
1169                 return 0;
1170         return s.id;
1171 }
1172
1173 struct replace_ctx {
1174         evtr_t evtr;
1175         uint64_t ts;
1176 };
1177
1178 static
1179 const char *
1180 replace_strptr(void *_ctx, const char *s)
1181 {
1182         struct replace_ctx *ctx = _ctx;
1183         return (const char *)(uintptr_t)evtr_dump_string(ctx->evtr, ctx->ts, s,
1184                                                          EVTR_NS_DSTR);
1185 }
1186
1187 static
1188 const char *
1189 replace_strid(void *_ctx, const char *s)
1190 {
1191         struct replace_ctx *ctx = _ctx;
1192         const char *ret;
1193
1194         ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
1195                               (int)(uintptr_t)s);
1196         if (!ret) {
1197                 fprintf(stderr, "Unknown id for data string\n");
1198                 ctx->evtr->errmsg = "unknown id for data string";
1199                 ctx->evtr->err = !0;
1200         }
1201         validate_string(ret);
1202         printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n",
1203                (int)(uintptr_t)s, EVTR_NS_DSTR, ret ? ret : "NULL", (int)(uintptr_t)ret);
1204         return ret;
1205 }
1206
1207 static
1208 int
1209 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
1210 {
1211         struct probe_event_header kev;
1212         char buf[1024];
1213
1214         memset(&kev, '\0', sizeof(kev));
1215         kev.eh.type = ev->type;
1216         kev.eh.ts = ev->ts;
1217         kev.line = ev->line;
1218         kev.cpu = ev->cpu;
1219         if (ev->file) {
1220                 kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
1221                                             EVTR_NS_PATH);
1222         }
1223         if (ev->func) {
1224                 kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
1225                                             EVTR_NS_FUNC);
1226         }
1227         if (ev->fmt) {
1228                 kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1229         }
1230         if (ev->fmtdata) {
1231                 struct replace_ctx replctx = {
1232                         .evtr = evtr,
1233                         .ts = ev->ts,
1234                 };
1235                 assert(ev->fmtdatalen <= (int)sizeof(buf));
1236                 kev.datalen = ev->fmtdatalen;
1237                 /*
1238                  * Replace all string pointers with string ids before dumping
1239                  * the data.
1240                  */
1241                 memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1242                 if (mangle_string_ptrs(ev->fmt, buf,
1243                                        replace_strptr, &replctx) < 0)
1244                         return !0;
1245                 if (evtr->err)
1246                         return evtr->err;
1247         }
1248         if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1249                 return !0;
1250         if (evtr_write(evtr, &kev, sizeof(kev)))
1251                 return !0;
1252         if (evtr_write(evtr, buf, ev->fmtdatalen))
1253                 return !0;
1254         if (evtr_dump_pad(evtr))
1255                 return !0;
1256         return 0;
1257 }
1258
1259 static
1260 int
1261 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1262 {
1263         uint8_t type = EVTR_TYPE_SYSINFO;
1264         uint16_t ncpus = ev->ncpus;
1265
1266         if (ncpus <= 0) {
1267                 evtr->errmsg = "invalid number of cpus";
1268                 return !0;
1269         }
1270         if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1271                 return !0;
1272         if (evtr_write(evtr, &type, sizeof(type))) {
1273                 return !0;
1274         }
1275         if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1276                 return !0;
1277         }
1278         if (evtr_dump_pad(evtr))
1279                 return !0;
1280         return 0;
1281 }
1282 static
1283 int
1284 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1285 {
1286         struct cpuinfo_event_header ci;
1287         uint8_t type;
1288
1289         if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1290                 return !0;
1291         type = EVTR_TYPE_CPUINFO;
1292         if (evtr_write(evtr, &type, sizeof(type))) {
1293                 return !0;
1294         }
1295         ci.cpu = ev->cpu;
1296         ci.freq = ev->cpuinfo.freq;
1297         if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1298                 return !0;
1299         if (evtr_write(evtr, &ci, sizeof(ci))) {
1300                 return !0;
1301         }
1302         if (evtr_dump_pad(evtr))
1303                 return !0;
1304         return 0;
1305 }
1306
1307 int
1308 evtr_rewind(evtr_t evtr)
1309 {
1310         assert((evtr->flags & EVTRF_WR) == 0);
1311         evtr->bytes = 0;
1312         if (fseek(evtr->f, 0, SEEK_SET)) {
1313                 evtr->err = errno;
1314                 return !0;
1315         }
1316         return 0;
1317 }
1318
1319 int
1320 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1321 {
1322         switch (ev->type) {
1323         case EVTR_TYPE_PROBE:
1324                 return evtr_dump_probe(evtr, ev);
1325         case EVTR_TYPE_SYSINFO:
1326                 return evtr_dump_sysinfo(evtr, ev);
1327         case EVTR_TYPE_CPUINFO:
1328                 return evtr_dump_cpuinfo(evtr, ev);
1329         }
1330         evtr->errmsg = "unknown event type";
1331         return !0;
1332 }
1333
1334 static
1335 evtr_t
1336 evtr_alloc(FILE *f)
1337 {
1338         evtr_t evtr;
1339         if (!(evtr = malloc(sizeof(*evtr)))) {
1340                 return NULL;
1341         }
1342
1343         evtr->f = f;
1344         evtr->err = 0;
1345         evtr->errmsg = NULL;
1346         evtr->bytes = 0;
1347         return evtr;
1348 }
1349
1350 static int evtr_next_event(evtr_t, evtr_event_t);
1351
1352 evtr_t
1353 evtr_open_read(FILE *f)
1354 {
1355         evtr_t evtr;
1356         struct evtr_event ev;
1357         int i;
1358
1359         if (!(evtr = evtr_alloc(f))) {
1360                 return NULL;
1361         }
1362         evtr->flags = 0;
1363         for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1364                 RB_INIT(&evtr->maps[i].root);
1365         }
1366         RB_INIT(&evtr->fmtmap.root);
1367         RB_INIT(&evtr->threads.root);
1368         evtr->cpus = NULL;
1369         evtr->ncpus = 0;
1370         /*
1371          * Load the first event so we can pick up any
1372          * sysinfo entries.
1373          */
1374         if (evtr_next_event(evtr, &ev)) {
1375                 goto free_evtr;
1376         }
1377         if (evtr_rewind(evtr))
1378                 goto free_evtr;
1379         return evtr;
1380 free_evtr:
1381         free(evtr);
1382         return NULL;
1383 }
1384
1385 evtr_t
1386 evtr_open_write(FILE *f)
1387 {
1388         evtr_t evtr;
1389         int i, j;
1390
1391         if (!(evtr = evtr_alloc(f))) {
1392                 return NULL;
1393         }
1394
1395         evtr->flags = EVTRF_WR;
1396         if (!(evtr->fmts = strhash_new()))
1397                 goto free_evtr;
1398         for (i = 0; i < EVTR_NS_MAX; ++i) {
1399                 evtr->strings[i] = strhash_new();
1400                 if (!evtr->strings[i]) {
1401                         for (j = 0; j < i; ++j) {
1402                                 strhash_destroy(evtr->strings[j]);
1403                         }
1404                         goto free_fmts;
1405                 }
1406         }
1407
1408         return evtr;
1409 free_fmts:
1410         strhash_destroy(evtr->fmts);
1411 free_evtr:
1412         free(evtr);
1413         return NULL;
1414 }
1415
1416 static
1417 void
1418 hashtab_destroy(struct hashtab *h)
1419 {
1420         struct hashentry *ent, *next;
1421         int i;
1422         for (i = 0; i < NR_BUCKETS; ++i) {
1423                 for (ent = h->buckets[i]; ent; ent = next) {
1424                         next = ent->next;
1425                         free(ent);
1426                 }
1427         }
1428         free(h);
1429 }
1430
1431 void
1432 evtr_close(evtr_t evtr)
1433 {
1434         int i;
1435
1436         if (evtr->flags & EVTRF_WR) {
1437                 hashtab_destroy(&evtr->fmts->tab);
1438                 for (i = 0; i < EVTR_NS_MAX - 1; ++i)
1439                         hashtab_destroy(&evtr->strings[i]->tab);
1440         } else {
1441                 id_tree_free(&evtr->fmtmap.root);
1442                 for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1443                         id_tree_free(&evtr->maps[i].root);
1444                 }
1445         }
1446         free(evtr);
1447 }
1448
1449 static
1450 int
1451 evtr_read(evtr_t evtr, void *buf, size_t size)
1452 {
1453         assert(size > 0);
1454         assert_foff_in_sync(evtr);
1455         printd(IO, "evtr_read at %#jx, %zd bytes\n", evtr->bytes, size);
1456         if (fread(buf, size, 1, evtr->f) != 1) {
1457                 if (feof(evtr->f)) {
1458                         evtr->errmsg = "incomplete record";
1459                 } else {
1460                         evtr->errmsg = strerror(errno);
1461                 }
1462                 return !0;
1463         }
1464         evtr->bytes += size;
1465         assert_foff_in_sync(evtr);
1466         return 0;
1467 }
1468
1469 static
1470 int
1471 evtr_load_fmt(evtr_query_t q, char *buf)
1472 {
1473         evtr_t evtr = q->evtr;
1474         struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1475         struct event_fmt *fmt;
1476         char *subsys = NULL, *fmtstr;
1477
1478         if (!(fmt = malloc(sizeof(*fmt)))) {
1479                 evtr->err = errno;
1480                 return !0;
1481         }
1482         if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1483                       sizeof(*evh) - sizeof(evh->eh))) {
1484                 goto free_fmt;
1485         }
1486         assert(!evh->subsys_len);
1487         if (evh->subsys_len) {
1488                 if (!(subsys = malloc(evh->subsys_len))) {
1489                         evtr->err = errno;
1490                         goto free_fmt;
1491                 }
1492                 if (evtr_read(evtr, subsys, evh->subsys_len)) {
1493                         goto free_subsys;
1494                 }
1495                 fmt->subsys = subsys;
1496         } else {
1497                 fmt->subsys = "";
1498         }
1499         if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1500                 evtr->err = errno;
1501                 goto free_subsys;
1502         }
1503         if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1504                 goto free_fmtstr;
1505         }
1506         fmtstr[evh->fmt_len] = '\0';
1507         fmt->fmt = fmtstr;
1508
1509         printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1510         evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1511         switch (evtr->err) {
1512         case ENOMEM:
1513                 evtr->errmsg = "out of memory";
1514                 break;
1515         case EEXIST:
1516                 evtr->errmsg = "redefinition of an id to a "
1517                         "different format (corrupt input)";
1518                 break;
1519         default:
1520                 ;
1521         }
1522         return evtr->err;
1523
1524 free_fmtstr:
1525         free(fmtstr);
1526 free_subsys:
1527         if (subsys)
1528                 free(subsys);
1529 free_fmt:
1530         free(fmt);
1531         return !0;
1532 }
1533
1534 static
1535 int
1536 evtr_load_string(evtr_t evtr, char *buf)
1537 {
1538         char sbuf[PATH_MAX + 1];
1539         struct string_event_header *evh = (struct string_event_header *)buf;
1540
1541         if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1542                       sizeof(*evh) - sizeof(evh->eh))) {
1543                 return !0;
1544         }
1545         if (evh->len > PATH_MAX) {
1546                 evtr->errmsg = "string too large (corrupt input)";
1547                 return !0;
1548         }
1549         if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1550                 return !0;
1551         }
1552         sbuf[evh->len] = 0;
1553         if (evh->ns >= EVTR_NS_MAX) {
1554                 evtr->errmsg = "invalid namespace (corrupt input)";
1555                 return !0;
1556         }
1557         validate_string(sbuf);
1558         printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1559                sbuf);
1560         evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1561         switch (evtr->err) {
1562         case ENOMEM:
1563                 evtr->errmsg = "out of memory";
1564                 break;
1565         case EEXIST:
1566                 evtr->errmsg = "redefinition of an id to a "
1567                         "different string (corrupt input)";
1568                 break;
1569         default:
1570                 ;
1571         }
1572         return 0;
1573 }
1574
1575 static
1576 int
1577 evtr_skip(evtr_t evtr, off_t bytes)
1578 {
1579         if (fseek(evtr->f, bytes, SEEK_CUR)) {
1580                 evtr->err = errno;
1581                 evtr->errmsg = strerror(errno);
1582                 return !0;
1583         }
1584         evtr->bytes += bytes;
1585         return 0;
1586 }
1587
1588 /*
1589  * Make sure q->buf is at least len bytes
1590  */
1591 static
1592 int
1593 evtr_query_reserve_buf(struct evtr_query *q, int len)
1594 {
1595         void *tmp;
1596
1597         if (q->bufsize >= len)
1598                 return 0;
1599         if (!(tmp = realloc(q->buf, len)))
1600                 return !0;
1601         q->buf = tmp;
1602         q->bufsize = len;
1603         return 0;
1604 }
1605
1606 static
1607 int
1608 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1609 {
1610         struct probe_event_header *evh = (struct probe_event_header *)buf;
1611         struct cpu *cpu;
1612
1613         if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1614                       sizeof(*evh) - sizeof(evh->eh)))
1615                 return !0;
1616         memset(ev, '\0', sizeof(*ev));
1617         ev->ts = evh->eh.ts;
1618         ev->type = EVTR_TYPE_PROBE;
1619         ev->line = evh->line;
1620         ev->cpu = evh->cpu;
1621         if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1622                 ev->td = cpu->td;
1623         } else {
1624                 ev->td = NULL;
1625         }
1626         if (evh->file) {
1627                 ev->file = string_map_find(
1628                         &evtr->maps[EVTR_NS_PATH - 1].root,
1629                         evh->file);
1630                 if (!ev->file) {
1631                         evtr->errmsg = "unknown id for file path";
1632                         evtr->err = !0;
1633                         ev->file = "<unknown>";
1634                 } else {
1635                         validate_string(ev->file);
1636                 }
1637         } else {
1638                 ev->file = "<unknown>";
1639         }
1640         if (evh->fmt) {
1641                 const struct event_fmt *fmt;
1642                 if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1643                         evtr->errmsg = "unknown id for event fmt";
1644                         evtr->err = !0;
1645                         ev->fmt = NULL;
1646                 } else {
1647                         ev->fmt = fmt->fmt;
1648                         validate_string(fmt->fmt);
1649                 }
1650         }
1651         if (evh->datalen) {
1652                 if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1653                         evtr->err = ENOMEM;
1654                 } else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1655                         struct replace_ctx replctx = {
1656                                 .evtr = evtr,
1657                                 .ts = ev->ts,
1658                         };
1659                         assert(ev->fmt);
1660
1661                         ev->fmtdata = q->buf;
1662                         /*
1663                          * If the format specifies any string pointers, there
1664                          * is a string id stored in the fmtdata. Look it up
1665                          * and replace it with a string pointer before
1666                          * returning it to the user.
1667                          */
1668                         if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1669                                                                   ev->fmtdata),
1670                                                replace_strid, &replctx) < 0)
1671                                 return evtr->err;
1672                         if (evtr->err)
1673                                 return evtr->err;
1674                         ((char *)ev->fmtdata)[evh->datalen] = '\0';
1675                         ev->fmtdatalen = evh->datalen;
1676                 }
1677         }
1678         evtr_run_callbacks(ev, q);
1679         return evtr->err;
1680 }
1681
1682 static
1683 int
1684 evtr_skip_to_record(evtr_t evtr)
1685 {
1686         int skip;
1687         
1688         skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1689         if (skip > 0) {
1690                 if (fseek(evtr->f, skip, SEEK_CUR)) {
1691                         evtr->err = errno;
1692                         evtr->errmsg = strerror(errno);
1693                         return !0;
1694                 }
1695                 evtr->bytes += skip;
1696         }
1697         return 0;
1698 }
1699
1700 static
1701 int
1702 evtr_load_sysinfo(evtr_t evtr)
1703 {
1704         uint16_t ncpus;
1705         int i;
1706
1707         if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1708                 return !0;
1709         }
1710         if (evtr->cpus)
1711                 return 0;
1712         evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1713         if (!evtr->cpus) {
1714                 evtr->err = ENOMEM;
1715                 return !0;
1716         }
1717         evtr->ncpus = ncpus;
1718         for (i = 0; i < ncpus; ++i) {
1719                 evtr->cpus[i].td = NULL;
1720                 evtr->cpus[i].freq = -1.0;
1721         }
1722         return 0;
1723 }
1724
1725 static
1726 int
1727 evtr_load_cpuinfo(evtr_t evtr)
1728 {
1729         struct cpuinfo_event_header cih;
1730         struct cpu *cpu;
1731
1732         if (evtr_read(evtr, &cih, sizeof(cih))) {
1733                 return !0;
1734         }
1735         if (cih.freq < 0.0) {
1736                 evtr->errmsg = "cpu freq is negative";
1737                 evtr->err = EINVAL;
1738                 return !0;
1739         }
1740         /*
1741          * Notice that freq is merely a multiplier with
1742          * which we convert a timestamp to seconds; if
1743          * ts is not in cycles, freq is not the frequency.
1744          */
1745         if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1746                 evtr->errmsg = "freq for invalid cpu";
1747                 evtr->err = EINVAL;
1748                 return !0;
1749         }
1750         cpu->freq = cih.freq;
1751         return 0;
1752 }
1753
1754 static
1755 int
1756 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1757 {
1758         char buf[MAX_EVHDR_SIZE];
1759         int ret, err;
1760         struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1761
1762         for (ret = 0; !ret;) {
1763                 if (q->flags & EVTRQF_PENDING) {
1764                         q->off = evtr->bytes;
1765                         memcpy(ev, &q->pending_event, sizeof(*ev));
1766                         q->flags &= ~EVTRQF_PENDING;
1767                         return 0;
1768                 }
1769                 if (evtr_read(evtr, &evhdr->type, 1)) {
1770                         if (feof(evtr->f)) {
1771                                 evtr->errmsg = NULL;
1772                                 evtr->err = 0;
1773                                 return -1;
1774                         }
1775                         return !0;
1776                 }
1777                 /*
1778                  * skip pad records -- this will only happen if there's a
1779                  * variable sized record close to the boundary
1780                  */
1781                 if (evhdr->type == EVTR_TYPE_PAD) {
1782                         evtr_skip_to_record(evtr);
1783                         continue;
1784                 }
1785                 if (evhdr->type == EVTR_TYPE_SYSINFO) {
1786                         evtr_load_sysinfo(evtr);
1787                         continue;
1788                 } else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1789                         evtr_load_cpuinfo(evtr);
1790                         continue;
1791                 }
1792                 if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1793                         return feof(evtr->f) ? -1 : !0;
1794                 switch (evhdr->type) {
1795                 case EVTR_TYPE_PROBE:
1796                         if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1797                                 if (err == -1) {
1798                                         /* no match */
1799                                         ret = 0;
1800                                 } else {
1801                                         return !0;
1802                                 }
1803                         } else {
1804                                 ret = !0;
1805                         }
1806                         break;
1807                 case EVTR_TYPE_STR:
1808                         if (evtr_load_string(evtr, buf)) {
1809                                 return !0;
1810                         }
1811                         break;
1812                 case EVTR_TYPE_FMT:
1813                         if (evtr_load_fmt(q, buf)) {
1814                                 return !0;
1815                         }
1816                         break;
1817                 default:
1818                         evtr->err = !0;
1819                         evtr->errmsg = "unknown event type (corrupt input?)";
1820                         return !0;
1821                 }
1822                 evtr_skip_to_record(evtr);
1823                 if (ret) {
1824                         if (!evtr_match_filters(q, ev)) {
1825                                 ret = 0;
1826                                 continue;
1827                         }
1828                         q->off = evtr->bytes;
1829                         return 0;
1830                 }
1831         }
1832         /* can't get here */
1833         return !0;
1834 }
1835
1836 static
1837 int
1838 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1839 {
1840         struct evtr_query *q;
1841         int ret;
1842
1843         if (!(q = evtr_query_init(evtr, NULL, 0))) {
1844                 evtr->err = ENOMEM;
1845                 return !0;
1846         }
1847         ret = _evtr_next_event(evtr, ev, q);
1848         evtr_query_destroy(q);
1849         return ret;
1850 }
1851
1852 int
1853 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1854 {
1855         struct stat st;
1856         int fd;
1857         off_t last_boundary;
1858
1859         if (evtr_error(evtr))
1860                 return !0;
1861
1862         fd = fileno(evtr->f);
1863         if (fstat(fd, &st))
1864                 return !0;
1865         /*
1866          * This skips pseudo records, so we can't provide
1867          * an event with all fields filled in this way.
1868          * It's doable, just needs some care. TBD.
1869          */
1870         if (0 && (st.st_mode & S_IFREG)) {
1871                 /*
1872                  * Skip to last boundary, that's the closest to the EOF
1873                  * location that we are sure contains a header so we can
1874                  * pick up the stream.
1875                  */
1876                 last_boundary = rounddown(st.st_size, REC_BOUNDARY);
1877                 /* XXX: ->bytes should be in query */
1878                 assert(evtr->bytes == 0);
1879                 evtr_skip(evtr, last_boundary);
1880         }
1881
1882
1883         /*
1884          * If we can't seek, we need to go through the whole file.
1885          * Since you can't seek back, this is pretty useless unless
1886          * you really are interested only in the last event.
1887          */
1888         while (!evtr_next_event(evtr, ev))
1889                 ;
1890         if (evtr_error(evtr))
1891                 return !0;
1892         evtr_rewind(evtr);
1893         return 0;
1894 }
1895
1896 struct evtr_query *
1897 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1898 {
1899         struct evtr_query *q;
1900         int i;
1901
1902         if (!(q = malloc(sizeof(*q)))) {
1903                 return q;
1904         }
1905         q->bufsize = 2;
1906         if (!(q->buf = malloc(q->bufsize))) {
1907                 goto free_q;
1908         }
1909         if (!(q->symtab = symtab_new()))
1910                 goto free_buf;
1911         q->evtr = evtr;
1912         q->off = 0;
1913         q->filt = filt;
1914         q->nfilt = nfilt;
1915         TAILQ_INIT(&q->unresolved_filtq);
1916         q->nmatched = 0;
1917         q->cbs = NULL;
1918         q->ncbs = 0;
1919         q->flags = 0;
1920         memset(&q->pending_event, '\0', sizeof(q->pending_event));
1921         if (evtr_register_callback(q, &thread_creation_callback, q)) {
1922                 goto free_symtab;
1923         }
1924         if (evtr_register_callback(q, &thread_switch_callback, q)) {
1925                 goto free_cbs;
1926         }
1927         if (evtr_query_needs_parsing(q) &&
1928             evtr_register_callback(q, &parse_callback, q)) {
1929                 goto free_cbs;
1930         }
1931
1932         for (i = 0; i < nfilt; ++i) {
1933                 filt[i].flags = 0;
1934                 if (filt[i].fmt == NULL)
1935                         continue;
1936                 if (evtr_filter_register(q, &filt[i])) {
1937                         evtr_deregister_filters(q, filt, i);
1938                         goto free_symtab;
1939                 }
1940         }
1941
1942         return q;
1943 free_cbs:
1944         evtr_deregister_callbacks(q);
1945 free_symtab:
1946         symtab_destroy(q->symtab);
1947 free_buf:
1948         free(q->buf);
1949 free_q:
1950         free(q);
1951         return NULL;
1952 }
1953
1954 void
1955 evtr_query_destroy(struct evtr_query *q)
1956 {
1957         evtr_deregister_filters(q, q->filt, q->nfilt);
1958                 
1959         free(q->buf);
1960         free(q);
1961 }
1962
1963 int
1964 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1965 {
1966         if (evtr_query_error(q))
1967                 return !0;
1968         /* we may support that in the future */
1969         if (q->off != q->evtr->bytes) {
1970                 q->errmsg = "evtr/query offset mismatch";
1971                 return !0;
1972         }
1973         return _evtr_next_event(q->evtr, ev, q);
1974 }
1975
1976 int
1977 evtr_ncpus(evtr_t evtr)
1978 {
1979         return evtr->ncpus;
1980 }
1981
1982 int
1983 evtr_cpufreqs(evtr_t evtr, double *freqs)
1984 {
1985         int i;
1986
1987         if (!freqs)
1988                 return EINVAL;
1989         for (i = 0; i < evtr->ncpus; ++i) {
1990                 freqs[i] = evtr->cpus[i].freq;
1991         }
1992         return 0;
1993 }