773422b1131a1c1354d51c2afea97d197b66b945
[dragonfly.git] / lib / libevtr / evtr.c
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  * 
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  * 
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 #include <assert.h>
33 #include <ctype.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <stdarg.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/queue.h>
42 #include <sys/stat.h>
43 #include <sys/tree.h>
44
45
46 #include "evtr.h"
47 #include "internal.h"
48
49 unsigned evtr_debug;
50
51 static
52 void
53 printd_set_flags(const char *str, unsigned int *flags)
54 {
55         /*
56          * This is suboptimal as we don't detect
57          * invalid flags.
58          */
59         for (; *str; ++str) {
60                 if ('A' == *str) {
61                         *flags = -1;
62                         return;
63                 }
64                 if (!islower(*str))
65                         err(2, "invalid debug flag %c\n", *str);
66                 *flags |= 1 << (*str - 'a');
67         }
68 }
69
70
71 enum {
72         MAX_EVHDR_SIZE = PATH_MAX + 200,
73         /* string namespaces */
74         EVTR_NS_PATH = 0x1,
75         EVTR_NS_FUNC,
76         EVTR_NS_DSTR,
77         EVTR_NS_MAX,
78         NR_BUCKETS = 1021,      /* prime */
79         PARSE_ERR_BUFSIZE = 256,
80         REC_ALIGN = 8,
81         REC_BOUNDARY = 1 << 14,
82         FILTF_ID = 0x10,
83         EVTRF_WR = 0x1,         /* open for writing */
84         EVTRQF_PENDING = 0x1,
85 };
86
87 typedef uint16_t fileid_t;
88 typedef uint16_t funcid_t;
89 typedef uint16_t fmtid_t;
90
91 struct trace_event_header {
92         uint8_t type;
93         uint64_t ts;    /* XXX: this should only be part of probe */
94 } __attribute__((packed));
95
96 struct probe_event_header {
97         struct trace_event_header eh;
98         /*
99          * For these fields, 0 implies "not available"
100          */
101         fileid_t file;
102         funcid_t caller1;
103         funcid_t caller2;
104         funcid_t func;
105         uint16_t line;
106         fmtid_t fmt;
107         uint16_t datalen;
108         uint8_t cpu;    /* -1 if n/a */
109 } __attribute__((packed));
110
111 struct string_event_header {
112         struct trace_event_header eh;
113         uint16_t ns;
114         uint32_t id;
115         uint16_t len;
116 } __attribute__((packed));
117
118 struct fmt_event_header {
119         struct trace_event_header eh;
120         uint16_t id;
121         uint8_t subsys_len;
122         uint8_t fmt_len;
123 } __attribute__((packed));
124
125 struct cpuinfo_event_header {
126         double freq;
127         uint8_t cpu;
128 } __attribute__((packed));
129
130 struct hashentry {
131         uintptr_t key;
132         uintptr_t val;
133         struct hashentry *next;
134 };
135
136 struct hashtab {
137         struct hashentry *buckets[NR_BUCKETS];
138         uintptr_t (*hashfunc)(uintptr_t);
139         uintptr_t (*cmpfunc)(uintptr_t, uintptr_t);
140 };
141
142 struct symtab {
143         struct hashtab tab;
144 };
145
146 struct event_fmt {
147         const char *subsys;
148         const char *fmt;
149 };
150
151 struct event_filter_unresolved {
152         TAILQ_ENTRY(event_filter_unresolved) link;
153         evtr_filter_t filt;
154 };
155
156 struct id_map {
157         RB_ENTRY(id_map) rb_node;
158         int id;
159         const void *data;
160 };
161
162 RB_HEAD(id_tree, id_map);
163 struct string_map {
164         struct id_tree root;
165 };
166
167 struct fmt_map {
168         struct id_tree root;
169 };
170
171 RB_HEAD(thread_tree, evtr_thread);
172
173 struct thread_map {
174         struct thread_tree root;
175 };
176
177 struct event_callback {
178         void (*cb)(evtr_event_t, void *data);
179         void *data;     /* this field must be malloc()ed */
180 };
181
182 struct cpu {
183         struct evtr_thread *td; /* currently executing thread */
184         double freq;
185 };
186
187 struct evtr {
188         FILE *f;
189         int flags;
190         int err;
191         const char *errmsg;
192         off_t bytes;
193         union {
194                 /*
195                  * When writing, we keep track of the strings we've
196                  * already dumped so we only dump them once.
197                  * Paths, function names etc belong to different
198                  * namespaces.
199                  */
200                 struct hashtab_str *strings[EVTR_NS_MAX - 1];
201                 /*
202                  * When reading, we build a map from id to string.
203                  * Every id must be defined at the point of use.
204                  */
205                 struct string_map maps[EVTR_NS_MAX - 1];
206         };
207         union {
208                 /* same as above, but for subsys+fmt pairs */
209                 struct fmt_map fmtmap;
210                 struct hashtab_str *fmts;
211         };
212         struct thread_map threads;
213         struct cpu *cpus;
214         int ncpus;
215 };
216
217 struct evtr_query {
218         evtr_t evtr;
219         off_t off;
220         evtr_filter_t filt;
221         int nfilt;
222         int nmatched;
223         int ntried;
224         void *buf;
225         int bufsize;
226         struct symtab *symtab;
227         int ncbs;
228         struct event_callback **cbs;
229         /*
230          * Filters that have a format specified and we
231          * need to resolve that to an fmtid
232          */
233         TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
234         int err;
235         const char *errmsg;
236         char parse_err_buf[PARSE_ERR_BUFSIZE];
237         int flags;
238         struct evtr_event pending_event;
239 };
240
241 void
242 evtr_set_debug(const char *str)
243 {
244         printd_set_flags(str, &evtr_debug);
245 }
246
247 static int id_map_cmp(struct id_map *, struct id_map *);
248 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
249 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
250
251 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
252 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
253 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
254
255 static inline
256 void
257 validate_string(const char *str)
258 {
259         if (!(evtr_debug & MISC))
260                 return;
261         for (; *str; ++str)
262                 assert(isprint(*str));
263 }
264
265 static
266 void
267 id_tree_free(struct id_tree *root)
268 {
269         struct id_map *v, *n;
270
271         for (v = RB_MIN(id_tree, root); v; v = n) {
272                 n = RB_NEXT(id_tree, root, v);
273                 RB_REMOVE(id_tree, root, v);
274         }
275 }
276
277 static
278 int
279 evtr_register_callback(evtr_query_t q, void (*fn)(evtr_event_t, void *), void *d)
280 {
281         struct event_callback *cb;
282         void *cbs;
283
284         if (!(cb = malloc(sizeof(*cb)))) {
285                 q->err = ENOMEM;
286                 return !0;
287         }
288         cb->cb = fn;
289         cb->data = d;
290         if (!(cbs = realloc(q->cbs, (++q->ncbs) * sizeof(cb)))) {
291                 --q->ncbs;
292                 free(cb);
293                 q->err = ENOMEM;
294                 return !0;
295         }
296         q->cbs = cbs;
297         q->cbs[q->ncbs - 1] = cb;
298         return 0;
299 }
300
301 static
302 void
303 evtr_deregister_callbacks(evtr_query_t q)
304 {
305         int i;
306
307         for (i = 0; i < q->ncbs; ++i) {
308                 free(q->cbs[i]);
309         }
310         free(q->cbs);
311         q->cbs = NULL;
312 }
313
314 static
315 void
316 evtr_run_callbacks(evtr_event_t ev, evtr_query_t q)
317 {
318         struct event_callback *cb;
319         int i;
320
321         for (i = 0; i < q->ncbs; ++i) {
322                 cb = q->cbs[i];
323                 cb->cb(ev, cb->data);
324         }
325 }
326
327 static
328 struct cpu *
329 evtr_cpu(evtr_t evtr, int c)
330 {
331         if ((c < 0) || (c >= evtr->ncpus))
332                 return NULL;
333         return &evtr->cpus[c];
334 }
335
336 static int parse_format_data(evtr_event_t ev, const char *fmt, ...)
337                __printflike(2, 3) __scanflike(2, 3);
338
339 static
340 int
341 parse_format_data(evtr_event_t ev, const char *fmt, ...)
342 {
343         va_list ap;
344         char buf[2048];
345
346         if (strcmp(fmt, ev->fmt))
347                 return 0;
348         vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
349         printd(MISC, "string is: %s\n", buf);
350         va_start(ap, fmt);
351         return vsscanf(buf, fmt, ap);
352 }
353
354 static
355 void
356 evtr_deregister_filters(evtr_query_t q, evtr_filter_t filt, int nfilt)
357 {
358         struct event_filter_unresolved *u, *tmp;
359         int i;
360         TAILQ_FOREACH_MUTABLE(u, &q->unresolved_filtq, link, tmp) {
361                 for (i = 0; i < nfilt; ++i) {
362                         if (u->filt == &filt[i]) {
363                                 TAILQ_REMOVE(&q->unresolved_filtq, u, link);
364                         }
365                 }
366         }
367 }
368
369 static
370 int
371 evtr_filter_register(evtr_query_t q, evtr_filter_t filt)
372 {
373         struct event_filter_unresolved *res;
374
375         if (!(res = malloc(sizeof(*res)))) {
376                 q->err = ENOMEM;
377                 return !0;
378         }
379         res->filt = filt;
380         TAILQ_INSERT_TAIL(&q->unresolved_filtq, res, link);
381         return 0;
382 }
383
384 static
385 int
386 evtr_query_needs_parsing(evtr_query_t q)
387 {
388         int i;
389
390         for (i = 0; i < q->nfilt; ++i)
391                 if (q->filt[i].ev_type == EVTR_TYPE_STMT)
392                         return !0;
393         return 0;
394 }
395
396 void
397 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
398 {
399         /*
400          * XXX: we implicitly trust the format string.
401          * We shouldn't.
402          */
403         if (ev->fmtdatalen) {
404                 vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
405         } else {
406                 strlcpy(buf, ev->fmt, len);
407         }
408 }
409
410 int
411 evtr_error(evtr_t evtr)
412 {
413         return evtr->err || (evtr->errmsg != NULL);
414 }
415
416 const char *
417 evtr_errmsg(evtr_t evtr)
418 {
419         return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
420 }
421
422 int
423 evtr_query_error(evtr_query_t q)
424 {
425         return q->err || (q->errmsg != NULL) || evtr_error(q->evtr);
426 }
427
428 const char *
429 evtr_query_errmsg(evtr_query_t q)
430 {
431         return q->errmsg ? q->errmsg :
432                 (q->err ? strerror(q->err) :
433                  (evtr_errmsg(q->evtr)));
434 }
435
436 static
437 int
438 id_map_cmp(struct id_map *a, struct id_map *b)
439 {
440         return a->id - b->id;
441 }
442
443 static
444 int
445 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
446 {
447         ptrdiff_t d;
448         d =  a->id - b->id;
449         if (d < 0)
450                 return -1;
451         if (!d)
452                 return 0;
453         return 1;
454 }
455
456 #define DEFINE_MAP_FIND(prefix, type)           \
457         static                                  \
458         type                            \
459         prefix ## _map_find(struct id_tree *tree, int id)\
460         {                                                \
461                 struct id_map *sid;                      \
462                                                         \
463                 sid = id_tree_RB_LOOKUP(tree, id);      \
464                 return sid ? sid->data : NULL;          \
465         }
466
467 DEFINE_MAP_FIND(string, const char *)
468 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
469
470 static
471 struct evtr_thread *
472 thread_map_find(struct thread_map *map, void *id)
473 {
474         return thread_tree_RB_LOOKUP(&map->root, id);
475 }
476
477 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)     \
478         static                                  \
479         int                                                             \
480         prefix ## _map_insert(struct id_tree *tree, type data, int id) \
481         {                                                               \
482         struct id_map *sid, *osid;                                      \
483                                                                         \
484         sid = malloc(sizeof(*sid));                                     \
485         if (!sid) {                                                     \
486                 return ENOMEM;                                          \
487         }                                                               \
488         sid->id = id;                                                   \
489         sid->data = data;                                               \
490         if ((osid = id_tree_RB_INSERT(tree, sid))) {                    \
491                 free(sid);                                              \
492                 if (_cmp((type)osid->data, data)) {                     \
493                         return EEXIST;                                  \
494                 }                                                       \
495                 printd(DS, "mapping already exists, skipping\n");               \
496                 /* we're OK with redefinitions of an id to the same string */ \
497                 return 0;                                               \
498         }                                                               \
499         /* only do the strdup if we're inserting a new string */        \
500         sid->data = _dup(data);         /* XXX: oom */                  \
501         return 0;                                                       \
502 }
503
504 static
505 void
506 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
507 {
508         struct evtr_thread *otd;
509
510         if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
511                 /*
512                  * Thread addresses might be reused, we're
513                  * ok with that.
514                  * DANGER, Will Robinson: this means the user
515                  * of the API needs to copy event->td if they
516                  * want it to remain stable.
517                  */
518                 free((void *)otd->comm);
519                 otd->comm = td->comm;
520                 free(td);
521         }
522 }
523
524 static
525 int
526 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
527 {
528         int ret = 0;
529
530         if (a->subsys) {
531                 if (b->subsys) {
532                         ret = strcmp(a->subsys, b->subsys);
533                 } else {
534                         ret = strcmp(a->subsys, "");
535                 }
536         } else if (b->subsys) {
537                         ret = strcmp("", b->subsys);
538         }
539         if (ret)
540                 return ret;
541         return strcmp(a->fmt, b->fmt);
542 }
543
544 static
545 struct event_fmt *
546 event_fmt_dup(const struct event_fmt *o)
547 {
548         struct event_fmt *n;
549
550         if (!(n = malloc(sizeof(*n)))) {
551                 return n;
552         }
553         memcpy(n, o, sizeof(*n));
554         return n;
555 }
556
557 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
558 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
559
560 int
561 hash_find(const struct hashtab *tab, uintptr_t key, uintptr_t *val)
562 {
563         struct hashentry *ent;
564
565         for(ent = tab->buckets[tab->hashfunc(key)];
566             ent && tab->cmpfunc(ent->key, key);
567             ent = ent->next);
568
569         if (!ent)
570                 return !0;
571         *val = ent->val;
572         return 0;
573 }
574
575 struct hashentry *
576 hash_insert(struct hashtab *tab, uintptr_t key, uintptr_t val)
577 {
578         struct hashentry *ent;
579         int hsh;
580
581         if (!(ent = malloc(sizeof(*ent)))) {
582                 fprintf(stderr, "out of memory\n");
583                 return NULL;
584         }
585         hsh = tab->hashfunc(key);
586         ent->next = tab->buckets[hsh];
587         ent->key = key;
588         ent->val = val;
589         tab->buckets[hsh] = ent;
590         return ent;
591 }
592
593 static
594 uintptr_t
595 cmpfunc_pointer(uintptr_t a, uintptr_t b)
596 {
597         return b - a;
598 }
599
600 static
601 uintptr_t
602 hashfunc_pointer(uintptr_t p)
603 {
604         return p % NR_BUCKETS;
605 }
606
607 struct hashtab *
608 hash_new(void)
609 {
610         struct hashtab *tab;
611         if (!(tab = calloc(sizeof(struct hashtab), 1)))
612                 return tab;
613         tab->hashfunc = &hashfunc_pointer;
614         tab->cmpfunc = &cmpfunc_pointer;
615         return tab;
616 }
617
618 struct hashtab_str {    /* string -> id map */
619         struct hashtab tab;
620         uint16_t id;
621 };
622
623 static
624 uintptr_t
625 hashfunc_string(uintptr_t p)
626 {
627         const char *str = (char *)p;
628         unsigned long hash = 5381;
629         int c;
630
631         while ((c = *str++))
632             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
633         return hash  % NR_BUCKETS;
634 }
635
636 static
637 uintptr_t
638 cmpfunc_string(uintptr_t a, uintptr_t b)
639 {
640         return strcmp((char *)a, (char *)b);
641 }
642
643
644 static
645 struct hashtab_str *
646 strhash_new(void)
647 {
648         struct hashtab_str *strtab;
649         if (!(strtab = calloc(sizeof(struct hashtab_str), 1)))
650                 return strtab;
651         strtab->tab.hashfunc = &hashfunc_string;
652         strtab->tab.cmpfunc = &cmpfunc_string;
653         return strtab;
654 }
655
656 static
657 void
658 strhash_destroy(struct hashtab_str *strtab)
659 {
660         free(strtab);
661 }
662
663 static
664 int
665 strhash_find(struct hashtab_str *strtab, const char *str, uint16_t *id)
666 {
667         uintptr_t val;
668
669         if (hash_find(&strtab->tab, (uintptr_t)str, &val))
670                 return !0;
671         *id = (uint16_t)val;
672         return 0;
673 }
674
675 static
676 int
677 strhash_insert(struct hashtab_str *strtab, const char *str, uint16_t *id)
678 {
679         uintptr_t val;
680
681         val = ++strtab->id;
682         if (strtab->id == 0) {
683                 fprintf(stderr, "too many strings\n");
684                 return ERANGE;
685         }
686         str = strdup(str);
687         if (!str) {
688                 fprintf(stderr, "out of memory\n");
689                 --strtab->id;
690                 return ENOMEM;
691         }
692         hash_insert(&strtab->tab, (uintptr_t)str, (uintptr_t)val);
693         *id = strtab->id;
694         return 0;
695 }
696
697 struct symtab *
698 symtab_new(void)
699 {
700         struct symtab *symtab;
701         if (!(symtab = calloc(sizeof(struct symtab), 1)))
702                 return symtab;
703         symtab->tab.hashfunc = &hashfunc_string;
704         symtab->tab.cmpfunc = &cmpfunc_string;
705         return symtab;
706 }
707
708 void
709 symtab_destroy(struct symtab *symtab)
710 {
711         free(symtab);
712 }
713
714 struct evtr_variable *
715 symtab_find(const struct symtab *symtab, const char *str)
716 {
717         uintptr_t val;
718
719         if (hash_find(&symtab->tab, (uintptr_t)str, &val))
720                 return NULL;
721         return (struct evtr_variable *)val;
722 }
723
724 int
725 symtab_insert(struct symtab *symtab, const char *name,
726                struct evtr_variable *var)
727 {
728         name = strdup(name);
729         if (!name) {
730                 fprintf(stderr, "out of memory\n");
731                 return ENOMEM;
732         }
733         hash_insert(&symtab->tab, (uintptr_t)name, (uintptr_t)var);
734         return 0;
735 }
736
737 static
738 int
739 evtr_filter_match(evtr_query_t q, evtr_filter_t f, evtr_event_t ev)
740 {
741         if ((f->cpu != -1) && (f->cpu != ev->cpu))
742                 return 0;
743
744         assert(!(f->flags & FILTF_ID));
745         if (ev->type != f->ev_type)
746                 return 0;
747         if (ev->type == EVTR_TYPE_PROBE) {
748                 if (f->fmt && strcmp(ev->fmt, f->fmt))
749                         return 0;
750         } else if (ev->type == EVTR_TYPE_STMT) {
751                 struct evtr_variable *var;
752                 /* resolve var */
753                 /* XXX: no need to do that *every* time */
754                 parse_var(f->var, q->symtab, &var, &q->parse_err_buf[0],
755                           PARSE_ERR_BUFSIZE);
756                 /*
757                  * Ignore errors, they're expected since the
758                  * variable might not be instantiated yet
759                  */
760                 if (var != ev->stmt.var)
761                         return 0;
762         }
763         return !0;
764 }
765
766 static
767 int
768 evtr_match_filters(struct evtr_query *q, evtr_event_t ev)
769 {
770         int i;
771
772         /* no filters means we're interested in all events */
773         if (!q->nfilt)
774                 return !0;
775         ++q->ntried;
776         for (i = 0; i < q->nfilt; ++i) {
777                 if (evtr_filter_match(q, &q->filt[i], ev)) {
778                         ++q->nmatched;
779                         return !0;
780                 }
781         }
782         return 0;
783 }
784
785 static
786 void
787 parse_callback(evtr_event_t ev, void *d)
788 {
789         evtr_query_t q = (evtr_query_t)d;
790         if (ev->type != EVTR_TYPE_PROBE)
791                 return;
792         if (!ev->fmt || (ev->fmt[0] != '#'))
793                 return;
794         /*
795          * Copy the event to ->pending_event, then call
796          * the parser to convert it into a synthesized
797          * EVTR_TYPE_STMT event.
798          */
799         memcpy(&q->pending_event, ev, sizeof(*ev));
800         parse_string(&q->pending_event, q->symtab, &ev->fmt[1],
801                      &q->parse_err_buf[0], PARSE_ERR_BUFSIZE);
802         if (q->parse_err_buf[0]) {      /* parse error */
803                 q->errmsg = &q->parse_err_buf[0];
804                 return;
805         }
806         if (!evtr_match_filters(q, &q->pending_event))
807                 return;
808         /*
809          * This will cause us to return ->pending_event next time
810          * we're called.
811          */
812         q->flags |= EVTRQF_PENDING;
813 }
814
815 static
816 void
817 thread_creation_callback(evtr_event_t ev, void *d)
818 {
819         evtr_query_t q = (evtr_query_t)d;
820         evtr_t evtr = q->evtr;
821         struct evtr_thread *td;
822         void *ktd;
823         char buf[20];
824
825         if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
826                 return;
827         }
828         buf[19] = '\0';
829
830         if (!(td = malloc(sizeof(*td)))) {
831                 q->err = ENOMEM;
832                 return;
833         }
834         td->id = ktd;
835         td->userdata = NULL;
836         if (!(td->comm = strdup(buf))) {
837                 free(td);
838                 q->err = ENOMEM;
839                 return;
840         }
841         printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
842         thread_map_insert(&evtr->threads, td);
843 }
844
845 static
846 void
847 thread_switch_callback(evtr_event_t ev, void *d)
848 {
849         evtr_t evtr = ((evtr_query_t)d)->evtr;
850         struct evtr_thread *tdp, *tdn;
851         void *ktdp, *ktdn;
852         struct cpu *cpu;
853         static struct evtr_event tdcr;
854         static char *fmt = "new_td %p %s";
855         char tidstr[40];
856         char fmtdata[sizeof(void *) + sizeof(char *)];
857
858         cpu = evtr_cpu(evtr, ev->cpu);
859         if (!cpu) {
860                 printw("invalid cpu %d\n", ev->cpu);
861                 return;
862         }
863         if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
864                 return;
865         }
866         tdp = thread_map_find(&evtr->threads, ktdp);
867         if (!tdp) {
868                 printd(DS, "switching from unknown thread %p\n", ktdp);
869         }
870         tdn = thread_map_find(&evtr->threads, ktdn);
871         if (!tdn) {
872                 /*
873                  * Fake a thread creation event for threads we
874                  * haven't seen before.
875                  */
876                 tdcr.type = EVTR_TYPE_PROBE;
877                 tdcr.ts = ev->ts;
878                 tdcr.file = NULL;
879                 tdcr.func = NULL;
880                 tdcr.line = 0;
881                 tdcr.fmt = fmt;
882                 tdcr.fmtdata = &fmtdata;
883                 tdcr.fmtdatalen = sizeof(fmtdata);
884                 tdcr.cpu = ev->cpu;
885                 tdcr.td = NULL;
886                 snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
887                 ((void **)fmtdata)[0] = ktdn;
888                 ((char **)fmtdata)[1] = &tidstr[0];
889                 thread_creation_callback(&tdcr, d);
890
891                 tdn = thread_map_find(&evtr->threads, ktdn);
892                 assert(tdn != NULL);
893                 printd(DS, "switching to unknown thread %p\n", ktdn);
894                 cpu->td = tdn;
895                 return;
896         }
897         printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
898         cpu->td = tdn;
899 }
900
901 static
902 void
903 assert_foff_in_sync(evtr_t evtr)
904 {
905         off_t off;
906
907         /*
908          * We keep our own offset because we
909          * might want to support mmap()
910          */
911         off = ftello(evtr->f);
912         if (evtr->bytes != off) {
913                 fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
914                 abort();
915         }
916 }
917
918 static
919 int
920 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
921 {
922         assert_foff_in_sync(evtr);
923         if (fwrite(buf, bytes, 1, evtr->f) != 1) {
924                 evtr->err = errno;
925                 evtr->errmsg = strerror(errno);
926                 return !0;
927         }
928         evtr->bytes += bytes;
929         assert_foff_in_sync(evtr);
930         return 0;
931 }
932
933 /*
934  * Called after dumping a record to make sure the next
935  * record is REC_ALIGN aligned. This does not make much sense,
936  * as we shouldn't be using packed structs anyway.
937  */
938 static
939 int
940 evtr_dump_pad(evtr_t evtr)
941 {
942         size_t pad;
943         static char buf[REC_ALIGN];
944
945         pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
946         if (pad > 0) {
947                 return evtr_write(evtr, buf, pad);
948         }
949         return 0;
950 }
951
952 /*
953  * We make sure that there is a new record every REC_BOUNDARY
954  * bytes, this costs next to nothing in space and allows for
955  * fast seeking.
956  */
957 static
958 int
959 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
960 {
961         unsigned pad, i;
962         static char buf[256];
963
964         pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
965         /* if adding @bytes would cause us to cross a boundary... */
966         if (bytes > pad) {
967                 /* then pad to the boundary */
968                 for (i = 0; i < (pad / sizeof(buf)); ++i) {
969                         if (evtr_write(evtr, buf, sizeof(buf))) {
970                                 return !0;
971                         }
972                 }
973                 i = pad % sizeof(buf);
974                 if (i) {
975                         if (evtr_write(evtr, buf, i)) {
976                                 return !0;
977                         }
978                 }
979         }
980         return 0;
981 }
982
983 static
984 int
985 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
986 {
987         struct fmt_event_header fmt;
988         uint16_t id;
989         int err;
990         char *subsys = "", buf[1024];
991
992         if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
993                 evtr->errmsg = "name of subsystem is too large";
994                 evtr->err = ERANGE;
995                 return 0;
996         }
997         if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
998                 evtr->errmsg = "fmt + name of subsystem is too large";
999                 evtr->err = ERANGE;
1000                 return 0;
1001         }
1002
1003         if (!strhash_find(evtr->fmts, buf, &id)) {
1004                 return id;
1005         }
1006         if ((err = strhash_insert(evtr->fmts, buf, &id))) {
1007                 evtr->err = err;
1008                 return 0;
1009         }
1010
1011         fmt.eh.type = EVTR_TYPE_FMT;
1012         fmt.eh.ts = ts;
1013         fmt.subsys_len = strlen(subsys);
1014         fmt.fmt_len = strlen(ev->fmt);
1015         fmt.id = id;
1016         if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
1017                                      fmt.fmt_len))
1018                 return 0;
1019         if (evtr_write(evtr, &fmt, sizeof(fmt)))
1020                 return 0;
1021         if (evtr_write(evtr, subsys, fmt.subsys_len))
1022                 return 0;
1023         if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
1024                 return 0;
1025         if (evtr_dump_pad(evtr))
1026                 return 0;
1027         return fmt.id;
1028 }
1029
1030 /*
1031  * Replace string pointers or string ids in fmtdata
1032  */ 
1033 static
1034 int
1035 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
1036                    const char *(*replace)(void *, const char *), void *ctx)
1037 {
1038         const char *f, *p;
1039         size_t skipsize, intsz;
1040         int ret = 0;
1041
1042         for (f = fmt; f[0] != '\0'; ++f) {
1043                 if (f[0] != '%')
1044                         continue;
1045                 ++f;
1046                 skipsize = 0;
1047                 for (p = f; p[0]; ++p) {
1048                         int again = 0;
1049                         /*
1050                          * Eat flags. Notice this will accept duplicate
1051                          * flags.
1052                          */
1053                         switch (p[0]) {
1054                         case '#':
1055                         case '0':
1056                         case '-':
1057                         case ' ':
1058                         case '+':
1059                         case '\'':
1060                                 again = !0;
1061                                 break;
1062                         }
1063                         if (!again)
1064                                 break;
1065                 }
1066                 /* Eat minimum field width, if any */
1067                 for (; isdigit(p[0]); ++p)
1068                         ;
1069                 if (p[0] == '.')
1070                         ++p;
1071                 /* Eat precision, if any */
1072                 for (; isdigit(p[0]); ++p)
1073                         ;
1074                 intsz = 0;
1075                 switch (p[0]) {
1076                 case 'l':
1077                         if (p[1] == 'l') {
1078                                 ++p;
1079                                 intsz = sizeof(long long);
1080                         } else {
1081                                 intsz = sizeof(long);
1082                         }
1083                         break;
1084                 case 'j':
1085                         intsz = sizeof(intmax_t);
1086                         break;
1087                 case 't':
1088                         intsz = sizeof(ptrdiff_t);
1089                         break;
1090                 case 'z':
1091                         intsz = sizeof(size_t);
1092                         break;
1093                 default:
1094                         break;
1095                 }
1096                 if (intsz != 0)
1097                         ++p;
1098                 else
1099                         intsz = sizeof(int);
1100
1101                 switch (p[0]) {
1102                 case 'd':
1103                 case 'i':
1104                 case 'o':
1105                 case 'u':
1106                 case 'x':
1107                 case 'X':
1108                 case 'c':
1109                         skipsize = intsz;
1110                         break;
1111                 case 'p':
1112                         skipsize = sizeof(void *);
1113                         break;
1114                 case 'f':
1115                         if (p[-1] == 'l')
1116                                 skipsize = sizeof(double);
1117                         else
1118                                 skipsize = sizeof(float);
1119                         break;
1120                 case 's':
1121                         ((const char **)fmtdata)[0] =
1122                                 replace(ctx, ((char **)fmtdata)[0]);
1123                         skipsize = sizeof(char *);
1124                         ++ret;
1125                         break;
1126                 default:
1127                         fprintf(stderr, "Unknown conversion specifier %c "
1128                                 "in fmt starting with %s", p[0], f - 1);
1129                         return -1;
1130                 }
1131                 fmtdata += skipsize;
1132         }
1133         return ret;
1134 }
1135
1136 /* XXX: do we really want the timestamp? */
1137 static
1138 int
1139 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
1140 {
1141         struct string_event_header s;
1142         int err;
1143         uint16_t id;
1144
1145         assert((0 <= ns) && (ns < EVTR_NS_MAX));
1146         if (!strhash_find(evtr->strings[ns], str, &id)) {
1147                 return id;
1148         }
1149         if ((err = strhash_insert(evtr->strings[ns], str, &id))) {
1150                 evtr->err = err;
1151                 return 0;
1152         }
1153
1154         printd(DS, "hash_insert %s ns %d id %d\n", str, ns, id);
1155         s.eh.type = EVTR_TYPE_STR;
1156         s.eh.ts = ts;
1157         s.ns = ns;
1158         s.id = id;
1159         s.len = strnlen(str, PATH_MAX);
1160
1161         if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
1162                 return 0;
1163         if (evtr_write(evtr, &s, sizeof(s)))
1164                 return 0;
1165         if (evtr_write(evtr, str, s.len))
1166                 return 0;
1167         if (evtr_dump_pad(evtr))
1168                 return 0;
1169         return s.id;
1170 }
1171
1172 struct replace_ctx {
1173         evtr_t evtr;
1174         uint64_t ts;
1175 };
1176
1177 static
1178 const char *
1179 replace_strptr(void *_ctx, const char *s)
1180 {
1181         struct replace_ctx *ctx = _ctx;
1182         return (const char *)(uintptr_t)evtr_dump_string(ctx->evtr, ctx->ts, s,
1183                                                          EVTR_NS_DSTR);
1184 }
1185
1186 static
1187 const char *
1188 replace_strid(void *_ctx, const char *s)
1189 {
1190         struct replace_ctx *ctx = _ctx;
1191         const char *ret;
1192
1193         ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
1194                               (int)(uintptr_t)s);
1195         if (!ret) {
1196                 fprintf(stderr, "Unknown id for data string\n");
1197                 ctx->evtr->errmsg = "unknown id for data string";
1198                 ctx->evtr->err = !0;
1199         }
1200         validate_string(ret);
1201         printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n",
1202                (int)(uintptr_t)s, EVTR_NS_DSTR, ret ? ret : "NULL", (int)(uintptr_t)ret);
1203         return ret;
1204 }
1205
1206 static
1207 int
1208 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
1209 {
1210         struct probe_event_header kev;
1211         char buf[1024];
1212
1213         memset(&kev, '\0', sizeof(kev));
1214         kev.eh.type = ev->type;
1215         kev.eh.ts = ev->ts;
1216         kev.line = ev->line;
1217         kev.cpu = ev->cpu;
1218         if (ev->file) {
1219                 kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
1220                                             EVTR_NS_PATH);
1221         }
1222         if (ev->func) {
1223                 kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
1224                                             EVTR_NS_FUNC);
1225         }
1226         if (ev->fmt) {
1227                 kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1228         }
1229         if (ev->fmtdata) {
1230                 struct replace_ctx replctx = {
1231                         .evtr = evtr,
1232                         .ts = ev->ts,
1233                 };
1234                 assert(ev->fmtdatalen <= (int)sizeof(buf));
1235                 kev.datalen = ev->fmtdatalen;
1236                 /*
1237                  * Replace all string pointers with string ids before dumping
1238                  * the data.
1239                  */
1240                 memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1241                 if (mangle_string_ptrs(ev->fmt, buf,
1242                                        replace_strptr, &replctx) < 0)
1243                         return !0;
1244                 if (evtr->err)
1245                         return evtr->err;
1246         }
1247         if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1248                 return !0;
1249         if (evtr_write(evtr, &kev, sizeof(kev)))
1250                 return !0;
1251         if (evtr_write(evtr, buf, ev->fmtdatalen))
1252                 return !0;
1253         if (evtr_dump_pad(evtr))
1254                 return !0;
1255         return 0;
1256 }
1257
1258 static
1259 int
1260 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1261 {
1262         uint8_t type = EVTR_TYPE_SYSINFO;
1263         uint16_t ncpus = ev->ncpus;
1264
1265         if (ncpus <= 0) {
1266                 evtr->errmsg = "invalid number of cpus";
1267                 return !0;
1268         }
1269         if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1270                 return !0;
1271         if (evtr_write(evtr, &type, sizeof(type))) {
1272                 return !0;
1273         }
1274         if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1275                 return !0;
1276         }
1277         if (evtr_dump_pad(evtr))
1278                 return !0;
1279         return 0;
1280 }
1281 static
1282 int
1283 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1284 {
1285         struct cpuinfo_event_header ci;
1286         uint8_t type;
1287
1288         if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1289                 return !0;
1290         type = EVTR_TYPE_CPUINFO;
1291         if (evtr_write(evtr, &type, sizeof(type))) {
1292                 return !0;
1293         }
1294         ci.cpu = ev->cpu;
1295         ci.freq = ev->cpuinfo.freq;
1296         if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1297                 return !0;
1298         if (evtr_write(evtr, &ci, sizeof(ci))) {
1299                 return !0;
1300         }
1301         if (evtr_dump_pad(evtr))
1302                 return !0;
1303         return 0;
1304 }
1305
1306 int
1307 evtr_rewind(evtr_t evtr)
1308 {
1309         assert((evtr->flags & EVTRF_WR) == 0);
1310         evtr->bytes = 0;
1311         if (fseek(evtr->f, 0, SEEK_SET)) {
1312                 evtr->err = errno;
1313                 return !0;
1314         }
1315         return 0;
1316 }
1317
1318 int
1319 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1320 {
1321         switch (ev->type) {
1322         case EVTR_TYPE_PROBE:
1323                 return evtr_dump_probe(evtr, ev);
1324         case EVTR_TYPE_SYSINFO:
1325                 return evtr_dump_sysinfo(evtr, ev);
1326         case EVTR_TYPE_CPUINFO:
1327                 return evtr_dump_cpuinfo(evtr, ev);
1328         }
1329         evtr->errmsg = "unknown event type";
1330         return !0;
1331 }
1332
1333 static
1334 evtr_t
1335 evtr_alloc(FILE *f)
1336 {
1337         evtr_t evtr;
1338         if (!(evtr = malloc(sizeof(*evtr)))) {
1339                 return NULL;
1340         }
1341
1342         evtr->f = f;
1343         evtr->err = 0;
1344         evtr->errmsg = NULL;
1345         evtr->bytes = 0;
1346         return evtr;
1347 }
1348
1349 static int evtr_next_event(evtr_t, evtr_event_t);
1350
1351 evtr_t
1352 evtr_open_read(FILE *f)
1353 {
1354         evtr_t evtr;
1355         struct evtr_event ev;
1356         int i;
1357
1358         if (!(evtr = evtr_alloc(f))) {
1359                 return NULL;
1360         }
1361         evtr->flags = 0;
1362         for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1363                 RB_INIT(&evtr->maps[i].root);
1364         }
1365         RB_INIT(&evtr->fmtmap.root);
1366         RB_INIT(&evtr->threads.root);
1367         evtr->cpus = NULL;
1368         evtr->ncpus = 0;
1369         /*
1370          * Load the first event so we can pick up any
1371          * sysinfo entries.
1372          */
1373         if (evtr_next_event(evtr, &ev)) {
1374                 goto free_evtr;
1375         }
1376         if (evtr_rewind(evtr))
1377                 goto free_evtr;
1378         return evtr;
1379 free_evtr:
1380         free(evtr);
1381         return NULL;
1382 }
1383
1384 evtr_t
1385 evtr_open_write(FILE *f)
1386 {
1387         evtr_t evtr;
1388         int i, j;
1389
1390         if (!(evtr = evtr_alloc(f))) {
1391                 return NULL;
1392         }
1393
1394         evtr->flags = EVTRF_WR;
1395         if (!(evtr->fmts = strhash_new()))
1396                 goto free_evtr;
1397         for (i = 0; i < EVTR_NS_MAX; ++i) {
1398                 evtr->strings[i] = strhash_new();
1399                 if (!evtr->strings[i]) {
1400                         for (j = 0; j < i; ++j) {
1401                                 strhash_destroy(evtr->strings[j]);
1402                         }
1403                         goto free_fmts;
1404                 }
1405         }
1406
1407         return evtr;
1408 free_fmts:
1409         strhash_destroy(evtr->fmts);
1410 free_evtr:
1411         free(evtr);
1412         return NULL;
1413 }
1414
1415 static
1416 void
1417 hashtab_destroy(struct hashtab *h)
1418 {
1419         struct hashentry *ent, *next;
1420         int i;
1421         for (i = 0; i < NR_BUCKETS; ++i) {
1422                 for (ent = h->buckets[i]; ent; ent = next) {
1423                         next = ent->next;
1424                         free(ent);
1425                 }
1426         }
1427         free(h);
1428 }
1429
1430 void
1431 evtr_close(evtr_t evtr)
1432 {
1433         int i;
1434
1435         if (evtr->flags & EVTRF_WR) {
1436                 hashtab_destroy(&evtr->fmts->tab);
1437                 for (i = 0; i < EVTR_NS_MAX; ++i)
1438                         hashtab_destroy(&evtr->strings[i]->tab);
1439         } else {
1440                 id_tree_free(&evtr->fmtmap.root);
1441                 for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1442                         id_tree_free(&evtr->maps[i].root);
1443                 }
1444         }
1445         free(evtr);
1446 }
1447
1448 static
1449 int
1450 evtr_read(evtr_t evtr, void *buf, size_t size)
1451 {
1452         assert(size > 0);
1453         assert_foff_in_sync(evtr);
1454         printd(IO, "evtr_read at %#jx, %zd bytes\n", evtr->bytes, size);
1455         if (fread(buf, size, 1, evtr->f) != 1) {
1456                 if (feof(evtr->f)) {
1457                         evtr->errmsg = "incomplete record";
1458                 } else {
1459                         evtr->errmsg = strerror(errno);
1460                 }
1461                 return !0;
1462         }
1463         evtr->bytes += size;
1464         assert_foff_in_sync(evtr);
1465         return 0;
1466 }
1467
1468 static
1469 int
1470 evtr_load_fmt(evtr_query_t q, char *buf)
1471 {
1472         evtr_t evtr = q->evtr;
1473         struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1474         struct event_fmt *fmt;
1475         char *subsys = NULL, *fmtstr;
1476
1477         if (!(fmt = malloc(sizeof(*fmt)))) {
1478                 evtr->err = errno;
1479                 return !0;
1480         }
1481         if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1482                       sizeof(*evh) - sizeof(evh->eh))) {
1483                 goto free_fmt;
1484         }
1485         assert(!evh->subsys_len);
1486         if (evh->subsys_len) {
1487                 if (!(subsys = malloc(evh->subsys_len))) {
1488                         evtr->err = errno;
1489                         goto free_fmt;
1490                 }
1491                 if (evtr_read(evtr, subsys, evh->subsys_len)) {
1492                         goto free_subsys;
1493                 }
1494                 fmt->subsys = subsys;
1495         } else {
1496                 fmt->subsys = "";
1497         }
1498         if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1499                 evtr->err = errno;
1500                 goto free_subsys;
1501         }
1502         if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1503                 goto free_fmtstr;
1504         }
1505         fmtstr[evh->fmt_len] = '\0';
1506         fmt->fmt = fmtstr;
1507
1508         printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1509         evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1510         switch (evtr->err) {
1511         case ENOMEM:
1512                 evtr->errmsg = "out of memory";
1513                 break;
1514         case EEXIST:
1515                 evtr->errmsg = "redefinition of an id to a "
1516                         "different format (corrupt input)";
1517                 break;
1518         default:
1519                 ;
1520         }
1521         return evtr->err;
1522
1523 free_fmtstr:
1524         free(fmtstr);
1525 free_subsys:
1526         if (subsys)
1527                 free(subsys);
1528 free_fmt:
1529         free(fmt);
1530         return !0;
1531 }
1532
1533 static
1534 int
1535 evtr_load_string(evtr_t evtr, char *buf)
1536 {
1537         char sbuf[PATH_MAX + 1];
1538         struct string_event_header *evh = (struct string_event_header *)buf;
1539
1540         if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1541                       sizeof(*evh) - sizeof(evh->eh))) {
1542                 return !0;
1543         }
1544         if (evh->len > PATH_MAX) {
1545                 evtr->errmsg = "string too large (corrupt input)";
1546                 return !0;
1547         }
1548         if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1549                 return !0;
1550         }
1551         sbuf[evh->len] = 0;
1552         if (evh->ns >= EVTR_NS_MAX) {
1553                 evtr->errmsg = "invalid namespace (corrupt input)";
1554                 return !0;
1555         }
1556         validate_string(sbuf);
1557         printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1558                sbuf);
1559         evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1560         switch (evtr->err) {
1561         case ENOMEM:
1562                 evtr->errmsg = "out of memory";
1563                 break;
1564         case EEXIST:
1565                 evtr->errmsg = "redefinition of an id to a "
1566                         "different string (corrupt input)";
1567                 break;
1568         default:
1569                 ;
1570         }
1571         return 0;
1572 }
1573
1574 static
1575 int
1576 evtr_skip(evtr_t evtr, off_t bytes)
1577 {
1578         if (fseek(evtr->f, bytes, SEEK_CUR)) {
1579                 evtr->err = errno;
1580                 evtr->errmsg = strerror(errno);
1581                 return !0;
1582         }
1583         evtr->bytes += bytes;
1584         return 0;
1585 }
1586
1587 /*
1588  * Make sure q->buf is at least len bytes
1589  */
1590 static
1591 int
1592 evtr_query_reserve_buf(struct evtr_query *q, int len)
1593 {
1594         void *tmp;
1595
1596         if (q->bufsize >= len)
1597                 return 0;
1598         if (!(tmp = realloc(q->buf, len)))
1599                 return !0;
1600         q->buf = tmp;
1601         q->bufsize = len;
1602         return 0;
1603 }
1604
1605 static
1606 int
1607 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1608 {
1609         struct probe_event_header *evh = (struct probe_event_header *)buf;
1610         struct cpu *cpu;
1611
1612         if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1613                       sizeof(*evh) - sizeof(evh->eh)))
1614                 return !0;
1615         memset(ev, '\0', sizeof(*ev));
1616         ev->ts = evh->eh.ts;
1617         ev->type = EVTR_TYPE_PROBE;
1618         ev->line = evh->line;
1619         ev->cpu = evh->cpu;
1620         if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1621                 ev->td = cpu->td;
1622         } else {
1623                 ev->td = NULL;
1624         }
1625         if (evh->file) {
1626                 ev->file = string_map_find(
1627                         &evtr->maps[EVTR_NS_PATH - 1].root,
1628                         evh->file);
1629                 if (!ev->file) {
1630                         evtr->errmsg = "unknown id for file path";
1631                         evtr->err = !0;
1632                         ev->file = "<unknown>";
1633                 } else {
1634                         validate_string(ev->file);
1635                 }
1636         } else {
1637                 ev->file = "<unknown>";
1638         }
1639         if (evh->fmt) {
1640                 const struct event_fmt *fmt;
1641                 if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1642                         evtr->errmsg = "unknown id for event fmt";
1643                         evtr->err = !0;
1644                         ev->fmt = NULL;
1645                 } else {
1646                         ev->fmt = fmt->fmt;
1647                         validate_string(fmt->fmt);
1648                 }
1649         }
1650         if (evh->datalen) {
1651                 if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1652                         evtr->err = ENOMEM;
1653                 } else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1654                         struct replace_ctx replctx = {
1655                                 .evtr = evtr,
1656                                 .ts = ev->ts,
1657                         };
1658                         assert(ev->fmt);
1659
1660                         ev->fmtdata = q->buf;
1661                         /*
1662                          * If the format specifies any string pointers, there
1663                          * is a string id stored in the fmtdata. Look it up
1664                          * and replace it with a string pointer before
1665                          * returning it to the user.
1666                          */
1667                         if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1668                                                                   ev->fmtdata),
1669                                                replace_strid, &replctx) < 0)
1670                                 return evtr->err;
1671                         if (evtr->err)
1672                                 return evtr->err;
1673                         ((char *)ev->fmtdata)[evh->datalen] = '\0';
1674                         ev->fmtdatalen = evh->datalen;
1675                 }
1676         }
1677         evtr_run_callbacks(ev, q);
1678         return evtr->err;
1679 }
1680
1681 static
1682 int
1683 evtr_skip_to_record(evtr_t evtr)
1684 {
1685         int skip;
1686         
1687         skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1688         if (skip > 0) {
1689                 if (fseek(evtr->f, skip, SEEK_CUR)) {
1690                         evtr->err = errno;
1691                         evtr->errmsg = strerror(errno);
1692                         return !0;
1693                 }
1694                 evtr->bytes += skip;
1695         }
1696         return 0;
1697 }
1698
1699 static
1700 int
1701 evtr_load_sysinfo(evtr_t evtr)
1702 {
1703         uint16_t ncpus;
1704         int i;
1705
1706         if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1707                 return !0;
1708         }
1709         if (evtr->cpus)
1710                 return 0;
1711         evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1712         if (!evtr->cpus) {
1713                 evtr->err = ENOMEM;
1714                 return !0;
1715         }
1716         evtr->ncpus = ncpus;
1717         for (i = 0; i < ncpus; ++i) {
1718                 evtr->cpus[i].td = NULL;
1719                 evtr->cpus[i].freq = -1.0;
1720         }
1721         return 0;
1722 }
1723
1724 static
1725 int
1726 evtr_load_cpuinfo(evtr_t evtr)
1727 {
1728         struct cpuinfo_event_header cih;
1729         struct cpu *cpu;
1730
1731         if (evtr_read(evtr, &cih, sizeof(cih))) {
1732                 return !0;
1733         }
1734         if (cih.freq < 0.0) {
1735                 evtr->errmsg = "cpu freq is negative";
1736                 evtr->err = EINVAL;
1737                 return !0;
1738         }
1739         /*
1740          * Notice that freq is merely a multiplier with
1741          * which we convert a timestamp to seconds; if
1742          * ts is not in cycles, freq is not the frequency.
1743          */
1744         if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1745                 evtr->errmsg = "freq for invalid cpu";
1746                 evtr->err = EINVAL;
1747                 return !0;
1748         }
1749         cpu->freq = cih.freq;
1750         return 0;
1751 }
1752
1753 static
1754 int
1755 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1756 {
1757         char buf[MAX_EVHDR_SIZE];
1758         int ret, err, ntried, nmatched;
1759         struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1760
1761         for (ret = 0; !ret;) {
1762                 if (q->flags & EVTRQF_PENDING) {
1763                         q->off = evtr->bytes;
1764                         memcpy(ev, &q->pending_event, sizeof(*ev));
1765                         q->flags &= ~EVTRQF_PENDING;
1766                         return 0;
1767                 }
1768                 if (evtr_read(evtr, &evhdr->type, 1)) {
1769                         if (feof(evtr->f)) {
1770                                 evtr->errmsg = NULL;
1771                                 evtr->err = 0;
1772                                 return -1;
1773                         }
1774                         return !0;
1775                 }
1776                 /*
1777                  * skip pad records -- this will only happen if there's a
1778                  * variable sized record close to the boundary
1779                  */
1780                 if (evhdr->type == EVTR_TYPE_PAD) {
1781                         evtr_skip_to_record(evtr);
1782                         continue;
1783                 }
1784                 if (evhdr->type == EVTR_TYPE_SYSINFO) {
1785                         evtr_load_sysinfo(evtr);
1786                         continue;
1787                 } else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1788                         evtr_load_cpuinfo(evtr);
1789                         continue;
1790                 }
1791                 if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1792                         return feof(evtr->f) ? -1 : !0;
1793                 switch (evhdr->type) {
1794                 case EVTR_TYPE_PROBE:
1795                         ntried = q->ntried;
1796                         nmatched = q->nmatched;
1797                         if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1798                                 if (err == -1) {
1799                                         /* no match */
1800                                         ret = 0;
1801                                 } else {
1802                                         return !0;
1803                                 }
1804                         } else {
1805                                 ret = !0;
1806                         }
1807                         break;
1808                 case EVTR_TYPE_STR:
1809                         if (evtr_load_string(evtr, buf)) {
1810                                 return !0;
1811                         }
1812                         break;
1813                 case EVTR_TYPE_FMT:
1814                         if (evtr_load_fmt(q, buf)) {
1815                                 return !0;
1816                         }
1817                         break;
1818                 default:
1819                         evtr->err = !0;
1820                         evtr->errmsg = "unknown event type (corrupt input?)";
1821                         return !0;
1822                 }
1823                 evtr_skip_to_record(evtr);
1824                 if (ret) {
1825                         if (!evtr_match_filters(q, ev)) {
1826                                 ret = 0;
1827                                 continue;
1828                         }
1829                         q->off = evtr->bytes;
1830                         return 0;
1831                 }
1832         }
1833         /* can't get here */
1834         return !0;
1835 }
1836
1837 static
1838 int
1839 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1840 {
1841         struct evtr_query *q;
1842         int ret;
1843
1844         if (!(q = evtr_query_init(evtr, NULL, 0))) {
1845                 evtr->err = ENOMEM;
1846                 return !0;
1847         }
1848         ret = _evtr_next_event(evtr, ev, q);
1849         evtr_query_destroy(q);
1850         return ret;
1851 }
1852
1853 int
1854 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1855 {
1856         struct stat st;
1857         int fd;
1858         off_t last_boundary;
1859
1860         if (evtr_error(evtr))
1861                 return !0;
1862
1863         fd = fileno(evtr->f);
1864         if (fstat(fd, &st))
1865                 return !0;
1866         /*
1867          * This skips pseudo records, so we can't provide
1868          * an event with all fields filled in this way.
1869          * It's doable, just needs some care. TBD.
1870          */
1871         if (0 && (st.st_mode & S_IFREG)) {
1872                 /*
1873                  * Skip to last boundary, that's the closest to the EOF
1874                  * location that we are sure contains a header so we can
1875                  * pick up the stream.
1876                  */
1877                 last_boundary = (st.st_size / REC_BOUNDARY) * REC_BOUNDARY;
1878                 /* XXX: ->bytes should be in query */
1879                 assert(evtr->bytes == 0);
1880                 evtr_skip(evtr, last_boundary);
1881         }
1882
1883
1884         /*
1885          * If we can't seek, we need to go through the whole file.
1886          * Since you can't seek back, this is pretty useless unless
1887          * you really are interested only in the last event.
1888          */
1889         while (!evtr_next_event(evtr, ev))
1890                 ;
1891         if (evtr_error(evtr))
1892                 return !0;
1893         evtr_rewind(evtr);
1894         return 0;
1895 }
1896
1897 struct evtr_query *
1898 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1899 {
1900         struct evtr_query *q;
1901         int i;
1902
1903         if (!(q = malloc(sizeof(*q)))) {
1904                 return q;
1905         }
1906         q->bufsize = 2;
1907         if (!(q->buf = malloc(q->bufsize))) {
1908                 goto free_q;
1909         }
1910         if (!(q->symtab = symtab_new()))
1911                 goto free_buf;
1912         q->evtr = evtr;
1913         q->off = 0;
1914         q->filt = filt;
1915         q->nfilt = nfilt;
1916         TAILQ_INIT(&q->unresolved_filtq);
1917         q->nmatched = 0;
1918         q->cbs = NULL;
1919         q->ncbs = 0;
1920         q->flags = 0;
1921         memset(&q->pending_event, '\0', sizeof(q->pending_event));
1922         if (evtr_register_callback(q, &thread_creation_callback, q)) {
1923                 goto free_symtab;
1924         }
1925         if (evtr_register_callback(q, &thread_switch_callback, q)) {
1926                 goto free_cbs;
1927         }
1928         if (evtr_query_needs_parsing(q) &&
1929             evtr_register_callback(q, &parse_callback, q)) {
1930                 goto free_cbs;
1931         }
1932
1933         for (i = 0; i < nfilt; ++i) {
1934                 filt[i].flags = 0;
1935                 if (filt[i].fmt == NULL)
1936                         continue;
1937                 if (evtr_filter_register(q, &filt[i])) {
1938                         evtr_deregister_filters(q, filt, i);
1939                         goto free_symtab;
1940                 }
1941         }
1942
1943         return q;
1944 free_cbs:
1945         evtr_deregister_callbacks(q);
1946 free_symtab:
1947         symtab_destroy(q->symtab);
1948 free_buf:
1949         free(q->buf);
1950 free_q:
1951         free(q);
1952         return NULL;
1953 }
1954
1955 void
1956 evtr_query_destroy(struct evtr_query *q)
1957 {
1958         evtr_deregister_filters(q, q->filt, q->nfilt);
1959                 
1960         free(q->buf);
1961         free(q);
1962 }
1963
1964 int
1965 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1966 {
1967         if (evtr_query_error(q))
1968                 return !0;
1969         /* we may support that in the future */
1970         if (q->off != q->evtr->bytes) {
1971                 q->errmsg = "evtr/query offset mismatch";
1972                 return !0;
1973         }
1974         return _evtr_next_event(q->evtr, ev, q);
1975 }
1976
1977 int
1978 evtr_ncpus(evtr_t evtr)
1979 {
1980         return evtr->ncpus;
1981 }
1982
1983 int
1984 evtr_cpufreqs(evtr_t evtr, double *freqs)
1985 {
1986         int i;
1987
1988         if (!freqs)
1989                 return EINVAL;
1990         for (i = 0; i < evtr->ncpus; ++i) {
1991                 freqs[i] = evtr->cpus[i].freq;
1992         }
1993         return 0;
1994 }