Merge tag 'block-5.17-2022-03-10' of git://git.kernel.dk/linux-block
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77
78 struct switch_output {
79         bool             enabled;
80         bool             signal;
81         unsigned long    size;
82         unsigned long    time;
83         const char      *str;
84         bool             set;
85         char             **filenames;
86         int              num_files;
87         int              cur_file;
88 };
89
90 struct record {
91         struct perf_tool        tool;
92         struct record_opts      opts;
93         u64                     bytes_written;
94         struct perf_data        data;
95         struct auxtrace_record  *itr;
96         struct evlist   *evlist;
97         struct perf_session     *session;
98         struct evlist           *sb_evlist;
99         pthread_t               thread_id;
100         int                     realtime_prio;
101         bool                    switch_output_event_set;
102         bool                    no_buildid;
103         bool                    no_buildid_set;
104         bool                    no_buildid_cache;
105         bool                    no_buildid_cache_set;
106         bool                    buildid_all;
107         bool                    buildid_mmap;
108         bool                    timestamp_filename;
109         bool                    timestamp_boundary;
110         struct switch_output    switch_output;
111         unsigned long long      samples;
112         struct mmap_cpu_mask    affinity_mask;
113         unsigned long           output_max_size;        /* = 0: unlimited */
114         struct perf_debuginfod  debuginfod;
115 };
116
117 static volatile int done;
118
119 static volatile int auxtrace_record__snapshot_started;
120 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
121 static DEFINE_TRIGGER(switch_output_trigger);
122
123 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
124         "SYS", "NODE", "CPU"
125 };
126
127 static bool switch_output_signal(struct record *rec)
128 {
129         return rec->switch_output.signal &&
130                trigger_is_ready(&switch_output_trigger);
131 }
132
133 static bool switch_output_size(struct record *rec)
134 {
135         return rec->switch_output.size &&
136                trigger_is_ready(&switch_output_trigger) &&
137                (rec->bytes_written >= rec->switch_output.size);
138 }
139
140 static bool switch_output_time(struct record *rec)
141 {
142         return rec->switch_output.time &&
143                trigger_is_ready(&switch_output_trigger);
144 }
145
146 static bool record__output_max_size_exceeded(struct record *rec)
147 {
148         return rec->output_max_size &&
149                (rec->bytes_written >= rec->output_max_size);
150 }
151
152 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
153                          void *bf, size_t size)
154 {
155         struct perf_data_file *file = &rec->session->data->file;
156
157         if (perf_data_file__write(file, bf, size) < 0) {
158                 pr_err("failed to write perf data, error: %m\n");
159                 return -1;
160         }
161
162         rec->bytes_written += size;
163
164         if (record__output_max_size_exceeded(rec) && !done) {
165                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
166                                 " stopping session ]\n",
167                                 rec->bytes_written >> 10);
168                 done = 1;
169         }
170
171         if (switch_output_size(rec))
172                 trigger_hit(&switch_output_trigger);
173
174         return 0;
175 }
176
177 static int record__aio_enabled(struct record *rec);
178 static int record__comp_enabled(struct record *rec);
179 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
180                             void *src, size_t src_size);
181
182 #ifdef HAVE_AIO_SUPPORT
183 static int record__aio_write(struct aiocb *cblock, int trace_fd,
184                 void *buf, size_t size, off_t off)
185 {
186         int rc;
187
188         cblock->aio_fildes = trace_fd;
189         cblock->aio_buf    = buf;
190         cblock->aio_nbytes = size;
191         cblock->aio_offset = off;
192         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
193
194         do {
195                 rc = aio_write(cblock);
196                 if (rc == 0) {
197                         break;
198                 } else if (errno != EAGAIN) {
199                         cblock->aio_fildes = -1;
200                         pr_err("failed to queue perf data, error: %m\n");
201                         break;
202                 }
203         } while (1);
204
205         return rc;
206 }
207
208 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
209 {
210         void *rem_buf;
211         off_t rem_off;
212         size_t rem_size;
213         int rc, aio_errno;
214         ssize_t aio_ret, written;
215
216         aio_errno = aio_error(cblock);
217         if (aio_errno == EINPROGRESS)
218                 return 0;
219
220         written = aio_ret = aio_return(cblock);
221         if (aio_ret < 0) {
222                 if (aio_errno != EINTR)
223                         pr_err("failed to write perf data, error: %m\n");
224                 written = 0;
225         }
226
227         rem_size = cblock->aio_nbytes - written;
228
229         if (rem_size == 0) {
230                 cblock->aio_fildes = -1;
231                 /*
232                  * md->refcount is incremented in record__aio_pushfn() for
233                  * every aio write request started in record__aio_push() so
234                  * decrement it because the request is now complete.
235                  */
236                 perf_mmap__put(&md->core);
237                 rc = 1;
238         } else {
239                 /*
240                  * aio write request may require restart with the
241                  * reminder if the kernel didn't write whole
242                  * chunk at once.
243                  */
244                 rem_off = cblock->aio_offset + written;
245                 rem_buf = (void *)(cblock->aio_buf + written);
246                 record__aio_write(cblock, cblock->aio_fildes,
247                                 rem_buf, rem_size, rem_off);
248                 rc = 0;
249         }
250
251         return rc;
252 }
253
254 static int record__aio_sync(struct mmap *md, bool sync_all)
255 {
256         struct aiocb **aiocb = md->aio.aiocb;
257         struct aiocb *cblocks = md->aio.cblocks;
258         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
259         int i, do_suspend;
260
261         do {
262                 do_suspend = 0;
263                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
264                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
265                                 if (sync_all)
266                                         aiocb[i] = NULL;
267                                 else
268                                         return i;
269                         } else {
270                                 /*
271                                  * Started aio write is not complete yet
272                                  * so it has to be waited before the
273                                  * next allocation.
274                                  */
275                                 aiocb[i] = &cblocks[i];
276                                 do_suspend = 1;
277                         }
278                 }
279                 if (!do_suspend)
280                         return -1;
281
282                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
283                         if (!(errno == EAGAIN || errno == EINTR))
284                                 pr_err("failed to sync perf data, error: %m\n");
285                 }
286         } while (1);
287 }
288
289 struct record_aio {
290         struct record   *rec;
291         void            *data;
292         size_t          size;
293 };
294
295 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
296 {
297         struct record_aio *aio = to;
298
299         /*
300          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
301          * to release space in the kernel buffer as fast as possible, calling
302          * perf_mmap__consume() from perf_mmap__push() function.
303          *
304          * That lets the kernel to proceed with storing more profiling data into
305          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
306          *
307          * Coping can be done in two steps in case the chunk of profiling data
308          * crosses the upper bound of the kernel buffer. In this case we first move
309          * part of data from map->start till the upper bound and then the reminder
310          * from the beginning of the kernel buffer till the end of the data chunk.
311          */
312
313         if (record__comp_enabled(aio->rec)) {
314                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
315                                      mmap__mmap_len(map) - aio->size,
316                                      buf, size);
317         } else {
318                 memcpy(aio->data + aio->size, buf, size);
319         }
320
321         if (!aio->size) {
322                 /*
323                  * Increment map->refcount to guard map->aio.data[] buffer
324                  * from premature deallocation because map object can be
325                  * released earlier than aio write request started on
326                  * map->aio.data[] buffer is complete.
327                  *
328                  * perf_mmap__put() is done at record__aio_complete()
329                  * after started aio request completion or at record__aio_push()
330                  * if the request failed to start.
331                  */
332                 perf_mmap__get(&map->core);
333         }
334
335         aio->size += size;
336
337         return size;
338 }
339
340 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
341 {
342         int ret, idx;
343         int trace_fd = rec->session->data->file.fd;
344         struct record_aio aio = { .rec = rec, .size = 0 };
345
346         /*
347          * Call record__aio_sync() to wait till map->aio.data[] buffer
348          * becomes available after previous aio write operation.
349          */
350
351         idx = record__aio_sync(map, false);
352         aio.data = map->aio.data[idx];
353         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
354         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
355                 return ret;
356
357         rec->samples++;
358         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
359         if (!ret) {
360                 *off += aio.size;
361                 rec->bytes_written += aio.size;
362                 if (switch_output_size(rec))
363                         trigger_hit(&switch_output_trigger);
364         } else {
365                 /*
366                  * Decrement map->refcount incremented in record__aio_pushfn()
367                  * back if record__aio_write() operation failed to start, otherwise
368                  * map->refcount is decremented in record__aio_complete() after
369                  * aio write operation finishes successfully.
370                  */
371                 perf_mmap__put(&map->core);
372         }
373
374         return ret;
375 }
376
377 static off_t record__aio_get_pos(int trace_fd)
378 {
379         return lseek(trace_fd, 0, SEEK_CUR);
380 }
381
382 static void record__aio_set_pos(int trace_fd, off_t pos)
383 {
384         lseek(trace_fd, pos, SEEK_SET);
385 }
386
387 static void record__aio_mmap_read_sync(struct record *rec)
388 {
389         int i;
390         struct evlist *evlist = rec->evlist;
391         struct mmap *maps = evlist->mmap;
392
393         if (!record__aio_enabled(rec))
394                 return;
395
396         for (i = 0; i < evlist->core.nr_mmaps; i++) {
397                 struct mmap *map = &maps[i];
398
399                 if (map->core.base)
400                         record__aio_sync(map, true);
401         }
402 }
403
404 static int nr_cblocks_default = 1;
405 static int nr_cblocks_max = 4;
406
407 static int record__aio_parse(const struct option *opt,
408                              const char *str,
409                              int unset)
410 {
411         struct record_opts *opts = (struct record_opts *)opt->value;
412
413         if (unset) {
414                 opts->nr_cblocks = 0;
415         } else {
416                 if (str)
417                         opts->nr_cblocks = strtol(str, NULL, 0);
418                 if (!opts->nr_cblocks)
419                         opts->nr_cblocks = nr_cblocks_default;
420         }
421
422         return 0;
423 }
424 #else /* HAVE_AIO_SUPPORT */
425 static int nr_cblocks_max = 0;
426
427 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
428                             off_t *off __maybe_unused)
429 {
430         return -1;
431 }
432
433 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
434 {
435         return -1;
436 }
437
438 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
439 {
440 }
441
442 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
443 {
444 }
445 #endif
446
447 static int record__aio_enabled(struct record *rec)
448 {
449         return rec->opts.nr_cblocks > 0;
450 }
451
452 #define MMAP_FLUSH_DEFAULT 1
453 static int record__mmap_flush_parse(const struct option *opt,
454                                     const char *str,
455                                     int unset)
456 {
457         int flush_max;
458         struct record_opts *opts = (struct record_opts *)opt->value;
459         static struct parse_tag tags[] = {
460                         { .tag  = 'B', .mult = 1       },
461                         { .tag  = 'K', .mult = 1 << 10 },
462                         { .tag  = 'M', .mult = 1 << 20 },
463                         { .tag  = 'G', .mult = 1 << 30 },
464                         { .tag  = 0 },
465         };
466
467         if (unset)
468                 return 0;
469
470         if (str) {
471                 opts->mmap_flush = parse_tag_value(str, tags);
472                 if (opts->mmap_flush == (int)-1)
473                         opts->mmap_flush = strtol(str, NULL, 0);
474         }
475
476         if (!opts->mmap_flush)
477                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
478
479         flush_max = evlist__mmap_size(opts->mmap_pages);
480         flush_max /= 4;
481         if (opts->mmap_flush > flush_max)
482                 opts->mmap_flush = flush_max;
483
484         return 0;
485 }
486
487 #ifdef HAVE_ZSTD_SUPPORT
488 static unsigned int comp_level_default = 1;
489
490 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
491 {
492         struct record_opts *opts = opt->value;
493
494         if (unset) {
495                 opts->comp_level = 0;
496         } else {
497                 if (str)
498                         opts->comp_level = strtol(str, NULL, 0);
499                 if (!opts->comp_level)
500                         opts->comp_level = comp_level_default;
501         }
502
503         return 0;
504 }
505 #endif
506 static unsigned int comp_level_max = 22;
507
508 static int record__comp_enabled(struct record *rec)
509 {
510         return rec->opts.comp_level > 0;
511 }
512
513 static int process_synthesized_event(struct perf_tool *tool,
514                                      union perf_event *event,
515                                      struct perf_sample *sample __maybe_unused,
516                                      struct machine *machine __maybe_unused)
517 {
518         struct record *rec = container_of(tool, struct record, tool);
519         return record__write(rec, NULL, event, event->header.size);
520 }
521
522 static int process_locked_synthesized_event(struct perf_tool *tool,
523                                      union perf_event *event,
524                                      struct perf_sample *sample __maybe_unused,
525                                      struct machine *machine __maybe_unused)
526 {
527         static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
528         int ret;
529
530         pthread_mutex_lock(&synth_lock);
531         ret = process_synthesized_event(tool, event, sample, machine);
532         pthread_mutex_unlock(&synth_lock);
533         return ret;
534 }
535
536 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
537 {
538         struct record *rec = to;
539
540         if (record__comp_enabled(rec)) {
541                 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
542                 bf   = map->data;
543         }
544
545         rec->samples++;
546         return record__write(rec, map, bf, size);
547 }
548
549 static volatile int signr = -1;
550 static volatile int child_finished;
551 #ifdef HAVE_EVENTFD_SUPPORT
552 static int done_fd = -1;
553 #endif
554
555 static void sig_handler(int sig)
556 {
557         if (sig == SIGCHLD)
558                 child_finished = 1;
559         else
560                 signr = sig;
561
562         done = 1;
563 #ifdef HAVE_EVENTFD_SUPPORT
564 {
565         u64 tmp = 1;
566         /*
567          * It is possible for this signal handler to run after done is checked
568          * in the main loop, but before the perf counter fds are polled. If this
569          * happens, the poll() will continue to wait even though done is set,
570          * and will only break out if either another signal is received, or the
571          * counters are ready for read. To ensure the poll() doesn't sleep when
572          * done is set, use an eventfd (done_fd) to wake up the poll().
573          */
574         if (write(done_fd, &tmp, sizeof(tmp)) < 0)
575                 pr_err("failed to signal wakeup fd, error: %m\n");
576 }
577 #endif // HAVE_EVENTFD_SUPPORT
578 }
579
580 static void sigsegv_handler(int sig)
581 {
582         perf_hooks__recover();
583         sighandler_dump_stack(sig);
584 }
585
586 static void record__sig_exit(void)
587 {
588         if (signr == -1)
589                 return;
590
591         signal(signr, SIG_DFL);
592         raise(signr);
593 }
594
595 #ifdef HAVE_AUXTRACE_SUPPORT
596
597 static int record__process_auxtrace(struct perf_tool *tool,
598                                     struct mmap *map,
599                                     union perf_event *event, void *data1,
600                                     size_t len1, void *data2, size_t len2)
601 {
602         struct record *rec = container_of(tool, struct record, tool);
603         struct perf_data *data = &rec->data;
604         size_t padding;
605         u8 pad[8] = {0};
606
607         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
608                 off_t file_offset;
609                 int fd = perf_data__fd(data);
610                 int err;
611
612                 file_offset = lseek(fd, 0, SEEK_CUR);
613                 if (file_offset == -1)
614                         return -1;
615                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
616                                                      event, file_offset);
617                 if (err)
618                         return err;
619         }
620
621         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
622         padding = (len1 + len2) & 7;
623         if (padding)
624                 padding = 8 - padding;
625
626         record__write(rec, map, event, event->header.size);
627         record__write(rec, map, data1, len1);
628         if (len2)
629                 record__write(rec, map, data2, len2);
630         record__write(rec, map, &pad, padding);
631
632         return 0;
633 }
634
635 static int record__auxtrace_mmap_read(struct record *rec,
636                                       struct mmap *map)
637 {
638         int ret;
639
640         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
641                                   record__process_auxtrace);
642         if (ret < 0)
643                 return ret;
644
645         if (ret)
646                 rec->samples++;
647
648         return 0;
649 }
650
651 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
652                                                struct mmap *map)
653 {
654         int ret;
655
656         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
657                                            record__process_auxtrace,
658                                            rec->opts.auxtrace_snapshot_size);
659         if (ret < 0)
660                 return ret;
661
662         if (ret)
663                 rec->samples++;
664
665         return 0;
666 }
667
668 static int record__auxtrace_read_snapshot_all(struct record *rec)
669 {
670         int i;
671         int rc = 0;
672
673         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
674                 struct mmap *map = &rec->evlist->mmap[i];
675
676                 if (!map->auxtrace_mmap.base)
677                         continue;
678
679                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
680                         rc = -1;
681                         goto out;
682                 }
683         }
684 out:
685         return rc;
686 }
687
688 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
689 {
690         pr_debug("Recording AUX area tracing snapshot\n");
691         if (record__auxtrace_read_snapshot_all(rec) < 0) {
692                 trigger_error(&auxtrace_snapshot_trigger);
693         } else {
694                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
695                         trigger_error(&auxtrace_snapshot_trigger);
696                 else
697                         trigger_ready(&auxtrace_snapshot_trigger);
698         }
699 }
700
701 static int record__auxtrace_snapshot_exit(struct record *rec)
702 {
703         if (trigger_is_error(&auxtrace_snapshot_trigger))
704                 return 0;
705
706         if (!auxtrace_record__snapshot_started &&
707             auxtrace_record__snapshot_start(rec->itr))
708                 return -1;
709
710         record__read_auxtrace_snapshot(rec, true);
711         if (trigger_is_error(&auxtrace_snapshot_trigger))
712                 return -1;
713
714         return 0;
715 }
716
717 static int record__auxtrace_init(struct record *rec)
718 {
719         int err;
720
721         if (!rec->itr) {
722                 rec->itr = auxtrace_record__init(rec->evlist, &err);
723                 if (err)
724                         return err;
725         }
726
727         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
728                                               rec->opts.auxtrace_snapshot_opts);
729         if (err)
730                 return err;
731
732         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
733                                             rec->opts.auxtrace_sample_opts);
734         if (err)
735                 return err;
736
737         auxtrace_regroup_aux_output(rec->evlist);
738
739         return auxtrace_parse_filters(rec->evlist);
740 }
741
742 #else
743
744 static inline
745 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
746                                struct mmap *map __maybe_unused)
747 {
748         return 0;
749 }
750
751 static inline
752 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
753                                     bool on_exit __maybe_unused)
754 {
755 }
756
757 static inline
758 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
759 {
760         return 0;
761 }
762
763 static inline
764 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
765 {
766         return 0;
767 }
768
769 static int record__auxtrace_init(struct record *rec __maybe_unused)
770 {
771         return 0;
772 }
773
774 #endif
775
776 static int record__config_text_poke(struct evlist *evlist)
777 {
778         struct evsel *evsel;
779         int err;
780
781         /* Nothing to do if text poke is already configured */
782         evlist__for_each_entry(evlist, evsel) {
783                 if (evsel->core.attr.text_poke)
784                         return 0;
785         }
786
787         err = parse_events(evlist, "dummy:u", NULL);
788         if (err)
789                 return err;
790
791         evsel = evlist__last(evlist);
792
793         evsel->core.attr.freq = 0;
794         evsel->core.attr.sample_period = 1;
795         evsel->core.attr.text_poke = 1;
796         evsel->core.attr.ksymbol = 1;
797
798         evsel->core.system_wide = true;
799         evsel->no_aux_samples = true;
800         evsel->immediate = true;
801
802         /* Text poke must be collected on all CPUs */
803         perf_cpu_map__put(evsel->core.own_cpus);
804         evsel->core.own_cpus = perf_cpu_map__new(NULL);
805         perf_cpu_map__put(evsel->core.cpus);
806         evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
807
808         evsel__set_sample_bit(evsel, TIME);
809
810         return 0;
811 }
812
813 static bool record__kcore_readable(struct machine *machine)
814 {
815         char kcore[PATH_MAX];
816         int fd;
817
818         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
819
820         fd = open(kcore, O_RDONLY);
821         if (fd < 0)
822                 return false;
823
824         close(fd);
825
826         return true;
827 }
828
829 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
830 {
831         char from_dir[PATH_MAX];
832         char kcore_dir[PATH_MAX];
833         int ret;
834
835         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
836
837         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
838         if (ret)
839                 return ret;
840
841         return kcore_copy(from_dir, kcore_dir);
842 }
843
844 static int record__mmap_evlist(struct record *rec,
845                                struct evlist *evlist)
846 {
847         struct record_opts *opts = &rec->opts;
848         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
849                                   opts->auxtrace_sample_mode;
850         char msg[512];
851
852         if (opts->affinity != PERF_AFFINITY_SYS)
853                 cpu__setup_cpunode_map();
854
855         if (evlist__mmap_ex(evlist, opts->mmap_pages,
856                                  opts->auxtrace_mmap_pages,
857                                  auxtrace_overwrite,
858                                  opts->nr_cblocks, opts->affinity,
859                                  opts->mmap_flush, opts->comp_level) < 0) {
860                 if (errno == EPERM) {
861                         pr_err("Permission error mapping pages.\n"
862                                "Consider increasing "
863                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
864                                "or try again with a smaller value of -m/--mmap_pages.\n"
865                                "(current value: %u,%u)\n",
866                                opts->mmap_pages, opts->auxtrace_mmap_pages);
867                         return -errno;
868                 } else {
869                         pr_err("failed to mmap with %d (%s)\n", errno,
870                                 str_error_r(errno, msg, sizeof(msg)));
871                         if (errno)
872                                 return -errno;
873                         else
874                                 return -EINVAL;
875                 }
876         }
877         return 0;
878 }
879
880 static int record__mmap(struct record *rec)
881 {
882         return record__mmap_evlist(rec, rec->evlist);
883 }
884
885 static int record__open(struct record *rec)
886 {
887         char msg[BUFSIZ];
888         struct evsel *pos;
889         struct evlist *evlist = rec->evlist;
890         struct perf_session *session = rec->session;
891         struct record_opts *opts = &rec->opts;
892         int rc = 0;
893
894         /*
895          * For initial_delay, system wide or a hybrid system, we need to add a
896          * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
897          * of waiting or event synthesis.
898          */
899         if (opts->initial_delay || target__has_cpu(&opts->target) ||
900             perf_pmu__has_hybrid()) {
901                 pos = evlist__get_tracking_event(evlist);
902                 if (!evsel__is_dummy_event(pos)) {
903                         /* Set up dummy event. */
904                         if (evlist__add_dummy(evlist))
905                                 return -ENOMEM;
906                         pos = evlist__last(evlist);
907                         evlist__set_tracking_event(evlist, pos);
908                 }
909
910                 /*
911                  * Enable the dummy event when the process is forked for
912                  * initial_delay, immediately for system wide.
913                  */
914                 if (opts->initial_delay && !pos->immediate &&
915                     !target__has_cpu(&opts->target))
916                         pos->core.attr.enable_on_exec = 1;
917                 else
918                         pos->immediate = 1;
919         }
920
921         evlist__config(evlist, opts, &callchain_param);
922
923         evlist__for_each_entry(evlist, pos) {
924 try_again:
925                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
926                         if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
927                                 if (verbose > 0)
928                                         ui__warning("%s\n", msg);
929                                 goto try_again;
930                         }
931                         if ((errno == EINVAL || errno == EBADF) &&
932                             pos->core.leader != &pos->core &&
933                             pos->weak_group) {
934                                 pos = evlist__reset_weak_group(evlist, pos, true);
935                                 goto try_again;
936                         }
937                         rc = -errno;
938                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
939                         ui__error("%s\n", msg);
940                         goto out;
941                 }
942
943                 pos->supported = true;
944         }
945
946         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
947                 pr_warning(
948 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
949 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
950 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
951 "file is not found in the buildid cache or in the vmlinux path.\n\n"
952 "Samples in kernel modules won't be resolved at all.\n\n"
953 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
954 "even with a suitable vmlinux or kallsyms file.\n\n");
955         }
956
957         if (evlist__apply_filters(evlist, &pos)) {
958                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
959                         pos->filter, evsel__name(pos), errno,
960                         str_error_r(errno, msg, sizeof(msg)));
961                 rc = -1;
962                 goto out;
963         }
964
965         rc = record__mmap(rec);
966         if (rc)
967                 goto out;
968
969         session->evlist = evlist;
970         perf_session__set_id_hdr_size(session);
971 out:
972         return rc;
973 }
974
975 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
976 {
977         if (rec->evlist->first_sample_time == 0)
978                 rec->evlist->first_sample_time = sample_time;
979
980         if (sample_time)
981                 rec->evlist->last_sample_time = sample_time;
982 }
983
984 static int process_sample_event(struct perf_tool *tool,
985                                 union perf_event *event,
986                                 struct perf_sample *sample,
987                                 struct evsel *evsel,
988                                 struct machine *machine)
989 {
990         struct record *rec = container_of(tool, struct record, tool);
991
992         set_timestamp_boundary(rec, sample->time);
993
994         if (rec->buildid_all)
995                 return 0;
996
997         rec->samples++;
998         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
999 }
1000
1001 static int process_buildids(struct record *rec)
1002 {
1003         struct perf_session *session = rec->session;
1004
1005         if (perf_data__size(&rec->data) == 0)
1006                 return 0;
1007
1008         /*
1009          * During this process, it'll load kernel map and replace the
1010          * dso->long_name to a real pathname it found.  In this case
1011          * we prefer the vmlinux path like
1012          *   /lib/modules/3.16.4/build/vmlinux
1013          *
1014          * rather than build-id path (in debug directory).
1015          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1016          */
1017         symbol_conf.ignore_vmlinux_buildid = true;
1018
1019         /*
1020          * If --buildid-all is given, it marks all DSO regardless of hits,
1021          * so no need to process samples. But if timestamp_boundary is enabled,
1022          * it still needs to walk on all samples to get the timestamps of
1023          * first/last samples.
1024          */
1025         if (rec->buildid_all && !rec->timestamp_boundary)
1026                 rec->tool.sample = NULL;
1027
1028         return perf_session__process_events(session);
1029 }
1030
1031 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1032 {
1033         int err;
1034         struct perf_tool *tool = data;
1035         /*
1036          *As for guest kernel when processing subcommand record&report,
1037          *we arrange module mmap prior to guest kernel mmap and trigger
1038          *a preload dso because default guest module symbols are loaded
1039          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1040          *method is used to avoid symbol missing when the first addr is
1041          *in module instead of in guest kernel.
1042          */
1043         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1044                                              machine);
1045         if (err < 0)
1046                 pr_err("Couldn't record guest kernel [%d]'s reference"
1047                        " relocation symbol.\n", machine->pid);
1048
1049         /*
1050          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1051          * have no _text sometimes.
1052          */
1053         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1054                                                  machine);
1055         if (err < 0)
1056                 pr_err("Couldn't record guest kernel [%d]'s reference"
1057                        " relocation symbol.\n", machine->pid);
1058 }
1059
1060 static struct perf_event_header finished_round_event = {
1061         .size = sizeof(struct perf_event_header),
1062         .type = PERF_RECORD_FINISHED_ROUND,
1063 };
1064
1065 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1066 {
1067         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1068             !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1069                           rec->affinity_mask.nbits)) {
1070                 bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1071                 bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1072                           map->affinity_mask.bits, rec->affinity_mask.nbits);
1073                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1074                                   (cpu_set_t *)rec->affinity_mask.bits);
1075                 if (verbose == 2)
1076                         mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1077         }
1078 }
1079
1080 static size_t process_comp_header(void *record, size_t increment)
1081 {
1082         struct perf_record_compressed *event = record;
1083         size_t size = sizeof(*event);
1084
1085         if (increment) {
1086                 event->header.size += increment;
1087                 return increment;
1088         }
1089
1090         event->header.type = PERF_RECORD_COMPRESSED;
1091         event->header.size = size;
1092
1093         return size;
1094 }
1095
1096 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1097                             void *src, size_t src_size)
1098 {
1099         size_t compressed;
1100         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1101
1102         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1103                                                      max_record_size, process_comp_header);
1104
1105         session->bytes_transferred += src_size;
1106         session->bytes_compressed  += compressed;
1107
1108         return compressed;
1109 }
1110
1111 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1112                                     bool overwrite, bool synch)
1113 {
1114         u64 bytes_written = rec->bytes_written;
1115         int i;
1116         int rc = 0;
1117         struct mmap *maps;
1118         int trace_fd = rec->data.file.fd;
1119         off_t off = 0;
1120
1121         if (!evlist)
1122                 return 0;
1123
1124         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1125         if (!maps)
1126                 return 0;
1127
1128         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1129                 return 0;
1130
1131         if (record__aio_enabled(rec))
1132                 off = record__aio_get_pos(trace_fd);
1133
1134         for (i = 0; i < evlist->core.nr_mmaps; i++) {
1135                 u64 flush = 0;
1136                 struct mmap *map = &maps[i];
1137
1138                 if (map->core.base) {
1139                         record__adjust_affinity(rec, map);
1140                         if (synch) {
1141                                 flush = map->core.flush;
1142                                 map->core.flush = 1;
1143                         }
1144                         if (!record__aio_enabled(rec)) {
1145                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1146                                         if (synch)
1147                                                 map->core.flush = flush;
1148                                         rc = -1;
1149                                         goto out;
1150                                 }
1151                         } else {
1152                                 if (record__aio_push(rec, map, &off) < 0) {
1153                                         record__aio_set_pos(trace_fd, off);
1154                                         if (synch)
1155                                                 map->core.flush = flush;
1156                                         rc = -1;
1157                                         goto out;
1158                                 }
1159                         }
1160                         if (synch)
1161                                 map->core.flush = flush;
1162                 }
1163
1164                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1165                     !rec->opts.auxtrace_sample_mode &&
1166                     record__auxtrace_mmap_read(rec, map) != 0) {
1167                         rc = -1;
1168                         goto out;
1169                 }
1170         }
1171
1172         if (record__aio_enabled(rec))
1173                 record__aio_set_pos(trace_fd, off);
1174
1175         /*
1176          * Mark the round finished in case we wrote
1177          * at least one event.
1178          */
1179         if (bytes_written != rec->bytes_written)
1180                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1181
1182         if (overwrite)
1183                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1184 out:
1185         return rc;
1186 }
1187
1188 static int record__mmap_read_all(struct record *rec, bool synch)
1189 {
1190         int err;
1191
1192         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1193         if (err)
1194                 return err;
1195
1196         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1197 }
1198
1199 static void record__init_features(struct record *rec)
1200 {
1201         struct perf_session *session = rec->session;
1202         int feat;
1203
1204         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1205                 perf_header__set_feat(&session->header, feat);
1206
1207         if (rec->no_buildid)
1208                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1209
1210         if (!have_tracepoints(&rec->evlist->core.entries))
1211                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1212
1213         if (!rec->opts.branch_stack)
1214                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1215
1216         if (!rec->opts.full_auxtrace)
1217                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1218
1219         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1220                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1221
1222         if (!rec->opts.use_clockid)
1223                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1224
1225         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1226         if (!record__comp_enabled(rec))
1227                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1228
1229         perf_header__clear_feat(&session->header, HEADER_STAT);
1230 }
1231
1232 static void
1233 record__finish_output(struct record *rec)
1234 {
1235         struct perf_data *data = &rec->data;
1236         int fd = perf_data__fd(data);
1237
1238         if (data->is_pipe)
1239                 return;
1240
1241         rec->session->header.data_size += rec->bytes_written;
1242         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1243
1244         if (!rec->no_buildid) {
1245                 process_buildids(rec);
1246
1247                 if (rec->buildid_all)
1248                         dsos__hit_all(rec->session);
1249         }
1250         perf_session__write_header(rec->session, rec->evlist, fd, true);
1251
1252         return;
1253 }
1254
1255 static int record__synthesize_workload(struct record *rec, bool tail)
1256 {
1257         int err;
1258         struct perf_thread_map *thread_map;
1259         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1260
1261         if (rec->opts.tail_synthesize != tail)
1262                 return 0;
1263
1264         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1265         if (thread_map == NULL)
1266                 return -1;
1267
1268         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1269                                                  process_synthesized_event,
1270                                                  &rec->session->machines.host,
1271                                                  needs_mmap,
1272                                                  rec->opts.sample_address);
1273         perf_thread_map__put(thread_map);
1274         return err;
1275 }
1276
1277 static int record__synthesize(struct record *rec, bool tail);
1278
1279 static int
1280 record__switch_output(struct record *rec, bool at_exit)
1281 {
1282         struct perf_data *data = &rec->data;
1283         int fd, err;
1284         char *new_filename;
1285
1286         /* Same Size:      "2015122520103046"*/
1287         char timestamp[] = "InvalidTimestamp";
1288
1289         record__aio_mmap_read_sync(rec);
1290
1291         record__synthesize(rec, true);
1292         if (target__none(&rec->opts.target))
1293                 record__synthesize_workload(rec, true);
1294
1295         rec->samples = 0;
1296         record__finish_output(rec);
1297         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1298         if (err) {
1299                 pr_err("Failed to get current timestamp\n");
1300                 return -EINVAL;
1301         }
1302
1303         fd = perf_data__switch(data, timestamp,
1304                                     rec->session->header.data_offset,
1305                                     at_exit, &new_filename);
1306         if (fd >= 0 && !at_exit) {
1307                 rec->bytes_written = 0;
1308                 rec->session->header.data_size = 0;
1309         }
1310
1311         if (!quiet)
1312                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1313                         data->path, timestamp);
1314
1315         if (rec->switch_output.num_files) {
1316                 int n = rec->switch_output.cur_file + 1;
1317
1318                 if (n >= rec->switch_output.num_files)
1319                         n = 0;
1320                 rec->switch_output.cur_file = n;
1321                 if (rec->switch_output.filenames[n]) {
1322                         remove(rec->switch_output.filenames[n]);
1323                         zfree(&rec->switch_output.filenames[n]);
1324                 }
1325                 rec->switch_output.filenames[n] = new_filename;
1326         } else {
1327                 free(new_filename);
1328         }
1329
1330         /* Output tracking events */
1331         if (!at_exit) {
1332                 record__synthesize(rec, false);
1333
1334                 /*
1335                  * In 'perf record --switch-output' without -a,
1336                  * record__synthesize() in record__switch_output() won't
1337                  * generate tracking events because there's no thread_map
1338                  * in evlist. Which causes newly created perf.data doesn't
1339                  * contain map and comm information.
1340                  * Create a fake thread_map and directly call
1341                  * perf_event__synthesize_thread_map() for those events.
1342                  */
1343                 if (target__none(&rec->opts.target))
1344                         record__synthesize_workload(rec, false);
1345         }
1346         return fd;
1347 }
1348
1349 static volatile int workload_exec_errno;
1350
1351 /*
1352  * evlist__prepare_workload will send a SIGUSR1
1353  * if the fork fails, since we asked by setting its
1354  * want_signal to true.
1355  */
1356 static void workload_exec_failed_signal(int signo __maybe_unused,
1357                                         siginfo_t *info,
1358                                         void *ucontext __maybe_unused)
1359 {
1360         workload_exec_errno = info->si_value.sival_int;
1361         done = 1;
1362         child_finished = 1;
1363 }
1364
1365 static void snapshot_sig_handler(int sig);
1366 static void alarm_sig_handler(int sig);
1367
1368 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1369 {
1370         if (evlist) {
1371                 if (evlist->mmap && evlist->mmap[0].core.base)
1372                         return evlist->mmap[0].core.base;
1373                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1374                         return evlist->overwrite_mmap[0].core.base;
1375         }
1376         return NULL;
1377 }
1378
1379 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1380 {
1381         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1382         if (pc)
1383                 return pc;
1384         return NULL;
1385 }
1386
1387 static int record__synthesize(struct record *rec, bool tail)
1388 {
1389         struct perf_session *session = rec->session;
1390         struct machine *machine = &session->machines.host;
1391         struct perf_data *data = &rec->data;
1392         struct record_opts *opts = &rec->opts;
1393         struct perf_tool *tool = &rec->tool;
1394         int err = 0;
1395         event_op f = process_synthesized_event;
1396
1397         if (rec->opts.tail_synthesize != tail)
1398                 return 0;
1399
1400         if (data->is_pipe) {
1401                 err = perf_event__synthesize_for_pipe(tool, session, data,
1402                                                       process_synthesized_event);
1403                 if (err < 0)
1404                         goto out;
1405
1406                 rec->bytes_written += err;
1407         }
1408
1409         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1410                                           process_synthesized_event, machine);
1411         if (err)
1412                 goto out;
1413
1414         /* Synthesize id_index before auxtrace_info */
1415         if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
1416                 err = perf_event__synthesize_id_index(tool,
1417                                                       process_synthesized_event,
1418                                                       session->evlist, machine);
1419                 if (err)
1420                         goto out;
1421         }
1422
1423         if (rec->opts.full_auxtrace) {
1424                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1425                                         session, process_synthesized_event);
1426                 if (err)
1427                         goto out;
1428         }
1429
1430         if (!evlist__exclude_kernel(rec->evlist)) {
1431                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1432                                                          machine);
1433                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1434                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1435                                    "Check /proc/kallsyms permission or run as root.\n");
1436
1437                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1438                                                      machine);
1439                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1440                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1441                                    "Check /proc/modules permission or run as root.\n");
1442         }
1443
1444         if (perf_guest) {
1445                 machines__process_guests(&session->machines,
1446                                          perf_event__synthesize_guest_os, tool);
1447         }
1448
1449         err = perf_event__synthesize_extra_attr(&rec->tool,
1450                                                 rec->evlist,
1451                                                 process_synthesized_event,
1452                                                 data->is_pipe);
1453         if (err)
1454                 goto out;
1455
1456         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1457                                                  process_synthesized_event,
1458                                                 NULL);
1459         if (err < 0) {
1460                 pr_err("Couldn't synthesize thread map.\n");
1461                 return err;
1462         }
1463
1464         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1465                                              process_synthesized_event, NULL);
1466         if (err < 0) {
1467                 pr_err("Couldn't synthesize cpu map.\n");
1468                 return err;
1469         }
1470
1471         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1472                                                 machine, opts);
1473         if (err < 0)
1474                 pr_warning("Couldn't synthesize bpf events.\n");
1475
1476         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1477                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1478                                                      machine);
1479                 if (err < 0)
1480                         pr_warning("Couldn't synthesize cgroup events.\n");
1481         }
1482
1483         if (rec->opts.nr_threads_synthesize > 1) {
1484                 perf_set_multithreaded();
1485                 f = process_locked_synthesized_event;
1486         }
1487
1488         if (rec->opts.synth & PERF_SYNTH_TASK) {
1489                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1490
1491                 err = __machine__synthesize_threads(machine, tool, &opts->target,
1492                                                     rec->evlist->core.threads,
1493                                                     f, needs_mmap, opts->sample_address,
1494                                                     rec->opts.nr_threads_synthesize);
1495         }
1496
1497         if (rec->opts.nr_threads_synthesize > 1)
1498                 perf_set_singlethreaded();
1499
1500 out:
1501         return err;
1502 }
1503
1504 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1505 {
1506         struct record *rec = data;
1507         pthread_kill(rec->thread_id, SIGUSR2);
1508         return 0;
1509 }
1510
1511 static int record__setup_sb_evlist(struct record *rec)
1512 {
1513         struct record_opts *opts = &rec->opts;
1514
1515         if (rec->sb_evlist != NULL) {
1516                 /*
1517                  * We get here if --switch-output-event populated the
1518                  * sb_evlist, so associate a callback that will send a SIGUSR2
1519                  * to the main thread.
1520                  */
1521                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1522                 rec->thread_id = pthread_self();
1523         }
1524 #ifdef HAVE_LIBBPF_SUPPORT
1525         if (!opts->no_bpf_event) {
1526                 if (rec->sb_evlist == NULL) {
1527                         rec->sb_evlist = evlist__new();
1528
1529                         if (rec->sb_evlist == NULL) {
1530                                 pr_err("Couldn't create side band evlist.\n.");
1531                                 return -1;
1532                         }
1533                 }
1534
1535                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1536                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1537                         return -1;
1538                 }
1539         }
1540 #endif
1541         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1542                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1543                 opts->no_bpf_event = true;
1544         }
1545
1546         return 0;
1547 }
1548
1549 static int record__init_clock(struct record *rec)
1550 {
1551         struct perf_session *session = rec->session;
1552         struct timespec ref_clockid;
1553         struct timeval ref_tod;
1554         u64 ref;
1555
1556         if (!rec->opts.use_clockid)
1557                 return 0;
1558
1559         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1560                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1561
1562         session->header.env.clock.clockid = rec->opts.clockid;
1563
1564         if (gettimeofday(&ref_tod, NULL) != 0) {
1565                 pr_err("gettimeofday failed, cannot set reference time.\n");
1566                 return -1;
1567         }
1568
1569         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1570                 pr_err("clock_gettime failed, cannot set reference time.\n");
1571                 return -1;
1572         }
1573
1574         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1575               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1576
1577         session->header.env.clock.tod_ns = ref;
1578
1579         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1580               (u64) ref_clockid.tv_nsec;
1581
1582         session->header.env.clock.clockid_ns = ref;
1583         return 0;
1584 }
1585
1586 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1587 {
1588         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1589                 trigger_hit(&auxtrace_snapshot_trigger);
1590                 auxtrace_record__snapshot_started = 1;
1591                 if (auxtrace_record__snapshot_start(rec->itr))
1592                         trigger_error(&auxtrace_snapshot_trigger);
1593         }
1594 }
1595
1596 static void record__uniquify_name(struct record *rec)
1597 {
1598         struct evsel *pos;
1599         struct evlist *evlist = rec->evlist;
1600         char *new_name;
1601         int ret;
1602
1603         if (!perf_pmu__has_hybrid())
1604                 return;
1605
1606         evlist__for_each_entry(evlist, pos) {
1607                 if (!evsel__is_hybrid(pos))
1608                         continue;
1609
1610                 if (strchr(pos->name, '/'))
1611                         continue;
1612
1613                 ret = asprintf(&new_name, "%s/%s/",
1614                                pos->pmu_name, pos->name);
1615                 if (ret) {
1616                         free(pos->name);
1617                         pos->name = new_name;
1618                 }
1619         }
1620 }
1621
1622 static int __cmd_record(struct record *rec, int argc, const char **argv)
1623 {
1624         int err;
1625         int status = 0;
1626         unsigned long waking = 0;
1627         const bool forks = argc > 0;
1628         struct perf_tool *tool = &rec->tool;
1629         struct record_opts *opts = &rec->opts;
1630         struct perf_data *data = &rec->data;
1631         struct perf_session *session;
1632         bool disabled = false, draining = false;
1633         int fd;
1634         float ratio = 0;
1635         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1636
1637         atexit(record__sig_exit);
1638         signal(SIGCHLD, sig_handler);
1639         signal(SIGINT, sig_handler);
1640         signal(SIGTERM, sig_handler);
1641         signal(SIGSEGV, sigsegv_handler);
1642
1643         if (rec->opts.record_namespaces)
1644                 tool->namespace_events = true;
1645
1646         if (rec->opts.record_cgroup) {
1647 #ifdef HAVE_FILE_HANDLE
1648                 tool->cgroup_events = true;
1649 #else
1650                 pr_err("cgroup tracking is not supported\n");
1651                 return -1;
1652 #endif
1653         }
1654
1655         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1656                 signal(SIGUSR2, snapshot_sig_handler);
1657                 if (rec->opts.auxtrace_snapshot_mode)
1658                         trigger_on(&auxtrace_snapshot_trigger);
1659                 if (rec->switch_output.enabled)
1660                         trigger_on(&switch_output_trigger);
1661         } else {
1662                 signal(SIGUSR2, SIG_IGN);
1663         }
1664
1665         session = perf_session__new(data, tool);
1666         if (IS_ERR(session)) {
1667                 pr_err("Perf session creation failed.\n");
1668                 return PTR_ERR(session);
1669         }
1670
1671         fd = perf_data__fd(data);
1672         rec->session = session;
1673
1674         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1675                 pr_err("Compression initialization failed.\n");
1676                 return -1;
1677         }
1678 #ifdef HAVE_EVENTFD_SUPPORT
1679         done_fd = eventfd(0, EFD_NONBLOCK);
1680         if (done_fd < 0) {
1681                 pr_err("Failed to create wakeup eventfd, error: %m\n");
1682                 status = -1;
1683                 goto out_delete_session;
1684         }
1685         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1686         if (err < 0) {
1687                 pr_err("Failed to add wakeup eventfd to poll list\n");
1688                 status = err;
1689                 goto out_delete_session;
1690         }
1691 #endif // HAVE_EVENTFD_SUPPORT
1692
1693         session->header.env.comp_type  = PERF_COMP_ZSTD;
1694         session->header.env.comp_level = rec->opts.comp_level;
1695
1696         if (rec->opts.kcore &&
1697             !record__kcore_readable(&session->machines.host)) {
1698                 pr_err("ERROR: kcore is not readable.\n");
1699                 return -1;
1700         }
1701
1702         if (record__init_clock(rec))
1703                 return -1;
1704
1705         record__init_features(rec);
1706
1707         if (forks) {
1708                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1709                                                workload_exec_failed_signal);
1710                 if (err < 0) {
1711                         pr_err("Couldn't run the workload!\n");
1712                         status = err;
1713                         goto out_delete_session;
1714                 }
1715         }
1716
1717         /*
1718          * If we have just single event and are sending data
1719          * through pipe, we need to force the ids allocation,
1720          * because we synthesize event name through the pipe
1721          * and need the id for that.
1722          */
1723         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1724                 rec->opts.sample_id = true;
1725
1726         record__uniquify_name(rec);
1727
1728         if (record__open(rec) != 0) {
1729                 err = -1;
1730                 goto out_child;
1731         }
1732         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1733
1734         if (rec->opts.kcore) {
1735                 err = record__kcore_copy(&session->machines.host, data);
1736                 if (err) {
1737                         pr_err("ERROR: Failed to copy kcore\n");
1738                         goto out_child;
1739                 }
1740         }
1741
1742         err = bpf__apply_obj_config();
1743         if (err) {
1744                 char errbuf[BUFSIZ];
1745
1746                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1747                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1748                          errbuf);
1749                 goto out_child;
1750         }
1751
1752         /*
1753          * Normally perf_session__new would do this, but it doesn't have the
1754          * evlist.
1755          */
1756         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1757                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1758                 rec->tool.ordered_events = false;
1759         }
1760
1761         if (!rec->evlist->core.nr_groups)
1762                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1763
1764         if (data->is_pipe) {
1765                 err = perf_header__write_pipe(fd);
1766                 if (err < 0)
1767                         goto out_child;
1768         } else {
1769                 err = perf_session__write_header(session, rec->evlist, fd, false);
1770                 if (err < 0)
1771                         goto out_child;
1772         }
1773
1774         err = -1;
1775         if (!rec->no_buildid
1776             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1777                 pr_err("Couldn't generate buildids. "
1778                        "Use --no-buildid to profile anyway.\n");
1779                 goto out_child;
1780         }
1781
1782         err = record__setup_sb_evlist(rec);
1783         if (err)
1784                 goto out_child;
1785
1786         err = record__synthesize(rec, false);
1787         if (err < 0)
1788                 goto out_child;
1789
1790         if (rec->realtime_prio) {
1791                 struct sched_param param;
1792
1793                 param.sched_priority = rec->realtime_prio;
1794                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1795                         pr_err("Could not set realtime priority.\n");
1796                         err = -1;
1797                         goto out_child;
1798                 }
1799         }
1800
1801         /*
1802          * When perf is starting the traced process, all the events
1803          * (apart from group members) have enable_on_exec=1 set,
1804          * so don't spoil it by prematurely enabling them.
1805          */
1806         if (!target__none(&opts->target) && !opts->initial_delay)
1807                 evlist__enable(rec->evlist);
1808
1809         /*
1810          * Let the child rip
1811          */
1812         if (forks) {
1813                 struct machine *machine = &session->machines.host;
1814                 union perf_event *event;
1815                 pid_t tgid;
1816
1817                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1818                 if (event == NULL) {
1819                         err = -ENOMEM;
1820                         goto out_child;
1821                 }
1822
1823                 /*
1824                  * Some H/W events are generated before COMM event
1825                  * which is emitted during exec(), so perf script
1826                  * cannot see a correct process name for those events.
1827                  * Synthesize COMM event to prevent it.
1828                  */
1829                 tgid = perf_event__synthesize_comm(tool, event,
1830                                                    rec->evlist->workload.pid,
1831                                                    process_synthesized_event,
1832                                                    machine);
1833                 free(event);
1834
1835                 if (tgid == -1)
1836                         goto out_child;
1837
1838                 event = malloc(sizeof(event->namespaces) +
1839                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1840                                machine->id_hdr_size);
1841                 if (event == NULL) {
1842                         err = -ENOMEM;
1843                         goto out_child;
1844                 }
1845
1846                 /*
1847                  * Synthesize NAMESPACES event for the command specified.
1848                  */
1849                 perf_event__synthesize_namespaces(tool, event,
1850                                                   rec->evlist->workload.pid,
1851                                                   tgid, process_synthesized_event,
1852                                                   machine);
1853                 free(event);
1854
1855                 evlist__start_workload(rec->evlist);
1856         }
1857
1858         if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1859                 goto out_child;
1860
1861         if (opts->initial_delay) {
1862                 pr_info(EVLIST_DISABLED_MSG);
1863                 if (opts->initial_delay > 0) {
1864                         usleep(opts->initial_delay * USEC_PER_MSEC);
1865                         evlist__enable(rec->evlist);
1866                         pr_info(EVLIST_ENABLED_MSG);
1867                 }
1868         }
1869
1870         trigger_ready(&auxtrace_snapshot_trigger);
1871         trigger_ready(&switch_output_trigger);
1872         perf_hooks__invoke_record_start();
1873         for (;;) {
1874                 unsigned long long hits = rec->samples;
1875
1876                 /*
1877                  * rec->evlist->bkw_mmap_state is possible to be
1878                  * BKW_MMAP_EMPTY here: when done == true and
1879                  * hits != rec->samples in previous round.
1880                  *
1881                  * evlist__toggle_bkw_mmap ensure we never
1882                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1883                  */
1884                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1885                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1886
1887                 if (record__mmap_read_all(rec, false) < 0) {
1888                         trigger_error(&auxtrace_snapshot_trigger);
1889                         trigger_error(&switch_output_trigger);
1890                         err = -1;
1891                         goto out_child;
1892                 }
1893
1894                 if (auxtrace_record__snapshot_started) {
1895                         auxtrace_record__snapshot_started = 0;
1896                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1897                                 record__read_auxtrace_snapshot(rec, false);
1898                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1899                                 pr_err("AUX area tracing snapshot failed\n");
1900                                 err = -1;
1901                                 goto out_child;
1902                         }
1903                 }
1904
1905                 if (trigger_is_hit(&switch_output_trigger)) {
1906                         /*
1907                          * If switch_output_trigger is hit, the data in
1908                          * overwritable ring buffer should have been collected,
1909                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1910                          *
1911                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1912                          * record__mmap_read_all() didn't collect data from
1913                          * overwritable ring buffer. Read again.
1914                          */
1915                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1916                                 continue;
1917                         trigger_ready(&switch_output_trigger);
1918
1919                         /*
1920                          * Reenable events in overwrite ring buffer after
1921                          * record__mmap_read_all(): we should have collected
1922                          * data from it.
1923                          */
1924                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1925
1926                         if (!quiet)
1927                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1928                                         waking);
1929                         waking = 0;
1930                         fd = record__switch_output(rec, false);
1931                         if (fd < 0) {
1932                                 pr_err("Failed to switch to new file\n");
1933                                 trigger_error(&switch_output_trigger);
1934                                 err = fd;
1935                                 goto out_child;
1936                         }
1937
1938                         /* re-arm the alarm */
1939                         if (rec->switch_output.time)
1940                                 alarm(rec->switch_output.time);
1941                 }
1942
1943                 if (hits == rec->samples) {
1944                         if (done || draining)
1945                                 break;
1946                         err = evlist__poll(rec->evlist, -1);
1947                         /*
1948                          * Propagate error, only if there's any. Ignore positive
1949                          * number of returned events and interrupt error.
1950                          */
1951                         if (err > 0 || (err < 0 && errno == EINTR))
1952                                 err = 0;
1953                         waking++;
1954
1955                         if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1956                                 draining = true;
1957                 }
1958
1959                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1960                         switch (cmd) {
1961                         case EVLIST_CTL_CMD_SNAPSHOT:
1962                                 hit_auxtrace_snapshot_trigger(rec);
1963                                 evlist__ctlfd_ack(rec->evlist);
1964                                 break;
1965                         case EVLIST_CTL_CMD_STOP:
1966                                 done = 1;
1967                                 break;
1968                         case EVLIST_CTL_CMD_ACK:
1969                         case EVLIST_CTL_CMD_UNSUPPORTED:
1970                         case EVLIST_CTL_CMD_ENABLE:
1971                         case EVLIST_CTL_CMD_DISABLE:
1972                         case EVLIST_CTL_CMD_EVLIST:
1973                         case EVLIST_CTL_CMD_PING:
1974                         default:
1975                                 break;
1976                         }
1977                 }
1978
1979                 /*
1980                  * When perf is starting the traced process, at the end events
1981                  * die with the process and we wait for that. Thus no need to
1982                  * disable events in this case.
1983                  */
1984                 if (done && !disabled && !target__none(&opts->target)) {
1985                         trigger_off(&auxtrace_snapshot_trigger);
1986                         evlist__disable(rec->evlist);
1987                         disabled = true;
1988                 }
1989         }
1990
1991         trigger_off(&auxtrace_snapshot_trigger);
1992         trigger_off(&switch_output_trigger);
1993
1994         if (opts->auxtrace_snapshot_on_exit)
1995                 record__auxtrace_snapshot_exit(rec);
1996
1997         if (forks && workload_exec_errno) {
1998                 char msg[STRERR_BUFSIZE], strevsels[2048];
1999                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2000
2001                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2002
2003                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2004                         strevsels, argv[0], emsg);
2005                 err = -1;
2006                 goto out_child;
2007         }
2008
2009         if (!quiet)
2010                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2011
2012         if (target__none(&rec->opts.target))
2013                 record__synthesize_workload(rec, true);
2014
2015 out_child:
2016         evlist__finalize_ctlfd(rec->evlist);
2017         record__mmap_read_all(rec, true);
2018         record__aio_mmap_read_sync(rec);
2019
2020         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2021                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2022                 session->header.env.comp_ratio = ratio + 0.5;
2023         }
2024
2025         if (forks) {
2026                 int exit_status;
2027
2028                 if (!child_finished)
2029                         kill(rec->evlist->workload.pid, SIGTERM);
2030
2031                 wait(&exit_status);
2032
2033                 if (err < 0)
2034                         status = err;
2035                 else if (WIFEXITED(exit_status))
2036                         status = WEXITSTATUS(exit_status);
2037                 else if (WIFSIGNALED(exit_status))
2038                         signr = WTERMSIG(exit_status);
2039         } else
2040                 status = err;
2041
2042         record__synthesize(rec, true);
2043         /* this will be recalculated during process_buildids() */
2044         rec->samples = 0;
2045
2046         if (!err) {
2047                 if (!rec->timestamp_filename) {
2048                         record__finish_output(rec);
2049                 } else {
2050                         fd = record__switch_output(rec, true);
2051                         if (fd < 0) {
2052                                 status = fd;
2053                                 goto out_delete_session;
2054                         }
2055                 }
2056         }
2057
2058         perf_hooks__invoke_record_end();
2059
2060         if (!err && !quiet) {
2061                 char samples[128];
2062                 const char *postfix = rec->timestamp_filename ?
2063                                         ".<timestamp>" : "";
2064
2065                 if (rec->samples && !rec->opts.full_auxtrace)
2066                         scnprintf(samples, sizeof(samples),
2067                                   " (%" PRIu64 " samples)", rec->samples);
2068                 else
2069                         samples[0] = '\0';
2070
2071                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2072                         perf_data__size(data) / 1024.0 / 1024.0,
2073                         data->path, postfix, samples);
2074                 if (ratio) {
2075                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2076                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2077                                         ratio);
2078                 }
2079                 fprintf(stderr, " ]\n");
2080         }
2081
2082 out_delete_session:
2083 #ifdef HAVE_EVENTFD_SUPPORT
2084         if (done_fd >= 0)
2085                 close(done_fd);
2086 #endif
2087         zstd_fini(&session->zstd_data);
2088         perf_session__delete(session);
2089
2090         if (!opts->no_bpf_event)
2091                 evlist__stop_sb_thread(rec->sb_evlist);
2092         return status;
2093 }
2094
2095 static void callchain_debug(struct callchain_param *callchain)
2096 {
2097         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2098
2099         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2100
2101         if (callchain->record_mode == CALLCHAIN_DWARF)
2102                 pr_debug("callchain: stack dump size %d\n",
2103                          callchain->dump_size);
2104 }
2105
2106 int record_opts__parse_callchain(struct record_opts *record,
2107                                  struct callchain_param *callchain,
2108                                  const char *arg, bool unset)
2109 {
2110         int ret;
2111         callchain->enabled = !unset;
2112
2113         /* --no-call-graph */
2114         if (unset) {
2115                 callchain->record_mode = CALLCHAIN_NONE;
2116                 pr_debug("callchain: disabled\n");
2117                 return 0;
2118         }
2119
2120         ret = parse_callchain_record_opt(arg, callchain);
2121         if (!ret) {
2122                 /* Enable data address sampling for DWARF unwind. */
2123                 if (callchain->record_mode == CALLCHAIN_DWARF)
2124                         record->sample_address = true;
2125                 callchain_debug(callchain);
2126         }
2127
2128         return ret;
2129 }
2130
2131 int record_parse_callchain_opt(const struct option *opt,
2132                                const char *arg,
2133                                int unset)
2134 {
2135         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2136 }
2137
2138 int record_callchain_opt(const struct option *opt,
2139                          const char *arg __maybe_unused,
2140                          int unset __maybe_unused)
2141 {
2142         struct callchain_param *callchain = opt->value;
2143
2144         callchain->enabled = true;
2145
2146         if (callchain->record_mode == CALLCHAIN_NONE)
2147                 callchain->record_mode = CALLCHAIN_FP;
2148
2149         callchain_debug(callchain);
2150         return 0;
2151 }
2152
2153 static int perf_record_config(const char *var, const char *value, void *cb)
2154 {
2155         struct record *rec = cb;
2156
2157         if (!strcmp(var, "record.build-id")) {
2158                 if (!strcmp(value, "cache"))
2159                         rec->no_buildid_cache = false;
2160                 else if (!strcmp(value, "no-cache"))
2161                         rec->no_buildid_cache = true;
2162                 else if (!strcmp(value, "skip"))
2163                         rec->no_buildid = true;
2164                 else if (!strcmp(value, "mmap"))
2165                         rec->buildid_mmap = true;
2166                 else
2167                         return -1;
2168                 return 0;
2169         }
2170         if (!strcmp(var, "record.call-graph")) {
2171                 var = "call-graph.record-mode";
2172                 return perf_default_config(var, value, cb);
2173         }
2174 #ifdef HAVE_AIO_SUPPORT
2175         if (!strcmp(var, "record.aio")) {
2176                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2177                 if (!rec->opts.nr_cblocks)
2178                         rec->opts.nr_cblocks = nr_cblocks_default;
2179         }
2180 #endif
2181         if (!strcmp(var, "record.debuginfod")) {
2182                 rec->debuginfod.urls = strdup(value);
2183                 if (!rec->debuginfod.urls)
2184                         return -ENOMEM;
2185                 rec->debuginfod.set = true;
2186         }
2187
2188         return 0;
2189 }
2190
2191
2192 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2193 {
2194         struct record_opts *opts = (struct record_opts *)opt->value;
2195
2196         if (unset || !str)
2197                 return 0;
2198
2199         if (!strcasecmp(str, "node"))
2200                 opts->affinity = PERF_AFFINITY_NODE;
2201         else if (!strcasecmp(str, "cpu"))
2202                 opts->affinity = PERF_AFFINITY_CPU;
2203
2204         return 0;
2205 }
2206
2207 static int parse_output_max_size(const struct option *opt,
2208                                  const char *str, int unset)
2209 {
2210         unsigned long *s = (unsigned long *)opt->value;
2211         static struct parse_tag tags_size[] = {
2212                 { .tag  = 'B', .mult = 1       },
2213                 { .tag  = 'K', .mult = 1 << 10 },
2214                 { .tag  = 'M', .mult = 1 << 20 },
2215                 { .tag  = 'G', .mult = 1 << 30 },
2216                 { .tag  = 0 },
2217         };
2218         unsigned long val;
2219
2220         if (unset) {
2221                 *s = 0;
2222                 return 0;
2223         }
2224
2225         val = parse_tag_value(str, tags_size);
2226         if (val != (unsigned long) -1) {
2227                 *s = val;
2228                 return 0;
2229         }
2230
2231         return -1;
2232 }
2233
2234 static int record__parse_mmap_pages(const struct option *opt,
2235                                     const char *str,
2236                                     int unset __maybe_unused)
2237 {
2238         struct record_opts *opts = opt->value;
2239         char *s, *p;
2240         unsigned int mmap_pages;
2241         int ret;
2242
2243         if (!str)
2244                 return -EINVAL;
2245
2246         s = strdup(str);
2247         if (!s)
2248                 return -ENOMEM;
2249
2250         p = strchr(s, ',');
2251         if (p)
2252                 *p = '\0';
2253
2254         if (*s) {
2255                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2256                 if (ret)
2257                         goto out_free;
2258                 opts->mmap_pages = mmap_pages;
2259         }
2260
2261         if (!p) {
2262                 ret = 0;
2263                 goto out_free;
2264         }
2265
2266         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2267         if (ret)
2268                 goto out_free;
2269
2270         opts->auxtrace_mmap_pages = mmap_pages;
2271
2272 out_free:
2273         free(s);
2274         return ret;
2275 }
2276
2277 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
2278 {
2279 }
2280
2281 static int parse_control_option(const struct option *opt,
2282                                 const char *str,
2283                                 int unset __maybe_unused)
2284 {
2285         struct record_opts *opts = opt->value;
2286
2287         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2288 }
2289
2290 static void switch_output_size_warn(struct record *rec)
2291 {
2292         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2293         struct switch_output *s = &rec->switch_output;
2294
2295         wakeup_size /= 2;
2296
2297         if (s->size < wakeup_size) {
2298                 char buf[100];
2299
2300                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2301                 pr_warning("WARNING: switch-output data size lower than "
2302                            "wakeup kernel buffer size (%s) "
2303                            "expect bigger perf.data sizes\n", buf);
2304         }
2305 }
2306
2307 static int switch_output_setup(struct record *rec)
2308 {
2309         struct switch_output *s = &rec->switch_output;
2310         static struct parse_tag tags_size[] = {
2311                 { .tag  = 'B', .mult = 1       },
2312                 { .tag  = 'K', .mult = 1 << 10 },
2313                 { .tag  = 'M', .mult = 1 << 20 },
2314                 { .tag  = 'G', .mult = 1 << 30 },
2315                 { .tag  = 0 },
2316         };
2317         static struct parse_tag tags_time[] = {
2318                 { .tag  = 's', .mult = 1        },
2319                 { .tag  = 'm', .mult = 60       },
2320                 { .tag  = 'h', .mult = 60*60    },
2321                 { .tag  = 'd', .mult = 60*60*24 },
2322                 { .tag  = 0 },
2323         };
2324         unsigned long val;
2325
2326         /*
2327          * If we're using --switch-output-events, then we imply its 
2328          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2329          *  thread to its parent.
2330          */
2331         if (rec->switch_output_event_set)
2332                 goto do_signal;
2333
2334         if (!s->set)
2335                 return 0;
2336
2337         if (!strcmp(s->str, "signal")) {
2338 do_signal:
2339                 s->signal = true;
2340                 pr_debug("switch-output with SIGUSR2 signal\n");
2341                 goto enabled;
2342         }
2343
2344         val = parse_tag_value(s->str, tags_size);
2345         if (val != (unsigned long) -1) {
2346                 s->size = val;
2347                 pr_debug("switch-output with %s size threshold\n", s->str);
2348                 goto enabled;
2349         }
2350
2351         val = parse_tag_value(s->str, tags_time);
2352         if (val != (unsigned long) -1) {
2353                 s->time = val;
2354                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2355                          s->str, s->time);
2356                 goto enabled;
2357         }
2358
2359         return -1;
2360
2361 enabled:
2362         rec->timestamp_filename = true;
2363         s->enabled              = true;
2364
2365         if (s->size && !rec->opts.no_buffering)
2366                 switch_output_size_warn(rec);
2367
2368         return 0;
2369 }
2370
2371 static const char * const __record_usage[] = {
2372         "perf record [<options>] [<command>]",
2373         "perf record [<options>] -- <command> [<options>]",
2374         NULL
2375 };
2376 const char * const *record_usage = __record_usage;
2377
2378 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2379                                   struct perf_sample *sample, struct machine *machine)
2380 {
2381         /*
2382          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2383          * no need to add them twice.
2384          */
2385         if (!(event->header.misc & PERF_RECORD_MISC_USER))
2386                 return 0;
2387         return perf_event__process_mmap(tool, event, sample, machine);
2388 }
2389
2390 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2391                                    struct perf_sample *sample, struct machine *machine)
2392 {
2393         /*
2394          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2395          * no need to add them twice.
2396          */
2397         if (!(event->header.misc & PERF_RECORD_MISC_USER))
2398                 return 0;
2399
2400         return perf_event__process_mmap2(tool, event, sample, machine);
2401 }
2402
2403 static int process_timestamp_boundary(struct perf_tool *tool,
2404                                       union perf_event *event __maybe_unused,
2405                                       struct perf_sample *sample,
2406                                       struct machine *machine __maybe_unused)
2407 {
2408         struct record *rec = container_of(tool, struct record, tool);
2409
2410         set_timestamp_boundary(rec, sample->time);
2411         return 0;
2412 }
2413
2414 static int parse_record_synth_option(const struct option *opt,
2415                                      const char *str,
2416                                      int unset __maybe_unused)
2417 {
2418         struct record_opts *opts = opt->value;
2419         char *p = strdup(str);
2420
2421         if (p == NULL)
2422                 return -1;
2423
2424         opts->synth = parse_synth_opt(p);
2425         free(p);
2426
2427         if (opts->synth < 0) {
2428                 pr_err("Invalid synth option: %s\n", str);
2429                 return -1;
2430         }
2431         return 0;
2432 }
2433
2434 /*
2435  * XXX Ideally would be local to cmd_record() and passed to a record__new
2436  * because we need to have access to it in record__exit, that is called
2437  * after cmd_record() exits, but since record_options need to be accessible to
2438  * builtin-script, leave it here.
2439  *
2440  * At least we don't ouch it in all the other functions here directly.
2441  *
2442  * Just say no to tons of global variables, sigh.
2443  */
2444 static struct record record = {
2445         .opts = {
2446                 .sample_time         = true,
2447                 .mmap_pages          = UINT_MAX,
2448                 .user_freq           = UINT_MAX,
2449                 .user_interval       = ULLONG_MAX,
2450                 .freq                = 4000,
2451                 .target              = {
2452                         .uses_mmap   = true,
2453                         .default_per_cpu = true,
2454                 },
2455                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2456                 .nr_threads_synthesize = 1,
2457                 .ctl_fd              = -1,
2458                 .ctl_fd_ack          = -1,
2459                 .synth               = PERF_SYNTH_ALL,
2460         },
2461         .tool = {
2462                 .sample         = process_sample_event,
2463                 .fork           = perf_event__process_fork,
2464                 .exit           = perf_event__process_exit,
2465                 .comm           = perf_event__process_comm,
2466                 .namespaces     = perf_event__process_namespaces,
2467                 .mmap           = build_id__process_mmap,
2468                 .mmap2          = build_id__process_mmap2,
2469                 .itrace_start   = process_timestamp_boundary,
2470                 .aux            = process_timestamp_boundary,
2471                 .ordered_events = true,
2472         },
2473 };
2474
2475 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2476         "\n\t\t\t\tDefault: fp";
2477
2478 static bool dry_run;
2479
2480 /*
2481  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2482  * with it and switch to use the library functions in perf_evlist that came
2483  * from builtin-record.c, i.e. use record_opts,
2484  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2485  * using pipes, etc.
2486  */
2487 static struct option __record_options[] = {
2488         OPT_CALLBACK('e', "event", &record.evlist, "event",
2489                      "event selector. use 'perf list' to list available events",
2490                      parse_events_option),
2491         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2492                      "event filter", parse_filter),
2493         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2494                            NULL, "don't record events from perf itself",
2495                            exclude_perf),
2496         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2497                     "record events on existing process id"),
2498         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2499                     "record events on existing thread id"),
2500         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2501                     "collect data with this RT SCHED_FIFO priority"),
2502         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2503                     "collect data without buffering"),
2504         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2505                     "collect raw sample records from all opened counters"),
2506         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2507                             "system-wide collection from all CPUs"),
2508         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2509                     "list of cpus to monitor"),
2510         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2511         OPT_STRING('o', "output", &record.data.path, "file",
2512                     "output file name"),
2513         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2514                         &record.opts.no_inherit_set,
2515                         "child tasks do not inherit counters"),
2516         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2517                     "synthesize non-sample events at the end of output"),
2518         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2519         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2520         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2521                     "Fail if the specified frequency can't be used"),
2522         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2523                      "profile at this frequency",
2524                       record__parse_freq),
2525         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2526                      "number of mmap data pages and AUX area tracing mmap pages",
2527                      record__parse_mmap_pages),
2528         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2529                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2530                      record__mmap_flush_parse),
2531         OPT_BOOLEAN(0, "group", &record.opts.group,
2532                     "put the counters into a counter group"),
2533         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2534                            NULL, "enables call-graph recording" ,
2535                            &record_callchain_opt),
2536         OPT_CALLBACK(0, "call-graph", &record.opts,
2537                      "record_mode[,record_size]", record_callchain_help,
2538                      &record_parse_callchain_opt),
2539         OPT_INCR('v', "verbose", &verbose,
2540                     "be more verbose (show counter open errors, etc)"),
2541         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2542         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2543                     "per thread counts"),
2544         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2545         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2546                     "Record the sample physical addresses"),
2547         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2548                     "Record the sampled data address data page size"),
2549         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2550                     "Record the sampled code address (ip) page size"),
2551         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2552         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2553                         &record.opts.sample_time_set,
2554                         "Record the sample timestamps"),
2555         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2556                         "Record the sample period"),
2557         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2558                     "don't sample"),
2559         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2560                         &record.no_buildid_cache_set,
2561                         "do not update the buildid cache"),
2562         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2563                         &record.no_buildid_set,
2564                         "do not collect buildids in perf.data"),
2565         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2566                      "monitor event in cgroup name only",
2567                      parse_cgroups),
2568         OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2569                   "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2570         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2571         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2572                    "user to profile"),
2573
2574         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2575                      "branch any", "sample any taken branches",
2576                      parse_branch_stack),
2577
2578         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2579                      "branch filter mask", "branch stack filter modes",
2580                      parse_branch_stack),
2581         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2582                     "sample by weight (on special events only)"),
2583         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2584                     "sample transaction flags (special events only)"),
2585         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2586                     "use per-thread mmaps"),
2587         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2588                     "sample selected machine registers on interrupt,"
2589                     " use '-I?' to list register names", parse_intr_regs),
2590         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2591                     "sample selected machine registers on interrupt,"
2592                     " use '--user-regs=?' to list register names", parse_user_regs),
2593         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2594                     "Record running/enabled time of read (:S) events"),
2595         OPT_CALLBACK('k', "clockid", &record.opts,
2596         "clockid", "clockid to use for events, see clock_gettime()",
2597         parse_clockid),
2598         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2599                           "opts", "AUX area tracing Snapshot Mode", ""),
2600         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2601                           "opts", "sample AUX area", ""),
2602         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2603                         "per thread proc mmap processing timeout in ms"),
2604         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2605                     "Record namespaces events"),
2606         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2607                     "Record cgroup events"),
2608         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2609                         &record.opts.record_switch_events_set,
2610                         "Record context switch events"),
2611         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2612                          "Configure all used events to run in kernel space.",
2613                          PARSE_OPT_EXCLUSIVE),
2614         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2615                          "Configure all used events to run in user space.",
2616                          PARSE_OPT_EXCLUSIVE),
2617         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2618                     "collect kernel callchains"),
2619         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2620                     "collect user callchains"),
2621         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2622                    "clang binary to use for compiling BPF scriptlets"),
2623         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2624                    "options passed to clang when compiling BPF scriptlets"),
2625         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2626                    "file", "vmlinux pathname"),
2627         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2628                     "Record build-id of all DSOs regardless of hits"),
2629         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2630                     "Record build-id in map events"),
2631         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2632                     "append timestamp to output filename"),
2633         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2634                     "Record timestamp boundary (time of first/last samples)"),
2635         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2636                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2637                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2638                           "signal"),
2639         OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2640                          "switch output event selector. use 'perf list' to list available events",
2641                          parse_events_option_new_evlist),
2642         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2643                    "Limit number of switch output generated files"),
2644         OPT_BOOLEAN(0, "dry-run", &dry_run,
2645                     "Parse options then exit"),
2646 #ifdef HAVE_AIO_SUPPORT
2647         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2648                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2649                      record__aio_parse),
2650 #endif
2651         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2652                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2653                      record__parse_affinity),
2654 #ifdef HAVE_ZSTD_SUPPORT
2655         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2656                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2657                             record__parse_comp_level),
2658 #endif
2659         OPT_CALLBACK(0, "max-size", &record.output_max_size,
2660                      "size", "Limit the maximum size of the output file", parse_output_max_size),
2661         OPT_UINTEGER(0, "num-thread-synthesize",
2662                      &record.opts.nr_threads_synthesize,
2663                      "number of threads to run for event synthesis"),
2664 #ifdef HAVE_LIBPFM
2665         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2666                 "libpfm4 event selector. use 'perf list' to list available events",
2667                 parse_libpfm_events_option),
2668 #endif
2669         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2670                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2671                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2672                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2673                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2674                       parse_control_option),
2675         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
2676                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
2677         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
2678                           &record.debuginfod.set, "debuginfod urls",
2679                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
2680                           "system"),
2681         OPT_END()
2682 };
2683
2684 struct option *record_options = __record_options;
2685
2686 int cmd_record(int argc, const char **argv)
2687 {
2688         int err;
2689         struct record *rec = &record;
2690         char errbuf[BUFSIZ];
2691
2692         setlocale(LC_ALL, "");
2693
2694 #ifndef HAVE_LIBBPF_SUPPORT
2695 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2696         set_nobuild('\0', "clang-path", true);
2697         set_nobuild('\0', "clang-opt", true);
2698 # undef set_nobuild
2699 #endif
2700
2701 #ifndef HAVE_BPF_PROLOGUE
2702 # if !defined (HAVE_DWARF_SUPPORT)
2703 #  define REASON  "NO_DWARF=1"
2704 # elif !defined (HAVE_LIBBPF_SUPPORT)
2705 #  define REASON  "NO_LIBBPF=1"
2706 # else
2707 #  define REASON  "this architecture doesn't support BPF prologue"
2708 # endif
2709 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2710         set_nobuild('\0', "vmlinux", true);
2711 # undef set_nobuild
2712 # undef REASON
2713 #endif
2714
2715         rec->opts.affinity = PERF_AFFINITY_SYS;
2716
2717         rec->evlist = evlist__new();
2718         if (rec->evlist == NULL)
2719                 return -ENOMEM;
2720
2721         err = perf_config(perf_record_config, rec);
2722         if (err)
2723                 return err;
2724
2725         argc = parse_options(argc, argv, record_options, record_usage,
2726                             PARSE_OPT_STOP_AT_NON_OPTION);
2727         if (quiet)
2728                 perf_quiet_option();
2729
2730         err = symbol__validate_sym_arguments();
2731         if (err)
2732                 return err;
2733
2734         perf_debuginfod_setup(&record.debuginfod);
2735
2736         /* Make system wide (-a) the default target. */
2737         if (!argc && target__none(&rec->opts.target))
2738                 rec->opts.target.system_wide = true;
2739
2740         if (nr_cgroups && !rec->opts.target.system_wide) {
2741                 usage_with_options_msg(record_usage, record_options,
2742                         "cgroup monitoring only available in system-wide mode");
2743
2744         }
2745
2746         if (rec->buildid_mmap) {
2747                 if (!perf_can_record_build_id()) {
2748                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2749                         err = -EINVAL;
2750                         goto out_opts;
2751                 }
2752                 pr_debug("Enabling build id in mmap2 events.\n");
2753                 /* Enable mmap build id synthesizing. */
2754                 symbol_conf.buildid_mmap2 = true;
2755                 /* Enable perf_event_attr::build_id bit. */
2756                 rec->opts.build_id = true;
2757                 /* Disable build id cache. */
2758                 rec->no_buildid = true;
2759         }
2760
2761         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2762                 pr_err("Kernel has no cgroup sampling support.\n");
2763                 err = -EINVAL;
2764                 goto out_opts;
2765         }
2766
2767         if (rec->opts.kcore)
2768                 rec->data.is_dir = true;
2769
2770         if (rec->opts.comp_level != 0) {
2771                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2772                 rec->no_buildid = true;
2773         }
2774
2775         if (rec->opts.record_switch_events &&
2776             !perf_can_record_switch_events()) {
2777                 ui__error("kernel does not support recording context switch events\n");
2778                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2779                 err = -EINVAL;
2780                 goto out_opts;
2781         }
2782
2783         if (switch_output_setup(rec)) {
2784                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2785                 err = -EINVAL;
2786                 goto out_opts;
2787         }
2788
2789         if (rec->switch_output.time) {
2790                 signal(SIGALRM, alarm_sig_handler);
2791                 alarm(rec->switch_output.time);
2792         }
2793
2794         if (rec->switch_output.num_files) {
2795                 rec->switch_output.filenames = calloc(sizeof(char *),
2796                                                       rec->switch_output.num_files);
2797                 if (!rec->switch_output.filenames) {
2798                         err = -EINVAL;
2799                         goto out_opts;
2800                 }
2801         }
2802
2803         /*
2804          * Allow aliases to facilitate the lookup of symbols for address
2805          * filters. Refer to auxtrace_parse_filters().
2806          */
2807         symbol_conf.allow_aliases = true;
2808
2809         symbol__init(NULL);
2810
2811         if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2812                 rec->affinity_mask.nbits = cpu__max_cpu().cpu;
2813                 rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
2814                 if (!rec->affinity_mask.bits) {
2815                         pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2816                         err = -ENOMEM;
2817                         goto out_opts;
2818                 }
2819                 pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2820         }
2821
2822         err = record__auxtrace_init(rec);
2823         if (err)
2824                 goto out;
2825
2826         if (dry_run)
2827                 goto out;
2828
2829         err = bpf__setup_stdout(rec->evlist);
2830         if (err) {
2831                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2832                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2833                          errbuf);
2834                 goto out;
2835         }
2836
2837         err = -ENOMEM;
2838
2839         if (rec->no_buildid_cache || rec->no_buildid) {
2840                 disable_buildid_cache();
2841         } else if (rec->switch_output.enabled) {
2842                 /*
2843                  * In 'perf record --switch-output', disable buildid
2844                  * generation by default to reduce data file switching
2845                  * overhead. Still generate buildid if they are required
2846                  * explicitly using
2847                  *
2848                  *  perf record --switch-output --no-no-buildid \
2849                  *              --no-no-buildid-cache
2850                  *
2851                  * Following code equals to:
2852                  *
2853                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2854                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2855                  *         disable_buildid_cache();
2856                  */
2857                 bool disable = true;
2858
2859                 if (rec->no_buildid_set && !rec->no_buildid)
2860                         disable = false;
2861                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2862                         disable = false;
2863                 if (disable) {
2864                         rec->no_buildid = true;
2865                         rec->no_buildid_cache = true;
2866                         disable_buildid_cache();
2867                 }
2868         }
2869
2870         if (record.opts.overwrite)
2871                 record.opts.tail_synthesize = true;
2872
2873         if (rec->evlist->core.nr_entries == 0) {
2874                 if (perf_pmu__has_hybrid()) {
2875                         err = evlist__add_default_hybrid(rec->evlist,
2876                                                          !record.opts.no_samples);
2877                 } else {
2878                         err = __evlist__add_default(rec->evlist,
2879                                                     !record.opts.no_samples);
2880                 }
2881
2882                 if (err < 0) {
2883                         pr_err("Not enough memory for event selector list\n");
2884                         goto out;
2885                 }
2886         }
2887
2888         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2889                 rec->opts.no_inherit = true;
2890
2891         err = target__validate(&rec->opts.target);
2892         if (err) {
2893                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2894                 ui__warning("%s\n", errbuf);
2895         }
2896
2897         err = target__parse_uid(&rec->opts.target);
2898         if (err) {
2899                 int saved_errno = errno;
2900
2901                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2902                 ui__error("%s", errbuf);
2903
2904                 err = -saved_errno;
2905                 goto out;
2906         }
2907
2908         /* Enable ignoring missing threads when -u/-p option is defined. */
2909         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2910
2911         if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
2912                 pr_err("failed to use cpu list %s\n",
2913                        rec->opts.target.cpu_list);
2914                 goto out;
2915         }
2916
2917         rec->opts.target.hybrid = perf_pmu__has_hybrid();
2918
2919         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
2920                 arch__add_leaf_frame_record_opts(&rec->opts);
2921
2922         err = -ENOMEM;
2923         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2924                 usage_with_options(record_usage, record_options);
2925
2926         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2927         if (err)
2928                 goto out;
2929
2930         /*
2931          * We take all buildids when the file contains
2932          * AUX area tracing data because we do not decode the
2933          * trace because it would take too long.
2934          */
2935         if (rec->opts.full_auxtrace)
2936                 rec->buildid_all = true;
2937
2938         if (rec->opts.text_poke) {
2939                 err = record__config_text_poke(rec->evlist);
2940                 if (err) {
2941                         pr_err("record__config_text_poke failed, error %d\n", err);
2942                         goto out;
2943                 }
2944         }
2945
2946         if (record_opts__config(&rec->opts)) {
2947                 err = -EINVAL;
2948                 goto out;
2949         }
2950
2951         if (rec->opts.nr_cblocks > nr_cblocks_max)
2952                 rec->opts.nr_cblocks = nr_cblocks_max;
2953         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2954
2955         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2956         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2957
2958         if (rec->opts.comp_level > comp_level_max)
2959                 rec->opts.comp_level = comp_level_max;
2960         pr_debug("comp level: %d\n", rec->opts.comp_level);
2961
2962         err = __cmd_record(&record, argc, argv);
2963 out:
2964         bitmap_free(rec->affinity_mask.bits);
2965         evlist__delete(rec->evlist);
2966         symbol__exit();
2967         auxtrace_record__free(rec->itr);
2968 out_opts:
2969         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2970         return err;
2971 }
2972
2973 static void snapshot_sig_handler(int sig __maybe_unused)
2974 {
2975         struct record *rec = &record;
2976
2977         hit_auxtrace_snapshot_trigger(rec);
2978
2979         if (switch_output_signal(rec))
2980                 trigger_hit(&switch_output_trigger);
2981 }
2982
2983 static void alarm_sig_handler(int sig __maybe_unused)
2984 {
2985         struct record *rec = &record;
2986
2987         if (switch_output_time(rec))
2988                 trigger_hit(&switch_output_trigger);
2989 }