Merge tag 'char-misc-6.4-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[linux.git] / drivers / md / dm-stats.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/errno.h>
3 #include <linux/numa.h>
4 #include <linux/slab.h>
5 #include <linux/rculist.h>
6 #include <linux/threads.h>
7 #include <linux/preempt.h>
8 #include <linux/irqflags.h>
9 #include <linux/vmalloc.h>
10 #include <linux/mm.h>
11 #include <linux/module.h>
12 #include <linux/device-mapper.h>
13
14 #include "dm-core.h"
15 #include "dm-stats.h"
16
17 #define DM_MSG_PREFIX "stats"
18
19 static int dm_stat_need_rcu_barrier;
20
21 /*
22  * Using 64-bit values to avoid overflow (which is a
23  * problem that block/genhd.c's IO accounting has).
24  */
25 struct dm_stat_percpu {
26         unsigned long long sectors[2];
27         unsigned long long ios[2];
28         unsigned long long merges[2];
29         unsigned long long ticks[2];
30         unsigned long long io_ticks[2];
31         unsigned long long io_ticks_total;
32         unsigned long long time_in_queue;
33         unsigned long long *histogram;
34 };
35
36 struct dm_stat_shared {
37         atomic_t in_flight[2];
38         unsigned long long stamp;
39         struct dm_stat_percpu tmp;
40 };
41
42 struct dm_stat {
43         struct list_head list_entry;
44         int id;
45         unsigned int stat_flags;
46         size_t n_entries;
47         sector_t start;
48         sector_t end;
49         sector_t step;
50         unsigned int n_histogram_entries;
51         unsigned long long *histogram_boundaries;
52         const char *program_id;
53         const char *aux_data;
54         struct rcu_head rcu_head;
55         size_t shared_alloc_size;
56         size_t percpu_alloc_size;
57         size_t histogram_alloc_size;
58         struct dm_stat_percpu *stat_percpu[NR_CPUS];
59         struct dm_stat_shared stat_shared[];
60 };
61
62 #define STAT_PRECISE_TIMESTAMPS         1
63
64 struct dm_stats_last_position {
65         sector_t last_sector;
66         unsigned int last_rw;
67 };
68
69 /*
70  * A typo on the command line could possibly make the kernel run out of memory
71  * and crash. To prevent the crash we account all used memory. We fail if we
72  * exhaust 1/4 of all memory or 1/2 of vmalloc space.
73  */
74 #define DM_STATS_MEMORY_FACTOR          4
75 #define DM_STATS_VMALLOC_FACTOR         2
76
77 static DEFINE_SPINLOCK(shared_memory_lock);
78
79 static unsigned long shared_memory_amount;
80
81 static bool __check_shared_memory(size_t alloc_size)
82 {
83         size_t a;
84
85         a = shared_memory_amount + alloc_size;
86         if (a < shared_memory_amount)
87                 return false;
88         if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR)
89                 return false;
90 #ifdef CONFIG_MMU
91         if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
92                 return false;
93 #endif
94         return true;
95 }
96
97 static bool check_shared_memory(size_t alloc_size)
98 {
99         bool ret;
100
101         spin_lock_irq(&shared_memory_lock);
102
103         ret = __check_shared_memory(alloc_size);
104
105         spin_unlock_irq(&shared_memory_lock);
106
107         return ret;
108 }
109
110 static bool claim_shared_memory(size_t alloc_size)
111 {
112         spin_lock_irq(&shared_memory_lock);
113
114         if (!__check_shared_memory(alloc_size)) {
115                 spin_unlock_irq(&shared_memory_lock);
116                 return false;
117         }
118
119         shared_memory_amount += alloc_size;
120
121         spin_unlock_irq(&shared_memory_lock);
122
123         return true;
124 }
125
126 static void free_shared_memory(size_t alloc_size)
127 {
128         unsigned long flags;
129
130         spin_lock_irqsave(&shared_memory_lock, flags);
131
132         if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
133                 spin_unlock_irqrestore(&shared_memory_lock, flags);
134                 DMCRIT("Memory usage accounting bug.");
135                 return;
136         }
137
138         shared_memory_amount -= alloc_size;
139
140         spin_unlock_irqrestore(&shared_memory_lock, flags);
141 }
142
143 static void *dm_kvzalloc(size_t alloc_size, int node)
144 {
145         void *p;
146
147         if (!claim_shared_memory(alloc_size))
148                 return NULL;
149
150         p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node);
151         if (p)
152                 return p;
153
154         free_shared_memory(alloc_size);
155
156         return NULL;
157 }
158
159 static void dm_kvfree(void *ptr, size_t alloc_size)
160 {
161         if (!ptr)
162                 return;
163
164         free_shared_memory(alloc_size);
165
166         kvfree(ptr);
167 }
168
169 static void dm_stat_free(struct rcu_head *head)
170 {
171         int cpu;
172         struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
173
174         kfree(s->histogram_boundaries);
175         kfree(s->program_id);
176         kfree(s->aux_data);
177         for_each_possible_cpu(cpu) {
178                 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
179                 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
180         }
181         dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
182         dm_kvfree(s, s->shared_alloc_size);
183 }
184
185 static int dm_stat_in_flight(struct dm_stat_shared *shared)
186 {
187         return atomic_read(&shared->in_flight[READ]) +
188                atomic_read(&shared->in_flight[WRITE]);
189 }
190
191 int dm_stats_init(struct dm_stats *stats)
192 {
193         int cpu;
194         struct dm_stats_last_position *last;
195
196         mutex_init(&stats->mutex);
197         INIT_LIST_HEAD(&stats->list);
198         stats->precise_timestamps = false;
199         stats->last = alloc_percpu(struct dm_stats_last_position);
200         if (!stats->last)
201                 return -ENOMEM;
202
203         for_each_possible_cpu(cpu) {
204                 last = per_cpu_ptr(stats->last, cpu);
205                 last->last_sector = (sector_t)ULLONG_MAX;
206                 last->last_rw = UINT_MAX;
207         }
208
209         return 0;
210 }
211
212 void dm_stats_cleanup(struct dm_stats *stats)
213 {
214         size_t ni;
215         struct dm_stat *s;
216         struct dm_stat_shared *shared;
217
218         while (!list_empty(&stats->list)) {
219                 s = container_of(stats->list.next, struct dm_stat, list_entry);
220                 list_del(&s->list_entry);
221                 for (ni = 0; ni < s->n_entries; ni++) {
222                         shared = &s->stat_shared[ni];
223                         if (WARN_ON(dm_stat_in_flight(shared))) {
224                                 DMCRIT("leaked in-flight counter at index %lu "
225                                        "(start %llu, end %llu, step %llu): reads %d, writes %d",
226                                        (unsigned long)ni,
227                                        (unsigned long long)s->start,
228                                        (unsigned long long)s->end,
229                                        (unsigned long long)s->step,
230                                        atomic_read(&shared->in_flight[READ]),
231                                        atomic_read(&shared->in_flight[WRITE]));
232                         }
233                         cond_resched();
234                 }
235                 dm_stat_free(&s->rcu_head);
236         }
237         free_percpu(stats->last);
238         mutex_destroy(&stats->mutex);
239 }
240
241 static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats)
242 {
243         struct list_head *l;
244         struct dm_stat *tmp_s;
245         bool precise_timestamps = false;
246
247         list_for_each(l, &stats->list) {
248                 tmp_s = container_of(l, struct dm_stat, list_entry);
249                 if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) {
250                         precise_timestamps = true;
251                         break;
252                 }
253         }
254         stats->precise_timestamps = precise_timestamps;
255 }
256
257 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
258                            sector_t step, unsigned int stat_flags,
259                            unsigned int n_histogram_entries,
260                            unsigned long long *histogram_boundaries,
261                            const char *program_id, const char *aux_data,
262                            void (*suspend_callback)(struct mapped_device *),
263                            void (*resume_callback)(struct mapped_device *),
264                            struct mapped_device *md)
265 {
266         struct list_head *l;
267         struct dm_stat *s, *tmp_s;
268         sector_t n_entries;
269         size_t ni;
270         size_t shared_alloc_size;
271         size_t percpu_alloc_size;
272         size_t histogram_alloc_size;
273         struct dm_stat_percpu *p;
274         int cpu;
275         int ret_id;
276         int r;
277
278         if (end < start || !step)
279                 return -EINVAL;
280
281         n_entries = end - start;
282         if (dm_sector_div64(n_entries, step))
283                 n_entries++;
284
285         if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
286                 return -EOVERFLOW;
287
288         shared_alloc_size = struct_size(s, stat_shared, n_entries);
289         if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
290                 return -EOVERFLOW;
291
292         percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
293         if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
294                 return -EOVERFLOW;
295
296         histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
297         if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
298                 return -EOVERFLOW;
299
300         if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
301                                  num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
302                 return -ENOMEM;
303
304         s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
305         if (!s)
306                 return -ENOMEM;
307
308         s->stat_flags = stat_flags;
309         s->n_entries = n_entries;
310         s->start = start;
311         s->end = end;
312         s->step = step;
313         s->shared_alloc_size = shared_alloc_size;
314         s->percpu_alloc_size = percpu_alloc_size;
315         s->histogram_alloc_size = histogram_alloc_size;
316
317         s->n_histogram_entries = n_histogram_entries;
318         s->histogram_boundaries = kmemdup(histogram_boundaries,
319                                           s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
320         if (!s->histogram_boundaries) {
321                 r = -ENOMEM;
322                 goto out;
323         }
324
325         s->program_id = kstrdup(program_id, GFP_KERNEL);
326         if (!s->program_id) {
327                 r = -ENOMEM;
328                 goto out;
329         }
330         s->aux_data = kstrdup(aux_data, GFP_KERNEL);
331         if (!s->aux_data) {
332                 r = -ENOMEM;
333                 goto out;
334         }
335
336         for (ni = 0; ni < n_entries; ni++) {
337                 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
338                 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
339                 cond_resched();
340         }
341
342         if (s->n_histogram_entries) {
343                 unsigned long long *hi;
344
345                 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
346                 if (!hi) {
347                         r = -ENOMEM;
348                         goto out;
349                 }
350                 for (ni = 0; ni < n_entries; ni++) {
351                         s->stat_shared[ni].tmp.histogram = hi;
352                         hi += s->n_histogram_entries + 1;
353                         cond_resched();
354                 }
355         }
356
357         for_each_possible_cpu(cpu) {
358                 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
359                 if (!p) {
360                         r = -ENOMEM;
361                         goto out;
362                 }
363                 s->stat_percpu[cpu] = p;
364                 if (s->n_histogram_entries) {
365                         unsigned long long *hi;
366
367                         hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
368                         if (!hi) {
369                                 r = -ENOMEM;
370                                 goto out;
371                         }
372                         for (ni = 0; ni < n_entries; ni++) {
373                                 p[ni].histogram = hi;
374                                 hi += s->n_histogram_entries + 1;
375                                 cond_resched();
376                         }
377                 }
378         }
379
380         /*
381          * Suspend/resume to make sure there is no i/o in flight,
382          * so that newly created statistics will be exact.
383          *
384          * (note: we couldn't suspend earlier because we must not
385          * allocate memory while suspended)
386          */
387         suspend_callback(md);
388
389         mutex_lock(&stats->mutex);
390         s->id = 0;
391         list_for_each(l, &stats->list) {
392                 tmp_s = container_of(l, struct dm_stat, list_entry);
393                 if (WARN_ON(tmp_s->id < s->id)) {
394                         r = -EINVAL;
395                         goto out_unlock_resume;
396                 }
397                 if (tmp_s->id > s->id)
398                         break;
399                 if (unlikely(s->id == INT_MAX)) {
400                         r = -ENFILE;
401                         goto out_unlock_resume;
402                 }
403                 s->id++;
404         }
405         ret_id = s->id;
406         list_add_tail_rcu(&s->list_entry, l);
407
408         dm_stats_recalc_precise_timestamps(stats);
409
410         if (!static_key_enabled(&stats_enabled.key))
411                 static_branch_enable(&stats_enabled);
412
413         mutex_unlock(&stats->mutex);
414
415         resume_callback(md);
416
417         return ret_id;
418
419 out_unlock_resume:
420         mutex_unlock(&stats->mutex);
421         resume_callback(md);
422 out:
423         dm_stat_free(&s->rcu_head);
424         return r;
425 }
426
427 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
428 {
429         struct dm_stat *s;
430
431         list_for_each_entry(s, &stats->list, list_entry) {
432                 if (s->id > id)
433                         break;
434                 if (s->id == id)
435                         return s;
436         }
437
438         return NULL;
439 }
440
441 static int dm_stats_delete(struct dm_stats *stats, int id)
442 {
443         struct dm_stat *s;
444         int cpu;
445
446         mutex_lock(&stats->mutex);
447
448         s = __dm_stats_find(stats, id);
449         if (!s) {
450                 mutex_unlock(&stats->mutex);
451                 return -ENOENT;
452         }
453
454         list_del_rcu(&s->list_entry);
455
456         dm_stats_recalc_precise_timestamps(stats);
457
458         mutex_unlock(&stats->mutex);
459
460         /*
461          * vfree can't be called from RCU callback
462          */
463         for_each_possible_cpu(cpu)
464                 if (is_vmalloc_addr(s->stat_percpu) ||
465                     is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
466                         goto do_sync_free;
467         if (is_vmalloc_addr(s) ||
468             is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
469 do_sync_free:
470                 synchronize_rcu_expedited();
471                 dm_stat_free(&s->rcu_head);
472         } else {
473                 WRITE_ONCE(dm_stat_need_rcu_barrier, 1);
474                 call_rcu(&s->rcu_head, dm_stat_free);
475         }
476         return 0;
477 }
478
479 static int dm_stats_list(struct dm_stats *stats, const char *program,
480                          char *result, unsigned int maxlen)
481 {
482         struct dm_stat *s;
483         sector_t len;
484         unsigned int sz = 0;
485
486         /*
487          * Output format:
488          *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
489          */
490
491         mutex_lock(&stats->mutex);
492         list_for_each_entry(s, &stats->list, list_entry) {
493                 if (!program || !strcmp(program, s->program_id)) {
494                         len = s->end - s->start;
495                         DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
496                                 (unsigned long long)s->start,
497                                 (unsigned long long)len,
498                                 (unsigned long long)s->step,
499                                 s->program_id,
500                                 s->aux_data);
501                         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
502                                 DMEMIT(" precise_timestamps");
503                         if (s->n_histogram_entries) {
504                                 unsigned int i;
505
506                                 DMEMIT(" histogram:");
507                                 for (i = 0; i < s->n_histogram_entries; i++) {
508                                         if (i)
509                                                 DMEMIT(",");
510                                         DMEMIT("%llu", s->histogram_boundaries[i]);
511                                 }
512                         }
513                         DMEMIT("\n");
514                 }
515                 cond_resched();
516         }
517         mutex_unlock(&stats->mutex);
518
519         return 1;
520 }
521
522 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
523                           struct dm_stat_percpu *p)
524 {
525         /*
526          * This is racy, but so is part_round_stats_single.
527          */
528         unsigned long long now, difference;
529         unsigned int in_flight_read, in_flight_write;
530
531         if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
532                 now = jiffies;
533         else
534                 now = ktime_to_ns(ktime_get());
535
536         difference = now - shared->stamp;
537         if (!difference)
538                 return;
539
540         in_flight_read = (unsigned int)atomic_read(&shared->in_flight[READ]);
541         in_flight_write = (unsigned int)atomic_read(&shared->in_flight[WRITE]);
542         if (in_flight_read)
543                 p->io_ticks[READ] += difference;
544         if (in_flight_write)
545                 p->io_ticks[WRITE] += difference;
546         if (in_flight_read + in_flight_write) {
547                 p->io_ticks_total += difference;
548                 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
549         }
550         shared->stamp = now;
551 }
552
553 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
554                               int idx, sector_t len,
555                               struct dm_stats_aux *stats_aux, bool end,
556                               unsigned long duration_jiffies)
557 {
558         struct dm_stat_shared *shared = &s->stat_shared[entry];
559         struct dm_stat_percpu *p;
560
561         /*
562          * For strict correctness we should use local_irq_save/restore
563          * instead of preempt_disable/enable.
564          *
565          * preempt_disable/enable is racy if the driver finishes bios
566          * from non-interrupt context as well as from interrupt context
567          * or from more different interrupts.
568          *
569          * On 64-bit architectures the race only results in not counting some
570          * events, so it is acceptable.  On 32-bit architectures the race could
571          * cause the counter going off by 2^32, so we need to do proper locking
572          * there.
573          *
574          * part_stat_lock()/part_stat_unlock() have this race too.
575          */
576 #if BITS_PER_LONG == 32
577         unsigned long flags;
578
579         local_irq_save(flags);
580 #else
581         preempt_disable();
582 #endif
583         p = &s->stat_percpu[smp_processor_id()][entry];
584
585         if (!end) {
586                 dm_stat_round(s, shared, p);
587                 atomic_inc(&shared->in_flight[idx]);
588         } else {
589                 unsigned long long duration;
590
591                 dm_stat_round(s, shared, p);
592                 atomic_dec(&shared->in_flight[idx]);
593                 p->sectors[idx] += len;
594                 p->ios[idx] += 1;
595                 p->merges[idx] += stats_aux->merged;
596                 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
597                         p->ticks[idx] += duration_jiffies;
598                         duration = jiffies_to_msecs(duration_jiffies);
599                 } else {
600                         p->ticks[idx] += stats_aux->duration_ns;
601                         duration = stats_aux->duration_ns;
602                 }
603                 if (s->n_histogram_entries) {
604                         unsigned int lo = 0, hi = s->n_histogram_entries + 1;
605
606                         while (lo + 1 < hi) {
607                                 unsigned int mid = (lo + hi) / 2;
608
609                                 if (s->histogram_boundaries[mid - 1] > duration)
610                                         hi = mid;
611                                 else
612                                         lo = mid;
613                         }
614                         p->histogram[lo]++;
615                 }
616         }
617
618 #if BITS_PER_LONG == 32
619         local_irq_restore(flags);
620 #else
621         preempt_enable();
622 #endif
623 }
624
625 static void __dm_stat_bio(struct dm_stat *s, int bi_rw,
626                           sector_t bi_sector, sector_t end_sector,
627                           bool end, unsigned long duration_jiffies,
628                           struct dm_stats_aux *stats_aux)
629 {
630         sector_t rel_sector, offset, todo, fragment_len;
631         size_t entry;
632
633         if (end_sector <= s->start || bi_sector >= s->end)
634                 return;
635         if (unlikely(bi_sector < s->start)) {
636                 rel_sector = 0;
637                 todo = end_sector - s->start;
638         } else {
639                 rel_sector = bi_sector - s->start;
640                 todo = end_sector - bi_sector;
641         }
642         if (unlikely(end_sector > s->end))
643                 todo -= (end_sector - s->end);
644
645         offset = dm_sector_div64(rel_sector, s->step);
646         entry = rel_sector;
647         do {
648                 if (WARN_ON_ONCE(entry >= s->n_entries)) {
649                         DMCRIT("Invalid area access in region id %d", s->id);
650                         return;
651                 }
652                 fragment_len = todo;
653                 if (fragment_len > s->step - offset)
654                         fragment_len = s->step - offset;
655                 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
656                                   stats_aux, end, duration_jiffies);
657                 todo -= fragment_len;
658                 entry++;
659                 offset = 0;
660         } while (unlikely(todo != 0));
661 }
662
663 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
664                          sector_t bi_sector, unsigned int bi_sectors, bool end,
665                          unsigned long start_time,
666                          struct dm_stats_aux *stats_aux)
667 {
668         struct dm_stat *s;
669         sector_t end_sector;
670         struct dm_stats_last_position *last;
671         bool got_precise_time;
672         unsigned long duration_jiffies = 0;
673
674         if (unlikely(!bi_sectors))
675                 return;
676
677         end_sector = bi_sector + bi_sectors;
678
679         if (!end) {
680                 /*
681                  * A race condition can at worst result in the merged flag being
682                  * misrepresented, so we don't have to disable preemption here.
683                  */
684                 last = raw_cpu_ptr(stats->last);
685                 stats_aux->merged =
686                         (bi_sector == (READ_ONCE(last->last_sector) &&
687                                        ((bi_rw == WRITE) ==
688                                         (READ_ONCE(last->last_rw) == WRITE))
689                                        ));
690                 WRITE_ONCE(last->last_sector, end_sector);
691                 WRITE_ONCE(last->last_rw, bi_rw);
692         } else
693                 duration_jiffies = jiffies - start_time;
694
695         rcu_read_lock();
696
697         got_precise_time = false;
698         list_for_each_entry_rcu(s, &stats->list, list_entry) {
699                 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
700                         /* start (!end) duration_ns is set by DM core's alloc_io() */
701                         if (end)
702                                 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
703                         got_precise_time = true;
704                 }
705                 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
706         }
707
708         rcu_read_unlock();
709 }
710
711 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
712                                                    struct dm_stat *s, size_t x)
713 {
714         int cpu;
715         struct dm_stat_percpu *p;
716
717         local_irq_disable();
718         p = &s->stat_percpu[smp_processor_id()][x];
719         dm_stat_round(s, shared, p);
720         local_irq_enable();
721
722         shared->tmp.sectors[READ] = 0;
723         shared->tmp.sectors[WRITE] = 0;
724         shared->tmp.ios[READ] = 0;
725         shared->tmp.ios[WRITE] = 0;
726         shared->tmp.merges[READ] = 0;
727         shared->tmp.merges[WRITE] = 0;
728         shared->tmp.ticks[READ] = 0;
729         shared->tmp.ticks[WRITE] = 0;
730         shared->tmp.io_ticks[READ] = 0;
731         shared->tmp.io_ticks[WRITE] = 0;
732         shared->tmp.io_ticks_total = 0;
733         shared->tmp.time_in_queue = 0;
734
735         if (s->n_histogram_entries)
736                 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
737
738         for_each_possible_cpu(cpu) {
739                 p = &s->stat_percpu[cpu][x];
740                 shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]);
741                 shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]);
742                 shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]);
743                 shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]);
744                 shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]);
745                 shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]);
746                 shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]);
747                 shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]);
748                 shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]);
749                 shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]);
750                 shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total);
751                 shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue);
752                 if (s->n_histogram_entries) {
753                         unsigned int i;
754
755                         for (i = 0; i < s->n_histogram_entries + 1; i++)
756                                 shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]);
757                 }
758         }
759 }
760
761 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
762                             bool init_tmp_percpu_totals)
763 {
764         size_t x;
765         struct dm_stat_shared *shared;
766         struct dm_stat_percpu *p;
767
768         for (x = idx_start; x < idx_end; x++) {
769                 shared = &s->stat_shared[x];
770                 if (init_tmp_percpu_totals)
771                         __dm_stat_init_temporary_percpu_totals(shared, s, x);
772                 local_irq_disable();
773                 p = &s->stat_percpu[smp_processor_id()][x];
774                 p->sectors[READ] -= shared->tmp.sectors[READ];
775                 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
776                 p->ios[READ] -= shared->tmp.ios[READ];
777                 p->ios[WRITE] -= shared->tmp.ios[WRITE];
778                 p->merges[READ] -= shared->tmp.merges[READ];
779                 p->merges[WRITE] -= shared->tmp.merges[WRITE];
780                 p->ticks[READ] -= shared->tmp.ticks[READ];
781                 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
782                 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
783                 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
784                 p->io_ticks_total -= shared->tmp.io_ticks_total;
785                 p->time_in_queue -= shared->tmp.time_in_queue;
786                 local_irq_enable();
787                 if (s->n_histogram_entries) {
788                         unsigned int i;
789
790                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
791                                 local_irq_disable();
792                                 p = &s->stat_percpu[smp_processor_id()][x];
793                                 p->histogram[i] -= shared->tmp.histogram[i];
794                                 local_irq_enable();
795                         }
796                 }
797                 cond_resched();
798         }
799 }
800
801 static int dm_stats_clear(struct dm_stats *stats, int id)
802 {
803         struct dm_stat *s;
804
805         mutex_lock(&stats->mutex);
806
807         s = __dm_stats_find(stats, id);
808         if (!s) {
809                 mutex_unlock(&stats->mutex);
810                 return -ENOENT;
811         }
812
813         __dm_stat_clear(s, 0, s->n_entries, true);
814
815         mutex_unlock(&stats->mutex);
816
817         return 1;
818 }
819
820 /*
821  * This is like jiffies_to_msec, but works for 64-bit values.
822  */
823 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
824 {
825         unsigned long long result;
826         unsigned int mult;
827
828         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
829                 return j;
830
831         result = 0;
832         if (j)
833                 result = jiffies_to_msecs(j & 0x3fffff);
834         if (j >= 1 << 22) {
835                 mult = jiffies_to_msecs(1 << 22);
836                 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
837         }
838         if (j >= 1ULL << 44)
839                 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
840
841         return result;
842 }
843
844 static int dm_stats_print(struct dm_stats *stats, int id,
845                           size_t idx_start, size_t idx_len,
846                           bool clear, char *result, unsigned int maxlen)
847 {
848         unsigned int sz = 0;
849         struct dm_stat *s;
850         size_t x;
851         sector_t start, end, step;
852         size_t idx_end;
853         struct dm_stat_shared *shared;
854
855         /*
856          * Output format:
857          *   <start_sector>+<length> counters
858          */
859
860         mutex_lock(&stats->mutex);
861
862         s = __dm_stats_find(stats, id);
863         if (!s) {
864                 mutex_unlock(&stats->mutex);
865                 return -ENOENT;
866         }
867
868         idx_end = idx_start + idx_len;
869         if (idx_end < idx_start ||
870             idx_end > s->n_entries)
871                 idx_end = s->n_entries;
872
873         if (idx_start > idx_end)
874                 idx_start = idx_end;
875
876         step = s->step;
877         start = s->start + (step * idx_start);
878
879         for (x = idx_start; x < idx_end; x++, start = end) {
880                 shared = &s->stat_shared[x];
881                 end = start + step;
882                 if (unlikely(end > s->end))
883                         end = s->end;
884
885                 __dm_stat_init_temporary_percpu_totals(shared, s, x);
886
887                 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
888                        (unsigned long long)start,
889                        (unsigned long long)step,
890                        shared->tmp.ios[READ],
891                        shared->tmp.merges[READ],
892                        shared->tmp.sectors[READ],
893                        dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
894                        shared->tmp.ios[WRITE],
895                        shared->tmp.merges[WRITE],
896                        shared->tmp.sectors[WRITE],
897                        dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
898                        dm_stat_in_flight(shared),
899                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
900                        dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
901                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
902                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
903                 if (s->n_histogram_entries) {
904                         unsigned int i;
905
906                         for (i = 0; i < s->n_histogram_entries + 1; i++)
907                                 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
908                 }
909                 DMEMIT("\n");
910
911                 if (unlikely(sz + 1 >= maxlen))
912                         goto buffer_overflow;
913
914                 cond_resched();
915         }
916
917         if (clear)
918                 __dm_stat_clear(s, idx_start, idx_end, false);
919
920 buffer_overflow:
921         mutex_unlock(&stats->mutex);
922
923         return 1;
924 }
925
926 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
927 {
928         struct dm_stat *s;
929         const char *new_aux_data;
930
931         mutex_lock(&stats->mutex);
932
933         s = __dm_stats_find(stats, id);
934         if (!s) {
935                 mutex_unlock(&stats->mutex);
936                 return -ENOENT;
937         }
938
939         new_aux_data = kstrdup(aux_data, GFP_KERNEL);
940         if (!new_aux_data) {
941                 mutex_unlock(&stats->mutex);
942                 return -ENOMEM;
943         }
944
945         kfree(s->aux_data);
946         s->aux_data = new_aux_data;
947
948         mutex_unlock(&stats->mutex);
949
950         return 0;
951 }
952
953 static int parse_histogram(const char *h, unsigned int *n_histogram_entries,
954                            unsigned long long **histogram_boundaries)
955 {
956         const char *q;
957         unsigned int n;
958         unsigned long long last;
959
960         *n_histogram_entries = 1;
961         for (q = h; *q; q++)
962                 if (*q == ',')
963                         (*n_histogram_entries)++;
964
965         *histogram_boundaries = kmalloc_array(*n_histogram_entries,
966                                               sizeof(unsigned long long),
967                                               GFP_KERNEL);
968         if (!*histogram_boundaries)
969                 return -ENOMEM;
970
971         n = 0;
972         last = 0;
973         while (1) {
974                 unsigned long long hi;
975                 int s;
976                 char ch;
977
978                 s = sscanf(h, "%llu%c", &hi, &ch);
979                 if (!s || (s == 2 && ch != ','))
980                         return -EINVAL;
981                 if (hi <= last)
982                         return -EINVAL;
983                 last = hi;
984                 (*histogram_boundaries)[n] = hi;
985                 if (s == 1)
986                         return 0;
987                 h = strchr(h, ',') + 1;
988                 n++;
989         }
990 }
991
992 static int message_stats_create(struct mapped_device *md,
993                                 unsigned int argc, char **argv,
994                                 char *result, unsigned int maxlen)
995 {
996         int r;
997         int id;
998         char dummy;
999         unsigned long long start, end, len, step;
1000         unsigned int divisor;
1001         const char *program_id, *aux_data;
1002         unsigned int stat_flags = 0;
1003         unsigned int n_histogram_entries = 0;
1004         unsigned long long *histogram_boundaries = NULL;
1005         struct dm_arg_set as, as_backup;
1006         const char *a;
1007         unsigned int feature_args;
1008
1009         /*
1010          * Input format:
1011          *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
1012          */
1013
1014         if (argc < 3)
1015                 goto ret_einval;
1016
1017         as.argc = argc;
1018         as.argv = argv;
1019         dm_consume_args(&as, 1);
1020
1021         a = dm_shift_arg(&as);
1022         if (!strcmp(a, "-")) {
1023                 start = 0;
1024                 len = dm_get_size(md);
1025                 if (!len)
1026                         len = 1;
1027         } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
1028                    start != (sector_t)start || len != (sector_t)len)
1029                 goto ret_einval;
1030
1031         end = start + len;
1032         if (start >= end)
1033                 goto ret_einval;
1034
1035         a = dm_shift_arg(&as);
1036         if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
1037                 if (!divisor)
1038                         return -EINVAL;
1039                 step = end - start;
1040                 if (do_div(step, divisor))
1041                         step++;
1042                 if (!step)
1043                         step = 1;
1044         } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
1045                    step != (sector_t)step || !step)
1046                 goto ret_einval;
1047
1048         as_backup = as;
1049         a = dm_shift_arg(&as);
1050         if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
1051                 while (feature_args--) {
1052                         a = dm_shift_arg(&as);
1053                         if (!a)
1054                                 goto ret_einval;
1055                         if (!strcasecmp(a, "precise_timestamps"))
1056                                 stat_flags |= STAT_PRECISE_TIMESTAMPS;
1057                         else if (!strncasecmp(a, "histogram:", 10)) {
1058                                 if (n_histogram_entries)
1059                                         goto ret_einval;
1060                                 r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries);
1061                                 if (r)
1062                                         goto ret;
1063                         } else
1064                                 goto ret_einval;
1065                 }
1066         } else {
1067                 as = as_backup;
1068         }
1069
1070         program_id = "-";
1071         aux_data = "-";
1072
1073         a = dm_shift_arg(&as);
1074         if (a)
1075                 program_id = a;
1076
1077         a = dm_shift_arg(&as);
1078         if (a)
1079                 aux_data = a;
1080
1081         if (as.argc)
1082                 goto ret_einval;
1083
1084         /*
1085          * If a buffer overflow happens after we created the region,
1086          * it's too late (the userspace would retry with a larger
1087          * buffer, but the region id that caused the overflow is already
1088          * leaked).  So we must detect buffer overflow in advance.
1089          */
1090         snprintf(result, maxlen, "%d", INT_MAX);
1091         if (dm_message_test_buffer_overflow(result, maxlen)) {
1092                 r = 1;
1093                 goto ret;
1094         }
1095
1096         id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
1097                              n_histogram_entries, histogram_boundaries, program_id, aux_data,
1098                              dm_internal_suspend_fast, dm_internal_resume_fast, md);
1099         if (id < 0) {
1100                 r = id;
1101                 goto ret;
1102         }
1103
1104         snprintf(result, maxlen, "%d", id);
1105
1106         r = 1;
1107         goto ret;
1108
1109 ret_einval:
1110         r = -EINVAL;
1111 ret:
1112         kfree(histogram_boundaries);
1113         return r;
1114 }
1115
1116 static int message_stats_delete(struct mapped_device *md,
1117                                 unsigned int argc, char **argv)
1118 {
1119         int id;
1120         char dummy;
1121
1122         if (argc != 2)
1123                 return -EINVAL;
1124
1125         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1126                 return -EINVAL;
1127
1128         return dm_stats_delete(dm_get_stats(md), id);
1129 }
1130
1131 static int message_stats_clear(struct mapped_device *md,
1132                                unsigned int argc, char **argv)
1133 {
1134         int id;
1135         char dummy;
1136
1137         if (argc != 2)
1138                 return -EINVAL;
1139
1140         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1141                 return -EINVAL;
1142
1143         return dm_stats_clear(dm_get_stats(md), id);
1144 }
1145
1146 static int message_stats_list(struct mapped_device *md,
1147                               unsigned int argc, char **argv,
1148                               char *result, unsigned int maxlen)
1149 {
1150         int r;
1151         const char *program = NULL;
1152
1153         if (argc < 1 || argc > 2)
1154                 return -EINVAL;
1155
1156         if (argc > 1) {
1157                 program = kstrdup(argv[1], GFP_KERNEL);
1158                 if (!program)
1159                         return -ENOMEM;
1160         }
1161
1162         r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
1163
1164         kfree(program);
1165
1166         return r;
1167 }
1168
1169 static int message_stats_print(struct mapped_device *md,
1170                                unsigned int argc, char **argv, bool clear,
1171                                char *result, unsigned int maxlen)
1172 {
1173         int id;
1174         char dummy;
1175         unsigned long idx_start = 0, idx_len = ULONG_MAX;
1176
1177         if (argc != 2 && argc != 4)
1178                 return -EINVAL;
1179
1180         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1181                 return -EINVAL;
1182
1183         if (argc > 3) {
1184                 if (strcmp(argv[2], "-") &&
1185                     sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
1186                         return -EINVAL;
1187                 if (strcmp(argv[3], "-") &&
1188                     sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
1189                         return -EINVAL;
1190         }
1191
1192         return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
1193                               result, maxlen);
1194 }
1195
1196 static int message_stats_set_aux(struct mapped_device *md,
1197                                  unsigned int argc, char **argv)
1198 {
1199         int id;
1200         char dummy;
1201
1202         if (argc != 3)
1203                 return -EINVAL;
1204
1205         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1206                 return -EINVAL;
1207
1208         return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
1209 }
1210
1211 int dm_stats_message(struct mapped_device *md, unsigned int argc, char **argv,
1212                      char *result, unsigned int maxlen)
1213 {
1214         int r;
1215
1216         /* All messages here must start with '@' */
1217         if (!strcasecmp(argv[0], "@stats_create"))
1218                 r = message_stats_create(md, argc, argv, result, maxlen);
1219         else if (!strcasecmp(argv[0], "@stats_delete"))
1220                 r = message_stats_delete(md, argc, argv);
1221         else if (!strcasecmp(argv[0], "@stats_clear"))
1222                 r = message_stats_clear(md, argc, argv);
1223         else if (!strcasecmp(argv[0], "@stats_list"))
1224                 r = message_stats_list(md, argc, argv, result, maxlen);
1225         else if (!strcasecmp(argv[0], "@stats_print"))
1226                 r = message_stats_print(md, argc, argv, false, result, maxlen);
1227         else if (!strcasecmp(argv[0], "@stats_print_clear"))
1228                 r = message_stats_print(md, argc, argv, true, result, maxlen);
1229         else if (!strcasecmp(argv[0], "@stats_set_aux"))
1230                 r = message_stats_set_aux(md, argc, argv);
1231         else
1232                 return 2; /* this wasn't a stats message */
1233
1234         if (r == -EINVAL)
1235                 DMCRIT("Invalid parameters for message %s", argv[0]);
1236
1237         return r;
1238 }
1239
1240 int __init dm_statistics_init(void)
1241 {
1242         shared_memory_amount = 0;
1243         dm_stat_need_rcu_barrier = 0;
1244         return 0;
1245 }
1246
1247 void dm_statistics_exit(void)
1248 {
1249         if (dm_stat_need_rcu_barrier)
1250                 rcu_barrier();
1251         if (WARN_ON(shared_memory_amount))
1252                 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
1253 }
1254
1255 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, 0444);
1256 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");