bcachefs: fix unsafety in bch2_stripe_to_text()
[linux.git] / fs / bcachefs / ec.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* erasure coding */
4
5 #include "bcachefs.h"
6 #include "alloc_background.h"
7 #include "alloc_foreground.h"
8 #include "backpointers.h"
9 #include "bkey_buf.h"
10 #include "bset.h"
11 #include "btree_gc.h"
12 #include "btree_update.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "disk_groups.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "io_read.h"
20 #include "keylist.h"
21 #include "recovery.h"
22 #include "replicas.h"
23 #include "super-io.h"
24 #include "util.h"
25
26 #include <linux/sort.h>
27
28 #ifdef __KERNEL__
29
30 #include <linux/raid/pq.h>
31 #include <linux/raid/xor.h>
32
33 static void raid5_recov(unsigned disks, unsigned failed_idx,
34                         size_t size, void **data)
35 {
36         unsigned i = 2, nr;
37
38         BUG_ON(failed_idx >= disks);
39
40         swap(data[0], data[failed_idx]);
41         memcpy(data[0], data[1], size);
42
43         while (i < disks) {
44                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
45                 xor_blocks(nr, size, data[0], data + i);
46                 i += nr;
47         }
48
49         swap(data[0], data[failed_idx]);
50 }
51
52 static void raid_gen(int nd, int np, size_t size, void **v)
53 {
54         if (np >= 1)
55                 raid5_recov(nd + np, nd, size, v);
56         if (np >= 2)
57                 raid6_call.gen_syndrome(nd + np, size, v);
58         BUG_ON(np > 2);
59 }
60
61 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
62 {
63         switch (nr) {
64         case 0:
65                 break;
66         case 1:
67                 if (ir[0] < nd + 1)
68                         raid5_recov(nd + 1, ir[0], size, v);
69                 else
70                         raid6_call.gen_syndrome(nd + np, size, v);
71                 break;
72         case 2:
73                 if (ir[1] < nd) {
74                         /* data+data failure. */
75                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
76                 } else if (ir[0] < nd) {
77                         /* data + p/q failure */
78
79                         if (ir[1] == nd) /* data + p failure */
80                                 raid6_datap_recov(nd + np, size, ir[0], v);
81                         else { /* data + q failure */
82                                 raid5_recov(nd + 1, ir[0], size, v);
83                                 raid6_call.gen_syndrome(nd + np, size, v);
84                         }
85                 } else {
86                         raid_gen(nd, np, size, v);
87                 }
88                 break;
89         default:
90                 BUG();
91         }
92 }
93
94 #else
95
96 #include <raid/raid.h>
97
98 #endif
99
100 struct ec_bio {
101         struct bch_dev          *ca;
102         struct ec_stripe_buf    *buf;
103         size_t                  idx;
104         struct bio              bio;
105 };
106
107 /* Stripes btree keys: */
108
109 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
110                         enum bkey_invalid_flags flags,
111                         struct printbuf *err)
112 {
113         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
114         int ret = 0;
115
116         bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
117                          bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
118                          stripe_pos_bad,
119                          "stripe at bad pos");
120
121         bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
122                          stripe_val_size_bad,
123                          "incorrect value size (%zu < %u)",
124                          bkey_val_u64s(k.k), stripe_val_u64s(s));
125
126         ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
127 fsck_err:
128         return ret;
129 }
130
131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
132                          struct bkey_s_c k)
133 {
134         const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
135         struct bch_stripe s = {};
136
137         memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
138
139         unsigned nr_data = s.nr_blocks - s.nr_redundant;
140
141         prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
142                    s.algorithm,
143                    le16_to_cpu(s.sectors),
144                    nr_data,
145                    s.nr_redundant,
146                    s.csum_type,
147                    1U << s.csum_granularity_bits);
148
149         for (unsigned i = 0; i < s.nr_blocks; i++) {
150                 const struct bch_extent_ptr *ptr = sp->ptrs + i;
151
152                 if ((void *) ptr >= bkey_val_end(k))
153                         break;
154
155                 bch2_extent_ptr_to_text(out, c, ptr);
156
157                 if (s.csum_type < BCH_CSUM_NR &&
158                     i < nr_data &&
159                     stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
160                         prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
161         }
162 }
163
164 /* Triggers: */
165
166 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
167                                          struct bkey_s_c_stripe s,
168                                          unsigned idx, bool deleting)
169 {
170         struct bch_fs *c = trans->c;
171         const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
172         struct btree_iter iter;
173         struct bkey_i_alloc_v4 *a;
174         enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
175                 ? BCH_DATA_parity : 0;
176         s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
177         int ret = 0;
178
179         if (deleting)
180                 sectors = -sectors;
181
182         a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
183         if (IS_ERR(a))
184                 return PTR_ERR(a);
185
186         ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
187                                     a->v.gen, a->v.data_type,
188                                     a->v.dirty_sectors);
189         if (ret)
190                 goto err;
191
192         if (!deleting) {
193                 if (bch2_trans_inconsistent_on(a->v.stripe ||
194                                                a->v.stripe_redundancy, trans,
195                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
196                                 iter.pos.inode, iter.pos.offset, a->v.gen,
197                                 bch2_data_type_str(a->v.data_type),
198                                 a->v.dirty_sectors,
199                                 a->v.stripe, s.k->p.offset)) {
200                         ret = -EIO;
201                         goto err;
202                 }
203
204                 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
205                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
206                                 iter.pos.inode, iter.pos.offset, a->v.gen,
207                                 bch2_data_type_str(a->v.data_type),
208                                 a->v.dirty_sectors,
209                                 s.k->p.offset)) {
210                         ret = -EIO;
211                         goto err;
212                 }
213
214                 a->v.stripe             = s.k->p.offset;
215                 a->v.stripe_redundancy  = s.v->nr_redundant;
216                 a->v.data_type          = BCH_DATA_stripe;
217         } else {
218                 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
219                                                a->v.stripe_redundancy != s.v->nr_redundant, trans,
220                                 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
221                                 iter.pos.inode, iter.pos.offset, a->v.gen,
222                                 s.k->p.offset, a->v.stripe)) {
223                         ret = -EIO;
224                         goto err;
225                 }
226
227                 a->v.stripe             = 0;
228                 a->v.stripe_redundancy  = 0;
229                 a->v.data_type          = alloc_data_type(a->v, BCH_DATA_user);
230         }
231
232         a->v.dirty_sectors += sectors;
233         if (data_type)
234                 a->v.data_type = !deleting ? data_type : 0;
235
236         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
237         if (ret)
238                 goto err;
239 err:
240         bch2_trans_iter_exit(trans, &iter);
241         return ret;
242 }
243
244 static int mark_stripe_bucket(struct btree_trans *trans,
245                               struct bkey_s_c k,
246                               unsigned ptr_idx,
247                               unsigned flags)
248 {
249         struct bch_fs *c = trans->c;
250         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
251         unsigned nr_data = s->nr_blocks - s->nr_redundant;
252         bool parity = ptr_idx >= nr_data;
253         enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
254         s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
255         const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
256         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
257         struct bucket old, new, *g;
258         struct printbuf buf = PRINTBUF;
259         int ret = 0;
260
261         BUG_ON(!(flags & BTREE_TRIGGER_GC));
262
263         /* * XXX doesn't handle deletion */
264
265         percpu_down_read(&c->mark_lock);
266         g = PTR_GC_BUCKET(ca, ptr);
267
268         if (g->dirty_sectors ||
269             (g->stripe && g->stripe != k.k->p.offset)) {
270                 bch2_fs_inconsistent(c,
271                               "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
272                               ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
273                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
274                 ret = -EINVAL;
275                 goto err;
276         }
277
278         bucket_lock(g);
279         old = *g;
280
281         ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
282                                     g->gen, g->data_type,
283                                     g->dirty_sectors);
284         if (ret)
285                 goto err;
286
287         g->data_type = data_type;
288         g->dirty_sectors += sectors;
289
290         g->stripe               = k.k->p.offset;
291         g->stripe_redundancy    = s->nr_redundant;
292         new = *g;
293 err:
294         bucket_unlock(g);
295         if (!ret)
296                 bch2_dev_usage_update_m(c, ca, &old, &new);
297         percpu_up_read(&c->mark_lock);
298         printbuf_exit(&buf);
299         return ret;
300 }
301
302 int bch2_trigger_stripe(struct btree_trans *trans,
303                         enum btree_id btree_id, unsigned level,
304                         struct bkey_s_c old, struct bkey_s _new,
305                         unsigned flags)
306 {
307         struct bkey_s_c new = _new.s_c;
308         struct bch_fs *c = trans->c;
309         u64 idx = new.k->p.offset;
310         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
311                 ? bkey_s_c_to_stripe(old).v : NULL;
312         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
313                 ? bkey_s_c_to_stripe(new).v : NULL;
314
315         if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
316                 /*
317                  * If the pointers aren't changing, we don't need to do anything:
318                  */
319                 if (new_s && old_s &&
320                     new_s->nr_blocks    == old_s->nr_blocks &&
321                     new_s->nr_redundant == old_s->nr_redundant &&
322                     !memcmp(old_s->ptrs, new_s->ptrs,
323                             new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
324                         return 0;
325
326                 BUG_ON(new_s && old_s &&
327                        (new_s->nr_blocks        != old_s->nr_blocks ||
328                         new_s->nr_redundant     != old_s->nr_redundant));
329
330                 if (new_s) {
331                         s64 sectors = le16_to_cpu(new_s->sectors);
332
333                         struct bch_replicas_padded r;
334                         bch2_bkey_to_replicas(&r.e, new);
335                         int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
336                         if (ret)
337                                 return ret;
338                 }
339
340                 if (old_s) {
341                         s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
342
343                         struct bch_replicas_padded r;
344                         bch2_bkey_to_replicas(&r.e, old);
345                         int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
346                         if (ret)
347                                 return ret;
348                 }
349
350                 unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
351                 for (unsigned i = 0; i < nr_blocks; i++) {
352                         if (new_s && old_s &&
353                             !memcmp(&new_s->ptrs[i],
354                                     &old_s->ptrs[i],
355                                     sizeof(new_s->ptrs[i])))
356                                 continue;
357
358                         if (new_s) {
359                                 int ret = bch2_trans_mark_stripe_bucket(trans,
360                                                 bkey_s_c_to_stripe(new), i, false);
361                                 if (ret)
362                                         return ret;
363                         }
364
365                         if (old_s) {
366                                 int ret = bch2_trans_mark_stripe_bucket(trans,
367                                                 bkey_s_c_to_stripe(old), i, true);
368                                 if (ret)
369                                         return ret;
370                         }
371                 }
372         }
373
374         if (flags & BTREE_TRIGGER_ATOMIC) {
375                 struct stripe *m = genradix_ptr(&c->stripes, idx);
376
377                 if (!m) {
378                         struct printbuf buf1 = PRINTBUF;
379                         struct printbuf buf2 = PRINTBUF;
380
381                         bch2_bkey_val_to_text(&buf1, c, old);
382                         bch2_bkey_val_to_text(&buf2, c, new);
383                         bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
384                                             "old %s\n"
385                                             "new %s", idx, buf1.buf, buf2.buf);
386                         printbuf_exit(&buf2);
387                         printbuf_exit(&buf1);
388                         bch2_inconsistent_error(c);
389                         return -1;
390                 }
391
392                 if (!new_s) {
393                         bch2_stripes_heap_del(c, m, idx);
394
395                         memset(m, 0, sizeof(*m));
396                 } else {
397                         m->sectors      = le16_to_cpu(new_s->sectors);
398                         m->algorithm    = new_s->algorithm;
399                         m->nr_blocks    = new_s->nr_blocks;
400                         m->nr_redundant = new_s->nr_redundant;
401                         m->blocks_nonempty = 0;
402
403                         for (unsigned i = 0; i < new_s->nr_blocks; i++)
404                                 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
405
406                         if (!old_s)
407                                 bch2_stripes_heap_insert(c, m, idx);
408                         else
409                                 bch2_stripes_heap_update(c, m, idx);
410                 }
411         }
412
413         if (flags & BTREE_TRIGGER_GC) {
414                 struct gc_stripe *m =
415                         genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
416
417                 if (!m) {
418                         bch_err(c, "error allocating memory for gc_stripes, idx %llu",
419                                 idx);
420                         return -BCH_ERR_ENOMEM_mark_stripe;
421                 }
422                 /*
423                  * This will be wrong when we bring back runtime gc: we should
424                  * be unmarking the old key and then marking the new key
425                  */
426                 m->alive        = true;
427                 m->sectors      = le16_to_cpu(new_s->sectors);
428                 m->nr_blocks    = new_s->nr_blocks;
429                 m->nr_redundant = new_s->nr_redundant;
430
431                 for (unsigned i = 0; i < new_s->nr_blocks; i++)
432                         m->ptrs[i] = new_s->ptrs[i];
433
434                 bch2_bkey_to_replicas(&m->r.e, new);
435
436                 /*
437                  * gc recalculates this field from stripe ptr
438                  * references:
439                  */
440                 memset(m->block_sectors, 0, sizeof(m->block_sectors));
441
442                 for (unsigned i = 0; i < new_s->nr_blocks; i++) {
443                         int ret = mark_stripe_bucket(trans, new, i, flags);
444                         if (ret)
445                                 return ret;
446                 }
447
448                 int ret = bch2_update_replicas(c, new, &m->r.e,
449                                       ((s64) m->sectors * m->nr_redundant),
450                                       0, true);
451                 if (ret) {
452                         struct printbuf buf = PRINTBUF;
453
454                         bch2_bkey_val_to_text(&buf, c, new);
455                         bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
456                         printbuf_exit(&buf);
457                         return ret;
458                 }
459         }
460
461         return 0;
462 }
463
464 /* returns blocknr in stripe that we matched: */
465 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
466                                                 struct bkey_s_c k, unsigned *block)
467 {
468         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
469         unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
470
471         bkey_for_each_ptr(ptrs, ptr)
472                 for (i = 0; i < nr_data; i++)
473                         if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
474                                                       le16_to_cpu(s->sectors))) {
475                                 *block = i;
476                                 return ptr;
477                         }
478
479         return NULL;
480 }
481
482 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
483 {
484         switch (k.k->type) {
485         case KEY_TYPE_extent: {
486                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
487                 const union bch_extent_entry *entry;
488
489                 extent_for_each_entry(e, entry)
490                         if (extent_entry_type(entry) ==
491                             BCH_EXTENT_ENTRY_stripe_ptr &&
492                             entry->stripe_ptr.idx == idx)
493                                 return true;
494
495                 break;
496         }
497         }
498
499         return false;
500 }
501
502 /* Stripe bufs: */
503
504 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
505 {
506         if (buf->key.k.type == KEY_TYPE_stripe) {
507                 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
508                 unsigned i;
509
510                 for (i = 0; i < s->v.nr_blocks; i++) {
511                         kvfree(buf->data[i]);
512                         buf->data[i] = NULL;
513                 }
514         }
515 }
516
517 /* XXX: this is a non-mempoolified memory allocation: */
518 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
519                               unsigned offset, unsigned size)
520 {
521         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
522         unsigned csum_granularity = 1U << v->csum_granularity_bits;
523         unsigned end = offset + size;
524         unsigned i;
525
526         BUG_ON(end > le16_to_cpu(v->sectors));
527
528         offset  = round_down(offset, csum_granularity);
529         end     = min_t(unsigned, le16_to_cpu(v->sectors),
530                         round_up(end, csum_granularity));
531
532         buf->offset     = offset;
533         buf->size       = end - offset;
534
535         memset(buf->valid, 0xFF, sizeof(buf->valid));
536
537         for (i = 0; i < v->nr_blocks; i++) {
538                 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
539                 if (!buf->data[i])
540                         goto err;
541         }
542
543         return 0;
544 err:
545         ec_stripe_buf_exit(buf);
546         return -BCH_ERR_ENOMEM_stripe_buf;
547 }
548
549 /* Checksumming: */
550
551 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
552                                          unsigned block, unsigned offset)
553 {
554         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
555         unsigned csum_granularity = 1 << v->csum_granularity_bits;
556         unsigned end = buf->offset + buf->size;
557         unsigned len = min(csum_granularity, end - offset);
558
559         BUG_ON(offset >= end);
560         BUG_ON(offset <  buf->offset);
561         BUG_ON(offset & (csum_granularity - 1));
562         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
563                (len & (csum_granularity - 1)));
564
565         return bch2_checksum(NULL, v->csum_type,
566                              null_nonce(),
567                              buf->data[block] + ((offset - buf->offset) << 9),
568                              len << 9);
569 }
570
571 static void ec_generate_checksums(struct ec_stripe_buf *buf)
572 {
573         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
574         unsigned i, j, csums_per_device = stripe_csums_per_device(v);
575
576         if (!v->csum_type)
577                 return;
578
579         BUG_ON(buf->offset);
580         BUG_ON(buf->size != le16_to_cpu(v->sectors));
581
582         for (i = 0; i < v->nr_blocks; i++)
583                 for (j = 0; j < csums_per_device; j++)
584                         stripe_csum_set(v, i, j,
585                                 ec_block_checksum(buf, i, j << v->csum_granularity_bits));
586 }
587
588 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
589 {
590         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
591         unsigned csum_granularity = 1 << v->csum_granularity_bits;
592         unsigned i;
593
594         if (!v->csum_type)
595                 return;
596
597         for (i = 0; i < v->nr_blocks; i++) {
598                 unsigned offset = buf->offset;
599                 unsigned end = buf->offset + buf->size;
600
601                 if (!test_bit(i, buf->valid))
602                         continue;
603
604                 while (offset < end) {
605                         unsigned j = offset >> v->csum_granularity_bits;
606                         unsigned len = min(csum_granularity, end - offset);
607                         struct bch_csum want = stripe_csum_get(v, i, j);
608                         struct bch_csum got = ec_block_checksum(buf, i, offset);
609
610                         if (bch2_crc_cmp(want, got)) {
611                                 struct printbuf err = PRINTBUF;
612                                 struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
613
614                                 prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
615                                            want.hi, want.lo,
616                                            got.hi, got.lo,
617                                            bch2_csum_types[v->csum_type]);
618                                 prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
619                                 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
620                                 bch_err_ratelimited(ca, "%s", err.buf);
621                                 printbuf_exit(&err);
622
623                                 clear_bit(i, buf->valid);
624
625                                 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
626                                 break;
627                         }
628
629                         offset += len;
630                 }
631         }
632 }
633
634 /* Erasure coding: */
635
636 static void ec_generate_ec(struct ec_stripe_buf *buf)
637 {
638         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
639         unsigned nr_data = v->nr_blocks - v->nr_redundant;
640         unsigned bytes = le16_to_cpu(v->sectors) << 9;
641
642         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
643 }
644
645 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
646 {
647         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
648
649         return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
650 }
651
652 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
653 {
654         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
655         unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
656         unsigned nr_data = v->nr_blocks - v->nr_redundant;
657         unsigned bytes = buf->size << 9;
658
659         if (ec_nr_failed(buf) > v->nr_redundant) {
660                 bch_err_ratelimited(c,
661                         "error doing reconstruct read: unable to read enough blocks");
662                 return -1;
663         }
664
665         for (i = 0; i < nr_data; i++)
666                 if (!test_bit(i, buf->valid))
667                         failed[nr_failed++] = i;
668
669         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
670         return 0;
671 }
672
673 /* IO: */
674
675 static void ec_block_endio(struct bio *bio)
676 {
677         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
678         struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
679         struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
680         struct bch_dev *ca = ec_bio->ca;
681         struct closure *cl = bio->bi_private;
682
683         if (bch2_dev_io_err_on(bio->bi_status, ca,
684                                bio_data_dir(bio)
685                                ? BCH_MEMBER_ERROR_write
686                                : BCH_MEMBER_ERROR_read,
687                                "erasure coding %s error: %s",
688                                bio_data_dir(bio) ? "write" : "read",
689                                bch2_blk_status_to_str(bio->bi_status)))
690                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
691
692         if (ptr_stale(ca, ptr)) {
693                 bch_err_ratelimited(ca->fs,
694                                     "error %s stripe: stale pointer after io",
695                                     bio_data_dir(bio) == READ ? "reading from" : "writing to");
696                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
697         }
698
699         bio_put(&ec_bio->bio);
700         percpu_ref_put(&ca->io_ref);
701         closure_put(cl);
702 }
703
704 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
705                         blk_opf_t opf, unsigned idx, struct closure *cl)
706 {
707         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
708         unsigned offset = 0, bytes = buf->size << 9;
709         struct bch_extent_ptr *ptr = &v->ptrs[idx];
710         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
711         enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
712                 ? BCH_DATA_user
713                 : BCH_DATA_parity;
714         int rw = op_is_write(opf);
715
716         if (ptr_stale(ca, ptr)) {
717                 bch_err_ratelimited(c,
718                                     "error %s stripe: stale pointer",
719                                     rw == READ ? "reading from" : "writing to");
720                 clear_bit(idx, buf->valid);
721                 return;
722         }
723
724         if (!bch2_dev_get_ioref(ca, rw)) {
725                 clear_bit(idx, buf->valid);
726                 return;
727         }
728
729         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
730
731         while (offset < bytes) {
732                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
733                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
734                 unsigned b = min_t(size_t, bytes - offset,
735                                    nr_iovecs << PAGE_SHIFT);
736                 struct ec_bio *ec_bio;
737
738                 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
739                                                        nr_iovecs,
740                                                        opf,
741                                                        GFP_KERNEL,
742                                                        &c->ec_bioset),
743                                       struct ec_bio, bio);
744
745                 ec_bio->ca                      = ca;
746                 ec_bio->buf                     = buf;
747                 ec_bio->idx                     = idx;
748
749                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
750                 ec_bio->bio.bi_end_io           = ec_block_endio;
751                 ec_bio->bio.bi_private          = cl;
752
753                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
754
755                 closure_get(cl);
756                 percpu_ref_get(&ca->io_ref);
757
758                 submit_bio(&ec_bio->bio);
759
760                 offset += b;
761         }
762
763         percpu_ref_put(&ca->io_ref);
764 }
765
766 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
767                                 struct ec_stripe_buf *stripe)
768 {
769         struct btree_iter iter;
770         struct bkey_s_c k;
771         int ret;
772
773         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
774                                POS(0, idx), BTREE_ITER_SLOTS);
775         ret = bkey_err(k);
776         if (ret)
777                 goto err;
778         if (k.k->type != KEY_TYPE_stripe) {
779                 ret = -ENOENT;
780                 goto err;
781         }
782         bkey_reassemble(&stripe->key, k);
783 err:
784         bch2_trans_iter_exit(trans, &iter);
785         return ret;
786 }
787
788 /* recovery read path: */
789 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
790 {
791         struct bch_fs *c = trans->c;
792         struct ec_stripe_buf *buf;
793         struct closure cl;
794         struct bch_stripe *v;
795         unsigned i, offset;
796         int ret = 0;
797
798         closure_init_stack(&cl);
799
800         BUG_ON(!rbio->pick.has_ec);
801
802         buf = kzalloc(sizeof(*buf), GFP_NOFS);
803         if (!buf)
804                 return -BCH_ERR_ENOMEM_ec_read_extent;
805
806         ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
807         if (ret) {
808                 bch_err_ratelimited(c,
809                         "error doing reconstruct read: error %i looking up stripe", ret);
810                 kfree(buf);
811                 return -EIO;
812         }
813
814         v = &bkey_i_to_stripe(&buf->key)->v;
815
816         if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
817                 bch_err_ratelimited(c,
818                         "error doing reconstruct read: pointer doesn't match stripe");
819                 ret = -EIO;
820                 goto err;
821         }
822
823         offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
824         if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
825                 bch_err_ratelimited(c,
826                         "error doing reconstruct read: read is bigger than stripe");
827                 ret = -EIO;
828                 goto err;
829         }
830
831         ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
832         if (ret)
833                 goto err;
834
835         for (i = 0; i < v->nr_blocks; i++)
836                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
837
838         closure_sync(&cl);
839
840         if (ec_nr_failed(buf) > v->nr_redundant) {
841                 bch_err_ratelimited(c,
842                         "error doing reconstruct read: unable to read enough blocks");
843                 ret = -EIO;
844                 goto err;
845         }
846
847         ec_validate_checksums(c, buf);
848
849         ret = ec_do_recov(c, buf);
850         if (ret)
851                 goto err;
852
853         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
854                       buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
855 err:
856         ec_stripe_buf_exit(buf);
857         kfree(buf);
858         return ret;
859 }
860
861 /* stripe bucket accounting: */
862
863 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
864 {
865         ec_stripes_heap n, *h = &c->ec_stripes_heap;
866
867         if (idx >= h->size) {
868                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
869                         return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
870
871                 mutex_lock(&c->ec_stripes_heap_lock);
872                 if (n.size > h->size) {
873                         memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
874                         n.used = h->used;
875                         swap(*h, n);
876                 }
877                 mutex_unlock(&c->ec_stripes_heap_lock);
878
879                 free_heap(&n);
880         }
881
882         if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
883                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
884
885         if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
886             !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
887                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
888
889         return 0;
890 }
891
892 static int ec_stripe_mem_alloc(struct btree_trans *trans,
893                                struct btree_iter *iter)
894 {
895         return allocate_dropping_locks_errcode(trans,
896                         __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
897 }
898
899 /*
900  * Hash table of open stripes:
901  * Stripes that are being created or modified are kept in a hash table, so that
902  * stripe deletion can skip them.
903  */
904
905 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
906 {
907         unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
908         struct ec_stripe_new *s;
909
910         hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
911                 if (s->idx == idx)
912                         return true;
913         return false;
914 }
915
916 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
917 {
918         bool ret = false;
919
920         spin_lock(&c->ec_stripes_new_lock);
921         ret = __bch2_stripe_is_open(c, idx);
922         spin_unlock(&c->ec_stripes_new_lock);
923
924         return ret;
925 }
926
927 static bool bch2_try_open_stripe(struct bch_fs *c,
928                                  struct ec_stripe_new *s,
929                                  u64 idx)
930 {
931         bool ret;
932
933         spin_lock(&c->ec_stripes_new_lock);
934         ret = !__bch2_stripe_is_open(c, idx);
935         if (ret) {
936                 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
937
938                 s->idx = idx;
939                 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
940         }
941         spin_unlock(&c->ec_stripes_new_lock);
942
943         return ret;
944 }
945
946 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
947 {
948         BUG_ON(!s->idx);
949
950         spin_lock(&c->ec_stripes_new_lock);
951         hlist_del_init(&s->hash);
952         spin_unlock(&c->ec_stripes_new_lock);
953
954         s->idx = 0;
955 }
956
957 /* Heap of all existing stripes, ordered by blocks_nonempty */
958
959 static u64 stripe_idx_to_delete(struct bch_fs *c)
960 {
961         ec_stripes_heap *h = &c->ec_stripes_heap;
962
963         lockdep_assert_held(&c->ec_stripes_heap_lock);
964
965         if (h->used &&
966             h->data[0].blocks_nonempty == 0 &&
967             !bch2_stripe_is_open(c, h->data[0].idx))
968                 return h->data[0].idx;
969
970         return 0;
971 }
972
973 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
974                                       struct ec_stripe_heap_entry l,
975                                       struct ec_stripe_heap_entry r)
976 {
977         return ((l.blocks_nonempty > r.blocks_nonempty) -
978                 (l.blocks_nonempty < r.blocks_nonempty));
979 }
980
981 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
982                                                    size_t i)
983 {
984         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
985
986         genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
987 }
988
989 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
990 {
991         ec_stripes_heap *h = &c->ec_stripes_heap;
992         struct stripe *m = genradix_ptr(&c->stripes, idx);
993
994         BUG_ON(m->heap_idx >= h->used);
995         BUG_ON(h->data[m->heap_idx].idx != idx);
996 }
997
998 void bch2_stripes_heap_del(struct bch_fs *c,
999                            struct stripe *m, size_t idx)
1000 {
1001         mutex_lock(&c->ec_stripes_heap_lock);
1002         heap_verify_backpointer(c, idx);
1003
1004         heap_del(&c->ec_stripes_heap, m->heap_idx,
1005                  ec_stripes_heap_cmp,
1006                  ec_stripes_heap_set_backpointer);
1007         mutex_unlock(&c->ec_stripes_heap_lock);
1008 }
1009
1010 void bch2_stripes_heap_insert(struct bch_fs *c,
1011                               struct stripe *m, size_t idx)
1012 {
1013         mutex_lock(&c->ec_stripes_heap_lock);
1014         BUG_ON(heap_full(&c->ec_stripes_heap));
1015
1016         heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
1017                         .idx = idx,
1018                         .blocks_nonempty = m->blocks_nonempty,
1019                 }),
1020                  ec_stripes_heap_cmp,
1021                  ec_stripes_heap_set_backpointer);
1022
1023         heap_verify_backpointer(c, idx);
1024         mutex_unlock(&c->ec_stripes_heap_lock);
1025 }
1026
1027 void bch2_stripes_heap_update(struct bch_fs *c,
1028                               struct stripe *m, size_t idx)
1029 {
1030         ec_stripes_heap *h = &c->ec_stripes_heap;
1031         bool do_deletes;
1032         size_t i;
1033
1034         mutex_lock(&c->ec_stripes_heap_lock);
1035         heap_verify_backpointer(c, idx);
1036
1037         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
1038
1039         i = m->heap_idx;
1040         heap_sift_up(h,   i, ec_stripes_heap_cmp,
1041                      ec_stripes_heap_set_backpointer);
1042         heap_sift_down(h, i, ec_stripes_heap_cmp,
1043                        ec_stripes_heap_set_backpointer);
1044
1045         heap_verify_backpointer(c, idx);
1046
1047         do_deletes = stripe_idx_to_delete(c) != 0;
1048         mutex_unlock(&c->ec_stripes_heap_lock);
1049
1050         if (do_deletes)
1051                 bch2_do_stripe_deletes(c);
1052 }
1053
1054 /* stripe deletion */
1055
1056 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
1057 {
1058         struct bch_fs *c = trans->c;
1059         struct btree_iter iter;
1060         struct bkey_s_c k;
1061         struct bkey_s_c_stripe s;
1062         int ret;
1063
1064         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
1065                                BTREE_ITER_INTENT);
1066         ret = bkey_err(k);
1067         if (ret)
1068                 goto err;
1069
1070         if (k.k->type != KEY_TYPE_stripe) {
1071                 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
1072                 ret = -EINVAL;
1073                 goto err;
1074         }
1075
1076         s = bkey_s_c_to_stripe(k);
1077         for (unsigned i = 0; i < s.v->nr_blocks; i++)
1078                 if (stripe_blockcount_get(s.v, i)) {
1079                         struct printbuf buf = PRINTBUF;
1080
1081                         bch2_bkey_val_to_text(&buf, c, k);
1082                         bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
1083                         printbuf_exit(&buf);
1084                         ret = -EINVAL;
1085                         goto err;
1086                 }
1087
1088         ret = bch2_btree_delete_at(trans, &iter, 0);
1089 err:
1090         bch2_trans_iter_exit(trans, &iter);
1091         return ret;
1092 }
1093
1094 static void ec_stripe_delete_work(struct work_struct *work)
1095 {
1096         struct bch_fs *c =
1097                 container_of(work, struct bch_fs, ec_stripe_delete_work);
1098
1099         while (1) {
1100                 mutex_lock(&c->ec_stripes_heap_lock);
1101                 u64 idx = stripe_idx_to_delete(c);
1102                 mutex_unlock(&c->ec_stripes_heap_lock);
1103
1104                 if (!idx)
1105                         break;
1106
1107                 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1108                                         ec_stripe_delete(trans, idx));
1109                 bch_err_fn(c, ret);
1110                 if (ret)
1111                         break;
1112         }
1113
1114         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1115 }
1116
1117 void bch2_do_stripe_deletes(struct bch_fs *c)
1118 {
1119         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1120             !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1121                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1122 }
1123
1124 /* stripe creation: */
1125
1126 static int ec_stripe_key_update(struct btree_trans *trans,
1127                                 struct bkey_i_stripe *new,
1128                                 bool create)
1129 {
1130         struct bch_fs *c = trans->c;
1131         struct btree_iter iter;
1132         struct bkey_s_c k;
1133         int ret;
1134
1135         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1136                                new->k.p, BTREE_ITER_INTENT);
1137         ret = bkey_err(k);
1138         if (ret)
1139                 goto err;
1140
1141         if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
1142                 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
1143                                      create ? "creating" : "updating",
1144                                      bch2_bkey_types[k.k->type]);
1145                 ret = -EINVAL;
1146                 goto err;
1147         }
1148
1149         if (k.k->type == KEY_TYPE_stripe) {
1150                 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
1151                 unsigned i;
1152
1153                 if (old->nr_blocks != new->v.nr_blocks) {
1154                         bch_err(c, "error updating stripe: nr_blocks does not match");
1155                         ret = -EINVAL;
1156                         goto err;
1157                 }
1158
1159                 for (i = 0; i < new->v.nr_blocks; i++) {
1160                         unsigned v = stripe_blockcount_get(old, i);
1161
1162                         BUG_ON(v &&
1163                                (old->ptrs[i].dev != new->v.ptrs[i].dev ||
1164                                 old->ptrs[i].gen != new->v.ptrs[i].gen ||
1165                                 old->ptrs[i].offset != new->v.ptrs[i].offset));
1166
1167                         stripe_blockcount_set(&new->v, i, v);
1168                 }
1169         }
1170
1171         ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1172 err:
1173         bch2_trans_iter_exit(trans, &iter);
1174         return ret;
1175 }
1176
1177 static int ec_stripe_update_extent(struct btree_trans *trans,
1178                                    struct bpos bucket, u8 gen,
1179                                    struct ec_stripe_buf *s,
1180                                    struct bpos *bp_pos)
1181 {
1182         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1183         struct bch_fs *c = trans->c;
1184         struct bch_backpointer bp;
1185         struct btree_iter iter;
1186         struct bkey_s_c k;
1187         const struct bch_extent_ptr *ptr_c;
1188         struct bch_extent_ptr *ptr, *ec_ptr = NULL;
1189         struct bch_extent_stripe_ptr stripe_ptr;
1190         struct bkey_i *n;
1191         int ret, dev, block;
1192
1193         ret = bch2_get_next_backpointer(trans, bucket, gen,
1194                                 bp_pos, &bp, BTREE_ITER_CACHED);
1195         if (ret)
1196                 return ret;
1197         if (bpos_eq(*bp_pos, SPOS_MAX))
1198                 return 0;
1199
1200         if (bp.level) {
1201                 struct printbuf buf = PRINTBUF;
1202                 struct btree_iter node_iter;
1203                 struct btree *b;
1204
1205                 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
1206                 bch2_trans_iter_exit(trans, &node_iter);
1207
1208                 if (!b)
1209                         return 0;
1210
1211                 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1212                 bch2_backpointer_to_text(&buf, &bp);
1213
1214                 bch2_fs_inconsistent(c, "%s", buf.buf);
1215                 printbuf_exit(&buf);
1216                 return -EIO;
1217         }
1218
1219         k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
1220         ret = bkey_err(k);
1221         if (ret)
1222                 return ret;
1223         if (!k.k) {
1224                 /*
1225                  * extent no longer exists - we could flush the btree
1226                  * write buffer and retry to verify, but no need:
1227                  */
1228                 return 0;
1229         }
1230
1231         if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1232                 goto out;
1233
1234         ptr_c = bkey_matches_stripe(v, k, &block);
1235         /*
1236          * It doesn't generally make sense to erasure code cached ptrs:
1237          * XXX: should we be incrementing a counter?
1238          */
1239         if (!ptr_c || ptr_c->cached)
1240                 goto out;
1241
1242         dev = v->ptrs[block].dev;
1243
1244         n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1245         ret = PTR_ERR_OR_ZERO(n);
1246         if (ret)
1247                 goto out;
1248
1249         bkey_reassemble(n, k);
1250
1251         bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
1252         ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1253         BUG_ON(!ec_ptr);
1254
1255         stripe_ptr = (struct bch_extent_stripe_ptr) {
1256                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1257                 .block          = block,
1258                 .redundancy     = v->nr_redundant,
1259                 .idx            = s->key.k.p.offset,
1260         };
1261
1262         __extent_entry_insert(n,
1263                         (union bch_extent_entry *) ec_ptr,
1264                         (union bch_extent_entry *) &stripe_ptr);
1265
1266         ret = bch2_trans_update(trans, &iter, n, 0);
1267 out:
1268         bch2_trans_iter_exit(trans, &iter);
1269         return ret;
1270 }
1271
1272 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1273                                    unsigned block)
1274 {
1275         struct bch_fs *c = trans->c;
1276         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1277         struct bch_extent_ptr bucket = v->ptrs[block];
1278         struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
1279         struct bpos bp_pos = POS_MIN;
1280         int ret = 0;
1281
1282         while (1) {
1283                 ret = commit_do(trans, NULL, NULL,
1284                                 BCH_TRANS_COMMIT_no_check_rw|
1285                                 BCH_TRANS_COMMIT_no_enospc,
1286                         ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
1287                                                 s, &bp_pos));
1288                 if (ret)
1289                         break;
1290                 if (bkey_eq(bp_pos, POS_MAX))
1291                         break;
1292
1293                 bp_pos = bpos_nosnap_successor(bp_pos);
1294         }
1295
1296         return ret;
1297 }
1298
1299 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1300 {
1301         struct btree_trans *trans = bch2_trans_get(c);
1302         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1303         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1304         int ret = 0;
1305
1306         ret = bch2_btree_write_buffer_flush_sync(trans);
1307         if (ret)
1308                 goto err;
1309
1310         for (i = 0; i < nr_data; i++) {
1311                 ret = ec_stripe_update_bucket(trans, s, i);
1312                 if (ret)
1313                         break;
1314         }
1315 err:
1316         bch2_trans_put(trans);
1317
1318         return ret;
1319 }
1320
1321 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1322                                        struct ec_stripe_new *s,
1323                                        unsigned block,
1324                                        struct open_bucket *ob)
1325 {
1326         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1327         unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1328         int ret;
1329
1330         if (!bch2_dev_get_ioref(ca, WRITE)) {
1331                 s->err = -BCH_ERR_erofs_no_writes;
1332                 return;
1333         }
1334
1335         memset(s->new_stripe.data[block] + (offset << 9),
1336                0,
1337                ob->sectors_free << 9);
1338
1339         ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1340                         ob->bucket * ca->mi.bucket_size + offset,
1341                         ob->sectors_free,
1342                         GFP_KERNEL, 0);
1343
1344         percpu_ref_put(&ca->io_ref);
1345
1346         if (ret)
1347                 s->err = ret;
1348 }
1349
1350 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1351 {
1352         if (s->idx)
1353                 bch2_stripe_close(c, s);
1354         kfree(s);
1355 }
1356
1357 /*
1358  * data buckets of new stripe all written: create the stripe
1359  */
1360 static void ec_stripe_create(struct ec_stripe_new *s)
1361 {
1362         struct bch_fs *c = s->c;
1363         struct open_bucket *ob;
1364         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1365         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1366         int ret;
1367
1368         BUG_ON(s->h->s == s);
1369
1370         closure_sync(&s->iodone);
1371
1372         if (!s->err) {
1373                 for (i = 0; i < nr_data; i++)
1374                         if (s->blocks[i]) {
1375                                 ob = c->open_buckets + s->blocks[i];
1376
1377                                 if (ob->sectors_free)
1378                                         zero_out_rest_of_ec_bucket(c, s, i, ob);
1379                         }
1380         }
1381
1382         if (s->err) {
1383                 if (!bch2_err_matches(s->err, EROFS))
1384                         bch_err(c, "error creating stripe: error writing data buckets");
1385                 goto err;
1386         }
1387
1388         if (s->have_existing_stripe) {
1389                 ec_validate_checksums(c, &s->existing_stripe);
1390
1391                 if (ec_do_recov(c, &s->existing_stripe)) {
1392                         bch_err(c, "error creating stripe: error reading existing stripe");
1393                         goto err;
1394                 }
1395
1396                 for (i = 0; i < nr_data; i++)
1397                         if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1398                                 swap(s->new_stripe.data[i],
1399                                      s->existing_stripe.data[i]);
1400
1401                 ec_stripe_buf_exit(&s->existing_stripe);
1402         }
1403
1404         BUG_ON(!s->allocated);
1405         BUG_ON(!s->idx);
1406
1407         ec_generate_ec(&s->new_stripe);
1408
1409         ec_generate_checksums(&s->new_stripe);
1410
1411         /* write p/q: */
1412         for (i = nr_data; i < v->nr_blocks; i++)
1413                 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1414         closure_sync(&s->iodone);
1415
1416         if (ec_nr_failed(&s->new_stripe)) {
1417                 bch_err(c, "error creating stripe: error writing redundancy buckets");
1418                 goto err;
1419         }
1420
1421         ret = bch2_trans_do(c, &s->res, NULL,
1422                             BCH_TRANS_COMMIT_no_check_rw|
1423                             BCH_TRANS_COMMIT_no_enospc,
1424                             ec_stripe_key_update(trans,
1425                                         bkey_i_to_stripe(&s->new_stripe.key),
1426                                         !s->have_existing_stripe));
1427         bch_err_msg(c, ret, "creating stripe key");
1428         if (ret) {
1429                 goto err;
1430         }
1431
1432         ret = ec_stripe_update_extents(c, &s->new_stripe);
1433         bch_err_msg(c, ret, "error updating extents");
1434         if (ret)
1435                 goto err;
1436 err:
1437         bch2_disk_reservation_put(c, &s->res);
1438
1439         for (i = 0; i < v->nr_blocks; i++)
1440                 if (s->blocks[i]) {
1441                         ob = c->open_buckets + s->blocks[i];
1442
1443                         if (i < nr_data) {
1444                                 ob->ec = NULL;
1445                                 __bch2_open_bucket_put(c, ob);
1446                         } else {
1447                                 bch2_open_bucket_put(c, ob);
1448                         }
1449                 }
1450
1451         mutex_lock(&c->ec_stripe_new_lock);
1452         list_del(&s->list);
1453         mutex_unlock(&c->ec_stripe_new_lock);
1454         wake_up(&c->ec_stripe_new_wait);
1455
1456         ec_stripe_buf_exit(&s->existing_stripe);
1457         ec_stripe_buf_exit(&s->new_stripe);
1458         closure_debug_destroy(&s->iodone);
1459
1460         ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1461 }
1462
1463 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1464 {
1465         struct ec_stripe_new *s;
1466
1467         mutex_lock(&c->ec_stripe_new_lock);
1468         list_for_each_entry(s, &c->ec_stripe_new_list, list)
1469                 if (!atomic_read(&s->ref[STRIPE_REF_io]))
1470                         goto out;
1471         s = NULL;
1472 out:
1473         mutex_unlock(&c->ec_stripe_new_lock);
1474
1475         return s;
1476 }
1477
1478 static void ec_stripe_create_work(struct work_struct *work)
1479 {
1480         struct bch_fs *c = container_of(work,
1481                 struct bch_fs, ec_stripe_create_work);
1482         struct ec_stripe_new *s;
1483
1484         while ((s = get_pending_stripe(c)))
1485                 ec_stripe_create(s);
1486
1487         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1488 }
1489
1490 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1491 {
1492         bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1493
1494         if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1495                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1496 }
1497
1498 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1499 {
1500         struct ec_stripe_new *s = h->s;
1501
1502         BUG_ON(!s->allocated && !s->err);
1503
1504         h->s            = NULL;
1505         s->pending      = true;
1506
1507         mutex_lock(&c->ec_stripe_new_lock);
1508         list_add(&s->list, &c->ec_stripe_new_list);
1509         mutex_unlock(&c->ec_stripe_new_lock);
1510
1511         ec_stripe_new_put(c, s, STRIPE_REF_io);
1512 }
1513
1514 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1515 {
1516         struct ec_stripe_new *s = ob->ec;
1517
1518         s->err = -EIO;
1519 }
1520
1521 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1522 {
1523         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1524         struct bch_dev *ca;
1525         unsigned offset;
1526
1527         if (!ob)
1528                 return NULL;
1529
1530         BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1531
1532         ca      = bch_dev_bkey_exists(c, ob->dev);
1533         offset  = ca->mi.bucket_size - ob->sectors_free;
1534
1535         return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1536 }
1537
1538 static int unsigned_cmp(const void *_l, const void *_r)
1539 {
1540         unsigned l = *((const unsigned *) _l);
1541         unsigned r = *((const unsigned *) _r);
1542
1543         return cmp_int(l, r);
1544 }
1545
1546 /* pick most common bucket size: */
1547 static unsigned pick_blocksize(struct bch_fs *c,
1548                                struct bch_devs_mask *devs)
1549 {
1550         unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1551         struct {
1552                 unsigned nr, size;
1553         } cur = { 0, 0 }, best = { 0, 0 };
1554
1555         for_each_member_device_rcu(c, ca, devs)
1556                 sizes[nr++] = ca->mi.bucket_size;
1557
1558         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1559
1560         for (unsigned i = 0; i < nr; i++) {
1561                 if (sizes[i] != cur.size) {
1562                         if (cur.nr > best.nr)
1563                                 best = cur;
1564
1565                         cur.nr = 0;
1566                         cur.size = sizes[i];
1567                 }
1568
1569                 cur.nr++;
1570         }
1571
1572         if (cur.nr > best.nr)
1573                 best = cur;
1574
1575         return best.size;
1576 }
1577
1578 static bool may_create_new_stripe(struct bch_fs *c)
1579 {
1580         return false;
1581 }
1582
1583 static void ec_stripe_key_init(struct bch_fs *c,
1584                                struct bkey_i *k,
1585                                unsigned nr_data,
1586                                unsigned nr_parity,
1587                                unsigned stripe_size)
1588 {
1589         struct bkey_i_stripe *s = bkey_stripe_init(k);
1590         unsigned u64s;
1591
1592         s->v.sectors                    = cpu_to_le16(stripe_size);
1593         s->v.algorithm                  = 0;
1594         s->v.nr_blocks                  = nr_data + nr_parity;
1595         s->v.nr_redundant               = nr_parity;
1596         s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
1597         s->v.csum_type                  = BCH_CSUM_crc32c;
1598         s->v.pad                        = 0;
1599
1600         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1601                 BUG_ON(1 << s->v.csum_granularity_bits >=
1602                        le16_to_cpu(s->v.sectors) ||
1603                        s->v.csum_granularity_bits == U8_MAX);
1604                 s->v.csum_granularity_bits++;
1605         }
1606
1607         set_bkey_val_u64s(&s->k, u64s);
1608 }
1609
1610 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1611 {
1612         struct ec_stripe_new *s;
1613
1614         lockdep_assert_held(&h->lock);
1615
1616         s = kzalloc(sizeof(*s), GFP_KERNEL);
1617         if (!s)
1618                 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1619
1620         mutex_init(&s->lock);
1621         closure_init(&s->iodone, NULL);
1622         atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1623         atomic_set(&s->ref[STRIPE_REF_io], 1);
1624         s->c            = c;
1625         s->h            = h;
1626         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1627                                 BCH_BKEY_PTRS_MAX) - h->redundancy;
1628         s->nr_parity    = h->redundancy;
1629
1630         ec_stripe_key_init(c, &s->new_stripe.key,
1631                            s->nr_data, s->nr_parity, h->blocksize);
1632
1633         h->s = s;
1634         return 0;
1635 }
1636
1637 static struct ec_stripe_head *
1638 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1639                          unsigned algo, unsigned redundancy,
1640                          enum bch_watermark watermark)
1641 {
1642         struct ec_stripe_head *h;
1643
1644         h = kzalloc(sizeof(*h), GFP_KERNEL);
1645         if (!h)
1646                 return NULL;
1647
1648         mutex_init(&h->lock);
1649         BUG_ON(!mutex_trylock(&h->lock));
1650
1651         h->target       = target;
1652         h->algo         = algo;
1653         h->redundancy   = redundancy;
1654         h->watermark    = watermark;
1655
1656         rcu_read_lock();
1657         h->devs = target_rw_devs(c, BCH_DATA_user, target);
1658
1659         for_each_member_device_rcu(c, ca, &h->devs)
1660                 if (!ca->mi.durability)
1661                         __clear_bit(ca->dev_idx, h->devs.d);
1662
1663         h->blocksize = pick_blocksize(c, &h->devs);
1664
1665         for_each_member_device_rcu(c, ca, &h->devs)
1666                 if (ca->mi.bucket_size == h->blocksize)
1667                         h->nr_active_devs++;
1668
1669         rcu_read_unlock();
1670
1671         /*
1672          * If we only have redundancy + 1 devices, we're better off with just
1673          * replication:
1674          */
1675         if (h->nr_active_devs < h->redundancy + 2)
1676                 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
1677                         h->nr_active_devs, h->redundancy + 2);
1678
1679         list_add(&h->list, &c->ec_stripe_head_list);
1680         return h;
1681 }
1682
1683 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1684 {
1685         if (h->s &&
1686             h->s->allocated &&
1687             bitmap_weight(h->s->blocks_allocated,
1688                           h->s->nr_data) == h->s->nr_data)
1689                 ec_stripe_set_pending(c, h);
1690
1691         mutex_unlock(&h->lock);
1692 }
1693
1694 static struct ec_stripe_head *
1695 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1696                           unsigned target,
1697                           unsigned algo,
1698                           unsigned redundancy,
1699                           enum bch_watermark watermark)
1700 {
1701         struct bch_fs *c = trans->c;
1702         struct ec_stripe_head *h;
1703         int ret;
1704
1705         if (!redundancy)
1706                 return NULL;
1707
1708         ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1709         if (ret)
1710                 return ERR_PTR(ret);
1711
1712         if (test_bit(BCH_FS_going_ro, &c->flags)) {
1713                 h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1714                 goto found;
1715         }
1716
1717         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1718                 if (h->target           == target &&
1719                     h->algo             == algo &&
1720                     h->redundancy       == redundancy &&
1721                     h->watermark        == watermark) {
1722                         ret = bch2_trans_mutex_lock(trans, &h->lock);
1723                         if (ret)
1724                                 h = ERR_PTR(ret);
1725                         goto found;
1726                 }
1727
1728         h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
1729 found:
1730         if (!IS_ERR_OR_NULL(h) &&
1731             h->nr_active_devs < h->redundancy + 2) {
1732                 mutex_unlock(&h->lock);
1733                 h = NULL;
1734         }
1735         mutex_unlock(&c->ec_stripe_head_lock);
1736         return h;
1737 }
1738
1739 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1740                                     enum bch_watermark watermark, struct closure *cl)
1741 {
1742         struct bch_fs *c = trans->c;
1743         struct bch_devs_mask devs = h->devs;
1744         struct open_bucket *ob;
1745         struct open_buckets buckets;
1746         struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1747         unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1748         bool have_cache = true;
1749         int ret = 0;
1750
1751         BUG_ON(v->nr_blocks     != h->s->nr_data + h->s->nr_parity);
1752         BUG_ON(v->nr_redundant  != h->s->nr_parity);
1753
1754         for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1755                 __clear_bit(v->ptrs[i].dev, devs.d);
1756                 if (i < h->s->nr_data)
1757                         nr_have_data++;
1758                 else
1759                         nr_have_parity++;
1760         }
1761
1762         BUG_ON(nr_have_data     > h->s->nr_data);
1763         BUG_ON(nr_have_parity   > h->s->nr_parity);
1764
1765         buckets.nr = 0;
1766         if (nr_have_parity < h->s->nr_parity) {
1767                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1768                                             &h->parity_stripe,
1769                                             &devs,
1770                                             h->s->nr_parity,
1771                                             &nr_have_parity,
1772                                             &have_cache, 0,
1773                                             BCH_DATA_parity,
1774                                             watermark,
1775                                             cl);
1776
1777                 open_bucket_for_each(c, &buckets, ob, i) {
1778                         j = find_next_zero_bit(h->s->blocks_gotten,
1779                                                h->s->nr_data + h->s->nr_parity,
1780                                                h->s->nr_data);
1781                         BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1782
1783                         h->s->blocks[j] = buckets.v[i];
1784                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1785                         __set_bit(j, h->s->blocks_gotten);
1786                 }
1787
1788                 if (ret)
1789                         return ret;
1790         }
1791
1792         buckets.nr = 0;
1793         if (nr_have_data < h->s->nr_data) {
1794                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1795                                             &h->block_stripe,
1796                                             &devs,
1797                                             h->s->nr_data,
1798                                             &nr_have_data,
1799                                             &have_cache, 0,
1800                                             BCH_DATA_user,
1801                                             watermark,
1802                                             cl);
1803
1804                 open_bucket_for_each(c, &buckets, ob, i) {
1805                         j = find_next_zero_bit(h->s->blocks_gotten,
1806                                                h->s->nr_data, 0);
1807                         BUG_ON(j >= h->s->nr_data);
1808
1809                         h->s->blocks[j] = buckets.v[i];
1810                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1811                         __set_bit(j, h->s->blocks_gotten);
1812                 }
1813
1814                 if (ret)
1815                         return ret;
1816         }
1817
1818         return 0;
1819 }
1820
1821 /* XXX: doesn't obey target: */
1822 static s64 get_existing_stripe(struct bch_fs *c,
1823                                struct ec_stripe_head *head)
1824 {
1825         ec_stripes_heap *h = &c->ec_stripes_heap;
1826         struct stripe *m;
1827         size_t heap_idx;
1828         u64 stripe_idx;
1829         s64 ret = -1;
1830
1831         if (may_create_new_stripe(c))
1832                 return -1;
1833
1834         mutex_lock(&c->ec_stripes_heap_lock);
1835         for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
1836                 /* No blocks worth reusing, stripe will just be deleted: */
1837                 if (!h->data[heap_idx].blocks_nonempty)
1838                         continue;
1839
1840                 stripe_idx = h->data[heap_idx].idx;
1841
1842                 m = genradix_ptr(&c->stripes, stripe_idx);
1843
1844                 if (m->algorithm        == head->algo &&
1845                     m->nr_redundant     == head->redundancy &&
1846                     m->sectors          == head->blocksize &&
1847                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant &&
1848                     bch2_try_open_stripe(c, head->s, stripe_idx)) {
1849                         ret = stripe_idx;
1850                         break;
1851                 }
1852         }
1853         mutex_unlock(&c->ec_stripes_heap_lock);
1854         return ret;
1855 }
1856
1857 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
1858 {
1859         struct bch_fs *c = trans->c;
1860         struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1861         struct bch_stripe *existing_v;
1862         unsigned i;
1863         s64 idx;
1864         int ret;
1865
1866         /*
1867          * If we can't allocate a new stripe, and there's no stripes with empty
1868          * blocks for us to reuse, that means we have to wait on copygc:
1869          */
1870         idx = get_existing_stripe(c, h);
1871         if (idx < 0)
1872                 return -BCH_ERR_stripe_alloc_blocked;
1873
1874         ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
1875         bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
1876                              "reading stripe key: %s", bch2_err_str(ret));
1877         if (ret) {
1878                 bch2_stripe_close(c, h->s);
1879                 return ret;
1880         }
1881
1882         existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
1883
1884         BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
1885         h->s->nr_data = existing_v->nr_blocks -
1886                 existing_v->nr_redundant;
1887
1888         ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
1889         if (ret) {
1890                 bch2_stripe_close(c, h->s);
1891                 return ret;
1892         }
1893
1894         BUG_ON(h->s->existing_stripe.size != h->blocksize);
1895         BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
1896
1897         /*
1898          * Free buckets we initially allocated - they might conflict with
1899          * blocks from the stripe we're reusing:
1900          */
1901         for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
1902                 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
1903                 h->s->blocks[i] = 0;
1904         }
1905         memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
1906         memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
1907
1908         for (i = 0; i < existing_v->nr_blocks; i++) {
1909                 if (stripe_blockcount_get(existing_v, i)) {
1910                         __set_bit(i, h->s->blocks_gotten);
1911                         __set_bit(i, h->s->blocks_allocated);
1912                 }
1913
1914                 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
1915         }
1916
1917         bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
1918         h->s->have_existing_stripe = true;
1919
1920         return 0;
1921 }
1922
1923 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
1924 {
1925         struct bch_fs *c = trans->c;
1926         struct btree_iter iter;
1927         struct bkey_s_c k;
1928         struct bpos min_pos = POS(0, 1);
1929         struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
1930         int ret;
1931
1932         if (!h->s->res.sectors) {
1933                 ret = bch2_disk_reservation_get(c, &h->s->res,
1934                                         h->blocksize,
1935                                         h->s->nr_parity,
1936                                         BCH_DISK_RESERVATION_NOFAIL);
1937                 if (ret)
1938                         return ret;
1939         }
1940
1941         for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
1942                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
1943                 if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
1944                         if (start_pos.offset) {
1945                                 start_pos = min_pos;
1946                                 bch2_btree_iter_set_pos(&iter, start_pos);
1947                                 continue;
1948                         }
1949
1950                         ret = -BCH_ERR_ENOSPC_stripe_create;
1951                         break;
1952                 }
1953
1954                 if (bkey_deleted(k.k) &&
1955                     bch2_try_open_stripe(c, h->s, k.k->p.offset))
1956                         break;
1957         }
1958
1959         c->ec_stripe_hint = iter.pos.offset;
1960
1961         if (ret)
1962                 goto err;
1963
1964         ret = ec_stripe_mem_alloc(trans, &iter);
1965         if (ret) {
1966                 bch2_stripe_close(c, h->s);
1967                 goto err;
1968         }
1969
1970         h->s->new_stripe.key.k.p = iter.pos;
1971 out:
1972         bch2_trans_iter_exit(trans, &iter);
1973         return ret;
1974 err:
1975         bch2_disk_reservation_put(c, &h->s->res);
1976         goto out;
1977 }
1978
1979 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
1980                                                unsigned target,
1981                                                unsigned algo,
1982                                                unsigned redundancy,
1983                                                enum bch_watermark watermark,
1984                                                struct closure *cl)
1985 {
1986         struct bch_fs *c = trans->c;
1987         struct ec_stripe_head *h;
1988         bool waiting = false;
1989         int ret;
1990
1991         h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
1992         if (IS_ERR_OR_NULL(h))
1993                 return h;
1994
1995         if (!h->s) {
1996                 ret = ec_new_stripe_alloc(c, h);
1997                 if (ret) {
1998                         bch_err(c, "failed to allocate new stripe");
1999                         goto err;
2000                 }
2001         }
2002
2003         if (h->s->allocated)
2004                 goto allocated;
2005
2006         if (h->s->have_existing_stripe)
2007                 goto alloc_existing;
2008
2009         /* First, try to allocate a full stripe: */
2010         ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
2011                 __bch2_ec_stripe_head_reserve(trans, h);
2012         if (!ret)
2013                 goto allocate_buf;
2014         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2015             bch2_err_matches(ret, ENOMEM))
2016                 goto err;
2017
2018         /*
2019          * Not enough buckets available for a full stripe: we must reuse an
2020          * existing stripe:
2021          */
2022         while (1) {
2023                 ret = __bch2_ec_stripe_head_reuse(trans, h);
2024                 if (!ret)
2025                         break;
2026                 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2027                         goto err;
2028
2029                 if (watermark == BCH_WATERMARK_copygc) {
2030                         ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
2031                                 __bch2_ec_stripe_head_reserve(trans, h);
2032                         if (ret)
2033                                 goto err;
2034                         goto allocate_buf;
2035                 }
2036
2037                 /* XXX freelist_wait? */
2038                 closure_wait(&c->freelist_wait, cl);
2039                 waiting = true;
2040         }
2041
2042         if (waiting)
2043                 closure_wake_up(&c->freelist_wait);
2044 alloc_existing:
2045         /*
2046          * Retry allocating buckets, with the watermark for this
2047          * particular write:
2048          */
2049         ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
2050         if (ret)
2051                 goto err;
2052
2053 allocate_buf:
2054         ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
2055         if (ret)
2056                 goto err;
2057
2058         h->s->allocated = true;
2059 allocated:
2060         BUG_ON(!h->s->idx);
2061         BUG_ON(!h->s->new_stripe.data[0]);
2062         BUG_ON(trans->restarted);
2063         return h;
2064 err:
2065         bch2_ec_stripe_head_put(c, h);
2066         return ERR_PTR(ret);
2067 }
2068
2069 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2070 {
2071         struct ec_stripe_head *h;
2072         struct open_bucket *ob;
2073         unsigned i;
2074
2075         mutex_lock(&c->ec_stripe_head_lock);
2076         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2077                 mutex_lock(&h->lock);
2078                 if (!h->s)
2079                         goto unlock;
2080
2081                 if (!ca)
2082                         goto found;
2083
2084                 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2085                         if (!h->s->blocks[i])
2086                                 continue;
2087
2088                         ob = c->open_buckets + h->s->blocks[i];
2089                         if (ob->dev == ca->dev_idx)
2090                                 goto found;
2091                 }
2092                 goto unlock;
2093 found:
2094                 h->s->err = -BCH_ERR_erofs_no_writes;
2095                 ec_stripe_set_pending(c, h);
2096 unlock:
2097                 mutex_unlock(&h->lock);
2098         }
2099         mutex_unlock(&c->ec_stripe_head_lock);
2100 }
2101
2102 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2103 {
2104         __bch2_ec_stop(c, ca);
2105 }
2106
2107 void bch2_fs_ec_stop(struct bch_fs *c)
2108 {
2109         __bch2_ec_stop(c, NULL);
2110 }
2111
2112 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2113 {
2114         bool ret;
2115
2116         mutex_lock(&c->ec_stripe_new_lock);
2117         ret = list_empty(&c->ec_stripe_new_list);
2118         mutex_unlock(&c->ec_stripe_new_lock);
2119
2120         return ret;
2121 }
2122
2123 void bch2_fs_ec_flush(struct bch_fs *c)
2124 {
2125         wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2126 }
2127
2128 int bch2_stripes_read(struct bch_fs *c)
2129 {
2130         int ret = bch2_trans_run(c,
2131                 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
2132                                    BTREE_ITER_PREFETCH, k, ({
2133                         if (k.k->type != KEY_TYPE_stripe)
2134                                 continue;
2135
2136                         ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
2137                         if (ret)
2138                                 break;
2139
2140                         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
2141
2142                         struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
2143                         m->sectors      = le16_to_cpu(s->sectors);
2144                         m->algorithm    = s->algorithm;
2145                         m->nr_blocks    = s->nr_blocks;
2146                         m->nr_redundant = s->nr_redundant;
2147                         m->blocks_nonempty = 0;
2148
2149                         for (unsigned i = 0; i < s->nr_blocks; i++)
2150                                 m->blocks_nonempty += !!stripe_blockcount_get(s, i);
2151
2152                         bch2_stripes_heap_insert(c, m, k.k->p.offset);
2153                         0;
2154                 })));
2155         bch_err_fn(c, ret);
2156         return ret;
2157 }
2158
2159 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
2160 {
2161         ec_stripes_heap *h = &c->ec_stripes_heap;
2162         struct stripe *m;
2163         size_t i;
2164
2165         mutex_lock(&c->ec_stripes_heap_lock);
2166         for (i = 0; i < min_t(size_t, h->used, 50); i++) {
2167                 m = genradix_ptr(&c->stripes, h->data[i].idx);
2168
2169                 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
2170                        h->data[i].blocks_nonempty,
2171                        m->nr_blocks - m->nr_redundant,
2172                        m->nr_redundant);
2173                 if (bch2_stripe_is_open(c, h->data[i].idx))
2174                         prt_str(out, " open");
2175                 prt_newline(out);
2176         }
2177         mutex_unlock(&c->ec_stripes_heap_lock);
2178 }
2179
2180 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2181 {
2182         struct ec_stripe_head *h;
2183         struct ec_stripe_new *s;
2184
2185         mutex_lock(&c->ec_stripe_head_lock);
2186         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2187                 prt_printf(out, "target %u algo %u redundancy %u %s:\n",
2188                        h->target, h->algo, h->redundancy,
2189                        bch2_watermarks[h->watermark]);
2190
2191                 if (h->s)
2192                         prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
2193                                h->s->idx, h->s->nr_data, h->s->nr_parity,
2194                                bitmap_weight(h->s->blocks_allocated,
2195                                              h->s->nr_data));
2196         }
2197         mutex_unlock(&c->ec_stripe_head_lock);
2198
2199         prt_printf(out, "in flight:\n");
2200
2201         mutex_lock(&c->ec_stripe_new_lock);
2202         list_for_each_entry(s, &c->ec_stripe_new_list, list) {
2203                 prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
2204                            s->idx, s->nr_data, s->nr_parity,
2205                            atomic_read(&s->ref[STRIPE_REF_io]),
2206                            atomic_read(&s->ref[STRIPE_REF_stripe]),
2207                            bch2_watermarks[s->h->watermark]);
2208         }
2209         mutex_unlock(&c->ec_stripe_new_lock);
2210 }
2211
2212 void bch2_fs_ec_exit(struct bch_fs *c)
2213 {
2214         struct ec_stripe_head *h;
2215         unsigned i;
2216
2217         while (1) {
2218                 mutex_lock(&c->ec_stripe_head_lock);
2219                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
2220                                              struct ec_stripe_head, list);
2221                 if (h)
2222                         list_del(&h->list);
2223                 mutex_unlock(&c->ec_stripe_head_lock);
2224                 if (!h)
2225                         break;
2226
2227                 if (h->s) {
2228                         for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2229                                 BUG_ON(h->s->blocks[i]);
2230
2231                         kfree(h->s);
2232                 }
2233                 kfree(h);
2234         }
2235
2236         BUG_ON(!list_empty(&c->ec_stripe_new_list));
2237
2238         free_heap(&c->ec_stripes_heap);
2239         genradix_free(&c->stripes);
2240         bioset_exit(&c->ec_bioset);
2241 }
2242
2243 void bch2_fs_ec_init_early(struct bch_fs *c)
2244 {
2245         spin_lock_init(&c->ec_stripes_new_lock);
2246         mutex_init(&c->ec_stripes_heap_lock);
2247
2248         INIT_LIST_HEAD(&c->ec_stripe_head_list);
2249         mutex_init(&c->ec_stripe_head_lock);
2250
2251         INIT_LIST_HEAD(&c->ec_stripe_new_list);
2252         mutex_init(&c->ec_stripe_new_lock);
2253         init_waitqueue_head(&c->ec_stripe_new_wait);
2254
2255         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2256         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2257 }
2258
2259 int bch2_fs_ec_init(struct bch_fs *c)
2260 {
2261         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2262                            BIOSET_NEED_BVECS);
2263 }