drivers/md/raid5.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * raid5.c : Multiple Devices driver for Linux
   4  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   5  *         Copyright (C) 1999, 2000 Ingo Molnar
   6  *         Copyright (C) 2002, 2003 H. Peter Anvin
   7  *
   8  * RAID-4/5/6 management functions.
   9  * Thanks to Penguin Computing for making the RAID-6 development possible
  10  * by donating a test server!
  11  */
  12
  13 /*
  14  * BITMAP UNPLUGGING:
  15  *
  16  * The sequencing for updating the bitmap reliably is a little
  17  * subtle (and I got it wrong the first time) so it deserves some
  18  * explanation.
  19  *
  20  * We group bitmap updates into batches.  Each batch has a number.
  21  * We may write out several batches at once, but that isn't very important.
  22  * conf->seq_write is the number of the last batch successfully written.
  23  * conf->seq_flush is the number of the last batch that was closed to
  24  *    new additions.
  25  * When we discover that we will need to write to any block in a stripe
  26  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  27  * the number of the batch it will be in. This is seq_flush+1.
  28  * When we are ready to do a write, if that batch hasn't been written yet,
  29  *   we plug the array and queue the stripe for later.
  30  * When an unplug happens, we increment bm_flush, thus closing the current
  31  *   batch.
  32  * When we notice that bm_flush > bm_write, we write out all pending updates
  33  * to the bitmap, and advance bm_write to where bm_flush was.
  34  * This may occasionally write a bit out twice, but is sure never to
  35  * miss any bits.
  36  */
  37
  38 #include <linux/blkdev.h>
  39 #include <linux/delay.h>
  40 #include <linux/kthread.h>
  41 #include <linux/raid/pq.h>
  42 #include <linux/async_tx.h>
  43 #include <linux/module.h>
  44 #include <linux/async.h>
  45 #include <linux/seq_file.h>
  46 #include <linux/cpu.h>
  47 #include <linux/slab.h>
  48 #include <linux/ratelimit.h>
  49 #include <linux/nodemask.h>
  50
  51 #include <trace/events/block.h>
  52 #include <linux/list_sort.h>
  53
  54 #include "md.h"
  55 #include "raid5.h"
  56 #include "raid0.h"
  57 #include "md-bitmap.h"
  58 #include "raid5-log.h"
  59
  60 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
  61
  62 #define cpu_to_group(cpu) cpu_to_node(cpu)
  63 #define ANY_GROUP NUMA_NO_NODE
  64
  65 #define RAID5_MAX_REQ_STRIPES 256
  66
  67 static bool devices_handle_discard_safely = false;
  68 module_param(devices_handle_discard_safely, bool, 0644);
  69 MODULE_PARM_DESC(devices_handle_discard_safely,
  70                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
  71 static struct workqueue_struct *raid5_wq;
  72
  73 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  74 {
  75         int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
  76         return &conf->stripe_hashtbl[hash];
  77 }
  78
  79 static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
  80 {
  81         return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
  82 }
  83
  84 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
  85         __acquires(&conf->device_lock)
  86 {
  87         spin_lock_irq(conf->hash_locks + hash);
  88         spin_lock(&conf->device_lock);
  89 }
  90
  91 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
  92         __releases(&conf->device_lock)
  93 {
  94         spin_unlock(&conf->device_lock);
  95         spin_unlock_irq(conf->hash_locks + hash);
  96 }
  97
  98 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
  99         __acquires(&conf->device_lock)
 100 {
 101         int i;
 102         spin_lock_irq(conf->hash_locks);
 103         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 104                 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 105         spin_lock(&conf->device_lock);
 106 }
 107
 108 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 109         __releases(&conf->device_lock)
 110 {
 111         int i;
 112         spin_unlock(&conf->device_lock);
 113         for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
 114                 spin_unlock(conf->hash_locks + i);
 115         spin_unlock_irq(conf->hash_locks);
 116 }
 117
 118 /* Find first data disk in a raid6 stripe */
 119 static inline int raid6_d0(struct stripe_head *sh)
 120 {
 121         if (sh->ddf_layout)
 122                 /* ddf always start from first device */
 123                 return 0;
 124         /* md starts just after Q block */
 125         if (sh->qd_idx == sh->disks - 1)
 126                 return 0;
 127         else
 128                 return sh->qd_idx + 1;
 129 }
 130 static inline int raid6_next_disk(int disk, int raid_disks)
 131 {
 132         disk++;
 133         return (disk < raid_disks) ? disk : 0;
 134 }
 135
 136 /* When walking through the disks in a raid5, starting at raid6_d0,
 137  * We need to map each disk to a 'slot', where the data disks are slot
 138  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 139  * is raid_disks-1.  This help does that mapping.
 140  */
 141 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 142                              int *count, int syndrome_disks)
 143 {
 144         int slot = *count;
 145
 146         if (sh->ddf_layout)
 147                 (*count)++;
 148         if (idx == sh->pd_idx)
 149                 return syndrome_disks;
 150         if (idx == sh->qd_idx)
 151                 return syndrome_disks + 1;
 152         if (!sh->ddf_layout)
 153                 (*count)++;
 154         return slot;
 155 }
 156
 157 static void print_raid5_conf (struct r5conf *conf);
 158
 159 static int stripe_operations_active(struct stripe_head *sh)
 160 {
 161         return sh->check_state || sh->reconstruct_state ||
 162                test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 163                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 164 }
 165
 166 static bool stripe_is_lowprio(struct stripe_head *sh)
 167 {
 168         return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
 169                 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
 170                !test_bit(STRIPE_R5C_CACHING, &sh->state);
 171 }
 172
 173 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 174         __must_hold(&sh->raid_conf->device_lock)
 175 {
 176         struct r5conf *conf = sh->raid_conf;
 177         struct r5worker_group *group;
 178         int thread_cnt;
 179         int i, cpu = sh->cpu;
 180
 181         if (!cpu_online(cpu)) {
 182                 cpu = cpumask_any(cpu_online_mask);
 183                 sh->cpu = cpu;
 184         }
 185
 186         if (list_empty(&sh->lru)) {
 187                 struct r5worker_group *group;
 188                 group = conf->worker_groups + cpu_to_group(cpu);
 189                 if (stripe_is_lowprio(sh))
 190                         list_add_tail(&sh->lru, &group->loprio_list);
 191                 else
 192                         list_add_tail(&sh->lru, &group->handle_list);
 193                 group->stripes_cnt++;
 194                 sh->group = group;
 195         }
 196
 197         if (conf->worker_cnt_per_group == 0) {
 198                 md_wakeup_thread(conf->mddev->thread);
 199                 return;
 200         }
 201
 202         group = conf->worker_groups + cpu_to_group(sh->cpu);
 203
 204         group->workers[0].working = true;
 205         /* at least one worker should run to avoid race */
 206         queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
 207
 208         thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
 209         /* wakeup more workers */
 210         for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
 211                 if (group->workers[i].working == false) {
 212                         group->workers[i].working = true;
 213                         queue_work_on(sh->cpu, raid5_wq,
 214                                       &group->workers[i].work);
 215                         thread_cnt--;
 216                 }
 217         }
 218 }
 219
 220 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 221                               struct list_head *temp_inactive_list)
 222         __must_hold(&conf->device_lock)
 223 {
 224         int i;
 225         int injournal = 0;      /* number of date pages with R5_InJournal */
 226
 227         BUG_ON(!list_empty(&sh->lru));
 228         BUG_ON(atomic_read(&conf->active_stripes)==0);
 229
 230         if (r5c_is_writeback(conf->log))
 231                 for (i = sh->disks; i--; )
 232                         if (test_bit(R5_InJournal, &sh->dev[i].flags))
 233                                 injournal++;
 234         /*
 235          * In the following cases, the stripe cannot be released to cached
 236          * lists. Therefore, we make the stripe write out and set
 237          * STRIPE_HANDLE:
 238          *   1. when quiesce in r5c write back;
 239          *   2. when resync is requested fot the stripe.
 240          */
 241         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
 242             (conf->quiesce && r5c_is_writeback(conf->log) &&
 243              !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
 244                 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 245                         r5c_make_stripe_write_out(sh);
 246                 set_bit(STRIPE_HANDLE, &sh->state);
 247         }
 248
 249         if (test_bit(STRIPE_HANDLE, &sh->state)) {
 250                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
 251                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 252                         list_add_tail(&sh->lru, &conf->delayed_list);
 253                 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 254                            sh->bm_seq - conf->seq_write > 0)
 255                         list_add_tail(&sh->lru, &conf->bitmap_list);
 256                 else {
 257                         clear_bit(STRIPE_DELAYED, &sh->state);
 258                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
 259                         if (conf->worker_cnt_per_group == 0) {
 260                                 if (stripe_is_lowprio(sh))
 261                                         list_add_tail(&sh->lru,
 262                                                         &conf->loprio_list);
 263                                 else
 264                                         list_add_tail(&sh->lru,
 265                                                         &conf->handle_list);
 266                         } else {
 267                                 raid5_wakeup_stripe_thread(sh);
 268                                 return;
 269                         }
 270                 }
 271                 md_wakeup_thread(conf->mddev->thread);
 272         } else {
 273                 BUG_ON(stripe_operations_active(sh));
 274                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 275                         if (atomic_dec_return(&conf->preread_active_stripes)
 276                             < IO_THRESHOLD)
 277                                 md_wakeup_thread(conf->mddev->thread);
 278                 atomic_dec(&conf->active_stripes);
 279                 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 280                         if (!r5c_is_writeback(conf->log))
 281                                 list_add_tail(&sh->lru, temp_inactive_list);
 282                         else {
 283                                 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
 284                                 if (injournal == 0)
 285                                         list_add_tail(&sh->lru, temp_inactive_list);
 286                                 else if (injournal == conf->raid_disks - conf->max_degraded) {
 287                                         /* full stripe */
 288                                         if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
 289                                                 atomic_inc(&conf->r5c_cached_full_stripes);
 290                                         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
 291                                                 atomic_dec(&conf->r5c_cached_partial_stripes);
 292                                         list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
 293                                         r5c_check_cached_full_stripe(conf);
 294                                 } else
 295                                         /*
 296                                          * STRIPE_R5C_PARTIAL_STRIPE is set in
 297                                          * r5c_try_caching_write(). No need to
 298                                          * set it again.
 299                                          */
 300                                         list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
 301                         }
 302                 }
 303         }
 304 }
 305
 306 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
 307                              struct list_head *temp_inactive_list)
 308         __must_hold(&conf->device_lock)
 309 {
 310         if (atomic_dec_and_test(&sh->count))
 311                 do_release_stripe(conf, sh, temp_inactive_list);
 312 }
 313
 314 /*
 315  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 316  *
 317  * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 318  * given time. Adding stripes only takes device lock, while deleting stripes
 319  * only takes hash lock.
 320  */
 321 static void release_inactive_stripe_list(struct r5conf *conf,
 322                                          struct list_head *temp_inactive_list,
 323                                          int hash)
 324 {
 325         int size;
 326         bool do_wakeup = false;
 327         unsigned long flags;
 328
 329         if (hash == NR_STRIPE_HASH_LOCKS) {
 330                 size = NR_STRIPE_HASH_LOCKS;
 331                 hash = NR_STRIPE_HASH_LOCKS - 1;
 332         } else
 333                 size = 1;
 334         while (size) {
 335                 struct list_head *list = &temp_inactive_list[size - 1];
 336
 337                 /*
 338                  * We don't hold any lock here yet, raid5_get_active_stripe() might
 339                  * remove stripes from the list
 340                  */
 341                 if (!list_empty_careful(list)) {
 342                         spin_lock_irqsave(conf->hash_locks + hash, flags);
 343                         if (list_empty(conf->inactive_list + hash) &&
 344                             !list_empty(list))
 345                                 atomic_dec(&conf->empty_inactive_list_nr);
 346                         list_splice_tail_init(list, conf->inactive_list + hash);
 347                         do_wakeup = true;
 348                         spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 349                 }
 350                 size--;
 351                 hash--;
 352         }
 353
 354         if (do_wakeup) {
 355                 wake_up(&conf->wait_for_stripe);
 356                 if (atomic_read(&conf->active_stripes) == 0)
 357                         wake_up(&conf->wait_for_quiescent);
 358                 if (conf->retry_read_aligned)
 359                         md_wakeup_thread(conf->mddev->thread);
 360         }
 361 }
 362
 363 static int release_stripe_list(struct r5conf *conf,
 364                                struct list_head *temp_inactive_list)
 365         __must_hold(&conf->device_lock)
 366 {
 367         struct stripe_head *sh, *t;
 368         int count = 0;
 369         struct llist_node *head;
 370
 371         head = llist_del_all(&conf->released_stripes);
 372         head = llist_reverse_order(head);
 373         llist_for_each_entry_safe(sh, t, head, release_list) {
 374                 int hash;
 375
 376                 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
 377                 smp_mb();
 378                 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
 379                 /*
 380                  * Don't worry the bit is set here, because if the bit is set
 381                  * again, the count is always > 1. This is true for
 382                  * STRIPE_ON_UNPLUG_LIST bit too.
 383                  */
 384                 hash = sh->hash_lock_index;
 385                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
 386                 count++;
 387         }
 388
 389         return count;
 390 }
 391
 392 void raid5_release_stripe(struct stripe_head *sh)
 393 {
 394         struct r5conf *conf = sh->raid_conf;
 395         unsigned long flags;
 396         struct list_head list;
 397         int hash;
 398         bool wakeup;
 399
 400         /* Avoid release_list until the last reference.
 401          */
 402         if (atomic_add_unless(&sh->count, -1, 1))
 403                 return;
 404
 405         if (unlikely(!conf->mddev->thread) ||
 406                 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
 407                 goto slow_path;
 408         wakeup = llist_add(&sh->release_list, &conf->released_stripes);
 409         if (wakeup)
 410                 md_wakeup_thread(conf->mddev->thread);
 411         return;
 412 slow_path:
 413         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 414         if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
 415                 INIT_LIST_HEAD(&list);
 416                 hash = sh->hash_lock_index;
 417                 do_release_stripe(conf, sh, &list);
 418                 spin_unlock_irqrestore(&conf->device_lock, flags);
 419                 release_inactive_stripe_list(conf, &list, hash);
 420         }
 421 }
 422
 423 static inline void remove_hash(struct stripe_head *sh)
 424 {
 425         pr_debug("remove_hash(), stripe %llu\n",
 426                 (unsigned long long)sh->sector);
 427
 428         hlist_del_init(&sh->hash);
 429 }
 430
 431 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 432 {
 433         struct hlist_head *hp = stripe_hash(conf, sh->sector);
 434
 435         pr_debug("insert_hash(), stripe %llu\n",
 436                 (unsigned long long)sh->sector);
 437
 438         hlist_add_head(&sh->hash, hp);
 439 }
 440
 441 /* find an idle stripe, make sure it is unhashed, and return it. */
 442 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 443 {
 444         struct stripe_head *sh = NULL;
 445         struct list_head *first;
 446
 447         if (list_empty(conf->inactive_list + hash))
 448                 goto out;
 449         first = (conf->inactive_list + hash)->next;
 450         sh = list_entry(first, struct stripe_head, lru);
 451         list_del_init(first);
 452         remove_hash(sh);
 453         atomic_inc(&conf->active_stripes);
 454         BUG_ON(hash != sh->hash_lock_index);
 455         if (list_empty(conf->inactive_list + hash))
 456                 atomic_inc(&conf->empty_inactive_list_nr);
 457 out:
 458         return sh;
 459 }
 460
 461 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
 462 static void free_stripe_pages(struct stripe_head *sh)
 463 {
 464         int i;
 465         struct page *p;
 466
 467         /* Have not allocate page pool */
 468         if (!sh->pages)
 469                 return;
 470
 471         for (i = 0; i < sh->nr_pages; i++) {
 472                 p = sh->pages[i];
 473                 if (p)
 474                         put_page(p);
 475                 sh->pages[i] = NULL;
 476         }
 477 }
 478
 479 static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
 480 {
 481         int i;
 482         struct page *p;
 483
 484         for (i = 0; i < sh->nr_pages; i++) {
 485                 /* The page have allocated. */
 486                 if (sh->pages[i])
 487                         continue;
 488
 489                 p = alloc_page(gfp);
 490                 if (!p) {
 491                         free_stripe_pages(sh);
 492                         return -ENOMEM;
 493                 }
 494                 sh->pages[i] = p;
 495         }
 496         return 0;
 497 }
 498
 499 static int
 500 init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
 501 {
 502         int nr_pages, cnt;
 503
 504         if (sh->pages)
 505                 return 0;
 506
 507         /* Each of the sh->dev[i] need one conf->stripe_size */
 508         cnt = PAGE_SIZE / conf->stripe_size;
 509         nr_pages = (disks + cnt - 1) / cnt;
 510
 511         sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 512         if (!sh->pages)
 513                 return -ENOMEM;
 514         sh->nr_pages = nr_pages;
 515         sh->stripes_per_page = cnt;
 516         return 0;
 517 }
 518 #endif
 519
 520 static void shrink_buffers(struct stripe_head *sh)
 521 {
 522         int i;
 523         int num = sh->raid_conf->pool_size;
 524
 525 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 526         for (i = 0; i < num ; i++) {
 527                 struct page *p;
 528
 529                 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 530                 p = sh->dev[i].page;
 531                 if (!p)
 532                         continue;
 533                 sh->dev[i].page = NULL;
 534                 put_page(p);
 535         }
 536 #else
 537         for (i = 0; i < num; i++)
 538                 sh->dev[i].page = NULL;
 539         free_stripe_pages(sh); /* Free pages */
 540 #endif
 541 }
 542
 543 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 544 {
 545         int i;
 546         int num = sh->raid_conf->pool_size;
 547
 548 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 549         for (i = 0; i < num; i++) {
 550                 struct page *page;
 551
 552                 if (!(page = alloc_page(gfp))) {
 553                         return 1;
 554                 }
 555                 sh->dev[i].page = page;
 556                 sh->dev[i].orig_page = page;
 557                 sh->dev[i].offset = 0;
 558         }
 559 #else
 560         if (alloc_stripe_pages(sh, gfp))
 561                 return -ENOMEM;
 562
 563         for (i = 0; i < num; i++) {
 564                 sh->dev[i].page = raid5_get_dev_page(sh, i);
 565                 sh->dev[i].orig_page = sh->dev[i].page;
 566                 sh->dev[i].offset = raid5_get_page_offset(sh, i);
 567         }
 568 #endif
 569         return 0;
 570 }
 571
 572 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 573                             struct stripe_head *sh);
 574
 575 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 576 {
 577         struct r5conf *conf = sh->raid_conf;
 578         int i, seq;
 579
 580         BUG_ON(atomic_read(&sh->count) != 0);
 581         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 582         BUG_ON(stripe_operations_active(sh));
 583         BUG_ON(sh->batch_head);
 584
 585         pr_debug("init_stripe called, stripe %llu\n",
 586                 (unsigned long long)sector);
 587 retry:
 588         seq = read_seqcount_begin(&conf->gen_lock);
 589         sh->generation = conf->generation - previous;
 590         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 591         sh->sector = sector;
 592         stripe_set_idx(sector, conf, previous, sh);
 593         sh->state = 0;
 594
 595         for (i = sh->disks; i--; ) {
 596                 struct r5dev *dev = &sh->dev[i];
 597
 598                 if (dev->toread || dev->read || dev->towrite || dev->written ||
 599                     test_bit(R5_LOCKED, &dev->flags)) {
 600                         pr_err("sector=%llx i=%d %p %p %p %p %d\n",
 601                                (unsigned long long)sh->sector, i, dev->toread,
 602                                dev->read, dev->towrite, dev->written,
 603                                test_bit(R5_LOCKED, &dev->flags));
 604                         WARN_ON(1);
 605                 }
 606                 dev->flags = 0;
 607                 dev->sector = raid5_compute_blocknr(sh, i, previous);
 608         }
 609         if (read_seqcount_retry(&conf->gen_lock, seq))
 610                 goto retry;
 611         sh->overwrite_disks = 0;
 612         insert_hash(conf, sh);
 613         sh->cpu = smp_processor_id();
 614         set_bit(STRIPE_BATCH_READY, &sh->state);
 615 }
 616
 617 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 618                                          short generation)
 619 {
 620         struct stripe_head *sh;
 621
 622         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 623         hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
 624                 if (sh->sector == sector && sh->generation == generation)
 625                         return sh;
 626         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 627         return NULL;
 628 }
 629
 630 static struct stripe_head *find_get_stripe(struct r5conf *conf,
 631                 sector_t sector, short generation, int hash)
 632 {
 633         int inc_empty_inactive_list_flag;
 634         struct stripe_head *sh;
 635
 636         sh = __find_stripe(conf, sector, generation);
 637         if (!sh)
 638                 return NULL;
 639
 640         if (atomic_inc_not_zero(&sh->count))
 641                 return sh;
 642
 643         /*
 644          * Slow path. The reference count is zero which means the stripe must
 645          * be on a list (sh->lru). Must remove the stripe from the list that
 646          * references it with the device_lock held.
 647          */
 648
 649         spin_lock(&conf->device_lock);
 650         if (!atomic_read(&sh->count)) {
 651                 if (!test_bit(STRIPE_HANDLE, &sh->state))
 652                         atomic_inc(&conf->active_stripes);
 653                 BUG_ON(list_empty(&sh->lru) &&
 654                        !test_bit(STRIPE_EXPANDING, &sh->state));
 655                 inc_empty_inactive_list_flag = 0;
 656                 if (!list_empty(conf->inactive_list + hash))
 657                         inc_empty_inactive_list_flag = 1;
 658                 list_del_init(&sh->lru);
 659                 if (list_empty(conf->inactive_list + hash) &&
 660                     inc_empty_inactive_list_flag)
 661                         atomic_inc(&conf->empty_inactive_list_nr);
 662                 if (sh->group) {
 663                         sh->group->stripes_cnt--;
 664                         sh->group = NULL;
 665                 }
 666         }
 667         atomic_inc(&sh->count);
 668         spin_unlock(&conf->device_lock);
 669
 670         return sh;
 671 }
 672
 673 /*
 674  * Need to check if array has failed when deciding whether to:
 675  *  - start an array
 676  *  - remove non-faulty devices
 677  *  - add a spare
 678  *  - allow a reshape
 679  * This determination is simple when no reshape is happening.
 680  * However if there is a reshape, we need to carefully check
 681  * both the before and after sections.
 682  * This is because some failed devices may only affect one
 683  * of the two sections, and some non-in_sync devices may
 684  * be insync in the section most affected by failed devices.
 685  *
 686  * Most calls to this function hold &conf->device_lock. Calls
 687  * in raid5_run() do not require the lock as no other threads
 688  * have been started yet.
 689  */
 690 int raid5_calc_degraded(struct r5conf *conf)
 691 {
 692         int degraded, degraded2;
 693         int i;
 694
 695         rcu_read_lock();
 696         degraded = 0;
 697         for (i = 0; i < conf->previous_raid_disks; i++) {
 698                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 699                 if (rdev && test_bit(Faulty, &rdev->flags))
 700                         rdev = rcu_dereference(conf->disks[i].replacement);
 701                 if (!rdev || test_bit(Faulty, &rdev->flags))
 702                         degraded++;
 703                 else if (test_bit(In_sync, &rdev->flags))
 704                         ;
 705                 else
 706                         /* not in-sync or faulty.
 707                          * If the reshape increases the number of devices,
 708                          * this is being recovered by the reshape, so
 709                          * this 'previous' section is not in_sync.
 710                          * If the number of devices is being reduced however,
 711                          * the device can only be part of the array if
 712                          * we are reverting a reshape, so this section will
 713                          * be in-sync.
 714                          */
 715                         if (conf->raid_disks >= conf->previous_raid_disks)
 716                                 degraded++;
 717         }
 718         rcu_read_unlock();
 719         if (conf->raid_disks == conf->previous_raid_disks)
 720                 return degraded;
 721         rcu_read_lock();
 722         degraded2 = 0;
 723         for (i = 0; i < conf->raid_disks; i++) {
 724                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 725                 if (rdev && test_bit(Faulty, &rdev->flags))
 726                         rdev = rcu_dereference(conf->disks[i].replacement);
 727                 if (!rdev || test_bit(Faulty, &rdev->flags))
 728                         degraded2++;
 729                 else if (test_bit(In_sync, &rdev->flags))
 730                         ;
 731                 else
 732                         /* not in-sync or faulty.
 733                          * If reshape increases the number of devices, this
 734                          * section has already been recovered, else it
 735                          * almost certainly hasn't.
 736                          */
 737                         if (conf->raid_disks <= conf->previous_raid_disks)
 738                                 degraded2++;
 739         }
 740         rcu_read_unlock();
 741         if (degraded2 > degraded)
 742                 return degraded2;
 743         return degraded;
 744 }
 745
 746 static bool has_failed(struct r5conf *conf)
 747 {
 748         int degraded = conf->mddev->degraded;
 749
 750         if (test_bit(MD_BROKEN, &conf->mddev->flags))
 751                 return true;
 752
 753         if (conf->mddev->reshape_position != MaxSector)
 754                 degraded = raid5_calc_degraded(conf);
 755
 756         return degraded > conf->max_degraded;
 757 }
 758
 759 enum stripe_result {
 760         STRIPE_SUCCESS = 0,
 761         STRIPE_RETRY,
 762         STRIPE_SCHEDULE_AND_RETRY,
 763         STRIPE_FAIL,
 764 };
 765
 766 struct stripe_request_ctx {
 767         /* a reference to the last stripe_head for batching */
 768         struct stripe_head *batch_last;
 769
 770         /* first sector in the request */
 771         sector_t first_sector;
 772
 773         /* last sector in the request */
 774         sector_t last_sector;
 775
 776         /*
 777          * bitmap to track stripe sectors that have been added to stripes
 778          * add one to account for unaligned requests
 779          */
 780         DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
 781
 782         /* the request had REQ_PREFLUSH, cleared after the first stripe_head */
 783         bool do_flush;
 784 };
 785
 786 /*
 787  * Block until another thread clears R5_INACTIVE_BLOCKED or
 788  * there are fewer than 3/4 the maximum number of active stripes
 789  * and there is an inactive stripe available.
 790  */
 791 static bool is_inactive_blocked(struct r5conf *conf, int hash)
 792 {
 793         if (list_empty(conf->inactive_list + hash))
 794                 return false;
 795
 796         if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 797                 return true;
 798
 799         return (atomic_read(&conf->active_stripes) <
 800                 (conf->max_nr_stripes * 3 / 4));
 801 }
 802
 803 struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
 804                 struct stripe_request_ctx *ctx, sector_t sector,
 805                 unsigned int flags)
 806 {
 807         struct stripe_head *sh;
 808         int hash = stripe_hash_locks_hash(conf, sector);
 809         int previous = !!(flags & R5_GAS_PREVIOUS);
 810
 811         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 812
 813         spin_lock_irq(conf->hash_locks + hash);
 814
 815         for (;;) {
 816                 if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
 817                         /*
 818                          * Must release the reference to batch_last before
 819                          * waiting, on quiesce, otherwise the batch_last will
 820                          * hold a reference to a stripe and raid5_quiesce()
 821                          * will deadlock waiting for active_stripes to go to
 822                          * zero.
 823                          */
 824                         if (ctx && ctx->batch_last) {
 825                                 raid5_release_stripe(ctx->batch_last);
 826                                 ctx->batch_last = NULL;
 827                         }
 828
 829                         wait_event_lock_irq(conf->wait_for_quiescent,
 830                                             !conf->quiesce,
 831                                             *(conf->hash_locks + hash));
 832                 }
 833
 834                 sh = find_get_stripe(conf, sector, conf->generation - previous,
 835                                      hash);
 836                 if (sh)
 837                         break;
 838
 839                 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 840                         sh = get_free_stripe(conf, hash);
 841                         if (sh) {
 842                                 r5c_check_stripe_cache_usage(conf);
 843                                 init_stripe(sh, sector, previous);
 844                                 atomic_inc(&sh->count);
 845                                 break;
 846                         }
 847
 848                         if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
 849                                 set_bit(R5_ALLOC_MORE, &conf->cache_state);
 850                 }
 851
 852                 if (flags & R5_GAS_NOBLOCK)
 853                         break;
 854
 855                 set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
 856                 r5l_wake_reclaim(conf->log, 0);
 857                 wait_event_lock_irq(conf->wait_for_stripe,
 858                                     is_inactive_blocked(conf, hash),
 859                                     *(conf->hash_locks + hash));
 860                 clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
 861         }
 862
 863         spin_unlock_irq(conf->hash_locks + hash);
 864         return sh;
 865 }
 866
 867 static bool is_full_stripe_write(struct stripe_head *sh)
 868 {
 869         BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
 870         return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 871 }
 872
 873 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 874                 __acquires(&sh1->stripe_lock)
 875                 __acquires(&sh2->stripe_lock)
 876 {
 877         if (sh1 > sh2) {
 878                 spin_lock_irq(&sh2->stripe_lock);
 879                 spin_lock_nested(&sh1->stripe_lock, 1);
 880         } else {
 881                 spin_lock_irq(&sh1->stripe_lock);
 882                 spin_lock_nested(&sh2->stripe_lock, 1);
 883         }
 884 }
 885
 886 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 887                 __releases(&sh1->stripe_lock)
 888                 __releases(&sh2->stripe_lock)
 889 {
 890         spin_unlock(&sh1->stripe_lock);
 891         spin_unlock_irq(&sh2->stripe_lock);
 892 }
 893
 894 /* Only freshly new full stripe normal write stripe can be added to a batch list */
 895 static bool stripe_can_batch(struct stripe_head *sh)
 896 {
 897         struct r5conf *conf = sh->raid_conf;
 898
 899         if (raid5_has_log(conf) || raid5_has_ppl(conf))
 900                 return false;
 901         return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 902                 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 903                 is_full_stripe_write(sh);
 904 }
 905
 906 /* we only do back search */
 907 static void stripe_add_to_batch_list(struct r5conf *conf,
 908                 struct stripe_head *sh, struct stripe_head *last_sh)
 909 {
 910         struct stripe_head *head;
 911         sector_t head_sector, tmp_sec;
 912         int hash;
 913         int dd_idx;
 914
 915         /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 916         tmp_sec = sh->sector;
 917         if (!sector_div(tmp_sec, conf->chunk_sectors))
 918                 return;
 919         head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
 920
 921         if (last_sh && head_sector == last_sh->sector) {
 922                 head = last_sh;
 923                 atomic_inc(&head->count);
 924         } else {
 925                 hash = stripe_hash_locks_hash(conf, head_sector);
 926                 spin_lock_irq(conf->hash_locks + hash);
 927                 head = find_get_stripe(conf, head_sector, conf->generation,
 928                                        hash);
 929                 spin_unlock_irq(conf->hash_locks + hash);
 930                 if (!head)
 931                         return;
 932                 if (!stripe_can_batch(head))
 933                         goto out;
 934         }
 935
 936         lock_two_stripes(head, sh);
 937         /* clear_batch_ready clear the flag */
 938         if (!stripe_can_batch(head) || !stripe_can_batch(sh))
 939                 goto unlock_out;
 940
 941         if (sh->batch_head)
 942                 goto unlock_out;
 943
 944         dd_idx = 0;
 945         while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
 946                 dd_idx++;
 947         if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
 948             bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
 949                 goto unlock_out;
 950
 951         if (head->batch_head) {
 952                 spin_lock(&head->batch_head->batch_lock);
 953                 /* This batch list is already running */
 954                 if (!stripe_can_batch(head)) {
 955                         spin_unlock(&head->batch_head->batch_lock);
 956                         goto unlock_out;
 957                 }
 958                 /*
 959                  * We must assign batch_head of this stripe within the
 960                  * batch_lock, otherwise clear_batch_ready of batch head
 961                  * stripe could clear BATCH_READY bit of this stripe and
 962                  * this stripe->batch_head doesn't get assigned, which
 963                  * could confuse clear_batch_ready for this stripe
 964                  */
 965                 sh->batch_head = head->batch_head;
 966
 967                 /*
 968                  * at this point, head's BATCH_READY could be cleared, but we
 969                  * can still add the stripe to batch list
 970                  */
 971                 list_add(&sh->batch_list, &head->batch_list);
 972                 spin_unlock(&head->batch_head->batch_lock);
 973         } else {
 974                 head->batch_head = head;
 975                 sh->batch_head = head->batch_head;
 976                 spin_lock(&head->batch_lock);
 977                 list_add_tail(&sh->batch_list, &head->batch_list);
 978                 spin_unlock(&head->batch_lock);
 979         }
 980
 981         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 982                 if (atomic_dec_return(&conf->preread_active_stripes)
 983                     < IO_THRESHOLD)
 984                         md_wakeup_thread(conf->mddev->thread);
 985
 986         if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
 987                 int seq = sh->bm_seq;
 988                 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
 989                     sh->batch_head->bm_seq > seq)
 990                         seq = sh->batch_head->bm_seq;
 991                 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
 992                 sh->batch_head->bm_seq = seq;
 993         }
 994
 995         atomic_inc(&sh->count);
 996 unlock_out:
 997         unlock_two_stripes(head, sh);
 998 out:
 999         raid5_release_stripe(head);
1000 }
1001
1002 /* Determine if 'data_offset' or 'new_data_offset' should be used
1003  * in this stripe_head.
1004  */
1005 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
1006 {
1007         sector_t progress = conf->reshape_progress;
1008         /* Need a memory barrier to make sure we see the value
1009          * of conf->generation, or ->data_offset that was set before
1010          * reshape_progress was updated.
1011          */
1012         smp_rmb();
1013         if (progress == MaxSector)
1014                 return 0;
1015         if (sh->generation == conf->generation - 1)
1016                 return 0;
1017         /* We are in a reshape, and this is a new-generation stripe,
1018          * so use new_data_offset.
1019          */
1020         return 1;
1021 }
1022
1023 static void dispatch_bio_list(struct bio_list *tmp)
1024 {
1025         struct bio *bio;
1026
1027         while ((bio = bio_list_pop(tmp)))
1028                 submit_bio_noacct(bio);
1029 }
1030
1031 static int cmp_stripe(void *priv, const struct list_head *a,
1032                       const struct list_head *b)
1033 {
1034         const struct r5pending_data *da = list_entry(a,
1035                                 struct r5pending_data, sibling);
1036         const struct r5pending_data *db = list_entry(b,
1037                                 struct r5pending_data, sibling);
1038         if (da->sector > db->sector)
1039                 return 1;
1040         if (da->sector < db->sector)
1041                 return -1;
1042         return 0;
1043 }
1044
1045 static void dispatch_defer_bios(struct r5conf *conf, int target,
1046                                 struct bio_list *list)
1047 {
1048         struct r5pending_data *data;
1049         struct list_head *first, *next = NULL;
1050         int cnt = 0;
1051
1052         if (conf->pending_data_cnt == 0)
1053                 return;
1054
1055         list_sort(NULL, &conf->pending_list, cmp_stripe);
1056
1057         first = conf->pending_list.next;
1058
1059         /* temporarily move the head */
1060         if (conf->next_pending_data)
1061                 list_move_tail(&conf->pending_list,
1062                                 &conf->next_pending_data->sibling);
1063
1064         while (!list_empty(&conf->pending_list)) {
1065                 data = list_first_entry(&conf->pending_list,
1066                         struct r5pending_data, sibling);
1067                 if (&data->sibling == first)
1068                         first = data->sibling.next;
1069                 next = data->sibling.next;
1070
1071                 bio_list_merge(list, &data->bios);
1072                 list_move(&data->sibling, &conf->free_list);
1073                 cnt++;
1074                 if (cnt >= target)
1075                         break;
1076         }
1077         conf->pending_data_cnt -= cnt;
1078         BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1079
1080         if (next != &conf->pending_list)
1081                 conf->next_pending_data = list_entry(next,
1082                                 struct r5pending_data, sibling);
1083         else
1084                 conf->next_pending_data = NULL;
1085         /* list isn't empty */
1086         if (first != &conf->pending_list)
1087                 list_move_tail(&conf->pending_list, first);
1088 }
1089
1090 static void flush_deferred_bios(struct r5conf *conf)
1091 {
1092         struct bio_list tmp = BIO_EMPTY_LIST;
1093
1094         if (conf->pending_data_cnt == 0)
1095                 return;
1096
1097         spin_lock(&conf->pending_bios_lock);
1098         dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1099         BUG_ON(conf->pending_data_cnt != 0);
1100         spin_unlock(&conf->pending_bios_lock);
1101
1102         dispatch_bio_list(&tmp);
1103 }
1104
1105 static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1106                                 struct bio_list *bios)
1107 {
1108         struct bio_list tmp = BIO_EMPTY_LIST;
1109         struct r5pending_data *ent;
1110
1111         spin_lock(&conf->pending_bios_lock);
1112         ent = list_first_entry(&conf->free_list, struct r5pending_data,
1113                                                         sibling);
1114         list_move_tail(&ent->sibling, &conf->pending_list);
1115         ent->sector = sector;
1116         bio_list_init(&ent->bios);
1117         bio_list_merge(&ent->bios, bios);
1118         conf->pending_data_cnt++;
1119         if (conf->pending_data_cnt >= PENDING_IO_MAX)
1120                 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1121
1122         spin_unlock(&conf->pending_bios_lock);
1123
1124         dispatch_bio_list(&tmp);
1125 }
1126
1127 static void
1128 raid5_end_read_request(struct bio *bi);
1129 static void
1130 raid5_end_write_request(struct bio *bi);
1131
1132 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1133 {
1134         struct r5conf *conf = sh->raid_conf;
1135         int i, disks = sh->disks;
1136         struct stripe_head *head_sh = sh;
1137         struct bio_list pending_bios = BIO_EMPTY_LIST;
1138         struct r5dev *dev;
1139         bool should_defer;
1140
1141         might_sleep();
1142
1143         if (log_stripe(sh, s) == 0)
1144                 return;
1145
1146         should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1147
1148         for (i = disks; i--; ) {
1149                 enum req_op op;
1150                 blk_opf_t op_flags = 0;
1151                 int replace_only = 0;
1152                 struct bio *bi, *rbi;
1153                 struct md_rdev *rdev, *rrdev = NULL;
1154
1155                 sh = head_sh;
1156                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1157                         op = REQ_OP_WRITE;
1158                         if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1159                                 op_flags = REQ_FUA;
1160                         if (test_bit(R5_Discard, &sh->dev[i].flags))
1161                                 op = REQ_OP_DISCARD;
1162                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1163                         op = REQ_OP_READ;
1164                 else if (test_and_clear_bit(R5_WantReplace,
1165                                             &sh->dev[i].flags)) {
1166                         op = REQ_OP_WRITE;
1167                         replace_only = 1;
1168                 } else
1169                         continue;
1170                 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1171                         op_flags |= REQ_SYNC;
1172
1173 again:
1174                 dev = &sh->dev[i];
1175                 bi = &dev->req;
1176                 rbi = &dev->rreq; /* For writing to replacement */
1177
1178                 rcu_read_lock();
1179                 rrdev = rcu_dereference(conf->disks[i].replacement);
1180                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
1181                 rdev = rcu_dereference(conf->disks[i].rdev);
1182                 if (!rdev) {
1183                         rdev = rrdev;
1184                         rrdev = NULL;
1185                 }
1186                 if (op_is_write(op)) {
1187                         if (replace_only)
1188                                 rdev = NULL;
1189                         if (rdev == rrdev)
1190                                 /* We raced and saw duplicates */
1191                                 rrdev = NULL;
1192                 } else {
1193                         if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1194                                 rdev = rrdev;
1195                         rrdev = NULL;
1196                 }
1197
1198                 if (rdev && test_bit(Faulty, &rdev->flags))
1199                         rdev = NULL;
1200                 if (rdev)
1201                         atomic_inc(&rdev->nr_pending);
1202                 if (rrdev && test_bit(Faulty, &rrdev->flags))
1203                         rrdev = NULL;
1204                 if (rrdev)
1205                         atomic_inc(&rrdev->nr_pending);
1206                 rcu_read_unlock();
1207
1208                 /* We have already checked bad blocks for reads.  Now
1209                  * need to check for writes.  We never accept write errors
1210                  * on the replacement, so we don't to check rrdev.
1211                  */
1212                 while (op_is_write(op) && rdev &&
1213                        test_bit(WriteErrorSeen, &rdev->flags)) {
1214                         sector_t first_bad;
1215                         int bad_sectors;
1216                         int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1217                                               &first_bad, &bad_sectors);
1218                         if (!bad)
1219                                 break;
1220
1221                         if (bad < 0) {
1222                                 set_bit(BlockedBadBlocks, &rdev->flags);
1223                                 if (!conf->mddev->external &&
1224                                     conf->mddev->sb_flags) {
1225                                         /* It is very unlikely, but we might
1226                                          * still need to write out the
1227                                          * bad block log - better give it
1228                                          * a chance*/
1229                                         md_check_recovery(conf->mddev);
1230                                 }
1231                                 /*
1232                                  * Because md_wait_for_blocked_rdev
1233                                  * will dec nr_pending, we must
1234                                  * increment it first.
1235                                  */
1236                                 atomic_inc(&rdev->nr_pending);
1237                                 md_wait_for_blocked_rdev(rdev, conf->mddev);
1238                         } else {
1239                                 /* Acknowledged bad block - skip the write */
1240                                 rdev_dec_pending(rdev, conf->mddev);
1241                                 rdev = NULL;
1242                         }
1243                 }
1244
1245                 if (rdev) {
1246                         if (s->syncing || s->expanding || s->expanded
1247                             || s->replacing)
1248                                 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1249
1250                         set_bit(STRIPE_IO_STARTED, &sh->state);
1251
1252                         bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
1253                         bi->bi_end_io = op_is_write(op)
1254                                 ? raid5_end_write_request
1255                                 : raid5_end_read_request;
1256                         bi->bi_private = sh;
1257
1258                         pr_debug("%s: for %llu schedule op %d on disc %d\n",
1259                                 __func__, (unsigned long long)sh->sector,
1260                                 bi->bi_opf, i);
1261                         atomic_inc(&sh->count);
1262                         if (sh != head_sh)
1263                                 atomic_inc(&head_sh->count);
1264                         if (use_new_offset(conf, sh))
1265                                 bi->bi_iter.bi_sector = (sh->sector
1266                                                  + rdev->new_data_offset);
1267                         else
1268                                 bi->bi_iter.bi_sector = (sh->sector
1269                                                  + rdev->data_offset);
1270                         if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1271                                 bi->bi_opf |= REQ_NOMERGE;
1272
1273                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1274                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1275
1276                         if (!op_is_write(op) &&
1277                             test_bit(R5_InJournal, &sh->dev[i].flags))
1278                                 /*
1279                                  * issuing read for a page in journal, this
1280                                  * must be preparing for prexor in rmw; read
1281                                  * the data into orig_page
1282                                  */
1283                                 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1284                         else
1285                                 sh->dev[i].vec.bv_page = sh->dev[i].page;
1286                         bi->bi_vcnt = 1;
1287                         bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1288                         bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1289                         bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1290                         /*
1291                          * If this is discard request, set bi_vcnt 0. We don't
1292                          * want to confuse SCSI because SCSI will replace payload
1293                          */
1294                         if (op == REQ_OP_DISCARD)
1295                                 bi->bi_vcnt = 0;
1296                         if (rrdev)
1297                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1298
1299                         if (conf->mddev->gendisk)
1300                                 trace_block_bio_remap(bi,
1301                                                 disk_devt(conf->mddev->gendisk),
1302                                                 sh->dev[i].sector);
1303                         if (should_defer && op_is_write(op))
1304                                 bio_list_add(&pending_bios, bi);
1305                         else
1306                                 submit_bio_noacct(bi);
1307                 }
1308                 if (rrdev) {
1309                         if (s->syncing || s->expanding || s->expanded
1310                             || s->replacing)
1311                                 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1312
1313                         set_bit(STRIPE_IO_STARTED, &sh->state);
1314
1315                         bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
1316                         BUG_ON(!op_is_write(op));
1317                         rbi->bi_end_io = raid5_end_write_request;
1318                         rbi->bi_private = sh;
1319
1320                         pr_debug("%s: for %llu schedule op %d on "
1321                                  "replacement disc %d\n",
1322                                 __func__, (unsigned long long)sh->sector,
1323                                 rbi->bi_opf, i);
1324                         atomic_inc(&sh->count);
1325                         if (sh != head_sh)
1326                                 atomic_inc(&head_sh->count);
1327                         if (use_new_offset(conf, sh))
1328                                 rbi->bi_iter.bi_sector = (sh->sector
1329                                                   + rrdev->new_data_offset);
1330                         else
1331                                 rbi->bi_iter.bi_sector = (sh->sector
1332                                                   + rrdev->data_offset);
1333                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1334                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1335                         sh->dev[i].rvec.bv_page = sh->dev[i].page;
1336                         rbi->bi_vcnt = 1;
1337                         rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1338                         rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1339                         rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1340                         /*
1341                          * If this is discard request, set bi_vcnt 0. We don't
1342                          * want to confuse SCSI because SCSI will replace payload
1343                          */
1344                         if (op == REQ_OP_DISCARD)
1345                                 rbi->bi_vcnt = 0;
1346                         if (conf->mddev->gendisk)
1347                                 trace_block_bio_remap(rbi,
1348                                                 disk_devt(conf->mddev->gendisk),
1349                                                 sh->dev[i].sector);
1350                         if (should_defer && op_is_write(op))
1351                                 bio_list_add(&pending_bios, rbi);
1352                         else
1353                                 submit_bio_noacct(rbi);
1354                 }
1355                 if (!rdev && !rrdev) {
1356                         if (op_is_write(op))
1357                                 set_bit(STRIPE_DEGRADED, &sh->state);
1358                         pr_debug("skip op %d on disc %d for sector %llu\n",
1359                                 bi->bi_opf, i, (unsigned long long)sh->sector);
1360                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
1361                         set_bit(STRIPE_HANDLE, &sh->state);
1362                 }
1363
1364                 if (!head_sh->batch_head)
1365                         continue;
1366                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1367                                       batch_list);
1368                 if (sh != head_sh)
1369                         goto again;
1370         }
1371
1372         if (should_defer && !bio_list_empty(&pending_bios))
1373                 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1374 }
1375
1376 static struct dma_async_tx_descriptor *
1377 async_copy_data(int frombio, struct bio *bio, struct page **page,
1378         unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1379         struct stripe_head *sh, int no_skipcopy)
1380 {
1381         struct bio_vec bvl;
1382         struct bvec_iter iter;
1383         struct page *bio_page;
1384         int page_offset;
1385         struct async_submit_ctl submit;
1386         enum async_tx_flags flags = 0;
1387         struct r5conf *conf = sh->raid_conf;
1388
1389         if (bio->bi_iter.bi_sector >= sector)
1390                 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1391         else
1392                 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1393
1394         if (frombio)
1395                 flags |= ASYNC_TX_FENCE;
1396         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1397
1398         bio_for_each_segment(bvl, bio, iter) {
1399                 int len = bvl.bv_len;
1400                 int clen;
1401                 int b_offset = 0;
1402
1403                 if (page_offset < 0) {
1404                         b_offset = -page_offset;
1405                         page_offset += b_offset;
1406                         len -= b_offset;
1407                 }
1408
1409                 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1410                         clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1411                 else
1412                         clen = len;
1413
1414                 if (clen > 0) {
1415                         b_offset += bvl.bv_offset;
1416                         bio_page = bvl.bv_page;
1417                         if (frombio) {
1418                                 if (conf->skip_copy &&
1419                                     b_offset == 0 && page_offset == 0 &&
1420                                     clen == RAID5_STRIPE_SIZE(conf) &&
1421                                     !no_skipcopy)
1422                                         *page = bio_page;
1423                                 else
1424                                         tx = async_memcpy(*page, bio_page, page_offset + poff,
1425                                                   b_offset, clen, &submit);
1426                         } else
1427                                 tx = async_memcpy(bio_page, *page, b_offset,
1428                                                   page_offset + poff, clen, &submit);
1429                 }
1430                 /* chain the operations */
1431                 submit.depend_tx = tx;
1432
1433                 if (clen < len) /* hit end of page */
1434                         break;
1435                 page_offset +=  len;
1436         }
1437
1438         return tx;
1439 }
1440
1441 static void ops_complete_biofill(void *stripe_head_ref)
1442 {
1443         struct stripe_head *sh = stripe_head_ref;
1444         int i;
1445         struct r5conf *conf = sh->raid_conf;
1446
1447         pr_debug("%s: stripe %llu\n", __func__,
1448                 (unsigned long long)sh->sector);
1449
1450         /* clear completed biofills */
1451         for (i = sh->disks; i--; ) {
1452                 struct r5dev *dev = &sh->dev[i];
1453
1454                 /* acknowledge completion of a biofill operation */
1455                 /* and check if we need to reply to a read request,
1456                  * new R5_Wantfill requests are held off until
1457                  * !STRIPE_BIOFILL_RUN
1458                  */
1459                 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1460                         struct bio *rbi, *rbi2;
1461
1462                         BUG_ON(!dev->read);
1463                         rbi = dev->read;
1464                         dev->read = NULL;
1465                         while (rbi && rbi->bi_iter.bi_sector <
1466                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1467                                 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1468                                 bio_endio(rbi);
1469                                 rbi = rbi2;
1470                         }
1471                 }
1472         }
1473         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1474
1475         set_bit(STRIPE_HANDLE, &sh->state);
1476         raid5_release_stripe(sh);
1477 }
1478
1479 static void ops_run_biofill(struct stripe_head *sh)
1480 {
1481         struct dma_async_tx_descriptor *tx = NULL;
1482         struct async_submit_ctl submit;
1483         int i;
1484         struct r5conf *conf = sh->raid_conf;
1485
1486         BUG_ON(sh->batch_head);
1487         pr_debug("%s: stripe %llu\n", __func__,
1488                 (unsigned long long)sh->sector);
1489
1490         for (i = sh->disks; i--; ) {
1491                 struct r5dev *dev = &sh->dev[i];
1492                 if (test_bit(R5_Wantfill, &dev->flags)) {
1493                         struct bio *rbi;
1494                         spin_lock_irq(&sh->stripe_lock);
1495                         dev->read = rbi = dev->toread;
1496                         dev->toread = NULL;
1497                         spin_unlock_irq(&sh->stripe_lock);
1498                         while (rbi && rbi->bi_iter.bi_sector <
1499                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1500                                 tx = async_copy_data(0, rbi, &dev->page,
1501                                                      dev->offset,
1502                                                      dev->sector, tx, sh, 0);
1503                                 rbi = r5_next_bio(conf, rbi, dev->sector);
1504                         }
1505                 }
1506         }
1507
1508         atomic_inc(&sh->count);
1509         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1510         async_trigger_callback(&submit);
1511 }
1512
1513 static void mark_target_uptodate(struct stripe_head *sh, int target)
1514 {
1515         struct r5dev *tgt;
1516
1517         if (target < 0)
1518                 return;
1519
1520         tgt = &sh->dev[target];
1521         set_bit(R5_UPTODATE, &tgt->flags);
1522         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1523         clear_bit(R5_Wantcompute, &tgt->flags);
1524 }
1525
1526 static void ops_complete_compute(void *stripe_head_ref)
1527 {
1528         struct stripe_head *sh = stripe_head_ref;
1529
1530         pr_debug("%s: stripe %llu\n", __func__,
1531                 (unsigned long long)sh->sector);
1532
1533         /* mark the computed target(s) as uptodate */
1534         mark_target_uptodate(sh, sh->ops.target);
1535         mark_target_uptodate(sh, sh->ops.target2);
1536
1537         clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1538         if (sh->check_state == check_state_compute_run)
1539                 sh->check_state = check_state_compute_result;
1540         set_bit(STRIPE_HANDLE, &sh->state);
1541         raid5_release_stripe(sh);
1542 }
1543
1544 /* return a pointer to the address conversion region of the scribble buffer */
1545 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1546 {
1547         return percpu->scribble + i * percpu->scribble_obj_size;
1548 }
1549
1550 /* return a pointer to the address conversion region of the scribble buffer */
1551 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1552                                  struct raid5_percpu *percpu, int i)
1553 {
1554         return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1555 }
1556
1557 /*
1558  * Return a pointer to record offset address.
1559  */
1560 static unsigned int *
1561 to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1562 {
1563         return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1564 }
1565
1566 static struct dma_async_tx_descriptor *
1567 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1568 {
1569         int disks = sh->disks;
1570         struct page **xor_srcs = to_addr_page(percpu, 0);
1571         unsigned int *off_srcs = to_addr_offs(sh, percpu);
1572         int target = sh->ops.target;
1573         struct r5dev *tgt = &sh->dev[target];
1574         struct page *xor_dest = tgt->page;
1575         unsigned int off_dest = tgt->offset;
1576         int count = 0;
1577         struct dma_async_tx_descriptor *tx;
1578         struct async_submit_ctl submit;
1579         int i;
1580
1581         BUG_ON(sh->batch_head);
1582
1583         pr_debug("%s: stripe %llu block: %d\n",
1584                 __func__, (unsigned long long)sh->sector, target);
1585         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1586
1587         for (i = disks; i--; ) {
1588                 if (i != target) {
1589                         off_srcs[count] = sh->dev[i].offset;
1590                         xor_srcs[count++] = sh->dev[i].page;
1591                 }
1592         }
1593
1594         atomic_inc(&sh->count);
1595
1596         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1597                           ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1598         if (unlikely(count == 1))
1599                 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1600                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1601         else
1602                 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1603                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1604
1605         return tx;
1606 }
1607
1608 /* set_syndrome_sources - populate source buffers for gen_syndrome
1609  * @srcs - (struct page *) array of size sh->disks
1610  * @offs - (unsigned int) array of offset for each page
1611  * @sh - stripe_head to parse
1612  *
1613  * Populates srcs in proper layout order for the stripe and returns the
1614  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1615  * destination buffer is recorded in srcs[count] and the Q destination
1616  * is recorded in srcs[count+1]].
1617  */
1618 static int set_syndrome_sources(struct page **srcs,
1619                                 unsigned int *offs,
1620                                 struct stripe_head *sh,
1621                                 int srctype)
1622 {
1623         int disks = sh->disks;
1624         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1625         int d0_idx = raid6_d0(sh);
1626         int count;
1627         int i;
1628
1629         for (i = 0; i < disks; i++)
1630                 srcs[i] = NULL;
1631
1632         count = 0;
1633         i = d0_idx;
1634         do {
1635                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1636                 struct r5dev *dev = &sh->dev[i];
1637
1638                 if (i == sh->qd_idx || i == sh->pd_idx ||
1639                     (srctype == SYNDROME_SRC_ALL) ||
1640                     (srctype == SYNDROME_SRC_WANT_DRAIN &&
1641                      (test_bit(R5_Wantdrain, &dev->flags) ||
1642                       test_bit(R5_InJournal, &dev->flags))) ||
1643                     (srctype == SYNDROME_SRC_WRITTEN &&
1644                      (dev->written ||
1645                       test_bit(R5_InJournal, &dev->flags)))) {
1646                         if (test_bit(R5_InJournal, &dev->flags))
1647                                 srcs[slot] = sh->dev[i].orig_page;
1648                         else
1649                                 srcs[slot] = sh->dev[i].page;
1650                         /*
1651                          * For R5_InJournal, PAGE_SIZE must be 4KB and will
1652                          * not shared page. In that case, dev[i].offset
1653                          * is 0.
1654                          */
1655                         offs[slot] = sh->dev[i].offset;
1656                 }
1657                 i = raid6_next_disk(i, disks);
1658         } while (i != d0_idx);
1659
1660         return syndrome_disks;
1661 }
1662
1663 static struct dma_async_tx_descriptor *
1664 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1665 {
1666         int disks = sh->disks;
1667         struct page **blocks = to_addr_page(percpu, 0);
1668         unsigned int *offs = to_addr_offs(sh, percpu);
1669         int target;
1670         int qd_idx = sh->qd_idx;
1671         struct dma_async_tx_descriptor *tx;
1672         struct async_submit_ctl submit;
1673         struct r5dev *tgt;
1674         struct page *dest;
1675         unsigned int dest_off;
1676         int i;
1677         int count;
1678
1679         BUG_ON(sh->batch_head);
1680         if (sh->ops.target < 0)
1681                 target = sh->ops.target2;
1682         else if (sh->ops.target2 < 0)
1683                 target = sh->ops.target;
1684         else
1685                 /* we should only have one valid target */
1686                 BUG();
1687         BUG_ON(target < 0);
1688         pr_debug("%s: stripe %llu block: %d\n",
1689                 __func__, (unsigned long long)sh->sector, target);
1690
1691         tgt = &sh->dev[target];
1692         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1693         dest = tgt->page;
1694         dest_off = tgt->offset;
1695
1696         atomic_inc(&sh->count);
1697
1698         if (target == qd_idx) {
1699                 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1700                 blocks[count] = NULL; /* regenerating p is not necessary */
1701                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1702                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1703                                   ops_complete_compute, sh,
1704                                   to_addr_conv(sh, percpu, 0));
1705                 tx = async_gen_syndrome(blocks, offs, count+2,
1706                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1707         } else {
1708                 /* Compute any data- or p-drive using XOR */
1709                 count = 0;
1710                 for (i = disks; i-- ; ) {
1711                         if (i == target || i == qd_idx)
1712                                 continue;
1713                         offs[count] = sh->dev[i].offset;
1714                         blocks[count++] = sh->dev[i].page;
1715                 }
1716
1717                 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1718                                   NULL, ops_complete_compute, sh,
1719                                   to_addr_conv(sh, percpu, 0));
1720                 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1721                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1722         }
1723
1724         return tx;
1725 }
1726
1727 static struct dma_async_tx_descriptor *
1728 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1729 {
1730         int i, count, disks = sh->disks;
1731         int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1732         int d0_idx = raid6_d0(sh);
1733         int faila = -1, failb = -1;
1734         int target = sh->ops.target;
1735         int target2 = sh->ops.target2;
1736         struct r5dev *tgt = &sh->dev[target];
1737         struct r5dev *tgt2 = &sh->dev[target2];
1738         struct dma_async_tx_descriptor *tx;
1739         struct page **blocks = to_addr_page(percpu, 0);
1740         unsigned int *offs = to_addr_offs(sh, percpu);
1741         struct async_submit_ctl submit;
1742
1743         BUG_ON(sh->batch_head);
1744         pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1745                  __func__, (unsigned long long)sh->sector, target, target2);
1746         BUG_ON(target < 0 || target2 < 0);
1747         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1748         BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1749
1750         /* we need to open-code set_syndrome_sources to handle the
1751          * slot number conversion for 'faila' and 'failb'
1752          */
1753         for (i = 0; i < disks ; i++) {
1754                 offs[i] = 0;
1755                 blocks[i] = NULL;
1756         }
1757         count = 0;
1758         i = d0_idx;
1759         do {
1760                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1761
1762                 offs[slot] = sh->dev[i].offset;
1763                 blocks[slot] = sh->dev[i].page;
1764
1765                 if (i == target)
1766                         faila = slot;
1767                 if (i == target2)
1768                         failb = slot;
1769                 i = raid6_next_disk(i, disks);
1770         } while (i != d0_idx);
1771
1772         BUG_ON(faila == failb);
1773         if (failb < faila)
1774                 swap(faila, failb);
1775         pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1776                  __func__, (unsigned long long)sh->sector, faila, failb);
1777
1778         atomic_inc(&sh->count);
1779
1780         if (failb == syndrome_disks+1) {
1781                 /* Q disk is one of the missing disks */
1782                 if (faila == syndrome_disks) {
1783                         /* Missing P+Q, just recompute */
1784                         init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1785                                           ops_complete_compute, sh,
1786                                           to_addr_conv(sh, percpu, 0));
1787                         return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1788                                                   RAID5_STRIPE_SIZE(sh->raid_conf),
1789                                                   &submit);
1790                 } else {
1791                         struct page *dest;
1792                         unsigned int dest_off;
1793                         int data_target;
1794                         int qd_idx = sh->qd_idx;
1795
1796                         /* Missing D+Q: recompute D from P, then recompute Q */
1797                         if (target == qd_idx)
1798                                 data_target = target2;
1799                         else
1800                                 data_target = target;
1801
1802                         count = 0;
1803                         for (i = disks; i-- ; ) {
1804                                 if (i == data_target || i == qd_idx)
1805                                         continue;
1806                                 offs[count] = sh->dev[i].offset;
1807                                 blocks[count++] = sh->dev[i].page;
1808                         }
1809                         dest = sh->dev[data_target].page;
1810                         dest_off = sh->dev[data_target].offset;
1811                         init_async_submit(&submit,
1812                                           ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1813                                           NULL, NULL, NULL,
1814                                           to_addr_conv(sh, percpu, 0));
1815                         tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1816                                        RAID5_STRIPE_SIZE(sh->raid_conf),
1817                                        &submit);
1818
1819                         count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1820                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1821                                           ops_complete_compute, sh,
1822                                           to_addr_conv(sh, percpu, 0));
1823                         return async_gen_syndrome(blocks, offs, count+2,
1824                                                   RAID5_STRIPE_SIZE(sh->raid_conf),
1825                                                   &submit);
1826                 }
1827         } else {
1828                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1829                                   ops_complete_compute, sh,
1830                                   to_addr_conv(sh, percpu, 0));
1831                 if (failb == syndrome_disks) {
1832                         /* We're missing D+P. */
1833                         return async_raid6_datap_recov(syndrome_disks+2,
1834                                                 RAID5_STRIPE_SIZE(sh->raid_conf),
1835                                                 faila,
1836                                                 blocks, offs, &submit);
1837                 } else {
1838                         /* We're missing D+D. */
1839                         return async_raid6_2data_recov(syndrome_disks+2,
1840                                                 RAID5_STRIPE_SIZE(sh->raid_conf),
1841                                                 faila, failb,
1842                                                 blocks, offs, &submit);
1843                 }
1844         }
1845 }
1846
1847 static void ops_complete_prexor(void *stripe_head_ref)
1848 {
1849         struct stripe_head *sh = stripe_head_ref;
1850
1851         pr_debug("%s: stripe %llu\n", __func__,
1852                 (unsigned long long)sh->sector);
1853
1854         if (r5c_is_writeback(sh->raid_conf->log))
1855                 /*
1856                  * raid5-cache write back uses orig_page during prexor.
1857                  * After prexor, it is time to free orig_page
1858                  */
1859                 r5c_release_extra_page(sh);
1860 }
1861
1862 static struct dma_async_tx_descriptor *
1863 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1864                 struct dma_async_tx_descriptor *tx)
1865 {
1866         int disks = sh->disks;
1867         struct page **xor_srcs = to_addr_page(percpu, 0);
1868         unsigned int *off_srcs = to_addr_offs(sh, percpu);
1869         int count = 0, pd_idx = sh->pd_idx, i;
1870         struct async_submit_ctl submit;
1871
1872         /* existing parity data subtracted */
1873         unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1874         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1875
1876         BUG_ON(sh->batch_head);
1877         pr_debug("%s: stripe %llu\n", __func__,
1878                 (unsigned long long)sh->sector);
1879
1880         for (i = disks; i--; ) {
1881                 struct r5dev *dev = &sh->dev[i];
1882                 /* Only process blocks that are known to be uptodate */
1883                 if (test_bit(R5_InJournal, &dev->flags)) {
1884                         /*
1885                          * For this case, PAGE_SIZE must be equal to 4KB and
1886                          * page offset is zero.
1887                          */
1888                         off_srcs[count] = dev->offset;
1889                         xor_srcs[count++] = dev->orig_page;
1890                 } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1891                         off_srcs[count] = dev->offset;
1892                         xor_srcs[count++] = dev->page;
1893                 }
1894         }
1895
1896         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1897                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1898         tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1899                         RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1900
1901         return tx;
1902 }
1903
1904 static struct dma_async_tx_descriptor *
1905 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1906                 struct dma_async_tx_descriptor *tx)
1907 {
1908         struct page **blocks = to_addr_page(percpu, 0);
1909         unsigned int *offs = to_addr_offs(sh, percpu);
1910         int count;
1911         struct async_submit_ctl submit;
1912
1913         pr_debug("%s: stripe %llu\n", __func__,
1914                 (unsigned long long)sh->sector);
1915
1916         count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1917
1918         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1919                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1920         tx = async_gen_syndrome(blocks, offs, count+2,
1921                         RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1922
1923         return tx;
1924 }
1925
1926 static struct dma_async_tx_descriptor *
1927 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1928 {
1929         struct r5conf *conf = sh->raid_conf;
1930         int disks = sh->disks;
1931         int i;
1932         struct stripe_head *head_sh = sh;
1933
1934         pr_debug("%s: stripe %llu\n", __func__,
1935                 (unsigned long long)sh->sector);
1936
1937         for (i = disks; i--; ) {
1938                 struct r5dev *dev;
1939                 struct bio *chosen;
1940
1941                 sh = head_sh;
1942                 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1943                         struct bio *wbi;
1944
1945 again:
1946                         dev = &sh->dev[i];
1947                         /*
1948                          * clear R5_InJournal, so when rewriting a page in
1949                          * journal, it is not skipped by r5l_log_stripe()
1950                          */
1951                         clear_bit(R5_InJournal, &dev->flags);
1952                         spin_lock_irq(&sh->stripe_lock);
1953                         chosen = dev->towrite;
1954                         dev->towrite = NULL;
1955                         sh->overwrite_disks = 0;
1956                         BUG_ON(dev->written);
1957                         wbi = dev->written = chosen;
1958                         spin_unlock_irq(&sh->stripe_lock);
1959                         WARN_ON(dev->page != dev->orig_page);
1960
1961                         while (wbi && wbi->bi_iter.bi_sector <
1962                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1963                                 if (wbi->bi_opf & REQ_FUA)
1964                                         set_bit(R5_WantFUA, &dev->flags);
1965                                 if (wbi->bi_opf & REQ_SYNC)
1966                                         set_bit(R5_SyncIO, &dev->flags);
1967                                 if (bio_op(wbi) == REQ_OP_DISCARD)
1968                                         set_bit(R5_Discard, &dev->flags);
1969                                 else {
1970                                         tx = async_copy_data(1, wbi, &dev->page,
1971                                                              dev->offset,
1972                                                              dev->sector, tx, sh,
1973                                                              r5c_is_writeback(conf->log));
1974                                         if (dev->page != dev->orig_page &&
1975                                             !r5c_is_writeback(conf->log)) {
1976                                                 set_bit(R5_SkipCopy, &dev->flags);
1977                                                 clear_bit(R5_UPTODATE, &dev->flags);
1978                                                 clear_bit(R5_OVERWRITE, &dev->flags);
1979                                         }
1980                                 }
1981                                 wbi = r5_next_bio(conf, wbi, dev->sector);
1982                         }
1983
1984                         if (head_sh->batch_head) {
1985                                 sh = list_first_entry(&sh->batch_list,
1986                                                       struct stripe_head,
1987                                                       batch_list);
1988                                 if (sh == head_sh)
1989                                         continue;
1990                                 goto again;
1991                         }
1992                 }
1993         }
1994
1995         return tx;
1996 }
1997
1998 static void ops_complete_reconstruct(void *stripe_head_ref)
1999 {
2000         struct stripe_head *sh = stripe_head_ref;
2001         int disks = sh->disks;
2002         int pd_idx = sh->pd_idx;
2003         int qd_idx = sh->qd_idx;
2004         int i;
2005         bool fua = false, sync = false, discard = false;
2006
2007         pr_debug("%s: stripe %llu\n", __func__,
2008                 (unsigned long long)sh->sector);
2009
2010         for (i = disks; i--; ) {
2011                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
2012                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
2013                 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
2014         }
2015
2016         for (i = disks; i--; ) {
2017                 struct r5dev *dev = &sh->dev[i];
2018
2019                 if (dev->written || i == pd_idx || i == qd_idx) {
2020                         if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
2021                                 set_bit(R5_UPTODATE, &dev->flags);
2022                                 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
2023                                         set_bit(R5_Expanded, &dev->flags);
2024                         }
2025                         if (fua)
2026                                 set_bit(R5_WantFUA, &dev->flags);
2027                         if (sync)
2028                                 set_bit(R5_SyncIO, &dev->flags);
2029                 }
2030         }
2031
2032         if (sh->reconstruct_state == reconstruct_state_drain_run)
2033                 sh->reconstruct_state = reconstruct_state_drain_result;
2034         else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
2035                 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
2036         else {
2037                 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
2038                 sh->reconstruct_state = reconstruct_state_result;
2039         }
2040
2041         set_bit(STRIPE_HANDLE, &sh->state);
2042         raid5_release_stripe(sh);
2043 }
2044
2045 static void
2046 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
2047                      struct dma_async_tx_descriptor *tx)
2048 {
2049         int disks = sh->disks;
2050         struct page **xor_srcs;
2051         unsigned int *off_srcs;
2052         struct async_submit_ctl submit;
2053         int count, pd_idx = sh->pd_idx, i;
2054         struct page *xor_dest;
2055         unsigned int off_dest;
2056         int prexor = 0;
2057         unsigned long flags;
2058         int j = 0;
2059         struct stripe_head *head_sh = sh;
2060         int last_stripe;
2061
2062         pr_debug("%s: stripe %llu\n", __func__,
2063                 (unsigned long long)sh->sector);
2064
2065         for (i = 0; i < sh->disks; i++) {
2066                 if (pd_idx == i)
2067                         continue;
2068                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2069                         break;
2070         }
2071         if (i >= sh->disks) {
2072                 atomic_inc(&sh->count);
2073                 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2074                 ops_complete_reconstruct(sh);
2075                 return;
2076         }
2077 again:
2078         count = 0;
2079         xor_srcs = to_addr_page(percpu, j);
2080         off_srcs = to_addr_offs(sh, percpu);
2081         /* check if prexor is active which means only process blocks
2082          * that are part of a read-modify-write (written)
2083          */
2084         if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2085                 prexor = 1;
2086                 off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2087                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2088                 for (i = disks; i--; ) {
2089                         struct r5dev *dev = &sh->dev[i];
2090                         if (head_sh->dev[i].written ||
2091                             test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2092                                 off_srcs[count] = dev->offset;
2093                                 xor_srcs[count++] = dev->page;
2094                         }
2095                 }
2096         } else {
2097                 xor_dest = sh->dev[pd_idx].page;
2098                 off_dest = sh->dev[pd_idx].offset;
2099                 for (i = disks; i--; ) {
2100                         struct r5dev *dev = &sh->dev[i];
2101                         if (i != pd_idx) {
2102                                 off_srcs[count] = dev->offset;
2103                                 xor_srcs[count++] = dev->page;
2104                         }
2105                 }
2106         }
2107
2108         /* 1/ if we prexor'd then the dest is reused as a source
2109          * 2/ if we did not prexor then we are redoing the parity
2110          * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2111          * for the synchronous xor case
2112          */
2113         last_stripe = !head_sh->batch_head ||
2114                 list_first_entry(&sh->batch_list,
2115                                  struct stripe_head, batch_list) == head_sh;
2116         if (last_stripe) {
2117                 flags = ASYNC_TX_ACK |
2118                         (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2119
2120                 atomic_inc(&head_sh->count);
2121                 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2122                                   to_addr_conv(sh, percpu, j));
2123         } else {
2124                 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2125                 init_async_submit(&submit, flags, tx, NULL, NULL,
2126                                   to_addr_conv(sh, percpu, j));
2127         }
2128
2129         if (unlikely(count == 1))
2130                 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2131                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2132         else
2133                 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2134                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2135         if (!last_stripe) {
2136                 j++;
2137                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2138                                       batch_list);
2139                 goto again;
2140         }
2141 }
2142
2143 static void
2144 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2145                      struct dma_async_tx_descriptor *tx)
2146 {
2147         struct async_submit_ctl submit;
2148         struct page **blocks;
2149         unsigned int *offs;
2150         int count, i, j = 0;
2151         struct stripe_head *head_sh = sh;
2152         int last_stripe;
2153         int synflags;
2154         unsigned long txflags;
2155
2156         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2157
2158         for (i = 0; i < sh->disks; i++) {
2159                 if (sh->pd_idx == i || sh->qd_idx == i)
2160                         continue;
2161                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2162                         break;
2163         }
2164         if (i >= sh->disks) {
2165                 atomic_inc(&sh->count);
2166                 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2167                 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2168                 ops_complete_reconstruct(sh);
2169                 return;
2170         }
2171
2172 again:
2173         blocks = to_addr_page(percpu, j);
2174         offs = to_addr_offs(sh, percpu);
2175
2176         if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2177                 synflags = SYNDROME_SRC_WRITTEN;
2178                 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2179         } else {
2180                 synflags = SYNDROME_SRC_ALL;
2181                 txflags = ASYNC_TX_ACK;
2182         }
2183
2184         count = set_syndrome_sources(blocks, offs, sh, synflags);
2185         last_stripe = !head_sh->batch_head ||
2186                 list_first_entry(&sh->batch_list,
2187                                  struct stripe_head, batch_list) == head_sh;
2188
2189         if (last_stripe) {
2190                 atomic_inc(&head_sh->count);
2191                 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2192                                   head_sh, to_addr_conv(sh, percpu, j));
2193         } else
2194                 init_async_submit(&submit, 0, tx, NULL, NULL,
2195                                   to_addr_conv(sh, percpu, j));
2196         tx = async_gen_syndrome(blocks, offs, count+2,
2197                         RAID5_STRIPE_SIZE(sh->raid_conf),  &submit);
2198         if (!last_stripe) {
2199                 j++;
2200                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2201                                       batch_list);
2202                 goto again;
2203         }
2204 }
2205
2206 static void ops_complete_check(void *stripe_head_ref)
2207 {
2208         struct stripe_head *sh = stripe_head_ref;
2209
2210         pr_debug("%s: stripe %llu\n", __func__,
2211                 (unsigned long long)sh->sector);
2212
2213         sh->check_state = check_state_check_result;
2214         set_bit(STRIPE_HANDLE, &sh->state);
2215         raid5_release_stripe(sh);
2216 }
2217
2218 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2219 {
2220         int disks = sh->disks;
2221         int pd_idx = sh->pd_idx;
2222         int qd_idx = sh->qd_idx;
2223         struct page *xor_dest;
2224         unsigned int off_dest;
2225         struct page **xor_srcs = to_addr_page(percpu, 0);
2226         unsigned int *off_srcs = to_addr_offs(sh, percpu);
2227         struct dma_async_tx_descriptor *tx;
2228         struct async_submit_ctl submit;
2229         int count;
2230         int i;
2231
2232         pr_debug("%s: stripe %llu\n", __func__,
2233                 (unsigned long long)sh->sector);
2234
2235         BUG_ON(sh->batch_head);
2236         count = 0;
2237         xor_dest = sh->dev[pd_idx].page;
2238         off_dest = sh->dev[pd_idx].offset;
2239         off_srcs[count] = off_dest;
2240         xor_srcs[count++] = xor_dest;
2241         for (i = disks; i--; ) {
2242                 if (i == pd_idx || i == qd_idx)
2243                         continue;
2244                 off_srcs[count] = sh->dev[i].offset;
2245                 xor_srcs[count++] = sh->dev[i].page;
2246         }
2247
2248         init_async_submit(&submit, 0, NULL, NULL, NULL,
2249                           to_addr_conv(sh, percpu, 0));
2250         tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2251                            RAID5_STRIPE_SIZE(sh->raid_conf),
2252                            &sh->ops.zero_sum_result, &submit);
2253
2254         atomic_inc(&sh->count);
2255         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2256         tx = async_trigger_callback(&submit);
2257 }
2258
2259 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2260 {
2261         struct page **srcs = to_addr_page(percpu, 0);
2262         unsigned int *offs = to_addr_offs(sh, percpu);
2263         struct async_submit_ctl submit;
2264         int count;
2265
2266         pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2267                 (unsigned long long)sh->sector, checkp);
2268
2269         BUG_ON(sh->batch_head);
2270         count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2271         if (!checkp)
2272                 srcs[count] = NULL;
2273
2274         atomic_inc(&sh->count);
2275         init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2276                           sh, to_addr_conv(sh, percpu, 0));
2277         async_syndrome_val(srcs, offs, count+2,
2278                            RAID5_STRIPE_SIZE(sh->raid_conf),
2279                            &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2280 }
2281
2282 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2283 {
2284         int overlap_clear = 0, i, disks = sh->disks;
2285         struct dma_async_tx_descriptor *tx = NULL;
2286         struct r5conf *conf = sh->raid_conf;
2287         int level = conf->level;
2288         struct raid5_percpu *percpu;
2289
2290         local_lock(&conf->percpu->lock);
2291         percpu = this_cpu_ptr(conf->percpu);
2292         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2293                 ops_run_biofill(sh);
2294                 overlap_clear++;
2295         }
2296
2297         if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2298                 if (level < 6)
2299                         tx = ops_run_compute5(sh, percpu);
2300                 else {
2301                         if (sh->ops.target2 < 0 || sh->ops.target < 0)
2302                                 tx = ops_run_compute6_1(sh, percpu);
2303                         else
2304                                 tx = ops_run_compute6_2(sh, percpu);
2305                 }
2306                 /* terminate the chain if reconstruct is not set to be run */
2307                 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2308                         async_tx_ack(tx);
2309         }
2310
2311         if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2312                 if (level < 6)
2313                         tx = ops_run_prexor5(sh, percpu, tx);
2314                 else
2315                         tx = ops_run_prexor6(sh, percpu, tx);
2316         }
2317
2318         if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2319                 tx = ops_run_partial_parity(sh, percpu, tx);
2320
2321         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2322                 tx = ops_run_biodrain(sh, tx);
2323                 overlap_clear++;
2324         }
2325
2326         if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2327                 if (level < 6)
2328                         ops_run_reconstruct5(sh, percpu, tx);
2329                 else
2330                         ops_run_reconstruct6(sh, percpu, tx);
2331         }
2332
2333         if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2334                 if (sh->check_state == check_state_run)
2335                         ops_run_check_p(sh, percpu);
2336                 else if (sh->check_state == check_state_run_q)
2337                         ops_run_check_pq(sh, percpu, 0);
2338                 else if (sh->check_state == check_state_run_pq)
2339                         ops_run_check_pq(sh, percpu, 1);
2340                 else
2341                         BUG();
2342         }
2343
2344         if (overlap_clear && !sh->batch_head) {
2345                 for (i = disks; i--; ) {
2346                         struct r5dev *dev = &sh->dev[i];
2347                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
2348                                 wake_up(&sh->raid_conf->wait_for_overlap);
2349                 }
2350         }
2351         local_unlock(&conf->percpu->lock);
2352 }
2353
2354 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2355 {
2356 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2357         kfree(sh->pages);
2358 #endif
2359         if (sh->ppl_page)
2360                 __free_page(sh->ppl_page);
2361         kmem_cache_free(sc, sh);
2362 }
2363
2364 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2365         int disks, struct r5conf *conf)
2366 {
2367         struct stripe_head *sh;
2368
2369         sh = kmem_cache_zalloc(sc, gfp);
2370         if (sh) {
2371                 spin_lock_init(&sh->stripe_lock);
2372                 spin_lock_init(&sh->batch_lock);
2373                 INIT_LIST_HEAD(&sh->batch_list);
2374                 INIT_LIST_HEAD(&sh->lru);
2375                 INIT_LIST_HEAD(&sh->r5c);
2376                 INIT_LIST_HEAD(&sh->log_list);
2377                 atomic_set(&sh->count, 1);
2378                 sh->raid_conf = conf;
2379                 sh->log_start = MaxSector;
2380
2381                 if (raid5_has_ppl(conf)) {
2382                         sh->ppl_page = alloc_page(gfp);
2383                         if (!sh->ppl_page) {
2384                                 free_stripe(sc, sh);
2385                                 return NULL;
2386                         }
2387                 }
2388 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2389                 if (init_stripe_shared_pages(sh, conf, disks)) {
2390                         free_stripe(sc, sh);
2391                         return NULL;
2392                 }
2393 #endif
2394         }
2395         return sh;
2396 }
2397 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2398 {
2399         struct stripe_head *sh;
2400
2401         sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2402         if (!sh)
2403                 return 0;
2404
2405         if (grow_buffers(sh, gfp)) {
2406                 shrink_buffers(sh);
2407                 free_stripe(conf->slab_cache, sh);
2408                 return 0;
2409         }
2410         sh->hash_lock_index =
2411                 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2412         /* we just created an active stripe so... */
2413         atomic_inc(&conf->active_stripes);
2414
2415         raid5_release_stripe(sh);
2416         conf->max_nr_stripes++;
2417         return 1;
2418 }
2419
2420 static int grow_stripes(struct r5conf *conf, int num)
2421 {
2422         struct kmem_cache *sc;
2423         size_t namelen = sizeof(conf->cache_name[0]);
2424         int devs = max(conf->raid_disks, conf->previous_raid_disks);
2425
2426         if (conf->mddev->gendisk)
2427                 snprintf(conf->cache_name[0], namelen,
2428                         "raid%d-%s", conf->level, mdname(conf->mddev));
2429         else
2430                 snprintf(conf->cache_name[0], namelen,
2431                         "raid%d-%p", conf->level, conf->mddev);
2432         snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2433
2434         conf->active_name = 0;
2435         sc = kmem_cache_create(conf->cache_name[conf->active_name],
2436                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2437                                0, 0, NULL);
2438         if (!sc)
2439                 return 1;
2440         conf->slab_cache = sc;
2441         conf->pool_size = devs;
2442         while (num--)
2443                 if (!grow_one_stripe(conf, GFP_KERNEL))
2444                         return 1;
2445
2446         return 0;
2447 }
2448
2449 /**
2450  * scribble_alloc - allocate percpu scribble buffer for required size
2451  *                  of the scribble region
2452  * @percpu: from for_each_present_cpu() of the caller
2453  * @num: total number of disks in the array
2454  * @cnt: scribble objs count for required size of the scribble region
2455  *
2456  * The scribble buffer size must be enough to contain:
2457  * 1/ a struct page pointer for each device in the array +2
2458  * 2/ room to convert each entry in (1) to its corresponding dma
2459  *    (dma_map_page()) or page (page_address()) address.
2460  *
2461  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2462  * calculate over all devices (not just the data blocks), using zeros in place
2463  * of the P and Q blocks.
2464  */
2465 static int scribble_alloc(struct raid5_percpu *percpu,
2466                           int num, int cnt)
2467 {
2468         size_t obj_size =
2469                 sizeof(struct page *) * (num + 2) +
2470                 sizeof(addr_conv_t) * (num + 2) +
2471                 sizeof(unsigned int) * (num + 2);
2472         void *scribble;
2473
2474         /*
2475          * If here is in raid array suspend context, it is in memalloc noio
2476          * context as well, there is no potential recursive memory reclaim
2477          * I/Os with the GFP_KERNEL flag.
2478          */
2479         scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2480         if (!scribble)
2481                 return -ENOMEM;
2482
2483         kvfree(percpu->scribble);
2484
2485         percpu->scribble = scribble;
2486         percpu->scribble_obj_size = obj_size;
2487         return 0;
2488 }
2489
2490 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2491 {
2492         unsigned long cpu;
2493         int err = 0;
2494
2495         /*
2496          * Never shrink. And mddev_suspend() could deadlock if this is called
2497          * from raid5d. In that case, scribble_disks and scribble_sectors
2498          * should equal to new_disks and new_sectors
2499          */
2500         if (conf->scribble_disks >= new_disks &&
2501             conf->scribble_sectors >= new_sectors)
2502                 return 0;
2503         mddev_suspend(conf->mddev);
2504         cpus_read_lock();
2505
2506         for_each_present_cpu(cpu) {
2507                 struct raid5_percpu *percpu;
2508
2509                 percpu = per_cpu_ptr(conf->percpu, cpu);
2510                 err = scribble_alloc(percpu, new_disks,
2511                                      new_sectors / RAID5_STRIPE_SECTORS(conf));
2512                 if (err)
2513                         break;
2514         }
2515
2516         cpus_read_unlock();
2517         mddev_resume(conf->mddev);
2518         if (!err) {
2519                 conf->scribble_disks = new_disks;
2520                 conf->scribble_sectors = new_sectors;
2521         }
2522         return err;
2523 }
2524
2525 static int resize_stripes(struct r5conf *conf, int newsize)
2526 {
2527         /* Make all the stripes able to hold 'newsize' devices.
2528          * New slots in each stripe get 'page' set to a new page.
2529          *
2530          * This happens in stages:
2531          * 1/ create a new kmem_cache and allocate the required number of
2532          *    stripe_heads.
2533          * 2/ gather all the old stripe_heads and transfer the pages across
2534          *    to the new stripe_heads.  This will have the side effect of
2535          *    freezing the array as once all stripe_heads have been collected,
2536          *    no IO will be possible.  Old stripe heads are freed once their
2537          *    pages have been transferred over, and the old kmem_cache is
2538          *    freed when all stripes are done.
2539          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2540          *    we simple return a failure status - no need to clean anything up.
2541          * 4/ allocate new pages for the new slots in the new stripe_heads.
2542          *    If this fails, we don't bother trying the shrink the
2543          *    stripe_heads down again, we just leave them as they are.
2544          *    As each stripe_head is processed the new one is released into
2545          *    active service.
2546          *
2547          * Once step2 is started, we cannot afford to wait for a write,
2548          * so we use GFP_NOIO allocations.
2549          */
2550         struct stripe_head *osh, *nsh;
2551         LIST_HEAD(newstripes);
2552         struct disk_info *ndisks;
2553         int err = 0;
2554         struct kmem_cache *sc;
2555         int i;
2556         int hash, cnt;
2557
2558         md_allow_write(conf->mddev);
2559
2560         /* Step 1 */
2561         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2562                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2563                                0, 0, NULL);
2564         if (!sc)
2565                 return -ENOMEM;
2566
2567         /* Need to ensure auto-resizing doesn't interfere */
2568         mutex_lock(&conf->cache_size_mutex);
2569
2570         for (i = conf->max_nr_stripes; i; i--) {
2571                 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2572                 if (!nsh)
2573                         break;
2574
2575                 list_add(&nsh->lru, &newstripes);
2576         }
2577         if (i) {
2578                 /* didn't get enough, give up */
2579                 while (!list_empty(&newstripes)) {
2580                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
2581                         list_del(&nsh->lru);
2582                         free_stripe(sc, nsh);
2583                 }
2584                 kmem_cache_destroy(sc);
2585                 mutex_unlock(&conf->cache_size_mutex);
2586                 return -ENOMEM;
2587         }
2588         /* Step 2 - Must use GFP_NOIO now.
2589          * OK, we have enough stripes, start collecting inactive
2590          * stripes and copying them over
2591          */
2592         hash = 0;
2593         cnt = 0;
2594         list_for_each_entry(nsh, &newstripes, lru) {
2595                 lock_device_hash_lock(conf, hash);
2596                 wait_event_cmd(conf->wait_for_stripe,
2597                                     !list_empty(conf->inactive_list + hash),
2598                                     unlock_device_hash_lock(conf, hash),
2599                                     lock_device_hash_lock(conf, hash));
2600                 osh = get_free_stripe(conf, hash);
2601                 unlock_device_hash_lock(conf, hash);
2602
2603 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2604         for (i = 0; i < osh->nr_pages; i++) {
2605                 nsh->pages[i] = osh->pages[i];
2606                 osh->pages[i] = NULL;
2607         }
2608 #endif
2609                 for(i=0; i<conf->pool_size; i++) {
2610                         nsh->dev[i].page = osh->dev[i].page;
2611                         nsh->dev[i].orig_page = osh->dev[i].page;
2612                         nsh->dev[i].offset = osh->dev[i].offset;
2613                 }
2614                 nsh->hash_lock_index = hash;
2615                 free_stripe(conf->slab_cache, osh);
2616                 cnt++;
2617                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2618                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2619                         hash++;
2620                         cnt = 0;
2621                 }
2622         }
2623         kmem_cache_destroy(conf->slab_cache);
2624
2625         /* Step 3.
2626          * At this point, we are holding all the stripes so the array
2627          * is completely stalled, so now is a good time to resize
2628          * conf->disks and the scribble region
2629          */
2630         ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2631         if (ndisks) {
2632                 for (i = 0; i < conf->pool_size; i++)
2633                         ndisks[i] = conf->disks[i];
2634
2635                 for (i = conf->pool_size; i < newsize; i++) {
2636                         ndisks[i].extra_page = alloc_page(GFP_NOIO);
2637                         if (!ndisks[i].extra_page)
2638                                 err = -ENOMEM;
2639                 }
2640
2641                 if (err) {
2642                         for (i = conf->pool_size; i < newsize; i++)
2643                                 if (ndisks[i].extra_page)
2644                                         put_page(ndisks[i].extra_page);
2645                         kfree(ndisks);
2646                 } else {
2647                         kfree(conf->disks);
2648                         conf->disks = ndisks;
2649                 }
2650         } else
2651                 err = -ENOMEM;
2652
2653         conf->slab_cache = sc;
2654         conf->active_name = 1-conf->active_name;
2655
2656         /* Step 4, return new stripes to service */
2657         while(!list_empty(&newstripes)) {
2658                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2659                 list_del_init(&nsh->lru);
2660
2661 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2662                 for (i = 0; i < nsh->nr_pages; i++) {
2663                         if (nsh->pages[i])
2664                                 continue;
2665                         nsh->pages[i] = alloc_page(GFP_NOIO);
2666                         if (!nsh->pages[i])
2667                                 err = -ENOMEM;
2668                 }
2669
2670                 for (i = conf->raid_disks; i < newsize; i++) {
2671                         if (nsh->dev[i].page)
2672                                 continue;
2673                         nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2674                         nsh->dev[i].orig_page = nsh->dev[i].page;
2675                         nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2676                 }
2677 #else
2678                 for (i=conf->raid_disks; i < newsize; i++)
2679                         if (nsh->dev[i].page == NULL) {
2680                                 struct page *p = alloc_page(GFP_NOIO);
2681                                 nsh->dev[i].page = p;
2682                                 nsh->dev[i].orig_page = p;
2683                                 nsh->dev[i].offset = 0;
2684                                 if (!p)
2685                                         err = -ENOMEM;
2686                         }
2687 #endif
2688                 raid5_release_stripe(nsh);
2689         }
2690         /* critical section pass, GFP_NOIO no longer needed */
2691
2692         if (!err)
2693                 conf->pool_size = newsize;
2694         mutex_unlock(&conf->cache_size_mutex);
2695
2696         return err;
2697 }
2698
2699 static int drop_one_stripe(struct r5conf *conf)
2700 {
2701         struct stripe_head *sh;
2702         int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2703
2704         spin_lock_irq(conf->hash_locks + hash);
2705         sh = get_free_stripe(conf, hash);
2706         spin_unlock_irq(conf->hash_locks + hash);
2707         if (!sh)
2708                 return 0;
2709         BUG_ON(atomic_read(&sh->count));
2710         shrink_buffers(sh);
2711         free_stripe(conf->slab_cache, sh);
2712         atomic_dec(&conf->active_stripes);
2713         conf->max_nr_stripes--;
2714         return 1;
2715 }
2716
2717 static void shrink_stripes(struct r5conf *conf)
2718 {
2719         while (conf->max_nr_stripes &&
2720                drop_one_stripe(conf))
2721                 ;
2722
2723         kmem_cache_destroy(conf->slab_cache);
2724         conf->slab_cache = NULL;
2725 }
2726
2727 /*
2728  * This helper wraps rcu_dereference_protected() and can be used when
2729  * it is known that the nr_pending of the rdev is elevated.
2730  */
2731 static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
2732 {
2733         return rcu_dereference_protected(rdev,
2734                         atomic_read(&rcu_access_pointer(rdev)->nr_pending));
2735 }
2736
2737 /*
2738  * This helper wraps rcu_dereference_protected() and should be used
2739  * when it is known that the mddev_lock() is held. This is safe
2740  * seeing raid5_remove_disk() has the same lock held.
2741  */
2742 static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
2743                                          struct md_rdev __rcu *rdev)
2744 {
2745         return rcu_dereference_protected(rdev,
2746                         lockdep_is_held(&mddev->reconfig_mutex));
2747 }
2748
2749 static void raid5_end_read_request(struct bio * bi)
2750 {
2751         struct stripe_head *sh = bi->bi_private;
2752         struct r5conf *conf = sh->raid_conf;
2753         int disks = sh->disks, i;
2754         struct md_rdev *rdev = NULL;
2755         sector_t s;
2756
2757         for (i=0 ; i<disks; i++)
2758                 if (bi == &sh->dev[i].req)
2759                         break;
2760
2761         pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2762                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2763                 bi->bi_status);
2764         if (i == disks) {
2765                 BUG();
2766                 return;
2767         }
2768         if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2769                 /* If replacement finished while this request was outstanding,
2770                  * 'replacement' might be NULL already.
2771                  * In that case it moved down to 'rdev'.
2772                  * rdev is not removed until all requests are finished.
2773                  */
2774                 rdev = rdev_pend_deref(conf->disks[i].replacement);
2775         if (!rdev)
2776                 rdev = rdev_pend_deref(conf->disks[i].rdev);
2777
2778         if (use_new_offset(conf, sh))
2779                 s = sh->sector + rdev->new_data_offset;
2780         else
2781                 s = sh->sector + rdev->data_offset;
2782         if (!bi->bi_status) {
2783                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2784                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2785                         /* Note that this cannot happen on a
2786                          * replacement device.  We just fail those on
2787                          * any error
2788                          */
2789                         pr_info_ratelimited(
2790                                 "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2791                                 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2792                                 (unsigned long long)s,
2793                                 rdev->bdev);
2794                         atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2795                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2796                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2797                 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2798                         clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2799
2800                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2801                         /*
2802                          * end read for a page in journal, this
2803                          * must be preparing for prexor in rmw
2804                          */
2805                         set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2806
2807                 if (atomic_read(&rdev->read_errors))
2808                         atomic_set(&rdev->read_errors, 0);
2809         } else {
2810                 int retry = 0;
2811                 int set_bad = 0;
2812
2813                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2814                 if (!(bi->bi_status == BLK_STS_PROTECTION))
2815                         atomic_inc(&rdev->read_errors);
2816                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2817                         pr_warn_ratelimited(
2818                                 "md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2819                                 mdname(conf->mddev),
2820                                 (unsigned long long)s,
2821                                 rdev->bdev);
2822                 else if (conf->mddev->degraded >= conf->max_degraded) {
2823                         set_bad = 1;
2824                         pr_warn_ratelimited(
2825                                 "md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2826                                 mdname(conf->mddev),
2827                                 (unsigned long long)s,
2828                                 rdev->bdev);
2829                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2830                         /* Oh, no!!! */
2831                         set_bad = 1;
2832                         pr_warn_ratelimited(
2833                                 "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2834                                 mdname(conf->mddev),
2835                                 (unsigned long long)s,
2836                                 rdev->bdev);
2837                 } else if (atomic_read(&rdev->read_errors)
2838                          > conf->max_nr_stripes) {
2839                         if (!test_bit(Faulty, &rdev->flags)) {
2840                                 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2841                                     mdname(conf->mddev),
2842                                     atomic_read(&rdev->read_errors),
2843                                     conf->max_nr_stripes);
2844                                 pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2845                                     mdname(conf->mddev), rdev->bdev);
2846                         }
2847                 } else
2848                         retry = 1;
2849                 if (set_bad && test_bit(In_sync, &rdev->flags)
2850                     && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2851                         retry = 1;
2852                 if (retry)
2853                         if (sh->qd_idx >= 0 && sh->pd_idx == i)
2854                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2855                         else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2856                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2857                                 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2858                         } else
2859                                 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2860                 else {
2861                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2862                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2863                         if (!(set_bad
2864                               && test_bit(In_sync, &rdev->flags)
2865                               && rdev_set_badblocks(
2866                                       rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2867                                 md_error(conf->mddev, rdev);
2868                 }
2869         }
2870         rdev_dec_pending(rdev, conf->mddev);
2871         bio_uninit(bi);
2872         clear_bit(R5_LOCKED, &sh->dev[i].flags);
2873         set_bit(STRIPE_HANDLE, &sh->state);
2874         raid5_release_stripe(sh);
2875 }
2876
2877 static void raid5_end_write_request(struct bio *bi)
2878 {
2879         struct stripe_head *sh = bi->bi_private;
2880         struct r5conf *conf = sh->raid_conf;
2881         int disks = sh->disks, i;
2882         struct md_rdev *rdev;
2883         sector_t first_bad;
2884         int bad_sectors;
2885         int replacement = 0;
2886
2887         for (i = 0 ; i < disks; i++) {
2888                 if (bi == &sh->dev[i].req) {
2889                         rdev = rdev_pend_deref(conf->disks[i].rdev);
2890                         break;
2891                 }
2892                 if (bi == &sh->dev[i].rreq) {
2893                         rdev = rdev_pend_deref(conf->disks[i].replacement);
2894                         if (rdev)
2895                                 replacement = 1;
2896                         else
2897                                 /* rdev was removed and 'replacement'
2898                                  * replaced it.  rdev is not removed
2899                                  * until all requests are finished.
2900                                  */
2901                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
2902                         break;
2903                 }
2904         }
2905         pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2906                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2907                 bi->bi_status);
2908         if (i == disks) {
2909                 BUG();
2910                 return;
2911         }
2912
2913         if (replacement) {
2914                 if (bi->bi_status)
2915                         md_error(conf->mddev, rdev);
2916                 else if (is_badblock(rdev, sh->sector,
2917                                      RAID5_STRIPE_SECTORS(conf),
2918                                      &first_bad, &bad_sectors))
2919                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2920         } else {
2921                 if (bi->bi_status) {
2922                         set_bit(STRIPE_DEGRADED, &sh->state);
2923                         set_bit(WriteErrorSeen, &rdev->flags);
2924                         set_bit(R5_WriteError, &sh->dev[i].flags);
2925                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2926                                 set_bit(MD_RECOVERY_NEEDED,
2927                                         &rdev->mddev->recovery);
2928                 } else if (is_badblock(rdev, sh->sector,
2929                                        RAID5_STRIPE_SECTORS(conf),
2930                                        &first_bad, &bad_sectors)) {
2931                         set_bit(R5_MadeGood, &sh->dev[i].flags);
2932                         if (test_bit(R5_ReadError, &sh->dev[i].flags))
2933                                 /* That was a successful write so make
2934                                  * sure it looks like we already did
2935                                  * a re-write.
2936                                  */
2937                                 set_bit(R5_ReWrite, &sh->dev[i].flags);
2938                 }
2939         }
2940         rdev_dec_pending(rdev, conf->mddev);
2941
2942         if (sh->batch_head && bi->bi_status && !replacement)
2943                 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2944
2945         bio_uninit(bi);
2946         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2947                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2948         set_bit(STRIPE_HANDLE, &sh->state);
2949
2950         if (sh->batch_head && sh != sh->batch_head)
2951                 raid5_release_stripe(sh->batch_head);
2952         raid5_release_stripe(sh);
2953 }
2954
2955 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2956 {
2957         struct r5conf *conf = mddev->private;
2958         unsigned long flags;
2959         pr_debug("raid456: error called\n");
2960
2961         pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2962                 mdname(mddev), rdev->bdev);
2963
2964         spin_lock_irqsave(&conf->device_lock, flags);
2965         set_bit(Faulty, &rdev->flags);
2966         clear_bit(In_sync, &rdev->flags);
2967         mddev->degraded = raid5_calc_degraded(conf);
2968
2969         if (has_failed(conf)) {
2970                 set_bit(MD_BROKEN, &conf->mddev->flags);
2971                 conf->recovery_disabled = mddev->recovery_disabled;
2972
2973                 pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2974                         mdname(mddev), mddev->degraded, conf->raid_disks);
2975         } else {
2976                 pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2977                         mdname(mddev), conf->raid_disks - mddev->degraded);
2978         }
2979
2980         spin_unlock_irqrestore(&conf->device_lock, flags);
2981         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2982
2983         set_bit(Blocked, &rdev->flags);
2984         set_mask_bits(&mddev->sb_flags, 0,
2985                       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2986         r5c_update_on_rdev_error(mddev, rdev);
2987 }
2988
2989 /*
2990  * Input: a 'big' sector number,
2991  * Output: index of the data and parity disk, and the sector # in them.
2992  */
2993 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2994                               int previous, int *dd_idx,
2995                               struct stripe_head *sh)
2996 {
2997         sector_t stripe, stripe2;
2998         sector_t chunk_number;
2999         unsigned int chunk_offset;
3000         int pd_idx, qd_idx;
3001         int ddf_layout = 0;
3002         sector_t new_sector;
3003         int algorithm = previous ? conf->prev_algo
3004                                  : conf->algorithm;
3005         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3006                                          : conf->chunk_sectors;
3007         int raid_disks = previous ? conf->previous_raid_disks
3008                                   : conf->raid_disks;
3009         int data_disks = raid_disks - conf->max_degraded;
3010
3011         /* First compute the information on this sector */
3012
3013         /*
3014          * Compute the chunk number and the sector offset inside the chunk
3015          */
3016         chunk_offset = sector_div(r_sector, sectors_per_chunk);
3017         chunk_number = r_sector;
3018
3019         /*
3020          * Compute the stripe number
3021          */
3022         stripe = chunk_number;
3023         *dd_idx = sector_div(stripe, data_disks);
3024         stripe2 = stripe;
3025         /*
3026          * Select the parity disk based on the user selected algorithm.
3027          */
3028         pd_idx = qd_idx = -1;
3029         switch(conf->level) {
3030         case 4:
3031                 pd_idx = data_disks;
3032                 break;
3033         case 5:
3034                 switch (algorithm) {
3035                 case ALGORITHM_LEFT_ASYMMETRIC:
3036                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
3037                         if (*dd_idx >= pd_idx)
3038                                 (*dd_idx)++;
3039                         break;
3040                 case ALGORITHM_RIGHT_ASYMMETRIC:
3041                         pd_idx = sector_div(stripe2, raid_disks);
3042                         if (*dd_idx >= pd_idx)
3043                                 (*dd_idx)++;
3044                         break;
3045                 case ALGORITHM_LEFT_SYMMETRIC:
3046                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
3047                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3048                         break;
3049                 case ALGORITHM_RIGHT_SYMMETRIC:
3050                         pd_idx = sector_div(stripe2, raid_disks);
3051                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3052                         break;
3053                 case ALGORITHM_PARITY_0:
3054                         pd_idx = 0;
3055                         (*dd_idx)++;
3056                         break;
3057                 case ALGORITHM_PARITY_N:
3058                         pd_idx = data_disks;
3059                         break;
3060                 default:
3061                         BUG();
3062                 }
3063                 break;
3064         case 6:
3065
3066                 switch (algorithm) {
3067                 case ALGORITHM_LEFT_ASYMMETRIC:
3068                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3069                         qd_idx = pd_idx + 1;
3070                         if (pd_idx == raid_disks-1) {
3071                                 (*dd_idx)++;    /* Q D D D P */
3072                                 qd_idx = 0;
3073                         } else if (*dd_idx >= pd_idx)
3074                                 (*dd_idx) += 2; /* D D P Q D */
3075                         break;
3076                 case ALGORITHM_RIGHT_ASYMMETRIC:
3077                         pd_idx = sector_div(stripe2, raid_disks);
3078                         qd_idx = pd_idx + 1;
3079                         if (pd_idx == raid_disks-1) {
3080                                 (*dd_idx)++;    /* Q D D D P */
3081                                 qd_idx = 0;
3082                         } else if (*dd_idx >= pd_idx)
3083                                 (*dd_idx) += 2; /* D D P Q D */
3084                         break;
3085                 case ALGORITHM_LEFT_SYMMETRIC:
3086                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3087                         qd_idx = (pd_idx + 1) % raid_disks;
3088                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3089                         break;
3090                 case ALGORITHM_RIGHT_SYMMETRIC:
3091                         pd_idx = sector_div(stripe2, raid_disks);
3092                         qd_idx = (pd_idx + 1) % raid_disks;
3093                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3094                         break;
3095
3096                 case ALGORITHM_PARITY_0:
3097                         pd_idx = 0;
3098                         qd_idx = 1;
3099                         (*dd_idx) += 2;
3100                         break;
3101                 case ALGORITHM_PARITY_N:
3102                         pd_idx = data_disks;
3103                         qd_idx = data_disks + 1;
3104                         break;
3105
3106                 case ALGORITHM_ROTATING_ZERO_RESTART:
3107                         /* Exactly the same as RIGHT_ASYMMETRIC, but or
3108                          * of blocks for computing Q is different.
3109                          */
3110                         pd_idx = sector_div(stripe2, raid_disks);
3111                         qd_idx = pd_idx + 1;
3112                         if (pd_idx == raid_disks-1) {
3113                                 (*dd_idx)++;    /* Q D D D P */
3114                                 qd_idx = 0;
3115                         } else if (*dd_idx >= pd_idx)
3116                                 (*dd_idx) += 2; /* D D P Q D */
3117                         ddf_layout = 1;
3118                         break;
3119
3120                 case ALGORITHM_ROTATING_N_RESTART:
3121                         /* Same a left_asymmetric, by first stripe is
3122                          * D D D P Q  rather than
3123                          * Q D D D P
3124                          */
3125                         stripe2 += 1;
3126                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3127                         qd_idx = pd_idx + 1;
3128                         if (pd_idx == raid_disks-1) {
3129                                 (*dd_idx)++;    /* Q D D D P */
3130                                 qd_idx = 0;
3131                         } else if (*dd_idx >= pd_idx)
3132                                 (*dd_idx) += 2; /* D D P Q D */
3133                         ddf_layout = 1;
3134                         break;
3135
3136                 case ALGORITHM_ROTATING_N_CONTINUE:
3137                         /* Same as left_symmetric but Q is before P */
3138                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3139                         qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3140                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3141                         ddf_layout = 1;
3142                         break;
3143
3144                 case ALGORITHM_LEFT_ASYMMETRIC_6:
3145                         /* RAID5 left_asymmetric, with Q on last device */
3146                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3147                         if (*dd_idx >= pd_idx)
3148                                 (*dd_idx)++;
3149                         qd_idx = raid_disks - 1;
3150                         break;
3151
3152                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3153                         pd_idx = sector_div(stripe2, raid_disks-1);
3154                         if (*dd_idx >= pd_idx)
3155                                 (*dd_idx)++;
3156                         qd_idx = raid_disks - 1;
3157                         break;
3158
3159                 case ALGORITHM_LEFT_SYMMETRIC_6:
3160                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3161                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3162                         qd_idx = raid_disks - 1;
3163                         break;
3164
3165                 case ALGORITHM_RIGHT_SYMMETRIC_6:
3166                         pd_idx = sector_div(stripe2, raid_disks-1);
3167                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3168                         qd_idx = raid_disks - 1;
3169                         break;
3170
3171                 case ALGORITHM_PARITY_0_6:
3172                         pd_idx = 0;
3173                         (*dd_idx)++;
3174                         qd_idx = raid_disks - 1;
3175                         break;
3176
3177                 default:
3178                         BUG();
3179                 }
3180                 break;
3181         }
3182
3183         if (sh) {
3184                 sh->pd_idx = pd_idx;
3185                 sh->qd_idx = qd_idx;
3186                 sh->ddf_layout = ddf_layout;
3187         }
3188         /*
3189          * Finally, compute the new sector number
3190          */
3191         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3192         return new_sector;
3193 }
3194
3195 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3196 {
3197         struct r5conf *conf = sh->raid_conf;
3198         int raid_disks = sh->disks;
3199         int data_disks = raid_disks - conf->max_degraded;
3200         sector_t new_sector = sh->sector, check;
3201         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3202                                          : conf->chunk_sectors;
3203         int algorithm = previous ? conf->prev_algo
3204                                  : conf->algorithm;
3205         sector_t stripe;
3206         int chunk_offset;
3207         sector_t chunk_number;
3208         int dummy1, dd_idx = i;
3209         sector_t r_sector;
3210         struct stripe_head sh2;
3211
3212         chunk_offset = sector_div(new_sector, sectors_per_chunk);
3213         stripe = new_sector;
3214
3215         if (i == sh->pd_idx)
3216                 return 0;
3217         switch(conf->level) {
3218         case 4: break;
3219         case 5:
3220                 switch (algorithm) {
3221                 case ALGORITHM_LEFT_ASYMMETRIC:
3222                 case ALGORITHM_RIGHT_ASYMMETRIC:
3223                         if (i > sh->pd_idx)
3224                                 i--;
3225                         break;
3226                 case ALGORITHM_LEFT_SYMMETRIC:
3227                 case ALGORITHM_RIGHT_SYMMETRIC:
3228                         if (i < sh->pd_idx)
3229                                 i += raid_disks;
3230                         i -= (sh->pd_idx + 1);
3231                         break;
3232                 case ALGORITHM_PARITY_0:
3233                         i -= 1;
3234                         break;
3235                 case ALGORITHM_PARITY_N:
3236                         break;
3237                 default:
3238                         BUG();
3239                 }
3240                 break;
3241         case 6:
3242                 if (i == sh->qd_idx)
3243                         return 0; /* It is the Q disk */
3244                 switch (algorithm) {
3245                 case ALGORITHM_LEFT_ASYMMETRIC:
3246                 case ALGORITHM_RIGHT_ASYMMETRIC:
3247                 case ALGORITHM_ROTATING_ZERO_RESTART:
3248                 case ALGORITHM_ROTATING_N_RESTART:
3249                         if (sh->pd_idx == raid_disks-1)
3250                                 i--;    /* Q D D D P */
3251                         else if (i > sh->pd_idx)
3252                                 i -= 2; /* D D P Q D */
3253                         break;
3254                 case ALGORITHM_LEFT_SYMMETRIC:
3255                 case ALGORITHM_RIGHT_SYMMETRIC:
3256                         if (sh->pd_idx == raid_disks-1)
3257                                 i--; /* Q D D D P */
3258                         else {
3259                                 /* D D P Q D */
3260                                 if (i < sh->pd_idx)
3261                                         i += raid_disks;
3262                                 i -= (sh->pd_idx + 2);
3263                         }
3264                         break;
3265                 case ALGORITHM_PARITY_0:
3266                         i -= 2;
3267                         break;
3268                 case ALGORITHM_PARITY_N:
3269                         break;
3270                 case ALGORITHM_ROTATING_N_CONTINUE:
3271                         /* Like left_symmetric, but P is before Q */
3272                         if (sh->pd_idx == 0)
3273                                 i--;    /* P D D D Q */
3274                         else {
3275                                 /* D D Q P D */
3276                                 if (i < sh->pd_idx)
3277                                         i += raid_disks;
3278                                 i -= (sh->pd_idx + 1);
3279                         }
3280                         break;
3281                 case ALGORITHM_LEFT_ASYMMETRIC_6:
3282                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3283                         if (i > sh->pd_idx)
3284                                 i--;
3285                         break;
3286                 case ALGORITHM_LEFT_SYMMETRIC_6:
3287                 case ALGORITHM_RIGHT_SYMMETRIC_6:
3288                         if (i < sh->pd_idx)
3289                                 i += data_disks + 1;
3290                         i -= (sh->pd_idx + 1);
3291                         break;
3292                 case ALGORITHM_PARITY_0_6:
3293                         i -= 1;
3294                         break;
3295                 default:
3296                         BUG();
3297                 }
3298                 break;
3299         }
3300
3301         chunk_number = stripe * data_disks + i;
3302         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3303
3304         check = raid5_compute_sector(conf, r_sector,
3305                                      previous, &dummy1, &sh2);
3306         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3307                 || sh2.qd_idx != sh->qd_idx) {
3308                 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3309                         mdname(conf->mddev));
3310                 return 0;
3311         }
3312         return r_sector;
3313 }
3314
3315 /*
3316  * There are cases where we want handle_stripe_dirtying() and
3317  * schedule_reconstruction() to delay towrite to some dev of a stripe.
3318  *
3319  * This function checks whether we want to delay the towrite. Specifically,
3320  * we delay the towrite when:
3321  *
3322  *   1. degraded stripe has a non-overwrite to the missing dev, AND this
3323  *      stripe has data in journal (for other devices).
3324  *
3325  *      In this case, when reading data for the non-overwrite dev, it is
3326  *      necessary to handle complex rmw of write back cache (prexor with
3327  *      orig_page, and xor with page). To keep read path simple, we would
3328  *      like to flush data in journal to RAID disks first, so complex rmw
3329  *      is handled in the write patch (handle_stripe_dirtying).
3330  *
3331  *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
3332  *
3333  *      It is important to be able to flush all stripes in raid5-cache.
3334  *      Therefore, we need reserve some space on the journal device for
3335  *      these flushes. If flush operation includes pending writes to the
3336  *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3337  *      for the flush out. If we exclude these pending writes from flush
3338  *      operation, we only need (conf->max_degraded + 1) pages per stripe.
3339  *      Therefore, excluding pending writes in these cases enables more
3340  *      efficient use of the journal device.
3341  *
3342  *      Note: To make sure the stripe makes progress, we only delay
3343  *      towrite for stripes with data already in journal (injournal > 0).
3344  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3345  *      no_space_stripes list.
3346  *
3347  *   3. during journal failure
3348  *      In journal failure, we try to flush all cached data to raid disks
3349  *      based on data in stripe cache. The array is read-only to upper
3350  *      layers, so we would skip all pending writes.
3351  *
3352  */
3353 static inline bool delay_towrite(struct r5conf *conf,
3354                                  struct r5dev *dev,
3355                                  struct stripe_head_state *s)
3356 {
3357         /* case 1 above */
3358         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3359             !test_bit(R5_Insync, &dev->flags) && s->injournal)
3360                 return true;
3361         /* case 2 above */
3362         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3363             s->injournal > 0)
3364                 return true;
3365         /* case 3 above */
3366         if (s->log_failed && s->injournal)
3367                 return true;
3368         return false;
3369 }
3370
3371 static void
3372 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3373                          int rcw, int expand)
3374 {
3375         int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3376         struct r5conf *conf = sh->raid_conf;
3377         int level = conf->level;
3378
3379         if (rcw) {
3380                 /*
3381                  * In some cases, handle_stripe_dirtying initially decided to
3382                  * run rmw and allocates extra page for prexor. However, rcw is
3383                  * cheaper later on. We need to free the extra page now,
3384                  * because we won't be able to do that in ops_complete_prexor().
3385                  */
3386                 r5c_release_extra_page(sh);
3387
3388                 for (i = disks; i--; ) {
3389                         struct r5dev *dev = &sh->dev[i];
3390
3391                         if (dev->towrite && !delay_towrite(conf, dev, s)) {
3392                                 set_bit(R5_LOCKED, &dev->flags);
3393                                 set_bit(R5_Wantdrain, &dev->flags);
3394                                 if (!expand)
3395                                         clear_bit(R5_UPTODATE, &dev->flags);
3396                                 s->locked++;
3397                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3398                                 set_bit(R5_LOCKED, &dev->flags);
3399                                 s->locked++;
3400                         }
3401                 }
3402                 /* if we are not expanding this is a proper write request, and
3403                  * there will be bios with new data to be drained into the
3404                  * stripe cache
3405                  */
3406                 if (!expand) {
3407                         if (!s->locked)
3408                                 /* False alarm, nothing to do */
3409                                 return;
3410                         sh->reconstruct_state = reconstruct_state_drain_run;
3411                         set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3412                 } else
3413                         sh->reconstruct_state = reconstruct_state_run;
3414
3415                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3416
3417                 if (s->locked + conf->max_degraded == disks)
3418                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3419                                 atomic_inc(&conf->pending_full_writes);
3420         } else {
3421                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3422                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3423                 BUG_ON(level == 6 &&
3424                         (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3425                            test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3426
3427                 for (i = disks; i--; ) {
3428                         struct r5dev *dev = &sh->dev[i];
3429                         if (i == pd_idx || i == qd_idx)
3430                                 continue;
3431
3432                         if (dev->towrite &&
3433                             (test_bit(R5_UPTODATE, &dev->flags) ||
3434                              test_bit(R5_Wantcompute, &dev->flags))) {
3435                                 set_bit(R5_Wantdrain, &dev->flags);
3436                                 set_bit(R5_LOCKED, &dev->flags);
3437                                 clear_bit(R5_UPTODATE, &dev->flags);
3438                                 s->locked++;
3439                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3440                                 set_bit(R5_LOCKED, &dev->flags);
3441                                 s->locked++;
3442                         }
3443                 }
3444                 if (!s->locked)
3445                         /* False alarm - nothing to do */
3446                         return;
3447                 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3448                 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3449                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3450                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3451         }
3452
3453         /* keep the parity disk(s) locked while asynchronous operations
3454          * are in flight
3455          */
3456         set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3457         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3458         s->locked++;
3459
3460         if (level == 6) {
3461                 int qd_idx = sh->qd_idx;
3462                 struct r5dev *dev = &sh->dev[qd_idx];
3463
3464                 set_bit(R5_LOCKED, &dev->flags);
3465                 clear_bit(R5_UPTODATE, &dev->flags);
3466                 s->locked++;
3467         }
3468
3469         if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3470             test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3471             !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3472             test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3473                 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3474
3475         pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3476                 __func__, (unsigned long long)sh->sector,
3477                 s->locked, s->ops_request);
3478 }
3479
3480 static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
3481                                 int dd_idx, int forwrite)
3482 {
3483         struct r5conf *conf = sh->raid_conf;
3484         struct bio **bip;
3485
3486         pr_debug("checking bi b#%llu to stripe s#%llu\n",
3487                  bi->bi_iter.bi_sector, sh->sector);
3488
3489         /* Don't allow new IO added to stripes in batch list */
3490         if (sh->batch_head)
3491                 return true;
3492
3493         if (forwrite)
3494                 bip = &sh->dev[dd_idx].towrite;
3495         else
3496                 bip = &sh->dev[dd_idx].toread;
3497
3498         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3499                 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3500                         return true;
3501                 bip = &(*bip)->bi_next;
3502         }
3503
3504         if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3505                 return true;
3506
3507         if (forwrite && raid5_has_ppl(conf)) {
3508                 /*
3509                  * With PPL only writes to consecutive data chunks within a
3510                  * stripe are allowed because for a single stripe_head we can
3511                  * only have one PPL entry at a time, which describes one data
3512                  * range. Not really an overlap, but wait_for_overlap can be
3513                  * used to handle this.
3514                  */
3515                 sector_t sector;
3516                 sector_t first = 0;
3517                 sector_t last = 0;
3518                 int count = 0;
3519                 int i;
3520
3521                 for (i = 0; i < sh->disks; i++) {
3522                         if (i != sh->pd_idx &&
3523                             (i == dd_idx || sh->dev[i].towrite)) {
3524                                 sector = sh->dev[i].sector;
3525                                 if (count == 0 || sector < first)
3526                                         first = sector;
3527                                 if (sector > last)
3528                                         last = sector;
3529                                 count++;
3530                         }
3531                 }
3532
3533                 if (first + conf->chunk_sectors * (count - 1) != last)
3534                         return true;
3535         }
3536
3537         return false;
3538 }
3539
3540 static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
3541                              int dd_idx, int forwrite, int previous)
3542 {
3543         struct r5conf *conf = sh->raid_conf;
3544         struct bio **bip;
3545         int firstwrite = 0;
3546
3547         if (forwrite) {
3548                 bip = &sh->dev[dd_idx].towrite;
3549                 if (!*bip)
3550                         firstwrite = 1;
3551         } else {
3552                 bip = &sh->dev[dd_idx].toread;
3553         }
3554
3555         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
3556                 bip = &(*bip)->bi_next;
3557
3558         if (!forwrite || previous)
3559                 clear_bit(STRIPE_BATCH_READY, &sh->state);
3560
3561         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3562         if (*bip)
3563                 bi->bi_next = *bip;
3564         *bip = bi;
3565         bio_inc_remaining(bi);
3566         md_write_inc(conf->mddev, bi);
3567
3568         if (forwrite) {
3569                 /* check if page is covered */
3570                 sector_t sector = sh->dev[dd_idx].sector;
3571                 for (bi=sh->dev[dd_idx].towrite;
3572                      sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3573                              bi && bi->bi_iter.bi_sector <= sector;
3574                      bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3575                         if (bio_end_sector(bi) >= sector)
3576                                 sector = bio_end_sector(bi);
3577                 }
3578                 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3579                         if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3580                                 sh->overwrite_disks++;
3581         }
3582
3583         pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
3584                  (*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
3585                  sh->dev[dd_idx].sector);
3586
3587         if (conf->mddev->bitmap && firstwrite) {
3588                 /* Cannot hold spinlock over bitmap_startwrite,
3589                  * but must ensure this isn't added to a batch until
3590                  * we have added to the bitmap and set bm_seq.
3591                  * So set STRIPE_BITMAP_PENDING to prevent
3592                  * batching.
3593                  * If multiple __add_stripe_bio() calls race here they
3594                  * much all set STRIPE_BITMAP_PENDING.  So only the first one
3595                  * to complete "bitmap_startwrite" gets to set
3596                  * STRIPE_BIT_DELAY.  This is important as once a stripe
3597                  * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3598                  * any more.
3599                  */
3600                 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3601                 spin_unlock_irq(&sh->stripe_lock);
3602                 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3603                                      RAID5_STRIPE_SECTORS(conf), 0);
3604                 spin_lock_irq(&sh->stripe_lock);
3605                 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3606                 if (!sh->batch_head) {
3607                         sh->bm_seq = conf->seq_flush+1;
3608                         set_bit(STRIPE_BIT_DELAY, &sh->state);
3609                 }
3610         }
3611 }
3612
3613 /*
3614  * Each stripe/dev can have one or more bios attached.
3615  * toread/towrite point to the first in a chain.
3616  * The bi_next chain must be in order.
3617  */
3618 static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
3619                            int dd_idx, int forwrite, int previous)
3620 {
3621         spin_lock_irq(&sh->stripe_lock);
3622
3623         if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
3624                 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3625                 spin_unlock_irq(&sh->stripe_lock);
3626                 return false;
3627         }
3628
3629         __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
3630         spin_unlock_irq(&sh->stripe_lock);
3631         return true;
3632 }
3633
3634 static void end_reshape(struct r5conf *conf);
3635
3636 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3637                             struct stripe_head *sh)
3638 {
3639         int sectors_per_chunk =
3640                 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3641         int dd_idx;
3642         int chunk_offset = sector_div(stripe, sectors_per_chunk);
3643         int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3644
3645         raid5_compute_sector(conf,
3646                              stripe * (disks - conf->max_degraded)
3647                              *sectors_per_chunk + chunk_offset,
3648                              previous,
3649                              &dd_idx, sh);
3650 }
3651
3652 static void
3653 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3654                      struct stripe_head_state *s, int disks)
3655 {
3656         int i;
3657         BUG_ON(sh->batch_head);
3658         for (i = disks; i--; ) {
3659                 struct bio *bi;
3660                 int bitmap_end = 0;
3661
3662                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3663                         struct md_rdev *rdev;
3664                         rcu_read_lock();
3665                         rdev = rcu_dereference(conf->disks[i].rdev);
3666                         if (rdev && test_bit(In_sync, &rdev->flags) &&
3667                             !test_bit(Faulty, &rdev->flags))
3668                                 atomic_inc(&rdev->nr_pending);
3669                         else
3670                                 rdev = NULL;
3671                         rcu_read_unlock();
3672                         if (rdev) {
3673                                 if (!rdev_set_badblocks(
3674                                             rdev,
3675                                             sh->sector,
3676                                             RAID5_STRIPE_SECTORS(conf), 0))
3677                                         md_error(conf->mddev, rdev);
3678                                 rdev_dec_pending(rdev, conf->mddev);
3679                         }
3680                 }
3681                 spin_lock_irq(&sh->stripe_lock);
3682                 /* fail all writes first */
3683                 bi = sh->dev[i].towrite;
3684                 sh->dev[i].towrite = NULL;
3685                 sh->overwrite_disks = 0;
3686                 spin_unlock_irq(&sh->stripe_lock);
3687                 if (bi)
3688                         bitmap_end = 1;
3689
3690                 log_stripe_write_finished(sh);
3691
3692                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3693                         wake_up(&conf->wait_for_overlap);
3694
3695                 while (bi && bi->bi_iter.bi_sector <
3696                         sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3697                         struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3698
3699                         md_write_end(conf->mddev);
3700                         bio_io_error(bi);
3701                         bi = nextbi;
3702                 }
3703                 if (bitmap_end)
3704                         md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3705                                            RAID5_STRIPE_SECTORS(conf), 0, 0);
3706                 bitmap_end = 0;
3707                 /* and fail all 'written' */
3708                 bi = sh->dev[i].written;
3709                 sh->dev[i].written = NULL;
3710                 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3711                         WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3712                         sh->dev[i].page = sh->dev[i].orig_page;
3713                 }
3714
3715                 if (bi) bitmap_end = 1;
3716                 while (bi && bi->bi_iter.bi_sector <
3717                        sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3718                         struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3719
3720                         md_write_end(conf->mddev);
3721                         bio_io_error(bi);
3722                         bi = bi2;
3723                 }
3724
3725                 /* fail any reads if this device is non-operational and
3726                  * the data has not reached the cache yet.
3727                  */
3728                 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3729                     s->failed > conf->max_degraded &&
3730                     (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3731                       test_bit(R5_ReadError, &sh->dev[i].flags))) {
3732                         spin_lock_irq(&sh->stripe_lock);
3733                         bi = sh->dev[i].toread;
3734                         sh->dev[i].toread = NULL;
3735                         spin_unlock_irq(&sh->stripe_lock);
3736                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3737                                 wake_up(&conf->wait_for_overlap);
3738                         if (bi)
3739                                 s->to_read--;
3740                         while (bi && bi->bi_iter.bi_sector <
3741                                sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3742                                 struct bio *nextbi =
3743                                         r5_next_bio(conf, bi, sh->dev[i].sector);
3744
3745                                 bio_io_error(bi);
3746                                 bi = nextbi;
3747                         }
3748                 }
3749                 if (bitmap_end)
3750                         md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3751                                            RAID5_STRIPE_SECTORS(conf), 0, 0);
3752                 /* If we were in the middle of a write the parity block might
3753                  * still be locked - so just clear all R5_LOCKED flags
3754                  */
3755                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3756         }
3757         s->to_write = 0;
3758         s->written = 0;
3759
3760         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3761                 if (atomic_dec_and_test(&conf->pending_full_writes))
3762                         md_wakeup_thread(conf->mddev->thread);
3763 }
3764
3765 static void
3766 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3767                    struct stripe_head_state *s)
3768 {
3769         int abort = 0;
3770         int i;
3771
3772         BUG_ON(sh->batch_head);
3773         clear_bit(STRIPE_SYNCING, &sh->state);
3774         if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3775                 wake_up(&conf->wait_for_overlap);
3776         s->syncing = 0;
3777         s->replacing = 0;
3778         /* There is nothing more to do for sync/check/repair.
3779          * Don't even need to abort as that is handled elsewhere
3780          * if needed, and not always wanted e.g. if there is a known
3781          * bad block here.
3782          * For recover/replace we need to record a bad block on all
3783          * non-sync devices, or abort the recovery
3784          */
3785         if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3786                 /* During recovery devices cannot be removed, so
3787                  * locking and refcounting of rdevs is not needed
3788                  */
3789                 rcu_read_lock();
3790                 for (i = 0; i < conf->raid_disks; i++) {
3791                         struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3792                         if (rdev
3793                             && !test_bit(Faulty, &rdev->flags)
3794                             && !test_bit(In_sync, &rdev->flags)
3795                             && !rdev_set_badblocks(rdev, sh->sector,
3796                                                    RAID5_STRIPE_SECTORS(conf), 0))
3797                                 abort = 1;
3798                         rdev = rcu_dereference(conf->disks[i].replacement);
3799                         if (rdev
3800                             && !test_bit(Faulty, &rdev->flags)
3801                             && !test_bit(In_sync, &rdev->flags)
3802                             && !rdev_set_badblocks(rdev, sh->sector,
3803                                                    RAID5_STRIPE_SECTORS(conf), 0))
3804                                 abort = 1;
3805                 }
3806                 rcu_read_unlock();
3807                 if (abort)
3808                         conf->recovery_disabled =
3809                                 conf->mddev->recovery_disabled;
3810         }
3811         md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3812 }
3813
3814 static int want_replace(struct stripe_head *sh, int disk_idx)
3815 {
3816         struct md_rdev *rdev;
3817         int rv = 0;
3818
3819         rcu_read_lock();
3820         rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3821         if (rdev
3822             && !test_bit(Faulty, &rdev->flags)
3823             && !test_bit(In_sync, &rdev->flags)
3824             && (rdev->recovery_offset <= sh->sector
3825                 || rdev->mddev->recovery_cp <= sh->sector))
3826                 rv = 1;
3827         rcu_read_unlock();
3828         return rv;
3829 }
3830
3831 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3832                            int disk_idx, int disks)
3833 {
3834         struct r5dev *dev = &sh->dev[disk_idx];
3835         struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3836                                   &sh->dev[s->failed_num[1]] };
3837         int i;
3838         bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3839
3840
3841         if (test_bit(R5_LOCKED, &dev->flags) ||
3842             test_bit(R5_UPTODATE, &dev->flags))
3843                 /* No point reading this as we already have it or have
3844                  * decided to get it.
3845                  */
3846                 return 0;
3847
3848         if (dev->toread ||
3849             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3850                 /* We need this block to directly satisfy a request */
3851                 return 1;
3852
3853         if (s->syncing || s->expanding ||
3854             (s->replacing && want_replace(sh, disk_idx)))
3855                 /* When syncing, or expanding we read everything.
3856                  * When replacing, we need the replaced block.
3857                  */
3858                 return 1;
3859
3860         if ((s->failed >= 1 && fdev[0]->toread) ||
3861             (s->failed >= 2 && fdev[1]->toread))
3862                 /* If we want to read from a failed device, then
3863                  * we need to actually read every other device.
3864                  */
3865                 return 1;
3866
3867         /* Sometimes neither read-modify-write nor reconstruct-write
3868          * cycles can work.  In those cases we read every block we
3869          * can.  Then the parity-update is certain to have enough to
3870          * work with.
3871          * This can only be a problem when we need to write something,
3872          * and some device has failed.  If either of those tests
3873          * fail we need look no further.
3874          */
3875         if (!s->failed || !s->to_write)
3876                 return 0;
3877
3878         if (test_bit(R5_Insync, &dev->flags) &&
3879             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3880                 /* Pre-reads at not permitted until after short delay
3881                  * to gather multiple requests.  However if this
3882                  * device is no Insync, the block could only be computed
3883                  * and there is no need to delay that.
3884                  */
3885                 return 0;
3886
3887         for (i = 0; i < s->failed && i < 2; i++) {
3888                 if (fdev[i]->towrite &&
3889                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3890                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3891                         /* If we have a partial write to a failed
3892                          * device, then we will need to reconstruct
3893                          * the content of that device, so all other
3894                          * devices must be read.
3895                          */
3896                         return 1;
3897
3898                 if (s->failed >= 2 &&
3899                     (fdev[i]->towrite ||
3900                      s->failed_num[i] == sh->pd_idx ||
3901                      s->failed_num[i] == sh->qd_idx) &&
3902                     !test_bit(R5_UPTODATE, &fdev[i]->flags))
3903                         /* In max degraded raid6, If the failed disk is P, Q,
3904                          * or we want to read the failed disk, we need to do
3905                          * reconstruct-write.
3906                          */
3907                         force_rcw = true;
3908         }
3909
3910         /* If we are forced to do a reconstruct-write, because parity
3911          * cannot be trusted and we are currently recovering it, there
3912          * is extra need to be careful.
3913          * If one of the devices that we would need to read, because
3914          * it is not being overwritten (and maybe not written at all)
3915          * is missing/faulty, then we need to read everything we can.
3916          */
3917         if (!force_rcw &&
3918             sh->sector < sh->raid_conf->mddev->recovery_cp)
3919                 /* reconstruct-write isn't being forced */
3920                 return 0;
3921         for (i = 0; i < s->failed && i < 2; i++) {
3922                 if (s->failed_num[i] != sh->pd_idx &&
3923                     s->failed_num[i] != sh->qd_idx &&
3924                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3925                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3926                         return 1;
3927         }
3928
3929         return 0;
3930 }
3931
3932 /* fetch_block - checks the given member device to see if its data needs
3933  * to be read or computed to satisfy a request.
3934  *
3935  * Returns 1 when no more member devices need to be checked, otherwise returns
3936  * 0 to tell the loop in handle_stripe_fill to continue
3937  */
3938 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3939                        int disk_idx, int disks)
3940 {
3941         struct r5dev *dev = &sh->dev[disk_idx];
3942
3943         /* is the data in this block needed, and can we get it? */
3944         if (need_this_block(sh, s, disk_idx, disks)) {
3945                 /* we would like to get this block, possibly by computing it,
3946                  * otherwise read it if the backing disk is insync
3947                  */
3948                 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3949                 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3950                 BUG_ON(sh->batch_head);
3951
3952                 /*
3953                  * In the raid6 case if the only non-uptodate disk is P
3954                  * then we already trusted P to compute the other failed
3955                  * drives. It is safe to compute rather than re-read P.
3956                  * In other cases we only compute blocks from failed
3957                  * devices, otherwise check/repair might fail to detect
3958                  * a real inconsistency.
3959                  */
3960
3961                 if ((s->uptodate == disks - 1) &&
3962                     ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3963                     (s->failed && (disk_idx == s->failed_num[0] ||
3964                                    disk_idx == s->failed_num[1])))) {
3965                         /* have disk failed, and we're requested to fetch it;
3966                          * do compute it
3967                          */
3968                         pr_debug("Computing stripe %llu block %d\n",
3969                                (unsigned long long)sh->sector, disk_idx);
3970                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3971                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3972                         set_bit(R5_Wantcompute, &dev->flags);
3973                         sh->ops.target = disk_idx;
3974                         sh->ops.target2 = -1; /* no 2nd target */
3975                         s->req_compute = 1;
3976                         /* Careful: from this point on 'uptodate' is in the eye
3977                          * of raid_run_ops which services 'compute' operations
3978                          * before writes. R5_Wantcompute flags a block that will
3979                          * be R5_UPTODATE by the time it is needed for a
3980                          * subsequent operation.
3981                          */
3982                         s->uptodate++;
3983                         return 1;
3984                 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3985                         /* Computing 2-failure is *very* expensive; only
3986                          * do it if failed >= 2
3987                          */
3988                         int other;
3989                         for (other = disks; other--; ) {
3990                                 if (other == disk_idx)
3991                                         continue;
3992                                 if (!test_bit(R5_UPTODATE,
3993                                       &sh->dev[other].flags))
3994                                         break;
3995                         }
3996                         BUG_ON(other < 0);
3997                         pr_debug("Computing stripe %llu blocks %d,%d\n",
3998                                (unsigned long long)sh->sector,
3999                                disk_idx, other);
4000                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4001                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4002                         set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
4003                         set_bit(R5_Wantcompute, &sh->dev[other].flags);
4004                         sh->ops.target = disk_idx;
4005                         sh->ops.target2 = other;
4006                         s->uptodate += 2;
4007                         s->req_compute = 1;
4008                         return 1;
4009                 } else if (test_bit(R5_Insync, &dev->flags)) {
4010                         set_bit(R5_LOCKED, &dev->flags);
4011                         set_bit(R5_Wantread, &dev->flags);
4012                         s->locked++;
4013                         pr_debug("Reading block %d (sync=%d)\n",
4014                                 disk_idx, s->syncing);
4015                 }
4016         }
4017
4018         return 0;
4019 }
4020
4021 /*
4022  * handle_stripe_fill - read or compute data to satisfy pending requests.
4023  */
4024 static void handle_stripe_fill(struct stripe_head *sh,
4025                                struct stripe_head_state *s,
4026                                int disks)
4027 {
4028         int i;
4029
4030         /* look for blocks to read/compute, skip this if a compute
4031          * is already in flight, or if the stripe contents are in the
4032          * midst of changing due to a write
4033          */
4034         if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
4035             !sh->reconstruct_state) {
4036
4037                 /*
4038                  * For degraded stripe with data in journal, do not handle
4039                  * read requests yet, instead, flush the stripe to raid
4040                  * disks first, this avoids handling complex rmw of write
4041                  * back cache (prexor with orig_page, and then xor with
4042                  * page) in the read path
4043                  */
4044                 if (s->to_read && s->injournal && s->failed) {
4045                         if (test_bit(STRIPE_R5C_CACHING, &sh->state))
4046                                 r5c_make_stripe_write_out(sh);
4047                         goto out;
4048                 }
4049
4050                 for (i = disks; i--; )
4051                         if (fetch_block(sh, s, i, disks))
4052                                 break;
4053         }
4054 out:
4055         set_bit(STRIPE_HANDLE, &sh->state);
4056 }
4057
4058 static void break_stripe_batch_list(struct stripe_head *head_sh,
4059                                     unsigned long handle_flags);
4060 /* handle_stripe_clean_event
4061  * any written block on an uptodate or failed drive can be returned.
4062  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
4063  * never LOCKED, so we don't need to test 'failed' directly.
4064  */
4065 static void handle_stripe_clean_event(struct r5conf *conf,
4066         struct stripe_head *sh, int disks)
4067 {
4068         int i;
4069         struct r5dev *dev;
4070         int discard_pending = 0;
4071         struct stripe_head *head_sh = sh;
4072         bool do_endio = false;
4073
4074         for (i = disks; i--; )
4075                 if (sh->dev[i].written) {
4076                         dev = &sh->dev[i];
4077                         if (!test_bit(R5_LOCKED, &dev->flags) &&
4078                             (test_bit(R5_UPTODATE, &dev->flags) ||
4079                              test_bit(R5_Discard, &dev->flags) ||
4080                              test_bit(R5_SkipCopy, &dev->flags))) {
4081                                 /* We can return any write requests */
4082                                 struct bio *wbi, *wbi2;
4083                                 pr_debug("Return write for disc %d\n", i);
4084                                 if (test_and_clear_bit(R5_Discard, &dev->flags))
4085                                         clear_bit(R5_UPTODATE, &dev->flags);
4086                                 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
4087                                         WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
4088                                 }
4089                                 do_endio = true;
4090
4091 returnbi:
4092                                 dev->page = dev->orig_page;
4093                                 wbi = dev->written;
4094                                 dev->written = NULL;
4095                                 while (wbi && wbi->bi_iter.bi_sector <
4096                                         dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4097                                         wbi2 = r5_next_bio(conf, wbi, dev->sector);
4098                                         md_write_end(conf->mddev);
4099                                         bio_endio(wbi);
4100                                         wbi = wbi2;
4101                                 }
4102                                 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
4103                                                    RAID5_STRIPE_SECTORS(conf),
4104                                                    !test_bit(STRIPE_DEGRADED, &sh->state),
4105                                                    0);
4106                                 if (head_sh->batch_head) {
4107                                         sh = list_first_entry(&sh->batch_list,
4108                                                               struct stripe_head,
4109                                                               batch_list);
4110                                         if (sh != head_sh) {
4111                                                 dev = &sh->dev[i];
4112                                                 goto returnbi;
4113                                         }
4114                                 }
4115                                 sh = head_sh;
4116                                 dev = &sh->dev[i];
4117                         } else if (test_bit(R5_Discard, &dev->flags))
4118                                 discard_pending = 1;
4119                 }
4120
4121         log_stripe_write_finished(sh);
4122
4123         if (!discard_pending &&
4124             test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4125                 int hash;
4126                 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4127                 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4128                 if (sh->qd_idx >= 0) {
4129                         clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4130                         clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4131                 }
4132                 /* now that discard is done we can proceed with any sync */
4133                 clear_bit(STRIPE_DISCARD, &sh->state);
4134                 /*
4135                  * SCSI discard will change some bio fields and the stripe has
4136                  * no updated data, so remove it from hash list and the stripe
4137                  * will be reinitialized
4138                  */
4139 unhash:
4140                 hash = sh->hash_lock_index;
4141                 spin_lock_irq(conf->hash_locks + hash);
4142                 remove_hash(sh);
4143                 spin_unlock_irq(conf->hash_locks + hash);
4144                 if (head_sh->batch_head) {
4145                         sh = list_first_entry(&sh->batch_list,
4146                                               struct stripe_head, batch_list);
4147                         if (sh != head_sh)
4148                                         goto unhash;
4149                 }
4150                 sh = head_sh;
4151
4152                 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4153                         set_bit(STRIPE_HANDLE, &sh->state);
4154
4155         }
4156
4157         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4158                 if (atomic_dec_and_test(&conf->pending_full_writes))
4159                         md_wakeup_thread(conf->mddev->thread);
4160
4161         if (head_sh->batch_head && do_endio)
4162                 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4163 }
4164
4165 /*
4166  * For RMW in write back cache, we need extra page in prexor to store the
4167  * old data. This page is stored in dev->orig_page.
4168  *
4169  * This function checks whether we have data for prexor. The exact logic
4170  * is:
4171  *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
4172  */
4173 static inline bool uptodate_for_rmw(struct r5dev *dev)
4174 {
4175         return (test_bit(R5_UPTODATE, &dev->flags)) &&
4176                 (!test_bit(R5_InJournal, &dev->flags) ||
4177                  test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4178 }
4179
4180 static int handle_stripe_dirtying(struct r5conf *conf,
4181                                   struct stripe_head *sh,
4182                                   struct stripe_head_state *s,
4183                                   int disks)
4184 {
4185         int rmw = 0, rcw = 0, i;
4186         sector_t recovery_cp = conf->mddev->recovery_cp;
4187
4188         /* Check whether resync is now happening or should start.
4189          * If yes, then the array is dirty (after unclean shutdown or
4190          * initial creation), so parity in some stripes might be inconsistent.
4191          * In this case, we need to always do reconstruct-write, to ensure
4192          * that in case of drive failure or read-error correction, we
4193          * generate correct data from the parity.
4194          */
4195         if (conf->rmw_level == PARITY_DISABLE_RMW ||
4196             (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4197              s->failed == 0)) {
4198                 /* Calculate the real rcw later - for now make it
4199                  * look like rcw is cheaper
4200                  */
4201                 rcw = 1; rmw = 2;
4202                 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4203                          conf->rmw_level, (unsigned long long)recovery_cp,
4204                          (unsigned long long)sh->sector);
4205         } else for (i = disks; i--; ) {
4206                 /* would I have to read this buffer for read_modify_write */
4207                 struct r5dev *dev = &sh->dev[i];
4208                 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4209                      i == sh->pd_idx || i == sh->qd_idx ||
4210                      test_bit(R5_InJournal, &dev->flags)) &&
4211                     !test_bit(R5_LOCKED, &dev->flags) &&
4212                     !(uptodate_for_rmw(dev) ||
4213                       test_bit(R5_Wantcompute, &dev->flags))) {
4214                         if (test_bit(R5_Insync, &dev->flags))
4215                                 rmw++;
4216                         else
4217                                 rmw += 2*disks;  /* cannot read it */
4218                 }
4219                 /* Would I have to read this buffer for reconstruct_write */
4220                 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4221                     i != sh->pd_idx && i != sh->qd_idx &&
4222                     !test_bit(R5_LOCKED, &dev->flags) &&
4223                     !(test_bit(R5_UPTODATE, &dev->flags) ||
4224                       test_bit(R5_Wantcompute, &dev->flags))) {
4225                         if (test_bit(R5_Insync, &dev->flags))
4226                                 rcw++;
4227                         else
4228                                 rcw += 2*disks;
4229                 }
4230         }
4231
4232         pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4233                  (unsigned long long)sh->sector, sh->state, rmw, rcw);
4234         set_bit(STRIPE_HANDLE, &sh->state);
4235         if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4236                 /* prefer read-modify-write, but need to get some data */
4237                 if (conf->mddev->queue)
4238                         blk_add_trace_msg(conf->mddev->queue,
4239                                           "raid5 rmw %llu %d",
4240                                           (unsigned long long)sh->sector, rmw);
4241                 for (i = disks; i--; ) {
4242                         struct r5dev *dev = &sh->dev[i];
4243                         if (test_bit(R5_InJournal, &dev->flags) &&
4244                             dev->page == dev->orig_page &&
4245                             !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4246                                 /* alloc page for prexor */
4247                                 struct page *p = alloc_page(GFP_NOIO);
4248
4249                                 if (p) {
4250                                         dev->orig_page = p;
4251                                         continue;
4252                                 }
4253
4254                                 /*
4255                                  * alloc_page() failed, try use
4256                                  * disk_info->extra_page
4257                                  */
4258                                 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4259                                                       &conf->cache_state)) {
4260                                         r5c_use_extra_page(sh);
4261                                         break;
4262                                 }
4263
4264                                 /* extra_page in use, add to delayed_list */
4265                                 set_bit(STRIPE_DELAYED, &sh->state);
4266                                 s->waiting_extra_page = 1;
4267                                 return -EAGAIN;
4268                         }
4269                 }
4270
4271                 for (i = disks; i--; ) {
4272                         struct r5dev *dev = &sh->dev[i];
4273                         if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4274                              i == sh->pd_idx || i == sh->qd_idx ||
4275                              test_bit(R5_InJournal, &dev->flags)) &&
4276                             !test_bit(R5_LOCKED, &dev->flags) &&
4277                             !(uptodate_for_rmw(dev) ||
4278                               test_bit(R5_Wantcompute, &dev->flags)) &&
4279                             test_bit(R5_Insync, &dev->flags)) {
4280                                 if (test_bit(STRIPE_PREREAD_ACTIVE,
4281                                              &sh->state)) {
4282                                         pr_debug("Read_old block %d for r-m-w\n",
4283                                                  i);
4284                                         set_bit(R5_LOCKED, &dev->flags);
4285                                         set_bit(R5_Wantread, &dev->flags);
4286                                         s->locked++;
4287                                 } else
4288                                         set_bit(STRIPE_DELAYED, &sh->state);
4289                         }
4290                 }
4291         }
4292         if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4293                 /* want reconstruct write, but need to get some data */
4294                 int qread =0;
4295                 rcw = 0;
4296                 for (i = disks; i--; ) {
4297                         struct r5dev *dev = &sh->dev[i];
4298                         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4299                             i != sh->pd_idx && i != sh->qd_idx &&
4300                             !test_bit(R5_LOCKED, &dev->flags) &&
4301                             !(test_bit(R5_UPTODATE, &dev->flags) ||
4302                               test_bit(R5_Wantcompute, &dev->flags))) {
4303                                 rcw++;
4304                                 if (test_bit(R5_Insync, &dev->flags) &&
4305                                     test_bit(STRIPE_PREREAD_ACTIVE,
4306                                              &sh->state)) {
4307                                         pr_debug("Read_old block "
4308                                                 "%d for Reconstruct\n", i);
4309                                         set_bit(R5_LOCKED, &dev->flags);
4310                                         set_bit(R5_Wantread, &dev->flags);
4311                                         s->locked++;
4312                                         qread++;
4313                                 } else
4314                                         set_bit(STRIPE_DELAYED, &sh->state);
4315                         }
4316                 }
4317                 if (rcw && conf->mddev->queue)
4318                         blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4319                                           (unsigned long long)sh->sector,
4320                                           rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4321         }
4322
4323         if (rcw > disks && rmw > disks &&
4324             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4325                 set_bit(STRIPE_DELAYED, &sh->state);
4326
4327         /* now if nothing is locked, and if we have enough data,
4328          * we can start a write request
4329          */
4330         /* since handle_stripe can be called at any time we need to handle the
4331          * case where a compute block operation has been submitted and then a
4332          * subsequent call wants to start a write request.  raid_run_ops only
4333          * handles the case where compute block and reconstruct are requested
4334          * simultaneously.  If this is not the case then new writes need to be
4335          * held off until the compute completes.
4336          */
4337         if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4338             (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4339              !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4340                 schedule_reconstruction(sh, s, rcw == 0, 0);
4341         return 0;
4342 }
4343
4344 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4345                                 struct stripe_head_state *s, int disks)
4346 {
4347         struct r5dev *dev = NULL;
4348
4349         BUG_ON(sh->batch_head);
4350         set_bit(STRIPE_HANDLE, &sh->state);
4351
4352         switch (sh->check_state) {
4353         case check_state_idle:
4354                 /* start a new check operation if there are no failures */
4355                 if (s->failed == 0) {
4356                         BUG_ON(s->uptodate != disks);
4357                         sh->check_state = check_state_run;
4358                         set_bit(STRIPE_OP_CHECK, &s->ops_request);
4359                         clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4360                         s->uptodate--;
4361                         break;
4362                 }
4363                 dev = &sh->dev[s->failed_num[0]];
4364                 fallthrough;
4365         case check_state_compute_result:
4366                 sh->check_state = check_state_idle;
4367                 if (!dev)
4368                         dev = &sh->dev[sh->pd_idx];
4369
4370                 /* check that a write has not made the stripe insync */
4371                 if (test_bit(STRIPE_INSYNC, &sh->state))
4372                         break;
4373
4374                 /* either failed parity check, or recovery is happening */
4375                 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4376                 BUG_ON(s->uptodate != disks);
4377
4378                 set_bit(R5_LOCKED, &dev->flags);
4379                 s->locked++;
4380                 set_bit(R5_Wantwrite, &dev->flags);
4381
4382                 clear_bit(STRIPE_DEGRADED, &sh->state);
4383                 set_bit(STRIPE_INSYNC, &sh->state);
4384                 break;
4385         case check_state_run:
4386                 break; /* we will be called again upon completion */
4387         case check_state_check_result:
4388                 sh->check_state = check_state_idle;
4389
4390                 /* if a failure occurred during the check operation, leave
4391                  * STRIPE_INSYNC not set and let the stripe be handled again
4392                  */
4393                 if (s->failed)
4394                         break;
4395
4396                 /* handle a successful check operation, if parity is correct
4397                  * we are done.  Otherwise update the mismatch count and repair
4398                  * parity if !MD_RECOVERY_CHECK
4399                  */
4400                 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4401                         /* parity is correct (on disc,
4402                          * not in buffer any more)
4403                          */
4404                         set_bit(STRIPE_INSYNC, &sh->state);
4405                 else {
4406                         atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4407                         if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4408                                 /* don't try to repair!! */
4409                                 set_bit(STRIPE_INSYNC, &sh->state);
4410                                 pr_warn_ratelimited("%s: mismatch sector in range "
4411                                                     "%llu-%llu\n", mdname(conf->mddev),
4412                                                     (unsigned long long) sh->sector,
4413                                                     (unsigned long long) sh->sector +
4414                                                     RAID5_STRIPE_SECTORS(conf));
4415                         } else {
4416                                 sh->check_state = check_state_compute_run;
4417                                 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4418                                 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4419                                 set_bit(R5_Wantcompute,
4420                                         &sh->dev[sh->pd_idx].flags);
4421                                 sh->ops.target = sh->pd_idx;
4422                                 sh->ops.target2 = -1;
4423                                 s->uptodate++;
4424                         }
4425                 }
4426                 break;
4427         case check_state_compute_run:
4428                 break;
4429         default:
4430                 pr_err("%s: unknown check_state: %d sector: %llu\n",
4431                        __func__, sh->check_state,
4432                        (unsigned long long) sh->sector);
4433                 BUG();
4434         }
4435 }
4436
4437 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4438                                   struct stripe_head_state *s,
4439                                   int disks)
4440 {
4441         int pd_idx = sh->pd_idx;
4442         int qd_idx = sh->qd_idx;
4443         struct r5dev *dev;
4444
4445         BUG_ON(sh->batch_head);
4446         set_bit(STRIPE_HANDLE, &sh->state);
4447
4448         BUG_ON(s->failed > 2);
4449
4450         /* Want to check and possibly repair P and Q.
4451          * However there could be one 'failed' device, in which
4452          * case we can only check one of them, possibly using the
4453          * other to generate missing data
4454          */
4455
4456         switch (sh->check_state) {
4457         case check_state_idle:
4458                 /* start a new check operation if there are < 2 failures */
4459                 if (s->failed == s->q_failed) {
4460                         /* The only possible failed device holds Q, so it
4461                          * makes sense to check P (If anything else were failed,
4462                          * we would have used P to recreate it).
4463                          */
4464                         sh->check_state = check_state_run;
4465                 }
4466                 if (!s->q_failed && s->failed < 2) {
4467                         /* Q is not failed, and we didn't use it to generate
4468                          * anything, so it makes sense to check it
4469                          */
4470                         if (sh->check_state == check_state_run)
4471                                 sh->check_state = check_state_run_pq;
4472                         else
4473                                 sh->check_state = check_state_run_q;
4474                 }
4475
4476                 /* discard potentially stale zero_sum_result */
4477                 sh->ops.zero_sum_result = 0;
4478
4479                 if (sh->check_state == check_state_run) {
4480                         /* async_xor_zero_sum destroys the contents of P */
4481                         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4482                         s->uptodate--;
4483                 }
4484                 if (sh->check_state >= check_state_run &&
4485                     sh->check_state <= check_state_run_pq) {
4486                         /* async_syndrome_zero_sum preserves P and Q, so
4487                          * no need to mark them !uptodate here
4488                          */
4489                         set_bit(STRIPE_OP_CHECK, &s->ops_request);
4490                         break;
4491                 }
4492
4493                 /* we have 2-disk failure */
4494                 BUG_ON(s->failed != 2);
4495                 fallthrough;
4496         case check_state_compute_result:
4497                 sh->check_state = check_state_idle;
4498
4499                 /* check that a write has not made the stripe insync */
4500                 if (test_bit(STRIPE_INSYNC, &sh->state))
4501                         break;
4502
4503                 /* now write out any block on a failed drive,
4504                  * or P or Q if they were recomputed
4505                  */
4506                 dev = NULL;
4507                 if (s->failed == 2) {
4508                         dev = &sh->dev[s->failed_num[1]];
4509                         s->locked++;
4510                         set_bit(R5_LOCKED, &dev->flags);
4511                         set_bit(R5_Wantwrite, &dev->flags);
4512                 }
4513                 if (s->failed >= 1) {
4514                         dev = &sh->dev[s->failed_num[0]];
4515                         s->locked++;
4516                         set_bit(R5_LOCKED, &dev->flags);
4517                         set_bit(R5_Wantwrite, &dev->flags);
4518                 }
4519                 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4520                         dev = &sh->dev[pd_idx];
4521                         s->locked++;
4522                         set_bit(R5_LOCKED, &dev->flags);
4523                         set_bit(R5_Wantwrite, &dev->flags);
4524                 }
4525                 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4526                         dev = &sh->dev[qd_idx];
4527                         s->locked++;
4528                         set_bit(R5_LOCKED, &dev->flags);
4529                         set_bit(R5_Wantwrite, &dev->flags);
4530                 }
4531                 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4532                               "%s: disk%td not up to date\n",
4533                               mdname(conf->mddev),
4534                               dev - (struct r5dev *) &sh->dev)) {
4535                         clear_bit(R5_LOCKED, &dev->flags);
4536                         clear_bit(R5_Wantwrite, &dev->flags);
4537                         s->locked--;
4538                 }
4539                 clear_bit(STRIPE_DEGRADED, &sh->state);
4540
4541                 set_bit(STRIPE_INSYNC, &sh->state);
4542                 break;
4543         case check_state_run:
4544         case check_state_run_q:
4545         case check_state_run_pq:
4546                 break; /* we will be called again upon completion */
4547         case check_state_check_result:
4548                 sh->check_state = check_state_idle;
4549
4550                 /* handle a successful check operation, if parity is correct
4551                  * we are done.  Otherwise update the mismatch count and repair
4552                  * parity if !MD_RECOVERY_CHECK
4553                  */
4554                 if (sh->ops.zero_sum_result == 0) {
4555                         /* both parities are correct */
4556                         if (!s->failed)
4557                                 set_bit(STRIPE_INSYNC, &sh->state);
4558                         else {
4559                                 /* in contrast to the raid5 case we can validate
4560                                  * parity, but still have a failure to write
4561                                  * back
4562                                  */
4563                                 sh->check_state = check_state_compute_result;
4564                                 /* Returning at this point means that we may go
4565                                  * off and bring p and/or q uptodate again so
4566                                  * we make sure to check zero_sum_result again
4567                                  * to verify if p or q need writeback
4568                                  */
4569                         }
4570                 } else {
4571                         atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4572                         if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4573                                 /* don't try to repair!! */
4574                                 set_bit(STRIPE_INSYNC, &sh->state);
4575                                 pr_warn_ratelimited("%s: mismatch sector in range "
4576                                                     "%llu-%llu\n", mdname(conf->mddev),
4577                                                     (unsigned long long) sh->sector,
4578                                                     (unsigned long long) sh->sector +
4579                                                     RAID5_STRIPE_SECTORS(conf));
4580                         } else {
4581                                 int *target = &sh->ops.target;
4582
4583                                 sh->ops.target = -1;
4584                                 sh->ops.target2 = -1;
4585                                 sh->check_state = check_state_compute_run;
4586                                 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4587                                 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4588                                 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4589                                         set_bit(R5_Wantcompute,
4590                                                 &sh->dev[pd_idx].flags);
4591                                         *target = pd_idx;
4592                                         target = &sh->ops.target2;
4593                                         s->uptodate++;
4594                                 }
4595                                 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4596                                         set_bit(R5_Wantcompute,
4597                                                 &sh->dev[qd_idx].flags);
4598                                         *target = qd_idx;
4599                                         s->uptodate++;
4600                                 }
4601                         }
4602                 }
4603                 break;
4604         case check_state_compute_run:
4605                 break;
4606         default:
4607                 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4608                         __func__, sh->check_state,
4609                         (unsigned long long) sh->sector);
4610                 BUG();
4611         }
4612 }
4613
4614 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4615 {
4616         int i;
4617
4618         /* We have read all the blocks in this stripe and now we need to
4619          * copy some of them into a target stripe for expand.
4620          */
4621         struct dma_async_tx_descriptor *tx = NULL;
4622         BUG_ON(sh->batch_head);
4623         clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4624         for (i = 0; i < sh->disks; i++)
4625                 if (i != sh->pd_idx && i != sh->qd_idx) {
4626                         int dd_idx, j;
4627                         struct stripe_head *sh2;
4628                         struct async_submit_ctl submit;
4629
4630                         sector_t bn = raid5_compute_blocknr(sh, i, 1);
4631                         sector_t s = raid5_compute_sector(conf, bn, 0,
4632                                                           &dd_idx, NULL);
4633                         sh2 = raid5_get_active_stripe(conf, NULL, s,
4634                                 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
4635                         if (sh2 == NULL)
4636                                 /* so far only the early blocks of this stripe
4637                                  * have been requested.  When later blocks
4638                                  * get requested, we will try again
4639                                  */
4640                                 continue;
4641                         if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4642                            test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4643                                 /* must have already done this block */
4644                                 raid5_release_stripe(sh2);
4645                                 continue;
4646                         }
4647
4648                         /* place all the copies on one channel */
4649                         init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4650                         tx = async_memcpy(sh2->dev[dd_idx].page,
4651                                           sh->dev[i].page, sh2->dev[dd_idx].offset,
4652                                           sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4653                                           &submit);
4654
4655                         set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4656                         set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4657                         for (j = 0; j < conf->raid_disks; j++)
4658                                 if (j != sh2->pd_idx &&
4659                                     j != sh2->qd_idx &&
4660                                     !test_bit(R5_Expanded, &sh2->dev[j].flags))
4661                                         break;
4662                         if (j == conf->raid_disks) {
4663                                 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4664                                 set_bit(STRIPE_HANDLE, &sh2->state);
4665                         }
4666                         raid5_release_stripe(sh2);
4667
4668                 }
4669         /* done submitting copies, wait for them to complete */
4670         async_tx_quiesce(&tx);
4671 }
4672
4673 /*
4674  * handle_stripe - do things to a stripe.
4675  *
4676  * We lock the stripe by setting STRIPE_ACTIVE and then examine the
4677  * state of various bits to see what needs to be done.
4678  * Possible results:
4679  *    return some read requests which now have data
4680  *    return some write requests which are safely on storage
4681  *    schedule a read on some buffers
4682  *    schedule a write of some buffers
4683  *    return confirmation of parity correctness
4684  *
4685  */
4686
4687 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4688 {
4689         struct r5conf *conf = sh->raid_conf;
4690         int disks = sh->disks;
4691         struct r5dev *dev;
4692         int i;
4693         int do_recovery = 0;
4694
4695         memset(s, 0, sizeof(*s));
4696
4697         s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4698         s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4699         s->failed_num[0] = -1;
4700         s->failed_num[1] = -1;
4701         s->log_failed = r5l_log_disk_error(conf);
4702
4703         /* Now to look around and see what can be done */
4704         rcu_read_lock();
4705         for (i=disks; i--; ) {
4706                 struct md_rdev *rdev;
4707                 sector_t first_bad;
4708                 int bad_sectors;
4709                 int is_bad = 0;
4710
4711                 dev = &sh->dev[i];
4712
4713                 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4714                          i, dev->flags,
4715                          dev->toread, dev->towrite, dev->written);
4716                 /* maybe we can reply to a read
4717                  *
4718                  * new wantfill requests are only permitted while
4719                  * ops_complete_biofill is guaranteed to be inactive
4720                  */
4721                 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4722                     !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4723                         set_bit(R5_Wantfill, &dev->flags);
4724
4725                 /* now count some things */
4726                 if (test_bit(R5_LOCKED, &dev->flags))
4727                         s->locked++;
4728                 if (test_bit(R5_UPTODATE, &dev->flags))
4729                         s->uptodate++;
4730                 if (test_bit(R5_Wantcompute, &dev->flags)) {
4731                         s->compute++;
4732                         BUG_ON(s->compute > 2);
4733                 }
4734
4735                 if (test_bit(R5_Wantfill, &dev->flags))
4736                         s->to_fill++;
4737                 else if (dev->toread)
4738                         s->to_read++;
4739                 if (dev->towrite) {
4740                         s->to_write++;
4741                         if (!test_bit(R5_OVERWRITE, &dev->flags))
4742                                 s->non_overwrite++;
4743                 }
4744                 if (dev->written)
4745                         s->written++;
4746                 /* Prefer to use the replacement for reads, but only
4747                  * if it is recovered enough and has no bad blocks.
4748                  */
4749                 rdev = rcu_dereference(conf->disks[i].replacement);
4750                 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4751                     rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4752                     !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4753                                  &first_bad, &bad_sectors))
4754                         set_bit(R5_ReadRepl, &dev->flags);
4755                 else {
4756                         if (rdev && !test_bit(Faulty, &rdev->flags))
4757                                 set_bit(R5_NeedReplace, &dev->flags);
4758                         else
4759                                 clear_bit(R5_NeedReplace, &dev->flags);
4760                         rdev = rcu_dereference(conf->disks[i].rdev);
4761                         clear_bit(R5_ReadRepl, &dev->flags);
4762                 }
4763                 if (rdev && test_bit(Faulty, &rdev->flags))
4764                         rdev = NULL;
4765                 if (rdev) {
4766                         is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4767                                              &first_bad, &bad_sectors);
4768                         if (s->blocked_rdev == NULL
4769                             && (test_bit(Blocked, &rdev->flags)
4770                                 || is_bad < 0)) {
4771                                 if (is_bad < 0)
4772                                         set_bit(BlockedBadBlocks,
4773                                                 &rdev->flags);
4774                                 s->blocked_rdev = rdev;
4775                                 atomic_inc(&rdev->nr_pending);
4776                         }
4777                 }
4778                 clear_bit(R5_Insync, &dev->flags);
4779                 if (!rdev)
4780                         /* Not in-sync */;
4781                 else if (is_bad) {
4782                         /* also not in-sync */
4783                         if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4784                             test_bit(R5_UPTODATE, &dev->flags)) {
4785                                 /* treat as in-sync, but with a read error
4786                                  * which we can now try to correct
4787                                  */
4788                                 set_bit(R5_Insync, &dev->flags);
4789                                 set_bit(R5_ReadError, &dev->flags);
4790                         }
4791                 } else if (test_bit(In_sync, &rdev->flags))
4792                         set_bit(R5_Insync, &dev->flags);
4793                 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4794                         /* in sync if before recovery_offset */
4795                         set_bit(R5_Insync, &dev->flags);
4796                 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4797                          test_bit(R5_Expanded, &dev->flags))
4798                         /* If we've reshaped into here, we assume it is Insync.
4799                          * We will shortly update recovery_offset to make
4800                          * it official.
4801                          */
4802                         set_bit(R5_Insync, &dev->flags);
4803
4804                 if (test_bit(R5_WriteError, &dev->flags)) {
4805                         /* This flag does not apply to '.replacement'
4806                          * only to .rdev, so make sure to check that*/
4807                         struct md_rdev *rdev2 = rcu_dereference(
4808                                 conf->disks[i].rdev);
4809                         if (rdev2 == rdev)
4810                                 clear_bit(R5_Insync, &dev->flags);
4811                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4812                                 s->handle_bad_blocks = 1;
4813                                 atomic_inc(&rdev2->nr_pending);
4814                         } else
4815                                 clear_bit(R5_WriteError, &dev->flags);
4816                 }
4817                 if (test_bit(R5_MadeGood, &dev->flags)) {
4818                         /* This flag does not apply to '.replacement'
4819                          * only to .rdev, so make sure to check that*/
4820                         struct md_rdev *rdev2 = rcu_dereference(
4821                                 conf->disks[i].rdev);
4822                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4823                                 s->handle_bad_blocks = 1;
4824                                 atomic_inc(&rdev2->nr_pending);
4825                         } else
4826                                 clear_bit(R5_MadeGood, &dev->flags);
4827                 }
4828                 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4829                         struct md_rdev *rdev2 = rcu_dereference(
4830                                 conf->disks[i].replacement);
4831                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4832                                 s->handle_bad_blocks = 1;
4833                                 atomic_inc(&rdev2->nr_pending);
4834                         } else
4835                                 clear_bit(R5_MadeGoodRepl, &dev->flags);
4836                 }
4837                 if (!test_bit(R5_Insync, &dev->flags)) {
4838                         /* The ReadError flag will just be confusing now */
4839                         clear_bit(R5_ReadError, &dev->flags);
4840                         clear_bit(R5_ReWrite, &dev->flags);
4841                 }
4842                 if (test_bit(R5_ReadError, &dev->flags))
4843                         clear_bit(R5_Insync, &dev->flags);
4844                 if (!test_bit(R5_Insync, &dev->flags)) {
4845                         if (s->failed < 2)
4846                                 s->failed_num[s->failed] = i;
4847                         s->failed++;
4848                         if (rdev && !test_bit(Faulty, &rdev->flags))
4849                                 do_recovery = 1;
4850                         else if (!rdev) {
4851                                 rdev = rcu_dereference(
4852                                     conf->disks[i].replacement);
4853                                 if (rdev && !test_bit(Faulty, &rdev->flags))
4854                                         do_recovery = 1;
4855                         }
4856                 }
4857
4858                 if (test_bit(R5_InJournal, &dev->flags))
4859                         s->injournal++;
4860                 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4861                         s->just_cached++;
4862         }
4863         if (test_bit(STRIPE_SYNCING, &sh->state)) {
4864                 /* If there is a failed device being replaced,
4865                  *     we must be recovering.
4866                  * else if we are after recovery_cp, we must be syncing
4867                  * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4868                  * else we can only be replacing
4869                  * sync and recovery both need to read all devices, and so
4870                  * use the same flag.
4871                  */
4872                 if (do_recovery ||
4873                     sh->sector >= conf->mddev->recovery_cp ||
4874                     test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4875                         s->syncing = 1;
4876                 else
4877                         s->replacing = 1;
4878         }
4879         rcu_read_unlock();
4880 }
4881
4882 /*
4883  * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4884  * a head which can now be handled.
4885  */
4886 static int clear_batch_ready(struct stripe_head *sh)
4887 {
4888         struct stripe_head *tmp;
4889         if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4890                 return (sh->batch_head && sh->batch_head != sh);
4891         spin_lock(&sh->stripe_lock);
4892         if (!sh->batch_head) {
4893                 spin_unlock(&sh->stripe_lock);
4894                 return 0;
4895         }
4896
4897         /*
4898          * this stripe could be added to a batch list before we check
4899          * BATCH_READY, skips it
4900          */
4901         if (sh->batch_head != sh) {
4902                 spin_unlock(&sh->stripe_lock);
4903                 return 1;
4904         }
4905         spin_lock(&sh->batch_lock);
4906         list_for_each_entry(tmp, &sh->batch_list, batch_list)
4907                 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4908         spin_unlock(&sh->batch_lock);
4909         spin_unlock(&sh->stripe_lock);
4910
4911         /*
4912          * BATCH_READY is cleared, no new stripes can be added.
4913          * batch_list can be accessed without lock
4914          */
4915         return 0;
4916 }
4917
4918 static void break_stripe_batch_list(struct stripe_head *head_sh,
4919                                     unsigned long handle_flags)
4920 {
4921         struct stripe_head *sh, *next;
4922         int i;
4923         int do_wakeup = 0;
4924
4925         list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4926
4927                 list_del_init(&sh->batch_list);
4928
4929                 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4930                                           (1 << STRIPE_SYNCING) |
4931                                           (1 << STRIPE_REPLACED) |
4932                                           (1 << STRIPE_DELAYED) |
4933                                           (1 << STRIPE_BIT_DELAY) |
4934                                           (1 << STRIPE_FULL_WRITE) |
4935                                           (1 << STRIPE_BIOFILL_RUN) |
4936                                           (1 << STRIPE_COMPUTE_RUN)  |
4937                                           (1 << STRIPE_DISCARD) |
4938                                           (1 << STRIPE_BATCH_READY) |
4939                                           (1 << STRIPE_BATCH_ERR) |
4940                                           (1 << STRIPE_BITMAP_PENDING)),
4941                         "stripe state: %lx\n", sh->state);
4942                 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4943                                               (1 << STRIPE_REPLACED)),
4944                         "head stripe state: %lx\n", head_sh->state);
4945
4946                 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4947                                             (1 << STRIPE_PREREAD_ACTIVE) |
4948                                             (1 << STRIPE_DEGRADED) |
4949                                             (1 << STRIPE_ON_UNPLUG_LIST)),
4950                               head_sh->state & (1 << STRIPE_INSYNC));
4951
4952                 sh->check_state = head_sh->check_state;
4953                 sh->reconstruct_state = head_sh->reconstruct_state;
4954                 spin_lock_irq(&sh->stripe_lock);
4955                 sh->batch_head = NULL;
4956                 spin_unlock_irq(&sh->stripe_lock);
4957                 for (i = 0; i < sh->disks; i++) {
4958                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4959                                 do_wakeup = 1;
4960                         sh->dev[i].flags = head_sh->dev[i].flags &
4961                                 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4962                 }
4963                 if (handle_flags == 0 ||
4964                     sh->state & handle_flags)
4965                         set_bit(STRIPE_HANDLE, &sh->state);
4966                 raid5_release_stripe(sh);
4967         }
4968         spin_lock_irq(&head_sh->stripe_lock);
4969         head_sh->batch_head = NULL;
4970         spin_unlock_irq(&head_sh->stripe_lock);
4971         for (i = 0; i < head_sh->disks; i++)
4972                 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4973                         do_wakeup = 1;
4974         if (head_sh->state & handle_flags)
4975                 set_bit(STRIPE_HANDLE, &head_sh->state);
4976
4977         if (do_wakeup)
4978                 wake_up(&head_sh->raid_conf->wait_for_overlap);
4979 }
4980
4981 static void handle_stripe(struct stripe_head *sh)
4982 {
4983         struct stripe_head_state s;
4984         struct r5conf *conf = sh->raid_conf;
4985         int i;
4986         int prexor;
4987         int disks = sh->disks;
4988         struct r5dev *pdev, *qdev;
4989
4990         clear_bit(STRIPE_HANDLE, &sh->state);
4991
4992         /*
4993          * handle_stripe should not continue handle the batched stripe, only
4994          * the head of batch list or lone stripe can continue. Otherwise we
4995          * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4996          * is set for the batched stripe.
4997          */
4998         if (clear_batch_ready(sh))
4999                 return;
5000
5001         if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
5002                 /* already being handled, ensure it gets handled
5003                  * again when current action finishes */
5004                 set_bit(STRIPE_HANDLE, &sh->state);
5005                 return;
5006         }
5007
5008         if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
5009                 break_stripe_batch_list(sh, 0);
5010
5011         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
5012                 spin_lock(&sh->stripe_lock);
5013                 /*
5014                  * Cannot process 'sync' concurrently with 'discard'.
5015                  * Flush data in r5cache before 'sync'.
5016                  */
5017                 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
5018                     !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
5019                     !test_bit(STRIPE_DISCARD, &sh->state) &&
5020                     test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
5021                         set_bit(STRIPE_SYNCING, &sh->state);
5022                         clear_bit(STRIPE_INSYNC, &sh->state);
5023                         clear_bit(STRIPE_REPLACED, &sh->state);
5024                 }
5025                 spin_unlock(&sh->stripe_lock);
5026         }
5027         clear_bit(STRIPE_DELAYED, &sh->state);
5028
5029         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
5030                 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
5031                (unsigned long long)sh->sector, sh->state,
5032                atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
5033                sh->check_state, sh->reconstruct_state);
5034
5035         analyse_stripe(sh, &s);
5036
5037         if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
5038                 goto finish;
5039
5040         if (s.handle_bad_blocks ||
5041             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
5042                 set_bit(STRIPE_HANDLE, &sh->state);
5043                 goto finish;
5044         }
5045
5046         if (unlikely(s.blocked_rdev)) {
5047                 if (s.syncing || s.expanding || s.expanded ||
5048                     s.replacing || s.to_write || s.written) {
5049                         set_bit(STRIPE_HANDLE, &sh->state);
5050                         goto finish;
5051                 }
5052                 /* There is nothing for the blocked_rdev to block */
5053                 rdev_dec_pending(s.blocked_rdev, conf->mddev);
5054                 s.blocked_rdev = NULL;
5055         }
5056
5057         if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
5058                 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
5059                 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
5060         }
5061
5062         pr_debug("locked=%d uptodate=%d to_read=%d"
5063                " to_write=%d failed=%d failed_num=%d,%d\n",
5064                s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
5065                s.failed_num[0], s.failed_num[1]);
5066         /*
5067          * check if the array has lost more than max_degraded devices and,
5068          * if so, some requests might need to be failed.
5069          *
5070          * When journal device failed (log_failed), we will only process
5071          * the stripe if there is data need write to raid disks
5072          */
5073         if (s.failed > conf->max_degraded ||
5074             (s.log_failed && s.injournal == 0)) {
5075                 sh->check_state = 0;
5076                 sh->reconstruct_state = 0;
5077                 break_stripe_batch_list(sh, 0);
5078                 if (s.to_read+s.to_write+s.written)
5079                         handle_failed_stripe(conf, sh, &s, disks);
5080                 if (s.syncing + s.replacing)
5081                         handle_failed_sync(conf, sh, &s);
5082         }
5083
5084         /* Now we check to see if any write operations have recently
5085          * completed
5086          */
5087         prexor = 0;
5088         if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
5089                 prexor = 1;
5090         if (sh->reconstruct_state == reconstruct_state_drain_result ||
5091             sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5092                 sh->reconstruct_state = reconstruct_state_idle;
5093
5094                 /* All the 'written' buffers and the parity block are ready to
5095                  * be written back to disk
5096                  */
5097                 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5098                        !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5099                 BUG_ON(sh->qd_idx >= 0 &&
5100                        !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5101                        !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5102                 for (i = disks; i--; ) {
5103                         struct r5dev *dev = &sh->dev[i];
5104                         if (test_bit(R5_LOCKED, &dev->flags) &&
5105                                 (i == sh->pd_idx || i == sh->qd_idx ||
5106                                  dev->written || test_bit(R5_InJournal,
5107                                                           &dev->flags))) {
5108                                 pr_debug("Writing block %d\n", i);
5109                                 set_bit(R5_Wantwrite, &dev->flags);
5110                                 if (prexor)
5111                                         continue;
5112                                 if (s.failed > 1)
5113                                         continue;
5114                                 if (!test_bit(R5_Insync, &dev->flags) ||
5115                                     ((i == sh->pd_idx || i == sh->qd_idx)  &&
5116                                      s.failed == 0))
5117                                         set_bit(STRIPE_INSYNC, &sh->state);
5118                         }
5119                 }
5120                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5121                         s.dec_preread_active = 1;
5122         }
5123
5124         /*
5125          * might be able to return some write requests if the parity blocks
5126          * are safe, or on a failed drive
5127          */
5128         pdev = &sh->dev[sh->pd_idx];
5129         s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5130                 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5131         qdev = &sh->dev[sh->qd_idx];
5132         s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5133                 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5134                 || conf->level < 6;
5135
5136         if (s.written &&
5137             (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5138                              && !test_bit(R5_LOCKED, &pdev->flags)
5139                              && (test_bit(R5_UPTODATE, &pdev->flags) ||
5140                                  test_bit(R5_Discard, &pdev->flags))))) &&
5141             (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5142                              && !test_bit(R5_LOCKED, &qdev->flags)
5143                              && (test_bit(R5_UPTODATE, &qdev->flags) ||
5144                                  test_bit(R5_Discard, &qdev->flags))))))
5145                 handle_stripe_clean_event(conf, sh, disks);
5146
5147         if (s.just_cached)
5148                 r5c_handle_cached_data_endio(conf, sh, disks);
5149         log_stripe_write_finished(sh);
5150
5151         /* Now we might consider reading some blocks, either to check/generate
5152          * parity, or to satisfy requests
5153          * or to load a block that is being partially written.
5154          */
5155         if (s.to_read || s.non_overwrite
5156             || (s.to_write && s.failed)
5157             || (s.syncing && (s.uptodate + s.compute < disks))
5158             || s.replacing
5159             || s.expanding)
5160                 handle_stripe_fill(sh, &s, disks);
5161
5162         /*
5163          * When the stripe finishes full journal write cycle (write to journal
5164          * and raid disk), this is the clean up procedure so it is ready for
5165          * next operation.
5166          */
5167         r5c_finish_stripe_write_out(conf, sh, &s);
5168
5169         /*
5170          * Now to consider new write requests, cache write back and what else,
5171          * if anything should be read.  We do not handle new writes when:
5172          * 1/ A 'write' operation (copy+xor) is already in flight.
5173          * 2/ A 'check' operation is in flight, as it may clobber the parity
5174          *    block.
5175          * 3/ A r5c cache log write is in flight.
5176          */
5177
5178         if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5179                 if (!r5c_is_writeback(conf->log)) {
5180                         if (s.to_write)
5181                                 handle_stripe_dirtying(conf, sh, &s, disks);
5182                 } else { /* write back cache */
5183                         int ret = 0;
5184
5185                         /* First, try handle writes in caching phase */
5186                         if (s.to_write)
5187                                 ret = r5c_try_caching_write(conf, sh, &s,
5188                                                             disks);
5189                         /*
5190                          * If caching phase failed: ret == -EAGAIN
5191                          *    OR
5192                          * stripe under reclaim: !caching && injournal
5193                          *
5194                          * fall back to handle_stripe_dirtying()
5195                          */
5196                         if (ret == -EAGAIN ||
5197                             /* stripe under reclaim: !caching && injournal */
5198                             (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5199                              s.injournal > 0)) {
5200                                 ret = handle_stripe_dirtying(conf, sh, &s,
5201                                                              disks);
5202                                 if (ret == -EAGAIN)
5203                                         goto finish;
5204                         }
5205                 }
5206         }
5207
5208         /* maybe we need to check and possibly fix the parity for this stripe
5209          * Any reads will already have been scheduled, so we just see if enough
5210          * data is available.  The parity check is held off while parity
5211          * dependent operations are in flight.
5212          */
5213         if (sh->check_state ||
5214             (s.syncing && s.locked == 0 &&
5215              !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5216              !test_bit(STRIPE_INSYNC, &sh->state))) {
5217                 if (conf->level == 6)
5218                         handle_parity_checks6(conf, sh, &s, disks);
5219                 else
5220                         handle_parity_checks5(conf, sh, &s, disks);
5221         }
5222
5223         if ((s.replacing || s.syncing) && s.locked == 0
5224             && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5225             && !test_bit(STRIPE_REPLACED, &sh->state)) {
5226                 /* Write out to replacement devices where possible */
5227                 for (i = 0; i < conf->raid_disks; i++)
5228                         if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5229                                 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5230                                 set_bit(R5_WantReplace, &sh->dev[i].flags);
5231                                 set_bit(R5_LOCKED, &sh->dev[i].flags);
5232                                 s.locked++;
5233                         }
5234                 if (s.replacing)
5235                         set_bit(STRIPE_INSYNC, &sh->state);
5236                 set_bit(STRIPE_REPLACED, &sh->state);
5237         }
5238         if ((s.syncing || s.replacing) && s.locked == 0 &&
5239             !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5240             test_bit(STRIPE_INSYNC, &sh->state)) {
5241                 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5242                 clear_bit(STRIPE_SYNCING, &sh->state);
5243                 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5244                         wake_up(&conf->wait_for_overlap);
5245         }
5246
5247         /* If the failed drives are just a ReadError, then we might need
5248          * to progress the repair/check process
5249          */
5250         if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5251                 for (i = 0; i < s.failed; i++) {
5252                         struct r5dev *dev = &sh->dev[s.failed_num[i]];
5253                         if (test_bit(R5_ReadError, &dev->flags)
5254                             && !test_bit(R5_LOCKED, &dev->flags)
5255                             && test_bit(R5_UPTODATE, &dev->flags)
5256                                 ) {
5257                                 if (!test_bit(R5_ReWrite, &dev->flags)) {
5258                                         set_bit(R5_Wantwrite, &dev->flags);
5259                                         set_bit(R5_ReWrite, &dev->flags);
5260                                 } else
5261                                         /* let's read it back */
5262                                         set_bit(R5_Wantread, &dev->flags);
5263                                 set_bit(R5_LOCKED, &dev->flags);
5264                                 s.locked++;
5265                         }
5266                 }
5267
5268         /* Finish reconstruct operations initiated by the expansion process */
5269         if (sh->reconstruct_state == reconstruct_state_result) {
5270                 struct stripe_head *sh_src
5271                         = raid5_get_active_stripe(conf, NULL, sh->sector,
5272                                         R5_GAS_PREVIOUS | R5_GAS_NOBLOCK |
5273                                         R5_GAS_NOQUIESCE);
5274                 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5275                         /* sh cannot be written until sh_src has been read.
5276                          * so arrange for sh to be delayed a little
5277                          */
5278                         set_bit(STRIPE_DELAYED, &sh->state);
5279                         set_bit(STRIPE_HANDLE, &sh->state);
5280                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5281                                               &sh_src->state))
5282                                 atomic_inc(&conf->preread_active_stripes);
5283                         raid5_release_stripe(sh_src);
5284                         goto finish;
5285                 }
5286                 if (sh_src)
5287                         raid5_release_stripe(sh_src);
5288
5289                 sh->reconstruct_state = reconstruct_state_idle;
5290                 clear_bit(STRIPE_EXPANDING, &sh->state);
5291                 for (i = conf->raid_disks; i--; ) {
5292                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
5293                         set_bit(R5_LOCKED, &sh->dev[i].flags);
5294                         s.locked++;
5295                 }
5296         }
5297
5298         if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5299             !sh->reconstruct_state) {
5300                 /* Need to write out all blocks after computing parity */
5301                 sh->disks = conf->raid_disks;
5302                 stripe_set_idx(sh->sector, conf, 0, sh);
5303                 schedule_reconstruction(sh, &s, 1, 1);
5304         } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5305                 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5306                 atomic_dec(&conf->reshape_stripes);
5307                 wake_up(&conf->wait_for_overlap);
5308                 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5309         }
5310
5311         if (s.expanding && s.locked == 0 &&
5312             !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5313                 handle_stripe_expansion(conf, sh);
5314
5315 finish:
5316         /* wait for this device to become unblocked */
5317         if (unlikely(s.blocked_rdev)) {
5318                 if (conf->mddev->external)
5319                         md_wait_for_blocked_rdev(s.blocked_rdev,
5320                                                  conf->mddev);
5321                 else
5322                         /* Internal metadata will immediately
5323                          * be written by raid5d, so we don't
5324                          * need to wait here.
5325                          */
5326                         rdev_dec_pending(s.blocked_rdev,
5327                                          conf->mddev);
5328         }
5329
5330         if (s.handle_bad_blocks)
5331                 for (i = disks; i--; ) {
5332                         struct md_rdev *rdev;
5333                         struct r5dev *dev = &sh->dev[i];
5334                         if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5335                                 /* We own a safe reference to the rdev */
5336                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
5337                                 if (!rdev_set_badblocks(rdev, sh->sector,
5338                                                         RAID5_STRIPE_SECTORS(conf), 0))
5339                                         md_error(conf->mddev, rdev);
5340                                 rdev_dec_pending(rdev, conf->mddev);
5341                         }
5342                         if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5343                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
5344                                 rdev_clear_badblocks(rdev, sh->sector,
5345                                                      RAID5_STRIPE_SECTORS(conf), 0);
5346                                 rdev_dec_pending(rdev, conf->mddev);
5347                         }
5348                         if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5349                                 rdev = rdev_pend_deref(conf->disks[i].replacement);
5350                                 if (!rdev)
5351                                         /* rdev have been moved down */
5352                                         rdev = rdev_pend_deref(conf->disks[i].rdev);
5353                                 rdev_clear_badblocks(rdev, sh->sector,
5354                                                      RAID5_STRIPE_SECTORS(conf), 0);
5355                                 rdev_dec_pending(rdev, conf->mddev);
5356                         }
5357                 }
5358
5359         if (s.ops_request)
5360                 raid_run_ops(sh, s.ops_request);
5361
5362         ops_run_io(sh, &s);
5363
5364         if (s.dec_preread_active) {
5365                 /* We delay this until after ops_run_io so that if make_request
5366                  * is waiting on a flush, it won't continue until the writes
5367                  * have actually been submitted.
5368                  */
5369                 atomic_dec(&conf->preread_active_stripes);
5370                 if (atomic_read(&conf->preread_active_stripes) <
5371                     IO_THRESHOLD)
5372                         md_wakeup_thread(conf->mddev->thread);
5373         }
5374
5375         clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5376 }
5377
5378 static void raid5_activate_delayed(struct r5conf *conf)
5379         __must_hold(&conf->device_lock)
5380 {
5381         if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5382                 while (!list_empty(&conf->delayed_list)) {
5383                         struct list_head *l = conf->delayed_list.next;
5384                         struct stripe_head *sh;
5385                         sh = list_entry(l, struct stripe_head, lru);
5386                         list_del_init(l);
5387                         clear_bit(STRIPE_DELAYED, &sh->state);
5388                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5389                                 atomic_inc(&conf->preread_active_stripes);
5390                         list_add_tail(&sh->lru, &conf->hold_list);
5391                         raid5_wakeup_stripe_thread(sh);
5392                 }
5393         }
5394 }
5395
5396 static void activate_bit_delay(struct r5conf *conf,
5397                 struct list_head *temp_inactive_list)
5398         __must_hold(&conf->device_lock)
5399 {
5400         struct list_head head;
5401         list_add(&head, &conf->bitmap_list);
5402         list_del_init(&conf->bitmap_list);
5403         while (!list_empty(&head)) {
5404                 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5405                 int hash;
5406                 list_del_init(&sh->lru);
5407                 atomic_inc(&sh->count);
5408                 hash = sh->hash_lock_index;
5409                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5410         }
5411 }
5412
5413 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5414 {
5415         struct r5conf *conf = mddev->private;
5416         sector_t sector = bio->bi_iter.bi_sector;
5417         unsigned int chunk_sectors;
5418         unsigned int bio_sectors = bio_sectors(bio);
5419
5420         chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5421         return  chunk_sectors >=
5422                 ((sector & (chunk_sectors - 1)) + bio_sectors);
5423 }
5424
5425 /*
5426  *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
5427  *  later sampled by raid5d.
5428  */
5429 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5430 {
5431         unsigned long flags;
5432
5433         spin_lock_irqsave(&conf->device_lock, flags);
5434
5435         bi->bi_next = conf->retry_read_aligned_list;
5436         conf->retry_read_aligned_list = bi;
5437
5438         spin_unlock_irqrestore(&conf->device_lock, flags);
5439         md_wakeup_thread(conf->mddev->thread);
5440 }
5441
5442 static struct bio *remove_bio_from_retry(struct r5conf *conf,
5443                                          unsigned int *offset)
5444 {
5445         struct bio *bi;
5446
5447         bi = conf->retry_read_aligned;
5448         if (bi) {
5449                 *offset = conf->retry_read_offset;
5450                 conf->retry_read_aligned = NULL;
5451                 return bi;
5452         }
5453         bi = conf->retry_read_aligned_list;
5454         if(bi) {
5455                 conf->retry_read_aligned_list = bi->bi_next;
5456                 bi->bi_next = NULL;
5457                 *offset = 0;
5458         }
5459
5460         return bi;
5461 }
5462
5463 /*
5464  *  The "raid5_align_endio" should check if the read succeeded and if it
5465  *  did, call bio_endio on the original bio (having bio_put the new bio
5466  *  first).
5467  *  If the read failed..
5468  */
5469 static void raid5_align_endio(struct bio *bi)
5470 {
5471         struct md_io_acct *md_io_acct = bi->bi_private;
5472         struct bio *raid_bi = md_io_acct->orig_bio;
5473         struct mddev *mddev;
5474         struct r5conf *conf;
5475         struct md_rdev *rdev;
5476         blk_status_t error = bi->bi_status;
5477         unsigned long start_time = md_io_acct->start_time;
5478
5479         bio_put(bi);
5480
5481         rdev = (void*)raid_bi->bi_next;
5482         raid_bi->bi_next = NULL;
5483         mddev = rdev->mddev;
5484         conf = mddev->private;
5485
5486         rdev_dec_pending(rdev, conf->mddev);
5487
5488         if (!error) {
5489                 if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
5490                         bio_end_io_acct(raid_bi, start_time);
5491                 bio_endio(raid_bi);
5492                 if (atomic_dec_and_test(&conf->active_aligned_reads))
5493                         wake_up(&conf->wait_for_quiescent);
5494                 return;
5495         }
5496
5497         pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5498
5499         add_bio_to_retry(raid_bi, conf);
5500 }
5501
5502 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5503 {
5504         struct r5conf *conf = mddev->private;
5505         struct bio *align_bio;
5506         struct md_rdev *rdev;
5507         sector_t sector, end_sector, first_bad;
5508         int bad_sectors, dd_idx;
5509         struct md_io_acct *md_io_acct;
5510         bool did_inc;
5511
5512         if (!in_chunk_boundary(mddev, raid_bio)) {
5513                 pr_debug("%s: non aligned\n", __func__);
5514                 return 0;
5515         }
5516
5517         sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
5518                                       &dd_idx, NULL);
5519         end_sector = bio_end_sector(raid_bio);
5520
5521         rcu_read_lock();
5522         if (r5c_big_stripe_cached(conf, sector))
5523                 goto out_rcu_unlock;
5524
5525         rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5526         if (!rdev || test_bit(Faulty, &rdev->flags) ||
5527             rdev->recovery_offset < end_sector) {
5528                 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5529                 if (!rdev)
5530                         goto out_rcu_unlock;
5531                 if (test_bit(Faulty, &rdev->flags) ||
5532                     !(test_bit(In_sync, &rdev->flags) ||
5533                       rdev->recovery_offset >= end_sector))
5534                         goto out_rcu_unlock;
5535         }
5536
5537         atomic_inc(&rdev->nr_pending);
5538         rcu_read_unlock();
5539
5540         if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
5541                         &bad_sectors)) {
5542                 rdev_dec_pending(rdev, mddev);
5543                 return 0;
5544         }
5545
5546         align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
5547                                     &mddev->io_acct_set);
5548         md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
5549         raid_bio->bi_next = (void *)rdev;
5550         if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
5551                 md_io_acct->start_time = bio_start_io_acct(raid_bio);
5552         md_io_acct->orig_bio = raid_bio;
5553
5554         align_bio->bi_end_io = raid5_align_endio;
5555         align_bio->bi_private = md_io_acct;
5556         align_bio->bi_iter.bi_sector = sector;
5557
5558         /* No reshape active, so we can trust rdev->data_offset */
5559         align_bio->bi_iter.bi_sector += rdev->data_offset;
5560
5561         did_inc = false;
5562         if (conf->quiesce == 0) {
5563                 atomic_inc(&conf->active_aligned_reads);
5564                 did_inc = true;
5565         }
5566         /* need a memory barrier to detect the race with raid5_quiesce() */
5567         if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
5568                 /* quiesce is in progress, so we need to undo io activation and wait
5569                  * for it to finish
5570                  */
5571                 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
5572                         wake_up(&conf->wait_for_quiescent);
5573                 spin_lock_irq(&conf->device_lock);
5574                 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
5575                                     conf->device_lock);
5576                 atomic_inc(&conf->active_aligned_reads);
5577                 spin_unlock_irq(&conf->device_lock);
5578         }
5579
5580         if (mddev->gendisk)
5581                 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
5582                                       raid_bio->bi_iter.bi_sector);
5583         submit_bio_noacct(align_bio);
5584         return 1;
5585
5586 out_rcu_unlock:
5587         rcu_read_unlock();
5588         return 0;
5589 }
5590
5591 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5592 {
5593         struct bio *split;
5594         sector_t sector = raid_bio->bi_iter.bi_sector;
5595         unsigned chunk_sects = mddev->chunk_sectors;
5596         unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5597
5598         if (sectors < bio_sectors(raid_bio)) {
5599                 struct r5conf *conf = mddev->private;
5600                 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5601                 bio_chain(split, raid_bio);
5602                 submit_bio_noacct(raid_bio);
5603                 raid_bio = split;
5604         }
5605
5606         if (!raid5_read_one_chunk(mddev, raid_bio))
5607                 return raid_bio;
5608
5609         return NULL;
5610 }
5611
5612 /* __get_priority_stripe - get the next stripe to process
5613  *
5614  * Full stripe writes are allowed to pass preread active stripes up until
5615  * the bypass_threshold is exceeded.  In general the bypass_count
5616  * increments when the handle_list is handled before the hold_list; however, it
5617  * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5618  * stripe with in flight i/o.  The bypass_count will be reset when the
5619  * head of the hold_list has changed, i.e. the head was promoted to the
5620  * handle_list.
5621  */
5622 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5623         __must_hold(&conf->device_lock)
5624 {
5625         struct stripe_head *sh, *tmp;
5626         struct list_head *handle_list = NULL;
5627         struct r5worker_group *wg;
5628         bool second_try = !r5c_is_writeback(conf->log) &&
5629                 !r5l_log_disk_error(conf);
5630         bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5631                 r5l_log_disk_error(conf);
5632
5633 again:
5634         wg = NULL;
5635         sh = NULL;
5636         if (conf->worker_cnt_per_group == 0) {
5637                 handle_list = try_loprio ? &conf->loprio_list :
5638                                         &conf->handle_list;
5639         } else if (group != ANY_GROUP) {
5640                 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5641                                 &conf->worker_groups[group].handle_list;
5642                 wg = &conf->worker_groups[group];
5643         } else {
5644                 int i;
5645                 for (i = 0; i < conf->group_cnt; i++) {
5646                         handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5647                                 &conf->worker_groups[i].handle_list;
5648                         wg = &conf->worker_groups[i];
5649                         if (!list_empty(handle_list))
5650                                 break;
5651                 }
5652         }
5653
5654         pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5655                   __func__,
5656                   list_empty(handle_list) ? "empty" : "busy",
5657                   list_empty(&conf->hold_list) ? "empty" : "busy",
5658                   atomic_read(&conf->pending_full_writes), conf->bypass_count);
5659
5660         if (!list_empty(handle_list)) {
5661                 sh = list_entry(handle_list->next, typeof(*sh), lru);
5662
5663                 if (list_empty(&conf->hold_list))
5664                         conf->bypass_count = 0;
5665                 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5666                         if (conf->hold_list.next == conf->last_hold)
5667                                 conf->bypass_count++;
5668                         else {
5669                                 conf->last_hold = conf->hold_list.next;
5670                                 conf->bypass_count -= conf->bypass_threshold;
5671                                 if (conf->bypass_count < 0)
5672                                         conf->bypass_count = 0;
5673                         }
5674                 }
5675         } else if (!list_empty(&conf->hold_list) &&
5676                    ((conf->bypass_threshold &&
5677                      conf->bypass_count > conf->bypass_threshold) ||
5678                     atomic_read(&conf->pending_full_writes) == 0)) {
5679
5680                 list_for_each_entry(tmp, &conf->hold_list,  lru) {
5681                         if (conf->worker_cnt_per_group == 0 ||
5682                             group == ANY_GROUP ||
5683                             !cpu_online(tmp->cpu) ||
5684                             cpu_to_group(tmp->cpu) == group) {
5685                                 sh = tmp;
5686                                 break;
5687                         }
5688                 }
5689
5690                 if (sh) {
5691                         conf->bypass_count -= conf->bypass_threshold;
5692                         if (conf->bypass_count < 0)
5693                                 conf->bypass_count = 0;
5694                 }
5695                 wg = NULL;
5696         }
5697
5698         if (!sh) {
5699                 if (second_try)
5700                         return NULL;
5701                 second_try = true;
5702                 try_loprio = !try_loprio;
5703                 goto again;
5704         }
5705
5706         if (wg) {
5707                 wg->stripes_cnt--;
5708                 sh->group = NULL;
5709         }
5710         list_del_init(&sh->lru);
5711         BUG_ON(atomic_inc_return(&sh->count) != 1);
5712         return sh;
5713 }
5714
5715 struct raid5_plug_cb {
5716         struct blk_plug_cb      cb;
5717         struct list_head        list;
5718         struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5719 };
5720
5721 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5722 {
5723         struct raid5_plug_cb *cb = container_of(
5724                 blk_cb, struct raid5_plug_cb, cb);
5725         struct stripe_head *sh;
5726         struct mddev *mddev = cb->cb.data;
5727         struct r5conf *conf = mddev->private;
5728         int cnt = 0;
5729         int hash;
5730
5731         if (cb->list.next && !list_empty(&cb->list)) {
5732                 spin_lock_irq(&conf->device_lock);
5733                 while (!list_empty(&cb->list)) {
5734                         sh = list_first_entry(&cb->list, struct stripe_head, lru);
5735                         list_del_init(&sh->lru);
5736                         /*
5737                          * avoid race release_stripe_plug() sees
5738                          * STRIPE_ON_UNPLUG_LIST clear but the stripe
5739                          * is still in our list
5740                          */
5741                         smp_mb__before_atomic();
5742                         clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5743                         /*
5744                          * STRIPE_ON_RELEASE_LIST could be set here. In that
5745                          * case, the count is always > 1 here
5746                          */
5747                         hash = sh->hash_lock_index;
5748                         __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5749                         cnt++;
5750                 }
5751                 spin_unlock_irq(&conf->device_lock);
5752         }
5753         release_inactive_stripe_list(conf, cb->temp_inactive_list,
5754                                      NR_STRIPE_HASH_LOCKS);
5755         if (mddev->queue)
5756                 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5757         kfree(cb);
5758 }
5759
5760 static void release_stripe_plug(struct mddev *mddev,
5761                                 struct stripe_head *sh)
5762 {
5763         struct blk_plug_cb *blk_cb = blk_check_plugged(
5764                 raid5_unplug, mddev,
5765                 sizeof(struct raid5_plug_cb));
5766         struct raid5_plug_cb *cb;
5767
5768         if (!blk_cb) {
5769                 raid5_release_stripe(sh);
5770                 return;
5771         }
5772
5773         cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5774
5775         if (cb->list.next == NULL) {
5776                 int i;
5777                 INIT_LIST_HEAD(&cb->list);
5778                 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5779                         INIT_LIST_HEAD(cb->temp_inactive_list + i);
5780         }
5781
5782         if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5783                 list_add_tail(&sh->lru, &cb->list);
5784         else
5785                 raid5_release_stripe(sh);
5786 }
5787
5788 static void make_discard_request(struct mddev *mddev, struct bio *bi)
5789 {
5790         struct r5conf *conf = mddev->private;
5791         sector_t logical_sector, last_sector;
5792         struct stripe_head *sh;
5793         int stripe_sectors;
5794
5795         /* We need to handle this when io_uring supports discard/trim */
5796         if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5797                 return;
5798
5799         if (mddev->reshape_position != MaxSector)
5800                 /* Skip discard while reshape is happening */
5801                 return;
5802
5803         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5804         last_sector = bio_end_sector(bi);
5805
5806         bi->bi_next = NULL;
5807
5808         stripe_sectors = conf->chunk_sectors *
5809                 (conf->raid_disks - conf->max_degraded);
5810         logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5811                                                stripe_sectors);
5812         sector_div(last_sector, stripe_sectors);
5813
5814         logical_sector *= conf->chunk_sectors;
5815         last_sector *= conf->chunk_sectors;
5816
5817         for (; logical_sector < last_sector;
5818              logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5819                 DEFINE_WAIT(w);
5820                 int d;
5821         again:
5822                 sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
5823                 prepare_to_wait(&conf->wait_for_overlap, &w,
5824                                 TASK_UNINTERRUPTIBLE);
5825                 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5826                 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5827                         raid5_release_stripe(sh);
5828                         schedule();
5829                         goto again;
5830                 }
5831                 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5832                 spin_lock_irq(&sh->stripe_lock);
5833                 for (d = 0; d < conf->raid_disks; d++) {
5834                         if (d == sh->pd_idx || d == sh->qd_idx)
5835                                 continue;
5836                         if (sh->dev[d].towrite || sh->dev[d].toread) {
5837                                 set_bit(R5_Overlap, &sh->dev[d].flags);
5838                                 spin_unlock_irq(&sh->stripe_lock);
5839                                 raid5_release_stripe(sh);
5840                                 schedule();
5841                                 goto again;
5842                         }
5843                 }
5844                 set_bit(STRIPE_DISCARD, &sh->state);
5845                 finish_wait(&conf->wait_for_overlap, &w);
5846                 sh->overwrite_disks = 0;
5847                 for (d = 0; d < conf->raid_disks; d++) {
5848                         if (d == sh->pd_idx || d == sh->qd_idx)
5849                                 continue;
5850                         sh->dev[d].towrite = bi;
5851                         set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5852                         bio_inc_remaining(bi);
5853                         md_write_inc(mddev, bi);
5854                         sh->overwrite_disks++;
5855                 }
5856                 spin_unlock_irq(&sh->stripe_lock);
5857                 if (conf->mddev->bitmap) {
5858                         for (d = 0;
5859                              d < conf->raid_disks - conf->max_degraded;
5860                              d++)
5861                                 md_bitmap_startwrite(mddev->bitmap,
5862                                                      sh->sector,
5863                                                      RAID5_STRIPE_SECTORS(conf),
5864                                                      0);
5865                         sh->bm_seq = conf->seq_flush + 1;
5866                         set_bit(STRIPE_BIT_DELAY, &sh->state);
5867                 }
5868
5869                 set_bit(STRIPE_HANDLE, &sh->state);
5870                 clear_bit(STRIPE_DELAYED, &sh->state);
5871                 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5872                         atomic_inc(&conf->preread_active_stripes);
5873                 release_stripe_plug(mddev, sh);
5874         }
5875
5876         bio_endio(bi);
5877 }
5878
5879 static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
5880                              sector_t reshape_sector)
5881 {
5882         return mddev->reshape_backwards ? sector < reshape_sector :
5883                                           sector >= reshape_sector;
5884 }
5885
5886 static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
5887                                    sector_t max, sector_t reshape_sector)
5888 {
5889         return mddev->reshape_backwards ? max < reshape_sector :
5890                                           min >= reshape_sector;
5891 }
5892
5893 static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
5894                                     struct stripe_head *sh)
5895 {
5896         sector_t max_sector = 0, min_sector = MaxSector;
5897         bool ret = false;
5898         int dd_idx;
5899
5900         for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5901                 if (dd_idx == sh->pd_idx)
5902                         continue;
5903
5904                 min_sector = min(min_sector, sh->dev[dd_idx].sector);
5905                 max_sector = min(max_sector, sh->dev[dd_idx].sector);
5906         }
5907
5908         spin_lock_irq(&conf->device_lock);
5909
5910         if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
5911                                      conf->reshape_progress))
5912                 /* mismatch, need to try again */
5913                 ret = true;
5914
5915         spin_unlock_irq(&conf->device_lock);
5916
5917         return ret;
5918 }
5919
5920 static int add_all_stripe_bios(struct r5conf *conf,
5921                 struct stripe_request_ctx *ctx, struct stripe_head *sh,
5922                 struct bio *bi, int forwrite, int previous)
5923 {
5924         int dd_idx;
5925         int ret = 1;
5926
5927         spin_lock_irq(&sh->stripe_lock);
5928
5929         for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5930                 struct r5dev *dev = &sh->dev[dd_idx];
5931
5932                 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
5933                         continue;
5934
5935                 if (dev->sector < ctx->first_sector ||
5936                     dev->sector >= ctx->last_sector)
5937                         continue;
5938
5939                 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
5940                         set_bit(R5_Overlap, &dev->flags);
5941                         ret = 0;
5942                         continue;
5943                 }
5944         }
5945
5946         if (!ret)
5947                 goto out;
5948
5949         for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5950                 struct r5dev *dev = &sh->dev[dd_idx];
5951
5952                 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
5953                         continue;
5954
5955                 if (dev->sector < ctx->first_sector ||
5956                     dev->sector >= ctx->last_sector)
5957                         continue;
5958
5959                 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
5960                 clear_bit((dev->sector - ctx->first_sector) >>
5961                           RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
5962         }
5963
5964 out:
5965         spin_unlock_irq(&sh->stripe_lock);
5966         return ret;
5967 }
5968
5969 static enum stripe_result make_stripe_request(struct mddev *mddev,
5970                 struct r5conf *conf, struct stripe_request_ctx *ctx,
5971                 sector_t logical_sector, struct bio *bi)
5972 {
5973         const int rw = bio_data_dir(bi);
5974         enum stripe_result ret;
5975         struct stripe_head *sh;
5976         sector_t new_sector;
5977         int previous = 0, flags = 0;
5978         int seq, dd_idx;
5979
5980         seq = read_seqcount_begin(&conf->gen_lock);
5981
5982         if (unlikely(conf->reshape_progress != MaxSector)) {
5983                 /*
5984                  * Spinlock is needed as reshape_progress may be
5985                  * 64bit on a 32bit platform, and so it might be
5986                  * possible to see a half-updated value
5987                  * Of course reshape_progress could change after
5988                  * the lock is dropped, so once we get a reference
5989                  * to the stripe that we think it is, we will have
5990                  * to check again.
5991                  */
5992                 spin_lock_irq(&conf->device_lock);
5993                 if (ahead_of_reshape(mddev, logical_sector,
5994                                      conf->reshape_progress)) {
5995                         previous = 1;
5996                 } else {
5997                         if (ahead_of_reshape(mddev, logical_sector,
5998                                              conf->reshape_safe)) {
5999                                 spin_unlock_irq(&conf->device_lock);
6000                                 return STRIPE_SCHEDULE_AND_RETRY;
6001                         }
6002                 }
6003                 spin_unlock_irq(&conf->device_lock);
6004         }
6005
6006         new_sector = raid5_compute_sector(conf, logical_sector, previous,
6007                                           &dd_idx, NULL);
6008         pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
6009                  new_sector, logical_sector);
6010
6011         if (previous)
6012                 flags |= R5_GAS_PREVIOUS;
6013         if (bi->bi_opf & REQ_RAHEAD)
6014                 flags |= R5_GAS_NOBLOCK;
6015         sh = raid5_get_active_stripe(conf, ctx, new_sector, flags);
6016         if (unlikely(!sh)) {
6017                 /* cannot get stripe, just give-up */
6018                 bi->bi_status = BLK_STS_IOERR;
6019                 return STRIPE_FAIL;
6020         }
6021
6022         if (unlikely(previous) &&
6023             stripe_ahead_of_reshape(mddev, conf, sh)) {
6024                 /*
6025                  * Expansion moved on while waiting for a stripe.
6026                  * Expansion could still move past after this
6027                  * test, but as we are holding a reference to
6028                  * 'sh', we know that if that happens,
6029                  *  STRIPE_EXPANDING will get set and the expansion
6030                  * won't proceed until we finish with the stripe.
6031                  */
6032                 ret = STRIPE_SCHEDULE_AND_RETRY;
6033                 goto out_release;
6034         }
6035
6036         if (read_seqcount_retry(&conf->gen_lock, seq)) {
6037                 /* Might have got the wrong stripe_head by accident */
6038                 ret = STRIPE_RETRY;
6039                 goto out_release;
6040         }
6041
6042         if (test_bit(STRIPE_EXPANDING, &sh->state) ||
6043             !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
6044                 /*
6045                  * Stripe is busy expanding or add failed due to
6046                  * overlap. Flush everything and wait a while.
6047                  */
6048                 md_wakeup_thread(mddev->thread);
6049                 ret = STRIPE_SCHEDULE_AND_RETRY;
6050                 goto out_release;
6051         }
6052
6053         if (stripe_can_batch(sh)) {
6054                 stripe_add_to_batch_list(conf, sh, ctx->batch_last);
6055                 if (ctx->batch_last)
6056                         raid5_release_stripe(ctx->batch_last);
6057                 atomic_inc(&sh->count);
6058                 ctx->batch_last = sh;
6059         }
6060
6061         if (ctx->do_flush) {
6062                 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
6063                 /* we only need flush for one stripe */
6064                 ctx->do_flush = false;
6065         }
6066
6067         set_bit(STRIPE_HANDLE, &sh->state);
6068         clear_bit(STRIPE_DELAYED, &sh->state);
6069         if ((!sh->batch_head || sh == sh->batch_head) &&
6070             (bi->bi_opf & REQ_SYNC) &&
6071             !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
6072                 atomic_inc(&conf->preread_active_stripes);
6073
6074         release_stripe_plug(mddev, sh);
6075         return STRIPE_SUCCESS;
6076
6077 out_release:
6078         raid5_release_stripe(sh);
6079         return ret;
6080 }
6081
6082 /*
6083  * If the bio covers multiple data disks, find sector within the bio that has
6084  * the lowest chunk offset in the first chunk.
6085  */
6086 static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
6087                                               struct bio *bi)
6088 {
6089         int sectors_per_chunk = conf->chunk_sectors;
6090         int raid_disks = conf->raid_disks;
6091         int dd_idx;
6092         struct stripe_head sh;
6093         unsigned int chunk_offset;
6094         sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6095         sector_t sector;
6096
6097         /* We pass in fake stripe_head to get back parity disk numbers */
6098         sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh);
6099         chunk_offset = sector_div(sector, sectors_per_chunk);
6100         if (sectors_per_chunk - chunk_offset >= bio_sectors(bi))
6101                 return r_sector;
6102         /*
6103          * Bio crosses to the next data disk. Check whether it's in the same
6104          * chunk.
6105          */
6106         dd_idx++;
6107         while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx)
6108                 dd_idx++;
6109         if (dd_idx >= raid_disks)
6110                 return r_sector;
6111         return r_sector + sectors_per_chunk - chunk_offset;
6112 }
6113
6114 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
6115 {
6116         DEFINE_WAIT_FUNC(wait, woken_wake_function);
6117         struct r5conf *conf = mddev->private;
6118         sector_t logical_sector;
6119         struct stripe_request_ctx ctx = {};
6120         const int rw = bio_data_dir(bi);
6121         enum stripe_result res;
6122         int s, stripe_cnt;
6123
6124         if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
6125                 int ret = log_handle_flush_request(conf, bi);
6126
6127                 if (ret == 0)
6128                         return true;
6129                 if (ret == -ENODEV) {
6130                         if (md_flush_request(mddev, bi))
6131                                 return true;
6132                 }
6133                 /* ret == -EAGAIN, fallback */
6134                 /*
6135                  * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
6136                  * we need to flush journal device
6137                  */
6138                 ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
6139         }
6140
6141         if (!md_write_start(mddev, bi))
6142                 return false;
6143         /*
6144          * If array is degraded, better not do chunk aligned read because
6145          * later we might have to read it again in order to reconstruct
6146          * data on failed drives.
6147          */
6148         if (rw == READ && mddev->degraded == 0 &&
6149             mddev->reshape_position == MaxSector) {
6150                 bi = chunk_aligned_read(mddev, bi);
6151                 if (!bi)
6152                         return true;
6153         }
6154
6155         if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
6156                 make_discard_request(mddev, bi);
6157                 md_write_end(mddev);
6158                 return true;
6159         }
6160
6161         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6162         ctx.first_sector = logical_sector;
6163         ctx.last_sector = bio_end_sector(bi);
6164         bi->bi_next = NULL;
6165
6166         stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
6167                                            RAID5_STRIPE_SECTORS(conf));
6168         bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
6169
6170         pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
6171                  bi->bi_iter.bi_sector, ctx.last_sector);
6172
6173         /* Bail out if conflicts with reshape and REQ_NOWAIT is set */
6174         if ((bi->bi_opf & REQ_NOWAIT) &&
6175             (conf->reshape_progress != MaxSector) &&
6176             !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
6177             ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
6178                 bio_wouldblock_error(bi);
6179                 if (rw == WRITE)
6180                         md_write_end(mddev);
6181                 return true;
6182         }
6183         md_account_bio(mddev, &bi);
6184
6185         /*
6186          * Lets start with the stripe with the lowest chunk offset in the first
6187          * chunk. That has the best chances of creating IOs adjacent to
6188          * previous IOs in case of sequential IO and thus creates the most
6189          * sequential IO pattern. We don't bother with the optimization when
6190          * reshaping as the performance benefit is not worth the complexity.
6191          */
6192         if (likely(conf->reshape_progress == MaxSector))
6193                 logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
6194         s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
6195
6196         add_wait_queue(&conf->wait_for_overlap, &wait);
6197         while (1) {
6198                 res = make_stripe_request(mddev, conf, &ctx, logical_sector,
6199                                           bi);
6200                 if (res == STRIPE_FAIL)
6201                         break;
6202
6203                 if (res == STRIPE_RETRY)
6204                         continue;
6205
6206                 if (res == STRIPE_SCHEDULE_AND_RETRY) {
6207                         /*
6208                          * Must release the reference to batch_last before
6209                          * scheduling and waiting for work to be done,
6210                          * otherwise the batch_last stripe head could prevent
6211                          * raid5_activate_delayed() from making progress
6212                          * and thus deadlocking.
6213                          */
6214                         if (ctx.batch_last) {
6215                                 raid5_release_stripe(ctx.batch_last);
6216                                 ctx.batch_last = NULL;
6217                         }
6218
6219                         wait_woken(&wait, TASK_UNINTERRUPTIBLE,
6220                                    MAX_SCHEDULE_TIMEOUT);
6221                         continue;
6222                 }
6223
6224                 s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s);
6225                 if (s == stripe_cnt)
6226                         break;
6227
6228                 logical_sector = ctx.first_sector +
6229                         (s << RAID5_STRIPE_SHIFT(conf));
6230         }
6231         remove_wait_queue(&conf->wait_for_overlap, &wait);
6232
6233         if (ctx.batch_last)
6234                 raid5_release_stripe(ctx.batch_last);
6235
6236         if (rw == WRITE)
6237                 md_write_end(mddev);
6238         bio_endio(bi);
6239         return true;
6240 }
6241
6242 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
6243
6244 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
6245 {
6246         /* reshaping is quite different to recovery/resync so it is
6247          * handled quite separately ... here.
6248          *
6249          * On each call to sync_request, we gather one chunk worth of
6250          * destination stripes and flag them as expanding.
6251          * Then we find all the source stripes and request reads.
6252          * As the reads complete, handle_stripe will copy the data
6253          * into the destination stripe and release that stripe.
6254          */
6255         struct r5conf *conf = mddev->private;
6256         struct stripe_head *sh;
6257         struct md_rdev *rdev;
6258         sector_t first_sector, last_sector;
6259         int raid_disks = conf->previous_raid_disks;
6260         int data_disks = raid_disks - conf->max_degraded;
6261         int new_data_disks = conf->raid_disks - conf->max_degraded;
6262         int i;
6263         int dd_idx;
6264         sector_t writepos, readpos, safepos;
6265         sector_t stripe_addr;
6266         int reshape_sectors;
6267         struct list_head stripes;
6268         sector_t retn;
6269
6270         if (sector_nr == 0) {
6271                 /* If restarting in the middle, skip the initial sectors */
6272                 if (mddev->reshape_backwards &&
6273                     conf->reshape_progress < raid5_size(mddev, 0, 0)) {
6274                         sector_nr = raid5_size(mddev, 0, 0)
6275                                 - conf->reshape_progress;
6276                 } else if (mddev->reshape_backwards &&
6277                            conf->reshape_progress == MaxSector) {
6278                         /* shouldn't happen, but just in case, finish up.*/
6279                         sector_nr = MaxSector;
6280                 } else if (!mddev->reshape_backwards &&
6281                            conf->reshape_progress > 0)
6282                         sector_nr = conf->reshape_progress;
6283                 sector_div(sector_nr, new_data_disks);
6284                 if (sector_nr) {
6285                         mddev->curr_resync_completed = sector_nr;
6286                         sysfs_notify_dirent_safe(mddev->sysfs_completed);
6287                         *skipped = 1;
6288                         retn = sector_nr;
6289                         goto finish;
6290                 }
6291         }
6292
6293         /* We need to process a full chunk at a time.
6294          * If old and new chunk sizes differ, we need to process the
6295          * largest of these
6296          */
6297
6298         reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6299
6300         /* We update the metadata at least every 10 seconds, or when
6301          * the data about to be copied would over-write the source of
6302          * the data at the front of the range.  i.e. one new_stripe
6303          * along from reshape_progress new_maps to after where
6304          * reshape_safe old_maps to
6305          */
6306         writepos = conf->reshape_progress;
6307         sector_div(writepos, new_data_disks);
6308         readpos = conf->reshape_progress;
6309         sector_div(readpos, data_disks);
6310         safepos = conf->reshape_safe;
6311         sector_div(safepos, data_disks);
6312         if (mddev->reshape_backwards) {
6313                 BUG_ON(writepos < reshape_sectors);
6314                 writepos -= reshape_sectors;
6315                 readpos += reshape_sectors;
6316                 safepos += reshape_sectors;
6317         } else {
6318                 writepos += reshape_sectors;
6319                 /* readpos and safepos are worst-case calculations.
6320                  * A negative number is overly pessimistic, and causes
6321                  * obvious problems for unsigned storage.  So clip to 0.
6322                  */
6323                 readpos -= min_t(sector_t, reshape_sectors, readpos);
6324                 safepos -= min_t(sector_t, reshape_sectors, safepos);
6325         }
6326
6327         /* Having calculated the 'writepos' possibly use it
6328          * to set 'stripe_addr' which is where we will write to.
6329          */
6330         if (mddev->reshape_backwards) {
6331                 BUG_ON(conf->reshape_progress == 0);
6332                 stripe_addr = writepos;
6333                 BUG_ON((mddev->dev_sectors &
6334                         ~((sector_t)reshape_sectors - 1))
6335                        - reshape_sectors - stripe_addr
6336                        != sector_nr);
6337         } else {
6338                 BUG_ON(writepos != sector_nr + reshape_sectors);
6339                 stripe_addr = sector_nr;
6340         }
6341
6342         /* 'writepos' is the most advanced device address we might write.
6343          * 'readpos' is the least advanced device address we might read.
6344          * 'safepos' is the least address recorded in the metadata as having
6345          *     been reshaped.
6346          * If there is a min_offset_diff, these are adjusted either by
6347          * increasing the safepos/readpos if diff is negative, or
6348          * increasing writepos if diff is positive.
6349          * If 'readpos' is then behind 'writepos', there is no way that we can
6350          * ensure safety in the face of a crash - that must be done by userspace
6351          * making a backup of the data.  So in that case there is no particular
6352          * rush to update metadata.
6353          * Otherwise if 'safepos' is behind 'writepos', then we really need to
6354          * update the metadata to advance 'safepos' to match 'readpos' so that
6355          * we can be safe in the event of a crash.
6356          * So we insist on updating metadata if safepos is behind writepos and
6357          * readpos is beyond writepos.
6358          * In any case, update the metadata every 10 seconds.
6359          * Maybe that number should be configurable, but I'm not sure it is
6360          * worth it.... maybe it could be a multiple of safemode_delay???
6361          */
6362         if (conf->min_offset_diff < 0) {
6363                 safepos += -conf->min_offset_diff;
6364                 readpos += -conf->min_offset_diff;
6365         } else
6366                 writepos += conf->min_offset_diff;
6367
6368         if ((mddev->reshape_backwards
6369              ? (safepos > writepos && readpos < writepos)
6370              : (safepos < writepos && readpos > writepos)) ||
6371             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6372                 /* Cannot proceed until we've updated the superblock... */
6373                 wait_event(conf->wait_for_overlap,
6374                            atomic_read(&conf->reshape_stripes)==0
6375                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6376                 if (atomic_read(&conf->reshape_stripes) != 0)
6377                         return 0;
6378                 mddev->reshape_position = conf->reshape_progress;
6379                 mddev->curr_resync_completed = sector_nr;
6380                 if (!mddev->reshape_backwards)
6381                         /* Can update recovery_offset */
6382                         rdev_for_each(rdev, mddev)
6383                                 if (rdev->raid_disk >= 0 &&
6384                                     !test_bit(Journal, &rdev->flags) &&
6385                                     !test_bit(In_sync, &rdev->flags) &&
6386                                     rdev->recovery_offset < sector_nr)
6387                                         rdev->recovery_offset = sector_nr;
6388
6389                 conf->reshape_checkpoint = jiffies;
6390                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6391                 md_wakeup_thread(mddev->thread);
6392                 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6393                            test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6394                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6395                         return 0;
6396                 spin_lock_irq(&conf->device_lock);
6397                 conf->reshape_safe = mddev->reshape_position;
6398                 spin_unlock_irq(&conf->device_lock);
6399                 wake_up(&conf->wait_for_overlap);
6400                 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6401         }
6402
6403         INIT_LIST_HEAD(&stripes);
6404         for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6405                 int j;
6406                 int skipped_disk = 0;
6407                 sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i,
6408                                              R5_GAS_NOQUIESCE);
6409                 set_bit(STRIPE_EXPANDING, &sh->state);
6410                 atomic_inc(&conf->reshape_stripes);
6411                 /* If any of this stripe is beyond the end of the old
6412                  * array, then we need to zero those blocks
6413                  */
6414                 for (j=sh->disks; j--;) {
6415                         sector_t s;
6416                         if (j == sh->pd_idx)
6417                                 continue;
6418                         if (conf->level == 6 &&
6419                             j == sh->qd_idx)
6420                                 continue;
6421                         s = raid5_compute_blocknr(sh, j, 0);
6422                         if (s < raid5_size(mddev, 0, 0)) {
6423                                 skipped_disk = 1;
6424                                 continue;
6425                         }
6426                         memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6427                         set_bit(R5_Expanded, &sh->dev[j].flags);
6428                         set_bit(R5_UPTODATE, &sh->dev[j].flags);
6429                 }
6430                 if (!skipped_disk) {
6431                         set_bit(STRIPE_EXPAND_READY, &sh->state);
6432                         set_bit(STRIPE_HANDLE, &sh->state);
6433                 }
6434                 list_add(&sh->lru, &stripes);
6435         }
6436         spin_lock_irq(&conf->device_lock);
6437         if (mddev->reshape_backwards)
6438                 conf->reshape_progress -= reshape_sectors * new_data_disks;
6439         else
6440                 conf->reshape_progress += reshape_sectors * new_data_disks;
6441         spin_unlock_irq(&conf->device_lock);
6442         /* Ok, those stripe are ready. We can start scheduling
6443          * reads on the source stripes.
6444          * The source stripes are determined by mapping the first and last
6445          * block on the destination stripes.
6446          */
6447         first_sector =
6448                 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6449                                      1, &dd_idx, NULL);
6450         last_sector =
6451                 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6452                                             * new_data_disks - 1),
6453                                      1, &dd_idx, NULL);
6454         if (last_sector >= mddev->dev_sectors)
6455                 last_sector = mddev->dev_sectors - 1;
6456         while (first_sector <= last_sector) {
6457                 sh = raid5_get_active_stripe(conf, NULL, first_sector,
6458                                 R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE);
6459                 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6460                 set_bit(STRIPE_HANDLE, &sh->state);
6461                 raid5_release_stripe(sh);
6462                 first_sector += RAID5_STRIPE_SECTORS(conf);
6463         }
6464         /* Now that the sources are clearly marked, we can release
6465          * the destination stripes
6466          */
6467         while (!list_empty(&stripes)) {
6468                 sh = list_entry(stripes.next, struct stripe_head, lru);
6469                 list_del_init(&sh->lru);
6470                 raid5_release_stripe(sh);
6471         }
6472         /* If this takes us to the resync_max point where we have to pause,
6473          * then we need to write out the superblock.
6474          */
6475         sector_nr += reshape_sectors;
6476         retn = reshape_sectors;
6477 finish:
6478         if (mddev->curr_resync_completed > mddev->resync_max ||
6479             (sector_nr - mddev->curr_resync_completed) * 2
6480             >= mddev->resync_max - mddev->curr_resync_completed) {
6481                 /* Cannot proceed until we've updated the superblock... */
6482                 wait_event(conf->wait_for_overlap,
6483                            atomic_read(&conf->reshape_stripes) == 0
6484                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6485                 if (atomic_read(&conf->reshape_stripes) != 0)
6486                         goto ret;
6487                 mddev->reshape_position = conf->reshape_progress;
6488                 mddev->curr_resync_completed = sector_nr;
6489                 if (!mddev->reshape_backwards)
6490                         /* Can update recovery_offset */
6491                         rdev_for_each(rdev, mddev)
6492                                 if (rdev->raid_disk >= 0 &&
6493                                     !test_bit(Journal, &rdev->flags) &&
6494                                     !test_bit(In_sync, &rdev->flags) &&
6495                                     rdev->recovery_offset < sector_nr)
6496                                         rdev->recovery_offset = sector_nr;
6497                 conf->reshape_checkpoint = jiffies;
6498                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6499                 md_wakeup_thread(mddev->thread);
6500                 wait_event(mddev->sb_wait,
6501                            !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6502                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6503                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6504                         goto ret;
6505                 spin_lock_irq(&conf->device_lock);
6506                 conf->reshape_safe = mddev->reshape_position;
6507                 spin_unlock_irq(&conf->device_lock);
6508                 wake_up(&conf->wait_for_overlap);
6509                 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6510         }
6511 ret:
6512         return retn;
6513 }
6514
6515 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6516                                           int *skipped)
6517 {
6518         struct r5conf *conf = mddev->private;
6519         struct stripe_head *sh;
6520         sector_t max_sector = mddev->dev_sectors;
6521         sector_t sync_blocks;
6522         int still_degraded = 0;
6523         int i;
6524
6525         if (sector_nr >= max_sector) {
6526                 /* just being told to finish up .. nothing much to do */
6527
6528                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6529                         end_reshape(conf);
6530                         return 0;
6531                 }
6532
6533                 if (mddev->curr_resync < max_sector) /* aborted */
6534                         md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6535                                            &sync_blocks, 1);
6536                 else /* completed sync */
6537                         conf->fullsync = 0;
6538                 md_bitmap_close_sync(mddev->bitmap);
6539
6540                 return 0;
6541         }
6542
6543         /* Allow raid5_quiesce to complete */
6544         wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6545
6546         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6547                 return reshape_request(mddev, sector_nr, skipped);
6548
6549         /* No need to check resync_max as we never do more than one
6550          * stripe, and as resync_max will always be on a chunk boundary,
6551          * if the check in md_do_sync didn't fire, there is no chance
6552          * of overstepping resync_max here
6553          */
6554
6555         /* if there is too many failed drives and we are trying
6556          * to resync, then assert that we are finished, because there is
6557          * nothing we can do.
6558          */
6559         if (mddev->degraded >= conf->max_degraded &&
6560             test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6561                 sector_t rv = mddev->dev_sectors - sector_nr;
6562                 *skipped = 1;
6563                 return rv;
6564         }
6565         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6566             !conf->fullsync &&
6567             !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6568             sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6569                 /* we can skip this block, and probably more */
6570                 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6571                 *skipped = 1;
6572                 /* keep things rounded to whole stripes */
6573                 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6574         }
6575
6576         md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6577
6578         sh = raid5_get_active_stripe(conf, NULL, sector_nr,
6579                                      R5_GAS_NOBLOCK);
6580         if (sh == NULL) {
6581                 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0);
6582                 /* make sure we don't swamp the stripe cache if someone else
6583                  * is trying to get access
6584                  */
6585                 schedule_timeout_uninterruptible(1);
6586         }
6587         /* Need to check if array will still be degraded after recovery/resync
6588          * Note in case of > 1 drive failures it's possible we're rebuilding
6589          * one drive while leaving another faulty drive in array.
6590          */
6591         rcu_read_lock();
6592         for (i = 0; i < conf->raid_disks; i++) {
6593                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
6594
6595                 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6596                         still_degraded = 1;
6597         }
6598         rcu_read_unlock();
6599
6600         md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6601
6602         set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6603         set_bit(STRIPE_HANDLE, &sh->state);
6604
6605         raid5_release_stripe(sh);
6606
6607         return RAID5_STRIPE_SECTORS(conf);
6608 }
6609
6610 static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6611                                unsigned int offset)
6612 {
6613         /* We may not be able to submit a whole bio at once as there
6614          * may not be enough stripe_heads available.
6615          * We cannot pre-allocate enough stripe_heads as we may need
6616          * more than exist in the cache (if we allow ever large chunks).
6617          * So we do one stripe head at a time and record in
6618          * ->bi_hw_segments how many have been done.
6619          *
6620          * We *know* that this entire raid_bio is in one chunk, so
6621          * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6622          */
6623         struct stripe_head *sh;
6624         int dd_idx;
6625         sector_t sector, logical_sector, last_sector;
6626         int scnt = 0;
6627         int handled = 0;
6628
6629         logical_sector = raid_bio->bi_iter.bi_sector &
6630                 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6631         sector = raid5_compute_sector(conf, logical_sector,
6632                                       0, &dd_idx, NULL);
6633         last_sector = bio_end_sector(raid_bio);
6634
6635         for (; logical_sector < last_sector;
6636              logical_sector += RAID5_STRIPE_SECTORS(conf),
6637                      sector += RAID5_STRIPE_SECTORS(conf),
6638                      scnt++) {
6639
6640                 if (scnt < offset)
6641                         /* already done this stripe */
6642                         continue;
6643
6644                 sh = raid5_get_active_stripe(conf, NULL, sector,
6645                                 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
6646                 if (!sh) {
6647                         /* failed to get a stripe - must wait */
6648                         conf->retry_read_aligned = raid_bio;
6649                         conf->retry_read_offset = scnt;
6650                         return handled;
6651                 }
6652
6653                 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6654                         raid5_release_stripe(sh);
6655                         conf->retry_read_aligned = raid_bio;
6656                         conf->retry_read_offset = scnt;
6657                         return handled;
6658                 }
6659
6660                 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6661                 handle_stripe(sh);
6662                 raid5_release_stripe(sh);
6663                 handled++;
6664         }
6665
6666         bio_endio(raid_bio);
6667
6668         if (atomic_dec_and_test(&conf->active_aligned_reads))
6669                 wake_up(&conf->wait_for_quiescent);
6670         return handled;
6671 }
6672
6673 static int handle_active_stripes(struct r5conf *conf, int group,
6674                                  struct r5worker *worker,
6675                                  struct list_head *temp_inactive_list)
6676                 __must_hold(&conf->device_lock)
6677 {
6678         struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6679         int i, batch_size = 0, hash;
6680         bool release_inactive = false;
6681
6682         while (batch_size < MAX_STRIPE_BATCH &&
6683                         (sh = __get_priority_stripe(conf, group)) != NULL)
6684                 batch[batch_size++] = sh;
6685
6686         if (batch_size == 0) {
6687                 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6688                         if (!list_empty(temp_inactive_list + i))
6689                                 break;
6690                 if (i == NR_STRIPE_HASH_LOCKS) {
6691                         spin_unlock_irq(&conf->device_lock);
6692                         log_flush_stripe_to_raid(conf);
6693                         spin_lock_irq(&conf->device_lock);
6694                         return batch_size;
6695                 }
6696                 release_inactive = true;
6697         }
6698         spin_unlock_irq(&conf->device_lock);
6699
6700         release_inactive_stripe_list(conf, temp_inactive_list,
6701                                      NR_STRIPE_HASH_LOCKS);
6702
6703         r5l_flush_stripe_to_raid(conf->log);
6704         if (release_inactive) {
6705                 spin_lock_irq(&conf->device_lock);
6706                 return 0;
6707         }
6708
6709         for (i = 0; i < batch_size; i++)
6710                 handle_stripe(batch[i]);
6711         log_write_stripe_run(conf);
6712
6713         cond_resched();
6714
6715         spin_lock_irq(&conf->device_lock);
6716         for (i = 0; i < batch_size; i++) {
6717                 hash = batch[i]->hash_lock_index;
6718                 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6719         }
6720         return batch_size;
6721 }
6722
6723 static void raid5_do_work(struct work_struct *work)
6724 {
6725         struct r5worker *worker = container_of(work, struct r5worker, work);
6726         struct r5worker_group *group = worker->group;
6727         struct r5conf *conf = group->conf;
6728         struct mddev *mddev = conf->mddev;
6729         int group_id = group - conf->worker_groups;
6730         int handled;
6731         struct blk_plug plug;
6732
6733         pr_debug("+++ raid5worker active\n");
6734
6735         blk_start_plug(&plug);
6736         handled = 0;
6737         spin_lock_irq(&conf->device_lock);
6738         while (1) {
6739                 int batch_size, released;
6740
6741                 released = release_stripe_list(conf, worker->temp_inactive_list);
6742
6743                 batch_size = handle_active_stripes(conf, group_id, worker,
6744                                                    worker->temp_inactive_list);
6745                 worker->working = false;
6746                 if (!batch_size && !released)
6747                         break;
6748                 handled += batch_size;
6749                 wait_event_lock_irq(mddev->sb_wait,
6750                         !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6751                         conf->device_lock);
6752         }
6753         pr_debug("%d stripes handled\n", handled);
6754
6755         spin_unlock_irq(&conf->device_lock);
6756
6757         flush_deferred_bios(conf);
6758
6759         r5l_flush_stripe_to_raid(conf->log);
6760
6761         async_tx_issue_pending_all();
6762         blk_finish_plug(&plug);
6763
6764         pr_debug("--- raid5worker inactive\n");
6765 }
6766
6767 /*
6768  * This is our raid5 kernel thread.
6769  *
6770  * We scan the hash table for stripes which can be handled now.
6771  * During the scan, completed stripes are saved for us by the interrupt
6772  * handler, so that they will not have to wait for our next wakeup.
6773  */
6774 static void raid5d(struct md_thread *thread)
6775 {
6776         struct mddev *mddev = thread->mddev;
6777         struct r5conf *conf = mddev->private;
6778         int handled;
6779         struct blk_plug plug;
6780
6781         pr_debug("+++ raid5d active\n");
6782
6783         md_check_recovery(mddev);
6784
6785         blk_start_plug(&plug);
6786         handled = 0;
6787         spin_lock_irq(&conf->device_lock);
6788         while (1) {
6789                 struct bio *bio;
6790                 int batch_size, released;
6791                 unsigned int offset;
6792
6793                 released = release_stripe_list(conf, conf->temp_inactive_list);
6794                 if (released)
6795                         clear_bit(R5_DID_ALLOC, &conf->cache_state);
6796
6797                 if (
6798                     !list_empty(&conf->bitmap_list)) {
6799                         /* Now is a good time to flush some bitmap updates */
6800                         conf->seq_flush++;
6801                         spin_unlock_irq(&conf->device_lock);
6802                         md_bitmap_unplug(mddev->bitmap);
6803                         spin_lock_irq(&conf->device_lock);
6804                         conf->seq_write = conf->seq_flush;
6805                         activate_bit_delay(conf, conf->temp_inactive_list);
6806                 }
6807                 raid5_activate_delayed(conf);
6808
6809                 while ((bio = remove_bio_from_retry(conf, &offset))) {
6810                         int ok;
6811                         spin_unlock_irq(&conf->device_lock);
6812                         ok = retry_aligned_read(conf, bio, offset);
6813                         spin_lock_irq(&conf->device_lock);
6814                         if (!ok)
6815                                 break;
6816                         handled++;
6817                 }
6818
6819                 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6820                                                    conf->temp_inactive_list);
6821                 if (!batch_size && !released)
6822                         break;
6823                 handled += batch_size;
6824
6825                 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6826                         spin_unlock_irq(&conf->device_lock);
6827                         md_check_recovery(mddev);
6828                         spin_lock_irq(&conf->device_lock);
6829
6830                         /*
6831                          * Waiting on MD_SB_CHANGE_PENDING below may deadlock
6832                          * seeing md_check_recovery() is needed to clear
6833                          * the flag when using mdmon.
6834                          */
6835                         continue;
6836                 }
6837
6838                 wait_event_lock_irq(mddev->sb_wait,
6839                         !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6840                         conf->device_lock);
6841         }
6842         pr_debug("%d stripes handled\n", handled);
6843
6844         spin_unlock_irq(&conf->device_lock);
6845         if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6846             mutex_trylock(&conf->cache_size_mutex)) {
6847                 grow_one_stripe(conf, __GFP_NOWARN);
6848                 /* Set flag even if allocation failed.  This helps
6849                  * slow down allocation requests when mem is short
6850                  */
6851                 set_bit(R5_DID_ALLOC, &conf->cache_state);
6852                 mutex_unlock(&conf->cache_size_mutex);
6853         }
6854
6855         flush_deferred_bios(conf);
6856
6857         r5l_flush_stripe_to_raid(conf->log);
6858
6859         async_tx_issue_pending_all();
6860         blk_finish_plug(&plug);
6861
6862         pr_debug("--- raid5d inactive\n");
6863 }
6864
6865 static ssize_t
6866 raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6867 {
6868         struct r5conf *conf;
6869         int ret = 0;
6870         spin_lock(&mddev->lock);
6871         conf = mddev->private;
6872         if (conf)
6873                 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6874         spin_unlock(&mddev->lock);
6875         return ret;
6876 }
6877
6878 int
6879 raid5_set_cache_size(struct mddev *mddev, int size)
6880 {
6881         int result = 0;
6882         struct r5conf *conf = mddev->private;
6883
6884         if (size <= 16 || size > 32768)
6885                 return -EINVAL;
6886
6887         conf->min_nr_stripes = size;
6888         mutex_lock(&conf->cache_size_mutex);
6889         while (size < conf->max_nr_stripes &&
6890                drop_one_stripe(conf))
6891                 ;
6892         mutex_unlock(&conf->cache_size_mutex);
6893
6894         md_allow_write(mddev);
6895
6896         mutex_lock(&conf->cache_size_mutex);
6897         while (size > conf->max_nr_stripes)
6898                 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6899                         conf->min_nr_stripes = conf->max_nr_stripes;
6900                         result = -ENOMEM;
6901                         break;
6902                 }
6903         mutex_unlock(&conf->cache_size_mutex);
6904
6905         return result;
6906 }
6907 EXPORT_SYMBOL(raid5_set_cache_size);
6908
6909 static ssize_t
6910 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6911 {
6912         struct r5conf *conf;
6913         unsigned long new;
6914         int err;
6915
6916         if (len >= PAGE_SIZE)
6917                 return -EINVAL;
6918         if (kstrtoul(page, 10, &new))
6919                 return -EINVAL;
6920         err = mddev_lock(mddev);
6921         if (err)
6922                 return err;
6923         conf = mddev->private;
6924         if (!conf)
6925                 err = -ENODEV;
6926         else
6927                 err = raid5_set_cache_size(mddev, new);
6928         mddev_unlock(mddev);
6929
6930         return err ?: len;
6931 }
6932
6933 static struct md_sysfs_entry
6934 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6935                                 raid5_show_stripe_cache_size,
6936                                 raid5_store_stripe_cache_size);
6937
6938 static ssize_t
6939 raid5_show_rmw_level(struct mddev  *mddev, char *page)
6940 {
6941         struct r5conf *conf = mddev->private;
6942         if (conf)
6943                 return sprintf(page, "%d\n", conf->rmw_level);
6944         else
6945                 return 0;
6946 }
6947
6948 static ssize_t
6949 raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
6950 {
6951         struct r5conf *conf = mddev->private;
6952         unsigned long new;
6953
6954         if (!conf)
6955                 return -ENODEV;
6956
6957         if (len >= PAGE_SIZE)
6958                 return -EINVAL;
6959
6960         if (kstrtoul(page, 10, &new))
6961                 return -EINVAL;
6962
6963         if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6964                 return -EINVAL;
6965
6966         if (new != PARITY_DISABLE_RMW &&
6967             new != PARITY_ENABLE_RMW &&
6968             new != PARITY_PREFER_RMW)
6969                 return -EINVAL;
6970
6971         conf->rmw_level = new;
6972         return len;
6973 }
6974
6975 static struct md_sysfs_entry
6976 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6977                          raid5_show_rmw_level,
6978                          raid5_store_rmw_level);
6979
6980 static ssize_t
6981 raid5_show_stripe_size(struct mddev  *mddev, char *page)
6982 {
6983         struct r5conf *conf;
6984         int ret = 0;
6985
6986         spin_lock(&mddev->lock);
6987         conf = mddev->private;
6988         if (conf)
6989                 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6990         spin_unlock(&mddev->lock);
6991         return ret;
6992 }
6993
6994 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6995 static ssize_t
6996 raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
6997 {
6998         struct r5conf *conf;
6999         unsigned long new;
7000         int err;
7001         int size;
7002
7003         if (len >= PAGE_SIZE)
7004                 return -EINVAL;
7005         if (kstrtoul(page, 10, &new))
7006                 return -EINVAL;
7007
7008         /*
7009          * The value should not be bigger than PAGE_SIZE. It requires to
7010          * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
7011          * of two.
7012          */
7013         if (new % DEFAULT_STRIPE_SIZE != 0 ||
7014                         new > PAGE_SIZE || new == 0 ||
7015                         new != roundup_pow_of_two(new))
7016                 return -EINVAL;
7017
7018         err = mddev_lock(mddev);
7019         if (err)
7020                 return err;
7021
7022         conf = mddev->private;
7023         if (!conf) {
7024                 err = -ENODEV;
7025                 goto out_unlock;
7026         }
7027
7028         if (new == conf->stripe_size)
7029                 goto out_unlock;
7030
7031         pr_debug("md/raid: change stripe_size from %lu to %lu\n",
7032                         conf->stripe_size, new);
7033
7034         if (mddev->sync_thread ||
7035                 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7036                 mddev->reshape_position != MaxSector ||
7037                 mddev->sysfs_active) {
7038                 err = -EBUSY;
7039                 goto out_unlock;
7040         }
7041
7042         mddev_suspend(mddev);
7043         mutex_lock(&conf->cache_size_mutex);
7044         size = conf->max_nr_stripes;
7045
7046         shrink_stripes(conf);
7047
7048         conf->stripe_size = new;
7049         conf->stripe_shift = ilog2(new) - 9;
7050         conf->stripe_sectors = new >> 9;
7051         if (grow_stripes(conf, size)) {
7052                 pr_warn("md/raid:%s: couldn't allocate buffers\n",
7053                                 mdname(mddev));
7054                 err = -ENOMEM;
7055         }
7056         mutex_unlock(&conf->cache_size_mutex);
7057         mddev_resume(mddev);
7058
7059 out_unlock:
7060         mddev_unlock(mddev);
7061         return err ?: len;
7062 }
7063
7064 static struct md_sysfs_entry
7065 raid5_stripe_size = __ATTR(stripe_size, 0644,
7066                          raid5_show_stripe_size,
7067                          raid5_store_stripe_size);
7068 #else
7069 static struct md_sysfs_entry
7070 raid5_stripe_size = __ATTR(stripe_size, 0444,
7071                          raid5_show_stripe_size,
7072                          NULL);
7073 #endif
7074
7075 static ssize_t
7076 raid5_show_preread_threshold(struct mddev *mddev, char *page)
7077 {
7078         struct r5conf *conf;
7079         int ret = 0;
7080         spin_lock(&mddev->lock);
7081         conf = mddev->private;
7082         if (conf)
7083                 ret = sprintf(page, "%d\n", conf->bypass_threshold);
7084         spin_unlock(&mddev->lock);
7085         return ret;
7086 }
7087
7088 static ssize_t
7089 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
7090 {
7091         struct r5conf *conf;
7092         unsigned long new;
7093         int err;
7094
7095         if (len >= PAGE_SIZE)
7096                 return -EINVAL;
7097         if (kstrtoul(page, 10, &new))
7098                 return -EINVAL;
7099
7100         err = mddev_lock(mddev);
7101         if (err)
7102                 return err;
7103         conf = mddev->private;
7104         if (!conf)
7105                 err = -ENODEV;
7106         else if (new > conf->min_nr_stripes)
7107                 err = -EINVAL;
7108         else
7109                 conf->bypass_threshold = new;
7110         mddev_unlock(mddev);
7111         return err ?: len;
7112 }
7113
7114 static struct md_sysfs_entry
7115 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
7116                                         S_IRUGO | S_IWUSR,
7117                                         raid5_show_preread_threshold,
7118                                         raid5_store_preread_threshold);
7119
7120 static ssize_t
7121 raid5_show_skip_copy(struct mddev *mddev, char *page)
7122 {
7123         struct r5conf *conf;
7124         int ret = 0;
7125         spin_lock(&mddev->lock);
7126         conf = mddev->private;
7127         if (conf)
7128                 ret = sprintf(page, "%d\n", conf->skip_copy);
7129         spin_unlock(&mddev->lock);
7130         return ret;
7131 }
7132
7133 static ssize_t
7134 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
7135 {
7136         struct r5conf *conf;
7137         unsigned long new;
7138         int err;
7139
7140         if (len >= PAGE_SIZE)
7141                 return -EINVAL;
7142         if (kstrtoul(page, 10, &new))
7143                 return -EINVAL;
7144         new = !!new;
7145
7146         err = mddev_lock(mddev);
7147         if (err)
7148                 return err;
7149         conf = mddev->private;
7150         if (!conf)
7151                 err = -ENODEV;
7152         else if (new != conf->skip_copy) {
7153                 struct request_queue *q = mddev->queue;
7154
7155                 mddev_suspend(mddev);
7156                 conf->skip_copy = new;
7157                 if (new)
7158                         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
7159                 else
7160                         blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
7161                 mddev_resume(mddev);
7162         }
7163         mddev_unlock(mddev);
7164         return err ?: len;
7165 }
7166
7167 static struct md_sysfs_entry
7168 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
7169                                         raid5_show_skip_copy,
7170                                         raid5_store_skip_copy);
7171
7172 static ssize_t
7173 stripe_cache_active_show(struct mddev *mddev, char *page)
7174 {
7175         struct r5conf *conf = mddev->private;
7176         if (conf)
7177                 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
7178         else
7179                 return 0;
7180 }
7181
7182 static struct md_sysfs_entry
7183 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
7184
7185 static ssize_t
7186 raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
7187 {
7188         struct r5conf *conf;
7189         int ret = 0;
7190         spin_lock(&mddev->lock);
7191         conf = mddev->private;
7192         if (conf)
7193                 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
7194         spin_unlock(&mddev->lock);
7195         return ret;
7196 }
7197
7198 static int alloc_thread_groups(struct r5conf *conf, int cnt,
7199                                int *group_cnt,
7200                                struct r5worker_group **worker_groups);
7201 static ssize_t
7202 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
7203 {
7204         struct r5conf *conf;
7205         unsigned int new;
7206         int err;
7207         struct r5worker_group *new_groups, *old_groups;
7208         int group_cnt;
7209
7210         if (len >= PAGE_SIZE)
7211                 return -EINVAL;
7212         if (kstrtouint(page, 10, &new))
7213                 return -EINVAL;
7214         /* 8192 should be big enough */
7215         if (new > 8192)
7216                 return -EINVAL;
7217
7218         err = mddev_lock(mddev);
7219         if (err)
7220                 return err;
7221         conf = mddev->private;
7222         if (!conf)
7223                 err = -ENODEV;
7224         else if (new != conf->worker_cnt_per_group) {
7225                 mddev_suspend(mddev);
7226
7227                 old_groups = conf->worker_groups;
7228                 if (old_groups)
7229                         flush_workqueue(raid5_wq);
7230
7231                 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
7232                 if (!err) {
7233                         spin_lock_irq(&conf->device_lock);
7234                         conf->group_cnt = group_cnt;
7235                         conf->worker_cnt_per_group = new;
7236                         conf->worker_groups = new_groups;
7237                         spin_unlock_irq(&conf->device_lock);
7238
7239                         if (old_groups)
7240                                 kfree(old_groups[0].workers);
7241                         kfree(old_groups);
7242                 }
7243                 mddev_resume(mddev);
7244         }
7245         mddev_unlock(mddev);
7246
7247         return err ?: len;
7248 }
7249
7250 static struct md_sysfs_entry
7251 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
7252                                 raid5_show_group_thread_cnt,
7253                                 raid5_store_group_thread_cnt);
7254
7255 static struct attribute *raid5_attrs[] =  {
7256         &raid5_stripecache_size.attr,
7257         &raid5_stripecache_active.attr,
7258         &raid5_preread_bypass_threshold.attr,
7259         &raid5_group_thread_cnt.attr,
7260         &raid5_skip_copy.attr,
7261         &raid5_rmw_level.attr,
7262         &raid5_stripe_size.attr,
7263         &r5c_journal_mode.attr,
7264         &ppl_write_hint.attr,
7265         NULL,
7266 };
7267 static const struct attribute_group raid5_attrs_group = {
7268         .name = NULL,
7269         .attrs = raid5_attrs,
7270 };
7271
7272 static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
7273                                struct r5worker_group **worker_groups)
7274 {
7275         int i, j, k;
7276         ssize_t size;
7277         struct r5worker *workers;
7278
7279         if (cnt == 0) {
7280                 *group_cnt = 0;
7281                 *worker_groups = NULL;
7282                 return 0;
7283         }
7284         *group_cnt = num_possible_nodes();
7285         size = sizeof(struct r5worker) * cnt;
7286         workers = kcalloc(size, *group_cnt, GFP_NOIO);
7287         *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
7288                                  GFP_NOIO);
7289         if (!*worker_groups || !workers) {
7290                 kfree(workers);
7291                 kfree(*worker_groups);
7292                 return -ENOMEM;
7293         }
7294
7295         for (i = 0; i < *group_cnt; i++) {
7296                 struct r5worker_group *group;
7297
7298                 group = &(*worker_groups)[i];
7299                 INIT_LIST_HEAD(&group->handle_list);
7300                 INIT_LIST_HEAD(&group->loprio_list);
7301                 group->conf = conf;
7302                 group->workers = workers + i * cnt;
7303
7304                 for (j = 0; j < cnt; j++) {
7305                         struct r5worker *worker = group->workers + j;
7306                         worker->group = group;
7307                         INIT_WORK(&worker->work, raid5_do_work);
7308
7309                         for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
7310                                 INIT_LIST_HEAD(worker->temp_inactive_list + k);
7311                 }
7312         }
7313
7314         return 0;
7315 }
7316
7317 static void free_thread_groups(struct r5conf *conf)
7318 {
7319         if (conf->worker_groups)
7320                 kfree(conf->worker_groups[0].workers);
7321         kfree(conf->worker_groups);
7322         conf->worker_groups = NULL;
7323 }
7324
7325 static sector_t
7326 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
7327 {
7328         struct r5conf *conf = mddev->private;
7329
7330         if (!sectors)
7331                 sectors = mddev->dev_sectors;
7332         if (!raid_disks)
7333                 /* size is defined by the smallest of previous and new size */
7334                 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7335
7336         sectors &= ~((sector_t)conf->chunk_sectors - 1);
7337         sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7338         return sectors * (raid_disks - conf->max_degraded);
7339 }
7340
7341 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7342 {
7343         safe_put_page(percpu->spare_page);
7344         percpu->spare_page = NULL;
7345         kvfree(percpu->scribble);
7346         percpu->scribble = NULL;
7347 }
7348
7349 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7350 {
7351         if (conf->level == 6 && !percpu->spare_page) {
7352                 percpu->spare_page = alloc_page(GFP_KERNEL);
7353                 if (!percpu->spare_page)
7354                         return -ENOMEM;
7355         }
7356
7357         if (scribble_alloc(percpu,
7358                            max(conf->raid_disks,
7359                                conf->previous_raid_disks),
7360                            max(conf->chunk_sectors,
7361                                conf->prev_chunk_sectors)
7362                            / RAID5_STRIPE_SECTORS(conf))) {
7363                 free_scratch_buffer(conf, percpu);
7364                 return -ENOMEM;
7365         }
7366
7367         local_lock_init(&percpu->lock);
7368         return 0;
7369 }
7370
7371 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7372 {
7373         struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7374
7375         free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7376         return 0;
7377 }
7378
7379 static void raid5_free_percpu(struct r5conf *conf)
7380 {
7381         if (!conf->percpu)
7382                 return;
7383
7384         cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7385         free_percpu(conf->percpu);
7386 }
7387
7388 static void free_conf(struct r5conf *conf)
7389 {
7390         int i;
7391
7392         log_exit(conf);
7393
7394         unregister_shrinker(&conf->shrinker);
7395         free_thread_groups(conf);
7396         shrink_stripes(conf);
7397         raid5_free_percpu(conf);
7398         for (i = 0; i < conf->pool_size; i++)
7399                 if (conf->disks[i].extra_page)
7400                         put_page(conf->disks[i].extra_page);
7401         kfree(conf->disks);
7402         bioset_exit(&conf->bio_split);
7403         kfree(conf->stripe_hashtbl);
7404         kfree(conf->pending_data);
7405         kfree(conf);
7406 }
7407
7408 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7409 {
7410         struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7411         struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7412
7413         if (alloc_scratch_buffer(conf, percpu)) {
7414                 pr_warn("%s: failed memory allocation for cpu%u\n",
7415                         __func__, cpu);
7416                 return -ENOMEM;
7417         }
7418         return 0;
7419 }
7420
7421 static int raid5_alloc_percpu(struct r5conf *conf)
7422 {
7423         int err = 0;
7424
7425         conf->percpu = alloc_percpu(struct raid5_percpu);
7426         if (!conf->percpu)
7427                 return -ENOMEM;
7428
7429         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7430         if (!err) {
7431                 conf->scribble_disks = max(conf->raid_disks,
7432                         conf->previous_raid_disks);
7433                 conf->scribble_sectors = max(conf->chunk_sectors,
7434                         conf->prev_chunk_sectors);
7435         }
7436         return err;
7437 }
7438
7439 static unsigned long raid5_cache_scan(struct shrinker *shrink,
7440                                       struct shrink_control *sc)
7441 {
7442         struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7443         unsigned long ret = SHRINK_STOP;
7444
7445         if (mutex_trylock(&conf->cache_size_mutex)) {
7446                 ret= 0;
7447                 while (ret < sc->nr_to_scan &&
7448                        conf->max_nr_stripes > conf->min_nr_stripes) {
7449                         if (drop_one_stripe(conf) == 0) {
7450                                 ret = SHRINK_STOP;
7451                                 break;
7452                         }
7453                         ret++;
7454                 }
7455                 mutex_unlock(&conf->cache_size_mutex);
7456         }
7457         return ret;
7458 }
7459
7460 static unsigned long raid5_cache_count(struct shrinker *shrink,
7461                                        struct shrink_control *sc)
7462 {
7463         struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7464
7465         if (conf->max_nr_stripes < conf->min_nr_stripes)
7466                 /* unlikely, but not impossible */
7467                 return 0;
7468         return conf->max_nr_stripes - conf->min_nr_stripes;
7469 }
7470
7471 static struct r5conf *setup_conf(struct mddev *mddev)
7472 {
7473         struct r5conf *conf;
7474         int raid_disk, memory, max_disks;
7475         struct md_rdev *rdev;
7476         struct disk_info *disk;
7477         char pers_name[6];
7478         int i;
7479         int group_cnt;
7480         struct r5worker_group *new_group;
7481         int ret = -ENOMEM;
7482
7483         if (mddev->new_level != 5
7484             && mddev->new_level != 4
7485             && mddev->new_level != 6) {
7486                 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7487                         mdname(mddev), mddev->new_level);
7488                 return ERR_PTR(-EIO);
7489         }
7490         if ((mddev->new_level == 5
7491              && !algorithm_valid_raid5(mddev->new_layout)) ||
7492             (mddev->new_level == 6
7493              && !algorithm_valid_raid6(mddev->new_layout))) {
7494                 pr_warn("md/raid:%s: layout %d not supported\n",
7495                         mdname(mddev), mddev->new_layout);
7496                 return ERR_PTR(-EIO);
7497         }
7498         if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7499                 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7500                         mdname(mddev), mddev->raid_disks);
7501                 return ERR_PTR(-EINVAL);
7502         }
7503
7504         if (!mddev->new_chunk_sectors ||
7505             (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7506             !is_power_of_2(mddev->new_chunk_sectors)) {
7507                 pr_warn("md/raid:%s: invalid chunk size %d\n",
7508                         mdname(mddev), mddev->new_chunk_sectors << 9);
7509                 return ERR_PTR(-EINVAL);
7510         }
7511
7512         conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7513         if (conf == NULL)
7514                 goto abort;
7515
7516 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7517         conf->stripe_size = DEFAULT_STRIPE_SIZE;
7518         conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7519         conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7520 #endif
7521         INIT_LIST_HEAD(&conf->free_list);
7522         INIT_LIST_HEAD(&conf->pending_list);
7523         conf->pending_data = kcalloc(PENDING_IO_MAX,
7524                                      sizeof(struct r5pending_data),
7525                                      GFP_KERNEL);
7526         if (!conf->pending_data)
7527                 goto abort;
7528         for (i = 0; i < PENDING_IO_MAX; i++)
7529                 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7530         /* Don't enable multi-threading by default*/
7531         if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7532                 conf->group_cnt = group_cnt;
7533                 conf->worker_cnt_per_group = 0;
7534                 conf->worker_groups = new_group;
7535         } else
7536                 goto abort;
7537         spin_lock_init(&conf->device_lock);
7538         seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7539         mutex_init(&conf->cache_size_mutex);
7540
7541         init_waitqueue_head(&conf->wait_for_quiescent);
7542         init_waitqueue_head(&conf->wait_for_stripe);
7543         init_waitqueue_head(&conf->wait_for_overlap);
7544         INIT_LIST_HEAD(&conf->handle_list);
7545         INIT_LIST_HEAD(&conf->loprio_list);
7546         INIT_LIST_HEAD(&conf->hold_list);
7547         INIT_LIST_HEAD(&conf->delayed_list);
7548         INIT_LIST_HEAD(&conf->bitmap_list);
7549         init_llist_head(&conf->released_stripes);
7550         atomic_set(&conf->active_stripes, 0);
7551         atomic_set(&conf->preread_active_stripes, 0);
7552         atomic_set(&conf->active_aligned_reads, 0);
7553         spin_lock_init(&conf->pending_bios_lock);
7554         conf->batch_bio_dispatch = true;
7555         rdev_for_each(rdev, mddev) {
7556                 if (test_bit(Journal, &rdev->flags))
7557                         continue;
7558                 if (bdev_nonrot(rdev->bdev)) {
7559                         conf->batch_bio_dispatch = false;
7560                         break;
7561                 }
7562         }
7563
7564         conf->bypass_threshold = BYPASS_THRESHOLD;
7565         conf->recovery_disabled = mddev->recovery_disabled - 1;
7566
7567         conf->raid_disks = mddev->raid_disks;
7568         if (mddev->reshape_position == MaxSector)
7569                 conf->previous_raid_disks = mddev->raid_disks;
7570         else
7571                 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7572         max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7573
7574         conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7575                               GFP_KERNEL);
7576
7577         if (!conf->disks)
7578                 goto abort;
7579
7580         for (i = 0; i < max_disks; i++) {
7581                 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7582                 if (!conf->disks[i].extra_page)
7583                         goto abort;
7584         }
7585
7586         ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7587         if (ret)
7588                 goto abort;
7589         conf->mddev = mddev;
7590
7591         ret = -ENOMEM;
7592         conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL);
7593         if (!conf->stripe_hashtbl)
7594                 goto abort;
7595
7596         /* We init hash_locks[0] separately to that it can be used
7597          * as the reference lock in the spin_lock_nest_lock() call
7598          * in lock_all_device_hash_locks_irq in order to convince
7599          * lockdep that we know what we are doing.
7600          */
7601         spin_lock_init(conf->hash_locks);
7602         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7603                 spin_lock_init(conf->hash_locks + i);
7604
7605         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7606                 INIT_LIST_HEAD(conf->inactive_list + i);
7607
7608         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7609                 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7610
7611         atomic_set(&conf->r5c_cached_full_stripes, 0);
7612         INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7613         atomic_set(&conf->r5c_cached_partial_stripes, 0);
7614         INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7615         atomic_set(&conf->r5c_flushing_full_stripes, 0);
7616         atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7617
7618         conf->level = mddev->new_level;
7619         conf->chunk_sectors = mddev->new_chunk_sectors;
7620         ret = raid5_alloc_percpu(conf);
7621         if (ret)
7622                 goto abort;
7623
7624         pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7625
7626         ret = -EIO;
7627         rdev_for_each(rdev, mddev) {
7628                 raid_disk = rdev->raid_disk;
7629                 if (raid_disk >= max_disks
7630                     || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7631                         continue;
7632                 disk = conf->disks + raid_disk;
7633
7634                 if (test_bit(Replacement, &rdev->flags)) {
7635                         if (disk->replacement)
7636                                 goto abort;
7637                         RCU_INIT_POINTER(disk->replacement, rdev);
7638                 } else {
7639                         if (disk->rdev)
7640                                 goto abort;
7641                         RCU_INIT_POINTER(disk->rdev, rdev);
7642                 }
7643
7644                 if (test_bit(In_sync, &rdev->flags)) {
7645                         pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7646                                 mdname(mddev), rdev->bdev, raid_disk);
7647                 } else if (rdev->saved_raid_disk != raid_disk)
7648                         /* Cannot rely on bitmap to complete recovery */
7649                         conf->fullsync = 1;
7650         }
7651
7652         conf->level = mddev->new_level;
7653         if (conf->level == 6) {
7654                 conf->max_degraded = 2;
7655                 if (raid6_call.xor_syndrome)
7656                         conf->rmw_level = PARITY_ENABLE_RMW;
7657                 else
7658                         conf->rmw_level = PARITY_DISABLE_RMW;
7659         } else {
7660                 conf->max_degraded = 1;
7661                 conf->rmw_level = PARITY_ENABLE_RMW;
7662         }
7663         conf->algorithm = mddev->new_layout;
7664         conf->reshape_progress = mddev->reshape_position;
7665         if (conf->reshape_progress != MaxSector) {
7666                 conf->prev_chunk_sectors = mddev->chunk_sectors;
7667                 conf->prev_algo = mddev->layout;
7668         } else {
7669                 conf->prev_chunk_sectors = conf->chunk_sectors;
7670                 conf->prev_algo = conf->algorithm;
7671         }
7672
7673         conf->min_nr_stripes = NR_STRIPES;
7674         if (mddev->reshape_position != MaxSector) {
7675                 int stripes = max_t(int,
7676                         ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7677                         ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7678                 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7679                 if (conf->min_nr_stripes != NR_STRIPES)
7680                         pr_info("md/raid:%s: force stripe size %d for reshape\n",
7681                                 mdname(mddev), conf->min_nr_stripes);
7682         }
7683         memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7684                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7685         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7686         if (grow_stripes(conf, conf->min_nr_stripes)) {
7687                 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7688                         mdname(mddev), memory);
7689                 ret = -ENOMEM;
7690                 goto abort;
7691         } else
7692                 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7693         /*
7694          * Losing a stripe head costs more than the time to refill it,
7695          * it reduces the queue depth and so can hurt throughput.
7696          * So set it rather large, scaled by number of devices.
7697          */
7698         conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7699         conf->shrinker.scan_objects = raid5_cache_scan;
7700         conf->shrinker.count_objects = raid5_cache_count;
7701         conf->shrinker.batch = 128;
7702         conf->shrinker.flags = 0;
7703         ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
7704         if (ret) {
7705                 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7706                         mdname(mddev));
7707                 goto abort;
7708         }
7709
7710         sprintf(pers_name, "raid%d", mddev->new_level);
7711         conf->thread = md_register_thread(raid5d, mddev, pers_name);
7712         if (!conf->thread) {
7713                 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7714                         mdname(mddev));
7715                 ret = -ENOMEM;
7716                 goto abort;
7717         }
7718
7719         return conf;
7720
7721  abort:
7722         if (conf)
7723                 free_conf(conf);
7724         return ERR_PTR(ret);
7725 }
7726
7727 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7728 {
7729         switch (algo) {
7730         case ALGORITHM_PARITY_0:
7731                 if (raid_disk < max_degraded)
7732                         return 1;
7733                 break;
7734         case ALGORITHM_PARITY_N:
7735                 if (raid_disk >= raid_disks - max_degraded)
7736                         return 1;
7737                 break;
7738         case ALGORITHM_PARITY_0_6:
7739                 if (raid_disk == 0 ||
7740                     raid_disk == raid_disks - 1)
7741                         return 1;
7742                 break;
7743         case ALGORITHM_LEFT_ASYMMETRIC_6:
7744         case ALGORITHM_RIGHT_ASYMMETRIC_6:
7745         case ALGORITHM_LEFT_SYMMETRIC_6:
7746         case ALGORITHM_RIGHT_SYMMETRIC_6:
7747                 if (raid_disk == raid_disks - 1)
7748                         return 1;
7749         }
7750         return 0;
7751 }
7752
7753 static void raid5_set_io_opt(struct r5conf *conf)
7754 {
7755         blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7756                          (conf->raid_disks - conf->max_degraded));
7757 }
7758
7759 static int raid5_run(struct mddev *mddev)
7760 {
7761         struct r5conf *conf;
7762         int dirty_parity_disks = 0;
7763         struct md_rdev *rdev;
7764         struct md_rdev *journal_dev = NULL;
7765         sector_t reshape_offset = 0;
7766         int i, ret = 0;
7767         long long min_offset_diff = 0;
7768         int first = 1;
7769
7770         if (acct_bioset_init(mddev)) {
7771                 pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
7772                 return -ENOMEM;
7773         }
7774
7775         if (mddev_init_writes_pending(mddev) < 0) {
7776                 ret = -ENOMEM;
7777                 goto exit_acct_set;
7778         }
7779
7780         if (mddev->recovery_cp != MaxSector)
7781                 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7782                           mdname(mddev));
7783
7784         rdev_for_each(rdev, mddev) {
7785                 long long diff;
7786
7787                 if (test_bit(Journal, &rdev->flags)) {
7788                         journal_dev = rdev;
7789                         continue;
7790                 }
7791                 if (rdev->raid_disk < 0)
7792                         continue;
7793                 diff = (rdev->new_data_offset - rdev->data_offset);
7794                 if (first) {
7795                         min_offset_diff = diff;
7796                         first = 0;
7797                 } else if (mddev->reshape_backwards &&
7798                          diff < min_offset_diff)
7799                         min_offset_diff = diff;
7800                 else if (!mddev->reshape_backwards &&
7801                          diff > min_offset_diff)
7802                         min_offset_diff = diff;
7803         }
7804
7805         if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7806             (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7807                 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7808                           mdname(mddev));
7809                 ret = -EINVAL;
7810                 goto exit_acct_set;
7811         }
7812
7813         if (mddev->reshape_position != MaxSector) {
7814                 /* Check that we can continue the reshape.
7815                  * Difficulties arise if the stripe we would write to
7816                  * next is at or after the stripe we would read from next.
7817                  * For a reshape that changes the number of devices, this
7818                  * is only possible for a very short time, and mdadm makes
7819                  * sure that time appears to have past before assembling
7820                  * the array.  So we fail if that time hasn't passed.
7821                  * For a reshape that keeps the number of devices the same
7822                  * mdadm must be monitoring the reshape can keeping the
7823                  * critical areas read-only and backed up.  It will start
7824                  * the array in read-only mode, so we check for that.
7825                  */
7826                 sector_t here_new, here_old;
7827                 int old_disks;
7828                 int max_degraded = (mddev->level == 6 ? 2 : 1);
7829                 int chunk_sectors;
7830                 int new_data_disks;
7831
7832                 if (journal_dev) {
7833                         pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7834                                 mdname(mddev));
7835                         ret = -EINVAL;
7836                         goto exit_acct_set;
7837                 }
7838
7839                 if (mddev->new_level != mddev->level) {
7840                         pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7841                                 mdname(mddev));
7842                         ret = -EINVAL;
7843                         goto exit_acct_set;
7844                 }
7845                 old_disks = mddev->raid_disks - mddev->delta_disks;
7846                 /* reshape_position must be on a new-stripe boundary, and one
7847                  * further up in new geometry must map after here in old
7848                  * geometry.
7849                  * If the chunk sizes are different, then as we perform reshape
7850                  * in units of the largest of the two, reshape_position needs
7851                  * be a multiple of the largest chunk size times new data disks.
7852                  */
7853                 here_new = mddev->reshape_position;
7854                 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7855                 new_data_disks = mddev->raid_disks - max_degraded;
7856                 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7857                         pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7858                                 mdname(mddev));
7859                         ret = -EINVAL;
7860                         goto exit_acct_set;
7861                 }
7862                 reshape_offset = here_new * chunk_sectors;
7863                 /* here_new is the stripe we will write to */
7864                 here_old = mddev->reshape_position;
7865                 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7866                 /* here_old is the first stripe that we might need to read
7867                  * from */
7868                 if (mddev->delta_disks == 0) {
7869                         /* We cannot be sure it is safe to start an in-place
7870                          * reshape.  It is only safe if user-space is monitoring
7871                          * and taking constant backups.
7872                          * mdadm always starts a situation like this in
7873                          * readonly mode so it can take control before
7874                          * allowing any writes.  So just check for that.
7875                          */
7876                         if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7877                             abs(min_offset_diff) >= mddev->new_chunk_sectors)
7878                                 /* not really in-place - so OK */;
7879                         else if (mddev->ro == 0) {
7880                                 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7881                                         mdname(mddev));
7882                                 ret = -EINVAL;
7883                                 goto exit_acct_set;
7884                         }
7885                 } else if (mddev->reshape_backwards
7886                     ? (here_new * chunk_sectors + min_offset_diff <=
7887                        here_old * chunk_sectors)
7888                     : (here_new * chunk_sectors >=
7889                        here_old * chunk_sectors + (-min_offset_diff))) {
7890                         /* Reading from the same stripe as writing to - bad */
7891                         pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7892                                 mdname(mddev));
7893                         ret = -EINVAL;
7894                         goto exit_acct_set;
7895                 }
7896                 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7897                 /* OK, we should be able to continue; */
7898         } else {
7899                 BUG_ON(mddev->level != mddev->new_level);
7900                 BUG_ON(mddev->layout != mddev->new_layout);
7901                 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7902                 BUG_ON(mddev->delta_disks != 0);
7903         }
7904
7905         if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7906             test_bit(MD_HAS_PPL, &mddev->flags)) {
7907                 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7908                         mdname(mddev));
7909                 clear_bit(MD_HAS_PPL, &mddev->flags);
7910                 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7911         }
7912
7913         if (mddev->private == NULL)
7914                 conf = setup_conf(mddev);
7915         else
7916                 conf = mddev->private;
7917
7918         if (IS_ERR(conf)) {
7919                 ret = PTR_ERR(conf);
7920                 goto exit_acct_set;
7921         }
7922
7923         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7924                 if (!journal_dev) {
7925                         pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7926                                 mdname(mddev));
7927                         mddev->ro = 1;
7928                         set_disk_ro(mddev->gendisk, 1);
7929                 } else if (mddev->recovery_cp == MaxSector)
7930                         set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7931         }
7932
7933         conf->min_offset_diff = min_offset_diff;
7934         mddev->thread = conf->thread;
7935         conf->thread = NULL;
7936         mddev->private = conf;
7937
7938         for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7939              i++) {
7940                 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
7941                 if (!rdev && conf->disks[i].replacement) {
7942                         /* The replacement is all we have yet */
7943                         rdev = rdev_mdlock_deref(mddev,
7944                                                  conf->disks[i].replacement);
7945                         conf->disks[i].replacement = NULL;
7946                         clear_bit(Replacement, &rdev->flags);
7947                         rcu_assign_pointer(conf->disks[i].rdev, rdev);
7948                 }
7949                 if (!rdev)
7950                         continue;
7951                 if (rcu_access_pointer(conf->disks[i].replacement) &&
7952                     conf->reshape_progress != MaxSector) {
7953                         /* replacements and reshape simply do not mix. */
7954                         pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7955                         goto abort;
7956                 }
7957                 if (test_bit(In_sync, &rdev->flags))
7958                         continue;
7959                 /* This disc is not fully in-sync.  However if it
7960                  * just stored parity (beyond the recovery_offset),
7961                  * when we don't need to be concerned about the
7962                  * array being dirty.
7963                  * When reshape goes 'backwards', we never have
7964                  * partially completed devices, so we only need
7965                  * to worry about reshape going forwards.
7966                  */
7967                 /* Hack because v0.91 doesn't store recovery_offset properly. */
7968                 if (mddev->major_version == 0 &&
7969                     mddev->minor_version > 90)
7970                         rdev->recovery_offset = reshape_offset;
7971
7972                 if (rdev->recovery_offset < reshape_offset) {
7973                         /* We need to check old and new layout */
7974                         if (!only_parity(rdev->raid_disk,
7975                                          conf->algorithm,
7976                                          conf->raid_disks,
7977                                          conf->max_degraded))
7978                                 continue;
7979                 }
7980                 if (!only_parity(rdev->raid_disk,
7981                                  conf->prev_algo,
7982                                  conf->previous_raid_disks,
7983                                  conf->max_degraded))
7984                         continue;
7985                 dirty_parity_disks++;
7986         }
7987
7988         /*
7989          * 0 for a fully functional array, 1 or 2 for a degraded array.
7990          */
7991         mddev->degraded = raid5_calc_degraded(conf);
7992
7993         if (has_failed(conf)) {
7994                 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7995                         mdname(mddev), mddev->degraded, conf->raid_disks);
7996                 goto abort;
7997         }
7998
7999         /* device size must be a multiple of chunk size */
8000         mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
8001         mddev->resync_max_sectors = mddev->dev_sectors;
8002
8003         if (mddev->degraded > dirty_parity_disks &&
8004             mddev->recovery_cp != MaxSector) {
8005                 if (test_bit(MD_HAS_PPL, &mddev->flags))
8006                         pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
8007                                 mdname(mddev));
8008                 else if (mddev->ok_start_degraded)
8009                         pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
8010                                 mdname(mddev));
8011                 else {
8012                         pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
8013                                 mdname(mddev));
8014                         goto abort;
8015                 }
8016         }
8017
8018         pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
8019                 mdname(mddev), conf->level,
8020                 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
8021                 mddev->new_layout);
8022
8023         print_raid5_conf(conf);
8024
8025         if (conf->reshape_progress != MaxSector) {
8026                 conf->reshape_safe = conf->reshape_progress;
8027                 atomic_set(&conf->reshape_stripes, 0);
8028                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8029                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8030                 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8031                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8032                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8033                                                         "reshape");
8034                 if (!mddev->sync_thread)
8035                         goto abort;
8036         }
8037
8038         /* Ok, everything is just fine now */
8039         if (mddev->to_remove == &raid5_attrs_group)
8040                 mddev->to_remove = NULL;
8041         else if (mddev->kobj.sd &&
8042             sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
8043                 pr_warn("raid5: failed to create sysfs attributes for %s\n",
8044                         mdname(mddev));
8045         md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
8046
8047         if (mddev->queue) {
8048                 int chunk_size;
8049                 /* read-ahead size must cover two whole stripes, which
8050                  * is 2 * (datadisks) * chunksize where 'n' is the
8051                  * number of raid devices
8052                  */
8053                 int data_disks = conf->previous_raid_disks - conf->max_degraded;
8054                 int stripe = data_disks *
8055                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
8056
8057                 chunk_size = mddev->chunk_sectors << 9;
8058                 blk_queue_io_min(mddev->queue, chunk_size);
8059                 raid5_set_io_opt(conf);
8060                 mddev->queue->limits.raid_partial_stripes_expensive = 1;
8061                 /*
8062                  * We can only discard a whole stripe. It doesn't make sense to
8063                  * discard data disk but write parity disk
8064                  */
8065                 stripe = stripe * PAGE_SIZE;
8066                 stripe = roundup_pow_of_two(stripe);
8067                 mddev->queue->limits.discard_granularity = stripe;
8068
8069                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
8070
8071                 rdev_for_each(rdev, mddev) {
8072                         disk_stack_limits(mddev->gendisk, rdev->bdev,
8073                                           rdev->data_offset << 9);
8074                         disk_stack_limits(mddev->gendisk, rdev->bdev,
8075                                           rdev->new_data_offset << 9);
8076                 }
8077
8078                 /*
8079                  * zeroing is required, otherwise data
8080                  * could be lost. Consider a scenario: discard a stripe
8081                  * (the stripe could be inconsistent if
8082                  * discard_zeroes_data is 0); write one disk of the
8083                  * stripe (the stripe could be inconsistent again
8084                  * depending on which disks are used to calculate
8085                  * parity); the disk is broken; The stripe data of this
8086                  * disk is lost.
8087                  *
8088                  * We only allow DISCARD if the sysadmin has confirmed that
8089                  * only safe devices are in use by setting a module parameter.
8090                  * A better idea might be to turn DISCARD into WRITE_ZEROES
8091                  * requests, as that is required to be safe.
8092                  */
8093                 if (!devices_handle_discard_safely ||
8094                     mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
8095                     mddev->queue->limits.discard_granularity < stripe)
8096                         blk_queue_max_discard_sectors(mddev->queue, 0);
8097
8098                 /*
8099                  * Requests require having a bitmap for each stripe.
8100                  * Limit the max sectors based on this.
8101                  */
8102                 blk_queue_max_hw_sectors(mddev->queue,
8103                         RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
8104
8105                 /* No restrictions on the number of segments in the request */
8106                 blk_queue_max_segments(mddev->queue, USHRT_MAX);
8107         }
8108
8109         if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
8110                 goto abort;
8111
8112         return 0;
8113 abort:
8114         md_unregister_thread(&mddev->thread);
8115         print_raid5_conf(conf);
8116         free_conf(conf);
8117         mddev->private = NULL;
8118         pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
8119         ret = -EIO;
8120 exit_acct_set:
8121         acct_bioset_exit(mddev);
8122         return ret;
8123 }
8124
8125 static void raid5_free(struct mddev *mddev, void *priv)
8126 {
8127         struct r5conf *conf = priv;
8128
8129         free_conf(conf);
8130         acct_bioset_exit(mddev);
8131         mddev->to_remove = &raid5_attrs_group;
8132 }
8133
8134 static void raid5_status(struct seq_file *seq, struct mddev *mddev)
8135 {
8136         struct r5conf *conf = mddev->private;
8137         int i;
8138
8139         seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
8140                 conf->chunk_sectors / 2, mddev->layout);
8141         seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
8142         rcu_read_lock();
8143         for (i = 0; i < conf->raid_disks; i++) {
8144                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
8145                 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
8146         }
8147         rcu_read_unlock();
8148         seq_printf (seq, "]");
8149 }
8150
8151 static void print_raid5_conf (struct r5conf *conf)
8152 {
8153         struct md_rdev *rdev;
8154         int i;
8155
8156         pr_debug("RAID conf printout:\n");
8157         if (!conf) {
8158                 pr_debug("(conf==NULL)\n");
8159                 return;
8160         }
8161         pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
8162                conf->raid_disks,
8163                conf->raid_disks - conf->mddev->degraded);
8164
8165         rcu_read_lock();
8166         for (i = 0; i < conf->raid_disks; i++) {
8167                 rdev = rcu_dereference(conf->disks[i].rdev);
8168                 if (rdev)
8169                         pr_debug(" disk %d, o:%d, dev:%pg\n",
8170                                i, !test_bit(Faulty, &rdev->flags),
8171                                rdev->bdev);
8172         }
8173         rcu_read_unlock();
8174 }
8175
8176 static int raid5_spare_active(struct mddev *mddev)
8177 {
8178         int i;
8179         struct r5conf *conf = mddev->private;
8180         struct md_rdev *rdev, *replacement;
8181         int count = 0;
8182         unsigned long flags;
8183
8184         for (i = 0; i < conf->raid_disks; i++) {
8185                 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
8186                 replacement = rdev_mdlock_deref(mddev,
8187                                                 conf->disks[i].replacement);
8188                 if (replacement
8189                     && replacement->recovery_offset == MaxSector
8190                     && !test_bit(Faulty, &replacement->flags)
8191                     && !test_and_set_bit(In_sync, &replacement->flags)) {
8192                         /* Replacement has just become active. */
8193                         if (!rdev
8194                             || !test_and_clear_bit(In_sync, &rdev->flags))
8195                                 count++;
8196                         if (rdev) {
8197                                 /* Replaced device not technically faulty,
8198                                  * but we need to be sure it gets removed
8199                                  * and never re-added.
8200                                  */
8201                                 set_bit(Faulty, &rdev->flags);
8202                                 sysfs_notify_dirent_safe(
8203                                         rdev->sysfs_state);
8204                         }
8205                         sysfs_notify_dirent_safe(replacement->sysfs_state);
8206                 } else if (rdev
8207                     && rdev->recovery_offset == MaxSector
8208                     && !test_bit(Faulty, &rdev->flags)
8209                     && !test_and_set_bit(In_sync, &rdev->flags)) {
8210                         count++;
8211                         sysfs_notify_dirent_safe(rdev->sysfs_state);
8212                 }
8213         }
8214         spin_lock_irqsave(&conf->device_lock, flags);
8215         mddev->degraded = raid5_calc_degraded(conf);
8216         spin_unlock_irqrestore(&conf->device_lock, flags);
8217         print_raid5_conf(conf);
8218         return count;
8219 }
8220
8221 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
8222 {
8223         struct r5conf *conf = mddev->private;
8224         int err = 0;
8225         int number = rdev->raid_disk;
8226         struct md_rdev __rcu **rdevp;
8227         struct disk_info *p;
8228         struct md_rdev *tmp;
8229
8230         print_raid5_conf(conf);
8231         if (test_bit(Journal, &rdev->flags) && conf->log) {
8232                 /*
8233                  * we can't wait pending write here, as this is called in
8234                  * raid5d, wait will deadlock.
8235                  * neilb: there is no locking about new writes here,
8236                  * so this cannot be safe.
8237                  */
8238                 if (atomic_read(&conf->active_stripes) ||
8239                     atomic_read(&conf->r5c_cached_full_stripes) ||
8240                     atomic_read(&conf->r5c_cached_partial_stripes)) {
8241                         return -EBUSY;
8242                 }
8243                 log_exit(conf);
8244                 return 0;
8245         }
8246         if (unlikely(number >= conf->pool_size))
8247                 return 0;
8248         p = conf->disks + number;
8249         if (rdev == rcu_access_pointer(p->rdev))
8250                 rdevp = &p->rdev;
8251         else if (rdev == rcu_access_pointer(p->replacement))
8252                 rdevp = &p->replacement;
8253         else
8254                 return 0;
8255
8256         if (number >= conf->raid_disks &&
8257             conf->reshape_progress == MaxSector)
8258                 clear_bit(In_sync, &rdev->flags);
8259
8260         if (test_bit(In_sync, &rdev->flags) ||
8261             atomic_read(&rdev->nr_pending)) {
8262                 err = -EBUSY;
8263                 goto abort;
8264         }
8265         /* Only remove non-faulty devices if recovery
8266          * isn't possible.
8267          */
8268         if (!test_bit(Faulty, &rdev->flags) &&
8269             mddev->recovery_disabled != conf->recovery_disabled &&
8270             !has_failed(conf) &&
8271             (!rcu_access_pointer(p->replacement) ||
8272              rcu_access_pointer(p->replacement) == rdev) &&
8273             number < conf->raid_disks) {
8274                 err = -EBUSY;
8275                 goto abort;
8276         }
8277         *rdevp = NULL;
8278         if (!test_bit(RemoveSynchronized, &rdev->flags)) {
8279                 lockdep_assert_held(&mddev->reconfig_mutex);
8280                 synchronize_rcu();
8281                 if (atomic_read(&rdev->nr_pending)) {
8282                         /* lost the race, try later */
8283                         err = -EBUSY;
8284                         rcu_assign_pointer(*rdevp, rdev);
8285                 }
8286         }
8287         if (!err) {
8288                 err = log_modify(conf, rdev, false);
8289                 if (err)
8290                         goto abort;
8291         }
8292
8293         tmp = rcu_access_pointer(p->replacement);
8294         if (tmp) {
8295                 /* We must have just cleared 'rdev' */
8296                 rcu_assign_pointer(p->rdev, tmp);
8297                 clear_bit(Replacement, &tmp->flags);
8298                 smp_mb(); /* Make sure other CPUs may see both as identical
8299                            * but will never see neither - if they are careful
8300                            */
8301                 rcu_assign_pointer(p->replacement, NULL);
8302
8303                 if (!err)
8304                         err = log_modify(conf, tmp, true);
8305         }
8306
8307         clear_bit(WantReplacement, &rdev->flags);
8308 abort:
8309
8310         print_raid5_conf(conf);
8311         return err;
8312 }
8313
8314 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
8315 {
8316         struct r5conf *conf = mddev->private;
8317         int ret, err = -EEXIST;
8318         int disk;
8319         struct disk_info *p;
8320         struct md_rdev *tmp;
8321         int first = 0;
8322         int last = conf->raid_disks - 1;
8323
8324         if (test_bit(Journal, &rdev->flags)) {
8325                 if (conf->log)
8326                         return -EBUSY;
8327
8328                 rdev->raid_disk = 0;
8329                 /*
8330                  * The array is in readonly mode if journal is missing, so no
8331                  * write requests running. We should be safe
8332                  */
8333                 ret = log_init(conf, rdev, false);
8334                 if (ret)
8335                         return ret;
8336
8337                 ret = r5l_start(conf->log);
8338                 if (ret)
8339                         return ret;
8340
8341                 return 0;
8342         }
8343         if (mddev->recovery_disabled == conf->recovery_disabled)
8344                 return -EBUSY;
8345
8346         if (rdev->saved_raid_disk < 0 && has_failed(conf))
8347                 /* no point adding a device */
8348                 return -EINVAL;
8349
8350         if (rdev->raid_disk >= 0)
8351                 first = last = rdev->raid_disk;
8352
8353         /*
8354          * find the disk ... but prefer rdev->saved_raid_disk
8355          * if possible.
8356          */
8357         if (rdev->saved_raid_disk >= first &&
8358             rdev->saved_raid_disk <= last &&
8359             conf->disks[rdev->saved_raid_disk].rdev == NULL)
8360                 first = rdev->saved_raid_disk;
8361
8362         for (disk = first; disk <= last; disk++) {
8363                 p = conf->disks + disk;
8364                 if (p->rdev == NULL) {
8365                         clear_bit(In_sync, &rdev->flags);
8366                         rdev->raid_disk = disk;
8367                         if (rdev->saved_raid_disk != disk)
8368                                 conf->fullsync = 1;
8369                         rcu_assign_pointer(p->rdev, rdev);
8370
8371                         err = log_modify(conf, rdev, true);
8372
8373                         goto out;
8374                 }
8375         }
8376         for (disk = first; disk <= last; disk++) {
8377                 p = conf->disks + disk;
8378                 tmp = rdev_mdlock_deref(mddev, p->rdev);
8379                 if (test_bit(WantReplacement, &tmp->flags) &&
8380                     p->replacement == NULL) {
8381                         clear_bit(In_sync, &rdev->flags);
8382                         set_bit(Replacement, &rdev->flags);
8383                         rdev->raid_disk = disk;
8384                         err = 0;
8385                         conf->fullsync = 1;
8386                         rcu_assign_pointer(p->replacement, rdev);
8387                         break;
8388                 }
8389         }
8390 out:
8391         print_raid5_conf(conf);
8392         return err;
8393 }
8394
8395 static int raid5_resize(struct mddev *mddev, sector_t sectors)
8396 {
8397         /* no resync is happening, and there is enough space
8398          * on all devices, so we can resize.
8399          * We need to make sure resync covers any new space.
8400          * If the array is shrinking we should possibly wait until
8401          * any io in the removed space completes, but it hardly seems
8402          * worth it.
8403          */
8404         sector_t newsize;
8405         struct r5conf *conf = mddev->private;
8406
8407         if (raid5_has_log(conf) || raid5_has_ppl(conf))
8408                 return -EINVAL;
8409         sectors &= ~((sector_t)conf->chunk_sectors - 1);
8410         newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8411         if (mddev->external_size &&
8412             mddev->array_sectors > newsize)
8413                 return -EINVAL;
8414         if (mddev->bitmap) {
8415                 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8416                 if (ret)
8417                         return ret;
8418         }
8419         md_set_array_sectors(mddev, newsize);
8420         if (sectors > mddev->dev_sectors &&
8421             mddev->recovery_cp > mddev->dev_sectors) {
8422                 mddev->recovery_cp = mddev->dev_sectors;
8423                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8424         }
8425         mddev->dev_sectors = sectors;
8426         mddev->resync_max_sectors = sectors;
8427         return 0;
8428 }
8429
8430 static int check_stripe_cache(struct mddev *mddev)
8431 {
8432         /* Can only proceed if there are plenty of stripe_heads.
8433          * We need a minimum of one full stripe,, and for sensible progress
8434          * it is best to have about 4 times that.
8435          * If we require 4 times, then the default 256 4K stripe_heads will
8436          * allow for chunk sizes up to 256K, which is probably OK.
8437          * If the chunk size is greater, user-space should request more
8438          * stripe_heads first.
8439          */
8440         struct r5conf *conf = mddev->private;
8441         if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8442             > conf->min_nr_stripes ||
8443             ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8444             > conf->min_nr_stripes) {
8445                 pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
8446                         mdname(mddev),
8447                         ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8448                          / RAID5_STRIPE_SIZE(conf))*4);
8449                 return 0;
8450         }
8451         return 1;
8452 }
8453
8454 static int check_reshape(struct mddev *mddev)
8455 {
8456         struct r5conf *conf = mddev->private;
8457
8458         if (raid5_has_log(conf) || raid5_has_ppl(conf))
8459                 return -EINVAL;
8460         if (mddev->delta_disks == 0 &&
8461             mddev->new_layout == mddev->layout &&
8462             mddev->new_chunk_sectors == mddev->chunk_sectors)
8463                 return 0; /* nothing to do */
8464         if (has_failed(conf))
8465                 return -EINVAL;
8466         if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8467                 /* We might be able to shrink, but the devices must
8468                  * be made bigger first.
8469                  * For raid6, 4 is the minimum size.
8470                  * Otherwise 2 is the minimum
8471                  */
8472                 int min = 2;
8473                 if (mddev->level == 6)
8474                         min = 4;
8475                 if (mddev->raid_disks + mddev->delta_disks < min)
8476                         return -EINVAL;
8477         }
8478
8479         if (!check_stripe_cache(mddev))
8480                 return -ENOSPC;
8481
8482         if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8483             mddev->delta_disks > 0)
8484                 if (resize_chunks(conf,
8485                                   conf->previous_raid_disks
8486                                   + max(0, mddev->delta_disks),
8487                                   max(mddev->new_chunk_sectors,
8488                                       mddev->chunk_sectors)
8489                             ) < 0)
8490                         return -ENOMEM;
8491
8492         if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8493                 return 0; /* never bother to shrink */
8494         return resize_stripes(conf, (conf->previous_raid_disks
8495                                      + mddev->delta_disks));
8496 }
8497
8498 static int raid5_start_reshape(struct mddev *mddev)
8499 {
8500         struct r5conf *conf = mddev->private;
8501         struct md_rdev *rdev;
8502         int spares = 0;
8503         unsigned long flags;
8504
8505         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8506                 return -EBUSY;
8507
8508         if (!check_stripe_cache(mddev))
8509                 return -ENOSPC;
8510
8511         if (has_failed(conf))
8512                 return -EINVAL;
8513
8514         rdev_for_each(rdev, mddev) {
8515                 if (!test_bit(In_sync, &rdev->flags)
8516                     && !test_bit(Faulty, &rdev->flags))
8517                         spares++;
8518         }
8519
8520         if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8521                 /* Not enough devices even to make a degraded array
8522                  * of that size
8523                  */
8524                 return -EINVAL;
8525
8526         /* Refuse to reduce size of the array.  Any reductions in
8527          * array size must be through explicit setting of array_size
8528          * attribute.
8529          */
8530         if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8531             < mddev->array_sectors) {
8532                 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8533                         mdname(mddev));
8534                 return -EINVAL;
8535         }
8536
8537         atomic_set(&conf->reshape_stripes, 0);
8538         spin_lock_irq(&conf->device_lock);
8539         write_seqcount_begin(&conf->gen_lock);
8540         conf->previous_raid_disks = conf->raid_disks;
8541         conf->raid_disks += mddev->delta_disks;
8542         conf->prev_chunk_sectors = conf->chunk_sectors;
8543         conf->chunk_sectors = mddev->new_chunk_sectors;
8544         conf->prev_algo = conf->algorithm;
8545         conf->algorithm = mddev->new_layout;
8546         conf->generation++;
8547         /* Code that selects data_offset needs to see the generation update
8548          * if reshape_progress has been set - so a memory barrier needed.
8549          */
8550         smp_mb();
8551         if (mddev->reshape_backwards)
8552                 conf->reshape_progress = raid5_size(mddev, 0, 0);
8553         else
8554                 conf->reshape_progress = 0;
8555         conf->reshape_safe = conf->reshape_progress;
8556         write_seqcount_end(&conf->gen_lock);
8557         spin_unlock_irq(&conf->device_lock);
8558
8559         /* Now make sure any requests that proceeded on the assumption
8560          * the reshape wasn't running - like Discard or Read - have
8561          * completed.
8562          */
8563         mddev_suspend(mddev);
8564         mddev_resume(mddev);
8565
8566         /* Add some new drives, as many as will fit.
8567          * We know there are enough to make the newly sized array work.
8568          * Don't add devices if we are reducing the number of
8569          * devices in the array.  This is because it is not possible
8570          * to correctly record the "partially reconstructed" state of
8571          * such devices during the reshape and confusion could result.
8572          */
8573         if (mddev->delta_disks >= 0) {
8574                 rdev_for_each(rdev, mddev)
8575                         if (rdev->raid_disk < 0 &&
8576                             !test_bit(Faulty, &rdev->flags)) {
8577                                 if (raid5_add_disk(mddev, rdev) == 0) {
8578                                         if (rdev->raid_disk
8579                                             >= conf->previous_raid_disks)
8580                                                 set_bit(In_sync, &rdev->flags);
8581                                         else
8582                                                 rdev->recovery_offset = 0;
8583
8584                                         /* Failure here is OK */
8585                                         sysfs_link_rdev(mddev, rdev);
8586                                 }
8587                         } else if (rdev->raid_disk >= conf->previous_raid_disks
8588                                    && !test_bit(Faulty, &rdev->flags)) {
8589                                 /* This is a spare that was manually added */
8590                                 set_bit(In_sync, &rdev->flags);
8591                         }
8592
8593                 /* When a reshape changes the number of devices,
8594                  * ->degraded is measured against the larger of the
8595                  * pre and post number of devices.
8596                  */
8597                 spin_lock_irqsave(&conf->device_lock, flags);
8598                 mddev->degraded = raid5_calc_degraded(conf);
8599                 spin_unlock_irqrestore(&conf->device_lock, flags);
8600         }
8601         mddev->raid_disks = conf->raid_disks;
8602         mddev->reshape_position = conf->reshape_progress;
8603         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8604
8605         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8606         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8607         clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8608         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8609         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8610         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8611                                                 "reshape");
8612         if (!mddev->sync_thread) {
8613                 mddev->recovery = 0;
8614                 spin_lock_irq(&conf->device_lock);
8615                 write_seqcount_begin(&conf->gen_lock);
8616                 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8617                 mddev->new_chunk_sectors =
8618                         conf->chunk_sectors = conf->prev_chunk_sectors;
8619                 mddev->new_layout = conf->algorithm = conf->prev_algo;
8620                 rdev_for_each(rdev, mddev)
8621                         rdev->new_data_offset = rdev->data_offset;
8622                 smp_wmb();
8623                 conf->generation --;
8624                 conf->reshape_progress = MaxSector;
8625                 mddev->reshape_position = MaxSector;
8626                 write_seqcount_end(&conf->gen_lock);
8627                 spin_unlock_irq(&conf->device_lock);
8628                 return -EAGAIN;
8629         }
8630         conf->reshape_checkpoint = jiffies;
8631         md_wakeup_thread(mddev->sync_thread);
8632         md_new_event();
8633         return 0;
8634 }
8635
8636 /* This is called from the reshape thread and should make any
8637  * changes needed in 'conf'
8638  */
8639 static void end_reshape(struct r5conf *conf)
8640 {
8641
8642         if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8643                 struct md_rdev *rdev;
8644
8645                 spin_lock_irq(&conf->device_lock);
8646                 conf->previous_raid_disks = conf->raid_disks;
8647                 md_finish_reshape(conf->mddev);
8648                 smp_wmb();
8649                 conf->reshape_progress = MaxSector;
8650                 conf->mddev->reshape_position = MaxSector;
8651                 rdev_for_each(rdev, conf->mddev)
8652                         if (rdev->raid_disk >= 0 &&
8653                             !test_bit(Journal, &rdev->flags) &&
8654                             !test_bit(In_sync, &rdev->flags))
8655                                 rdev->recovery_offset = MaxSector;
8656                 spin_unlock_irq(&conf->device_lock);
8657                 wake_up(&conf->wait_for_overlap);
8658
8659                 if (conf->mddev->queue)
8660                         raid5_set_io_opt(conf);
8661         }
8662 }
8663
8664 /* This is called from the raid5d thread with mddev_lock held.
8665  * It makes config changes to the device.
8666  */
8667 static void raid5_finish_reshape(struct mddev *mddev)
8668 {
8669         struct r5conf *conf = mddev->private;
8670         struct md_rdev *rdev;
8671
8672         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8673
8674                 if (mddev->delta_disks <= 0) {
8675                         int d;
8676                         spin_lock_irq(&conf->device_lock);
8677                         mddev->degraded = raid5_calc_degraded(conf);
8678                         spin_unlock_irq(&conf->device_lock);
8679                         for (d = conf->raid_disks ;
8680                              d < conf->raid_disks - mddev->delta_disks;
8681                              d++) {
8682                                 rdev = rdev_mdlock_deref(mddev,
8683                                                          conf->disks[d].rdev);
8684                                 if (rdev)
8685                                         clear_bit(In_sync, &rdev->flags);
8686                                 rdev = rdev_mdlock_deref(mddev,
8687                                                 conf->disks[d].replacement);
8688                                 if (rdev)
8689                                         clear_bit(In_sync, &rdev->flags);
8690                         }
8691                 }
8692                 mddev->layout = conf->algorithm;
8693                 mddev->chunk_sectors = conf->chunk_sectors;
8694                 mddev->reshape_position = MaxSector;
8695                 mddev->delta_disks = 0;
8696                 mddev->reshape_backwards = 0;
8697         }
8698 }
8699
8700 static void raid5_quiesce(struct mddev *mddev, int quiesce)
8701 {
8702         struct r5conf *conf = mddev->private;
8703
8704         if (quiesce) {
8705                 /* stop all writes */
8706                 lock_all_device_hash_locks_irq(conf);
8707                 /* '2' tells resync/reshape to pause so that all
8708                  * active stripes can drain
8709                  */
8710                 r5c_flush_cache(conf, INT_MAX);
8711                 /* need a memory barrier to make sure read_one_chunk() sees
8712                  * quiesce started and reverts to slow (locked) path.
8713                  */
8714                 smp_store_release(&conf->quiesce, 2);
8715                 wait_event_cmd(conf->wait_for_quiescent,
8716                                     atomic_read(&conf->active_stripes) == 0 &&
8717                                     atomic_read(&conf->active_aligned_reads) == 0,
8718                                     unlock_all_device_hash_locks_irq(conf),
8719                                     lock_all_device_hash_locks_irq(conf));
8720                 conf->quiesce = 1;
8721                 unlock_all_device_hash_locks_irq(conf);
8722                 /* allow reshape to continue */
8723                 wake_up(&conf->wait_for_overlap);
8724         } else {
8725                 /* re-enable writes */
8726                 lock_all_device_hash_locks_irq(conf);
8727                 conf->quiesce = 0;
8728                 wake_up(&conf->wait_for_quiescent);
8729                 wake_up(&conf->wait_for_overlap);
8730                 unlock_all_device_hash_locks_irq(conf);
8731         }
8732         log_quiesce(conf, quiesce);
8733 }
8734
8735 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8736 {
8737         struct r0conf *raid0_conf = mddev->private;
8738         sector_t sectors;
8739
8740         /* for raid0 takeover only one zone is supported */
8741         if (raid0_conf->nr_strip_zones > 1) {
8742                 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8743                         mdname(mddev));
8744                 return ERR_PTR(-EINVAL);
8745         }
8746
8747         sectors = raid0_conf->strip_zone[0].zone_end;
8748         sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8749         mddev->dev_sectors = sectors;
8750         mddev->new_level = level;
8751         mddev->new_layout = ALGORITHM_PARITY_N;
8752         mddev->new_chunk_sectors = mddev->chunk_sectors;
8753         mddev->raid_disks += 1;
8754         mddev->delta_disks = 1;
8755         /* make sure it will be not marked as dirty */
8756         mddev->recovery_cp = MaxSector;
8757
8758         return setup_conf(mddev);
8759 }
8760
8761 static void *raid5_takeover_raid1(struct mddev *mddev)
8762 {
8763         int chunksect;
8764         void *ret;
8765
8766         if (mddev->raid_disks != 2 ||
8767             mddev->degraded > 1)
8768                 return ERR_PTR(-EINVAL);
8769
8770         /* Should check if there are write-behind devices? */
8771
8772         chunksect = 64*2; /* 64K by default */
8773
8774         /* The array must be an exact multiple of chunksize */
8775         while (chunksect && (mddev->array_sectors & (chunksect-1)))
8776                 chunksect >>= 1;
8777
8778         if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8779                 /* array size does not allow a suitable chunk size */
8780                 return ERR_PTR(-EINVAL);
8781
8782         mddev->new_level = 5;
8783         mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8784         mddev->new_chunk_sectors = chunksect;
8785
8786         ret = setup_conf(mddev);
8787         if (!IS_ERR(ret))
8788                 mddev_clear_unsupported_flags(mddev,
8789                         UNSUPPORTED_MDDEV_FLAGS);
8790         return ret;
8791 }
8792
8793 static void *raid5_takeover_raid6(struct mddev *mddev)
8794 {
8795         int new_layout;
8796
8797         switch (mddev->layout) {
8798         case ALGORITHM_LEFT_ASYMMETRIC_6:
8799                 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8800                 break;
8801         case ALGORITHM_RIGHT_ASYMMETRIC_6:
8802                 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8803                 break;
8804         case ALGORITHM_LEFT_SYMMETRIC_6:
8805                 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8806                 break;
8807         case ALGORITHM_RIGHT_SYMMETRIC_6:
8808                 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8809                 break;
8810         case ALGORITHM_PARITY_0_6:
8811                 new_layout = ALGORITHM_PARITY_0;
8812                 break;
8813         case ALGORITHM_PARITY_N:
8814                 new_layout = ALGORITHM_PARITY_N;
8815                 break;
8816         default:
8817                 return ERR_PTR(-EINVAL);
8818         }
8819         mddev->new_level = 5;
8820         mddev->new_layout = new_layout;
8821         mddev->delta_disks = -1;
8822         mddev->raid_disks -= 1;
8823         return setup_conf(mddev);
8824 }
8825
8826 static int raid5_check_reshape(struct mddev *mddev)
8827 {
8828         /* For a 2-drive array, the layout and chunk size can be changed
8829          * immediately as not restriping is needed.
8830          * For larger arrays we record the new value - after validation
8831          * to be used by a reshape pass.
8832          */
8833         struct r5conf *conf = mddev->private;
8834         int new_chunk = mddev->new_chunk_sectors;
8835
8836         if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8837                 return -EINVAL;
8838         if (new_chunk > 0) {
8839                 if (!is_power_of_2(new_chunk))
8840                         return -EINVAL;
8841                 if (new_chunk < (PAGE_SIZE>>9))
8842                         return -EINVAL;
8843                 if (mddev->array_sectors & (new_chunk-1))
8844                         /* not factor of array size */
8845                         return -EINVAL;
8846         }
8847
8848         /* They look valid */
8849
8850         if (mddev->raid_disks == 2) {
8851                 /* can make the change immediately */
8852                 if (mddev->new_layout >= 0) {
8853                         conf->algorithm = mddev->new_layout;
8854                         mddev->layout = mddev->new_layout;
8855                 }
8856                 if (new_chunk > 0) {
8857                         conf->chunk_sectors = new_chunk ;
8858                         mddev->chunk_sectors = new_chunk;
8859                 }
8860                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8861                 md_wakeup_thread(mddev->thread);
8862         }
8863         return check_reshape(mddev);
8864 }
8865
8866 static int raid6_check_reshape(struct mddev *mddev)
8867 {
8868         int new_chunk = mddev->new_chunk_sectors;
8869
8870         if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8871                 return -EINVAL;
8872         if (new_chunk > 0) {
8873                 if (!is_power_of_2(new_chunk))
8874                         return -EINVAL;
8875                 if (new_chunk < (PAGE_SIZE >> 9))
8876                         return -EINVAL;
8877                 if (mddev->array_sectors & (new_chunk-1))
8878                         /* not factor of array size */
8879                         return -EINVAL;
8880         }
8881
8882         /* They look valid */
8883         return check_reshape(mddev);
8884 }
8885
8886 static void *raid5_takeover(struct mddev *mddev)
8887 {
8888         /* raid5 can take over:
8889          *  raid0 - if there is only one strip zone - make it a raid4 layout
8890          *  raid1 - if there are two drives.  We need to know the chunk size
8891          *  raid4 - trivial - just use a raid4 layout.
8892          *  raid6 - Providing it is a *_6 layout
8893          */
8894         if (mddev->level == 0)
8895                 return raid45_takeover_raid0(mddev, 5);
8896         if (mddev->level == 1)
8897                 return raid5_takeover_raid1(mddev);
8898         if (mddev->level == 4) {
8899                 mddev->new_layout = ALGORITHM_PARITY_N;
8900                 mddev->new_level = 5;
8901                 return setup_conf(mddev);
8902         }
8903         if (mddev->level == 6)
8904                 return raid5_takeover_raid6(mddev);
8905
8906         return ERR_PTR(-EINVAL);
8907 }
8908
8909 static void *raid4_takeover(struct mddev *mddev)
8910 {
8911         /* raid4 can take over:
8912          *  raid0 - if there is only one strip zone
8913          *  raid5 - if layout is right
8914          */
8915         if (mddev->level == 0)
8916                 return raid45_takeover_raid0(mddev, 4);
8917         if (mddev->level == 5 &&
8918             mddev->layout == ALGORITHM_PARITY_N) {
8919                 mddev->new_layout = 0;
8920                 mddev->new_level = 4;
8921                 return setup_conf(mddev);
8922         }
8923         return ERR_PTR(-EINVAL);
8924 }
8925
8926 static struct md_personality raid5_personality;
8927
8928 static void *raid6_takeover(struct mddev *mddev)
8929 {
8930         /* Currently can only take over a raid5.  We map the
8931          * personality to an equivalent raid6 personality
8932          * with the Q block at the end.
8933          */
8934         int new_layout;
8935
8936         if (mddev->pers != &raid5_personality)
8937                 return ERR_PTR(-EINVAL);
8938         if (mddev->degraded > 1)
8939                 return ERR_PTR(-EINVAL);
8940         if (mddev->raid_disks > 253)
8941                 return ERR_PTR(-EINVAL);
8942         if (mddev->raid_disks < 3)
8943                 return ERR_PTR(-EINVAL);
8944
8945         switch (mddev->layout) {
8946         case ALGORITHM_LEFT_ASYMMETRIC:
8947                 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8948                 break;
8949         case ALGORITHM_RIGHT_ASYMMETRIC:
8950                 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8951                 break;
8952         case ALGORITHM_LEFT_SYMMETRIC:
8953                 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8954                 break;
8955         case ALGORITHM_RIGHT_SYMMETRIC:
8956                 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8957                 break;
8958         case ALGORITHM_PARITY_0:
8959                 new_layout = ALGORITHM_PARITY_0_6;
8960                 break;
8961         case ALGORITHM_PARITY_N:
8962                 new_layout = ALGORITHM_PARITY_N;
8963                 break;
8964         default:
8965                 return ERR_PTR(-EINVAL);
8966         }
8967         mddev->new_level = 6;
8968         mddev->new_layout = new_layout;
8969         mddev->delta_disks = 1;
8970         mddev->raid_disks += 1;
8971         return setup_conf(mddev);
8972 }
8973
8974 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8975 {
8976         struct r5conf *conf;
8977         int err;
8978
8979         err = mddev_lock(mddev);
8980         if (err)
8981                 return err;
8982         conf = mddev->private;
8983         if (!conf) {
8984                 mddev_unlock(mddev);
8985                 return -ENODEV;
8986         }
8987
8988         if (strncmp(buf, "ppl", 3) == 0) {
8989                 /* ppl only works with RAID 5 */
8990                 if (!raid5_has_ppl(conf) && conf->level == 5) {
8991                         err = log_init(conf, NULL, true);
8992                         if (!err) {
8993                                 err = resize_stripes(conf, conf->pool_size);
8994                                 if (err) {
8995                                         mddev_suspend(mddev);
8996                                         log_exit(conf);
8997                                         mddev_resume(mddev);
8998                                 }
8999                         }
9000                 } else
9001                         err = -EINVAL;
9002         } else if (strncmp(buf, "resync", 6) == 0) {
9003                 if (raid5_has_ppl(conf)) {
9004                         mddev_suspend(mddev);
9005                         log_exit(conf);
9006                         mddev_resume(mddev);
9007                         err = resize_stripes(conf, conf->pool_size);
9008                 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
9009                            r5l_log_disk_error(conf)) {
9010                         bool journal_dev_exists = false;
9011                         struct md_rdev *rdev;
9012
9013                         rdev_for_each(rdev, mddev)
9014                                 if (test_bit(Journal, &rdev->flags)) {
9015                                         journal_dev_exists = true;
9016                                         break;
9017                                 }
9018
9019                         if (!journal_dev_exists) {
9020                                 mddev_suspend(mddev);
9021                                 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
9022                                 mddev_resume(mddev);
9023                         } else  /* need remove journal device first */
9024                                 err = -EBUSY;
9025                 } else
9026                         err = -EINVAL;
9027         } else {
9028                 err = -EINVAL;
9029         }
9030
9031         if (!err)
9032                 md_update_sb(mddev, 1);
9033
9034         mddev_unlock(mddev);
9035
9036         return err;
9037 }
9038
9039 static int raid5_start(struct mddev *mddev)
9040 {
9041         struct r5conf *conf = mddev->private;
9042
9043         return r5l_start(conf->log);
9044 }
9045
9046 static struct md_personality raid6_personality =
9047 {
9048         .name           = "raid6",
9049         .level          = 6,
9050         .owner          = THIS_MODULE,
9051         .make_request   = raid5_make_request,
9052         .run            = raid5_run,
9053         .start          = raid5_start,
9054         .free           = raid5_free,
9055         .status         = raid5_status,
9056         .error_handler  = raid5_error,
9057         .hot_add_disk   = raid5_add_disk,
9058         .hot_remove_disk= raid5_remove_disk,
9059         .spare_active   = raid5_spare_active,
9060         .sync_request   = raid5_sync_request,
9061         .resize         = raid5_resize,
9062         .size           = raid5_size,
9063         .check_reshape  = raid6_check_reshape,
9064         .start_reshape  = raid5_start_reshape,
9065         .finish_reshape = raid5_finish_reshape,
9066         .quiesce        = raid5_quiesce,
9067         .takeover       = raid6_takeover,
9068         .change_consistency_policy = raid5_change_consistency_policy,
9069 };
9070 static struct md_personality raid5_personality =
9071 {
9072         .name           = "raid5",
9073         .level          = 5,
9074         .owner          = THIS_MODULE,
9075         .make_request   = raid5_make_request,
9076         .run            = raid5_run,
9077         .start          = raid5_start,
9078         .free           = raid5_free,
9079         .status         = raid5_status,
9080         .error_handler  = raid5_error,
9081         .hot_add_disk   = raid5_add_disk,
9082         .hot_remove_disk= raid5_remove_disk,
9083         .spare_active   = raid5_spare_active,
9084         .sync_request   = raid5_sync_request,
9085         .resize         = raid5_resize,
9086         .size           = raid5_size,
9087         .check_reshape  = raid5_check_reshape,
9088         .start_reshape  = raid5_start_reshape,
9089         .finish_reshape = raid5_finish_reshape,
9090         .quiesce        = raid5_quiesce,
9091         .takeover       = raid5_takeover,
9092         .change_consistency_policy = raid5_change_consistency_policy,
9093 };
9094
9095 static struct md_personality raid4_personality =
9096 {
9097         .name           = "raid4",
9098         .level          = 4,
9099         .owner          = THIS_MODULE,
9100         .make_request   = raid5_make_request,
9101         .run            = raid5_run,
9102         .start          = raid5_start,
9103         .free           = raid5_free,
9104         .status         = raid5_status,
9105         .error_handler  = raid5_error,
9106         .hot_add_disk   = raid5_add_disk,
9107         .hot_remove_disk= raid5_remove_disk,
9108         .spare_active   = raid5_spare_active,
9109         .sync_request   = raid5_sync_request,
9110         .resize         = raid5_resize,
9111         .size           = raid5_size,
9112         .check_reshape  = raid5_check_reshape,
9113         .start_reshape  = raid5_start_reshape,
9114         .finish_reshape = raid5_finish_reshape,
9115         .quiesce        = raid5_quiesce,
9116         .takeover       = raid4_takeover,
9117         .change_consistency_policy = raid5_change_consistency_policy,
9118 };
9119
9120 static int __init raid5_init(void)
9121 {
9122         int ret;
9123
9124         raid5_wq = alloc_workqueue("raid5wq",
9125                 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
9126         if (!raid5_wq)
9127                 return -ENOMEM;
9128
9129         ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
9130                                       "md/raid5:prepare",
9131                                       raid456_cpu_up_prepare,
9132                                       raid456_cpu_dead);
9133         if (ret) {
9134                 destroy_workqueue(raid5_wq);
9135                 return ret;
9136         }
9137         register_md_personality(&raid6_personality);
9138         register_md_personality(&raid5_personality);
9139         register_md_personality(&raid4_personality);
9140         return 0;
9141 }
9142
9143 static void raid5_exit(void)
9144 {
9145         unregister_md_personality(&raid6_personality);
9146         unregister_md_personality(&raid5_personality);
9147         unregister_md_personality(&raid4_personality);
9148         cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
9149         destroy_workqueue(raid5_wq);
9150 }
9151
9152 module_init(raid5_init);
9153 module_exit(raid5_exit);
9154 MODULE_LICENSE("GPL");
9155 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
9156 MODULE_ALIAS("md-personality-4"); /* RAID5 */
9157 MODULE_ALIAS("md-raid5");
9158 MODULE_ALIAS("md-raid4");
9159 MODULE_ALIAS("md-level-5");
9160 MODULE_ALIAS("md-level-4");
9161 MODULE_ALIAS("md-personality-8"); /* RAID6 */
9162 MODULE_ALIAS("md-raid6");
9163 MODULE_ALIAS("md-level-6");
9164
9165 /* This used to be two separate modules, they were: */
9166 MODULE_ALIAS("raid5");
9167 MODULE_ALIAS("raid6");