mm/page_io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/mm/page_io.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6  *
   7  *  Swap reorganised 29.12.95,
   8  *  Asynchronous swapping added 30.12.95. Stephen Tweedie
   9  *  Removed race in async swapping. 14.4.1996. Bruno Haible
  10  *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
  11  *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
  12  */
  13
  14 #include <linux/mm.h>
  15 #include <linux/kernel_stat.h>
  16 #include <linux/gfp.h>
  17 #include <linux/pagemap.h>
  18 #include <linux/swap.h>
  19 #include <linux/bio.h>
  20 #include <linux/swapops.h>
  21 #include <linux/writeback.h>
  22 #include <linux/frontswap.h>
  23 #include <linux/blkdev.h>
  24 #include <linux/psi.h>
  25 #include <linux/uio.h>
  26 #include <linux/sched/task.h>
  27 #include <linux/delayacct.h>
  28 #include "swap.h"
  29
  30 static void __end_swap_bio_write(struct bio *bio)
  31 {
  32         struct page *page = bio_first_page_all(bio);
  33
  34         if (bio->bi_status) {
  35                 SetPageError(page);
  36                 /*
  37                  * We failed to write the page out to swap-space.
  38                  * Re-dirty the page in order to avoid it being reclaimed.
  39                  * Also print a dire warning that things will go BAD (tm)
  40                  * very quickly.
  41                  *
  42                  * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
  43                  */
  44                 set_page_dirty(page);
  45                 pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
  46                                      MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
  47                                      (unsigned long long)bio->bi_iter.bi_sector);
  48                 ClearPageReclaim(page);
  49         }
  50         end_page_writeback(page);
  51 }
  52
  53 static void end_swap_bio_write(struct bio *bio)
  54 {
  55         __end_swap_bio_write(bio);
  56         bio_put(bio);
  57 }
  58
  59 static void __end_swap_bio_read(struct bio *bio)
  60 {
  61         struct page *page = bio_first_page_all(bio);
  62
  63         if (bio->bi_status) {
  64                 SetPageError(page);
  65                 ClearPageUptodate(page);
  66                 pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
  67                                      MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
  68                                      (unsigned long long)bio->bi_iter.bi_sector);
  69         } else {
  70                 SetPageUptodate(page);
  71         }
  72         unlock_page(page);
  73 }
  74
  75 static void end_swap_bio_read(struct bio *bio)
  76 {
  77         __end_swap_bio_read(bio);
  78         bio_put(bio);
  79 }
  80
  81 int generic_swapfile_activate(struct swap_info_struct *sis,
  82                                 struct file *swap_file,
  83                                 sector_t *span)
  84 {
  85         struct address_space *mapping = swap_file->f_mapping;
  86         struct inode *inode = mapping->host;
  87         unsigned blocks_per_page;
  88         unsigned long page_no;
  89         unsigned blkbits;
  90         sector_t probe_block;
  91         sector_t last_block;
  92         sector_t lowest_block = -1;
  93         sector_t highest_block = 0;
  94         int nr_extents = 0;
  95         int ret;
  96
  97         blkbits = inode->i_blkbits;
  98         blocks_per_page = PAGE_SIZE >> blkbits;
  99
 100         /*
 101          * Map all the blocks into the extent tree.  This code doesn't try
 102          * to be very smart.
 103          */
 104         probe_block = 0;
 105         page_no = 0;
 106         last_block = i_size_read(inode) >> blkbits;
 107         while ((probe_block + blocks_per_page) <= last_block &&
 108                         page_no < sis->max) {
 109                 unsigned block_in_page;
 110                 sector_t first_block;
 111
 112                 cond_resched();
 113
 114                 first_block = probe_block;
 115                 ret = bmap(inode, &first_block);
 116                 if (ret || !first_block)
 117                         goto bad_bmap;
 118
 119                 /*
 120                  * It must be PAGE_SIZE aligned on-disk
 121                  */
 122                 if (first_block & (blocks_per_page - 1)) {
 123                         probe_block++;
 124                         goto reprobe;
 125                 }
 126
 127                 for (block_in_page = 1; block_in_page < blocks_per_page;
 128                                         block_in_page++) {
 129                         sector_t block;
 130
 131                         block = probe_block + block_in_page;
 132                         ret = bmap(inode, &block);
 133                         if (ret || !block)
 134                                 goto bad_bmap;
 135
 136                         if (block != first_block + block_in_page) {
 137                                 /* Discontiguity */
 138                                 probe_block++;
 139                                 goto reprobe;
 140                         }
 141                 }
 142
 143                 first_block >>= (PAGE_SHIFT - blkbits);
 144                 if (page_no) {  /* exclude the header page */
 145                         if (first_block < lowest_block)
 146                                 lowest_block = first_block;
 147                         if (first_block > highest_block)
 148                                 highest_block = first_block;
 149                 }
 150
 151                 /*
 152                  * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 153                  */
 154                 ret = add_swap_extent(sis, page_no, 1, first_block);
 155                 if (ret < 0)
 156                         goto out;
 157                 nr_extents += ret;
 158                 page_no++;
 159                 probe_block += blocks_per_page;
 160 reprobe:
 161                 continue;
 162         }
 163         ret = nr_extents;
 164         *span = 1 + highest_block - lowest_block;
 165         if (page_no == 0)
 166                 page_no = 1;    /* force Empty message */
 167         sis->max = page_no;
 168         sis->pages = page_no - 1;
 169         sis->highest_bit = page_no - 1;
 170 out:
 171         return ret;
 172 bad_bmap:
 173         pr_err("swapon: swapfile has holes\n");
 174         ret = -EINVAL;
 175         goto out;
 176 }
 177
 178 /*
 179  * We may have stale swap cache pages in memory: notice
 180  * them here and get rid of the unnecessary final write.
 181  */
 182 int swap_writepage(struct page *page, struct writeback_control *wbc)
 183 {
 184         struct folio *folio = page_folio(page);
 185         int ret;
 186
 187         if (folio_free_swap(folio)) {
 188                 folio_unlock(folio);
 189                 return 0;
 190         }
 191         /*
 192          * Arch code may have to preserve more data than just the page
 193          * contents, e.g. memory tags.
 194          */
 195         ret = arch_prepare_to_swap(&folio->page);
 196         if (ret) {
 197                 folio_mark_dirty(folio);
 198                 folio_unlock(folio);
 199                 return ret;
 200         }
 201         if (frontswap_store(&folio->page) == 0) {
 202                 folio_start_writeback(folio);
 203                 folio_unlock(folio);
 204                 folio_end_writeback(folio);
 205                 return 0;
 206         }
 207         __swap_writepage(&folio->page, wbc);
 208         return 0;
 209 }
 210
 211 static inline void count_swpout_vm_event(struct page *page)
 212 {
 213 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 214         if (unlikely(PageTransHuge(page)))
 215                 count_vm_event(THP_SWPOUT);
 216 #endif
 217         count_vm_events(PSWPOUT, thp_nr_pages(page));
 218 }
 219
 220 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
 221 static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 222 {
 223         struct cgroup_subsys_state *css;
 224         struct mem_cgroup *memcg;
 225
 226         memcg = page_memcg(page);
 227         if (!memcg)
 228                 return;
 229
 230         rcu_read_lock();
 231         css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
 232         bio_associate_blkg_from_css(bio, css);
 233         rcu_read_unlock();
 234 }
 235 #else
 236 #define bio_associate_blkg_from_page(bio, page)         do { } while (0)
 237 #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
 238
 239 struct swap_iocb {
 240         struct kiocb            iocb;
 241         struct bio_vec          bvec[SWAP_CLUSTER_MAX];
 242         int                     pages;
 243         int                     len;
 244 };
 245 static mempool_t *sio_pool;
 246
 247 int sio_pool_init(void)
 248 {
 249         if (!sio_pool) {
 250                 mempool_t *pool = mempool_create_kmalloc_pool(
 251                         SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
 252                 if (cmpxchg(&sio_pool, NULL, pool))
 253                         mempool_destroy(pool);
 254         }
 255         if (!sio_pool)
 256                 return -ENOMEM;
 257         return 0;
 258 }
 259
 260 static void sio_write_complete(struct kiocb *iocb, long ret)
 261 {
 262         struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
 263         struct page *page = sio->bvec[0].bv_page;
 264         int p;
 265
 266         if (ret != sio->len) {
 267                 /*
 268                  * In the case of swap-over-nfs, this can be a
 269                  * temporary failure if the system has limited
 270                  * memory for allocating transmit buffers.
 271                  * Mark the page dirty and avoid
 272                  * folio_rotate_reclaimable but rate-limit the
 273                  * messages but do not flag PageError like
 274                  * the normal direct-to-bio case as it could
 275                  * be temporary.
 276                  */
 277                 pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
 278                                    ret, page_file_offset(page));
 279                 for (p = 0; p < sio->pages; p++) {
 280                         page = sio->bvec[p].bv_page;
 281                         set_page_dirty(page);
 282                         ClearPageReclaim(page);
 283                 }
 284         } else {
 285                 for (p = 0; p < sio->pages; p++)
 286                         count_swpout_vm_event(sio->bvec[p].bv_page);
 287         }
 288
 289         for (p = 0; p < sio->pages; p++)
 290                 end_page_writeback(sio->bvec[p].bv_page);
 291
 292         mempool_free(sio, sio_pool);
 293 }
 294
 295 static void swap_writepage_fs(struct page *page, struct writeback_control *wbc)
 296 {
 297         struct swap_iocb *sio = NULL;
 298         struct swap_info_struct *sis = page_swap_info(page);
 299         struct file *swap_file = sis->swap_file;
 300         loff_t pos = page_file_offset(page);
 301
 302         set_page_writeback(page);
 303         unlock_page(page);
 304         if (wbc->swap_plug)
 305                 sio = *wbc->swap_plug;
 306         if (sio) {
 307                 if (sio->iocb.ki_filp != swap_file ||
 308                     sio->iocb.ki_pos + sio->len != pos) {
 309                         swap_write_unplug(sio);
 310                         sio = NULL;
 311                 }
 312         }
 313         if (!sio) {
 314                 sio = mempool_alloc(sio_pool, GFP_NOIO);
 315                 init_sync_kiocb(&sio->iocb, swap_file);
 316                 sio->iocb.ki_complete = sio_write_complete;
 317                 sio->iocb.ki_pos = pos;
 318                 sio->pages = 0;
 319                 sio->len = 0;
 320         }
 321         bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
 322         sio->len += thp_size(page);
 323         sio->pages += 1;
 324         if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
 325                 swap_write_unplug(sio);
 326                 sio = NULL;
 327         }
 328         if (wbc->swap_plug)
 329                 *wbc->swap_plug = sio;
 330 }
 331
 332 static void swap_writepage_bdev_sync(struct page *page,
 333                 struct writeback_control *wbc, struct swap_info_struct *sis)
 334 {
 335         struct bio_vec bv;
 336         struct bio bio;
 337
 338         bio_init(&bio, sis->bdev, &bv, 1,
 339                  REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
 340         bio.bi_iter.bi_sector = swap_page_sector(page);
 341         bio_add_page(&bio, page, thp_size(page), 0);
 342
 343         bio_associate_blkg_from_page(&bio, page);
 344         count_swpout_vm_event(page);
 345
 346         set_page_writeback(page);
 347         unlock_page(page);
 348
 349         submit_bio_wait(&bio);
 350         __end_swap_bio_write(&bio);
 351 }
 352
 353 static void swap_writepage_bdev_async(struct page *page,
 354                 struct writeback_control *wbc, struct swap_info_struct *sis)
 355 {
 356         struct bio *bio;
 357
 358         bio = bio_alloc(sis->bdev, 1,
 359                         REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
 360                         GFP_NOIO);
 361         bio->bi_iter.bi_sector = swap_page_sector(page);
 362         bio->bi_end_io = end_swap_bio_write;
 363         bio_add_page(bio, page, thp_size(page), 0);
 364
 365         bio_associate_blkg_from_page(bio, page);
 366         count_swpout_vm_event(page);
 367         set_page_writeback(page);
 368         unlock_page(page);
 369         submit_bio(bio);
 370 }
 371
 372 void __swap_writepage(struct page *page, struct writeback_control *wbc)
 373 {
 374         struct swap_info_struct *sis = page_swap_info(page);
 375
 376         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 377         /*
 378          * ->flags can be updated non-atomicially (scan_swap_map_slots),
 379          * but that will never affect SWP_FS_OPS, so the data_race
 380          * is safe.
 381          */
 382         if (data_race(sis->flags & SWP_FS_OPS))
 383                 swap_writepage_fs(page, wbc);
 384         else if (sis->flags & SWP_SYNCHRONOUS_IO)
 385                 swap_writepage_bdev_sync(page, wbc, sis);
 386         else
 387                 swap_writepage_bdev_async(page, wbc, sis);
 388 }
 389
 390 void swap_write_unplug(struct swap_iocb *sio)
 391 {
 392         struct iov_iter from;
 393         struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
 394         int ret;
 395
 396         iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
 397         ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
 398         if (ret != -EIOCBQUEUED)
 399                 sio_write_complete(&sio->iocb, ret);
 400 }
 401
 402 static void sio_read_complete(struct kiocb *iocb, long ret)
 403 {
 404         struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
 405         int p;
 406
 407         if (ret == sio->len) {
 408                 for (p = 0; p < sio->pages; p++) {
 409                         struct page *page = sio->bvec[p].bv_page;
 410
 411                         SetPageUptodate(page);
 412                         unlock_page(page);
 413                 }
 414                 count_vm_events(PSWPIN, sio->pages);
 415         } else {
 416                 for (p = 0; p < sio->pages; p++) {
 417                         struct page *page = sio->bvec[p].bv_page;
 418
 419                         SetPageError(page);
 420                         ClearPageUptodate(page);
 421                         unlock_page(page);
 422                 }
 423                 pr_alert_ratelimited("Read-error on swap-device\n");
 424         }
 425         mempool_free(sio, sio_pool);
 426 }
 427
 428 static void swap_readpage_fs(struct page *page,
 429                              struct swap_iocb **plug)
 430 {
 431         struct swap_info_struct *sis = page_swap_info(page);
 432         struct swap_iocb *sio = NULL;
 433         loff_t pos = page_file_offset(page);
 434
 435         if (plug)
 436                 sio = *plug;
 437         if (sio) {
 438                 if (sio->iocb.ki_filp != sis->swap_file ||
 439                     sio->iocb.ki_pos + sio->len != pos) {
 440                         swap_read_unplug(sio);
 441                         sio = NULL;
 442                 }
 443         }
 444         if (!sio) {
 445                 sio = mempool_alloc(sio_pool, GFP_KERNEL);
 446                 init_sync_kiocb(&sio->iocb, sis->swap_file);
 447                 sio->iocb.ki_pos = pos;
 448                 sio->iocb.ki_complete = sio_read_complete;
 449                 sio->pages = 0;
 450                 sio->len = 0;
 451         }
 452         bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
 453         sio->len += thp_size(page);
 454         sio->pages += 1;
 455         if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
 456                 swap_read_unplug(sio);
 457                 sio = NULL;
 458         }
 459         if (plug)
 460                 *plug = sio;
 461 }
 462
 463 static void swap_readpage_bdev_sync(struct page *page,
 464                 struct swap_info_struct *sis)
 465 {
 466         struct bio_vec bv;
 467         struct bio bio;
 468
 469         bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
 470         bio.bi_iter.bi_sector = swap_page_sector(page);
 471         bio_add_page(&bio, page, thp_size(page), 0);
 472         /*
 473          * Keep this task valid during swap readpage because the oom killer may
 474          * attempt to access it in the page fault retry time check.
 475          */
 476         get_task_struct(current);
 477         count_vm_event(PSWPIN);
 478         submit_bio_wait(&bio);
 479         __end_swap_bio_read(&bio);
 480         put_task_struct(current);
 481 }
 482
 483 static void swap_readpage_bdev_async(struct page *page,
 484                 struct swap_info_struct *sis)
 485 {
 486         struct bio *bio;
 487
 488         bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
 489         bio->bi_iter.bi_sector = swap_page_sector(page);
 490         bio->bi_end_io = end_swap_bio_read;
 491         bio_add_page(bio, page, thp_size(page), 0);
 492         count_vm_event(PSWPIN);
 493         submit_bio(bio);
 494 }
 495
 496 void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
 497 {
 498         struct swap_info_struct *sis = page_swap_info(page);
 499         bool workingset = PageWorkingset(page);
 500         unsigned long pflags;
 501         bool in_thrashing;
 502
 503         VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
 504         VM_BUG_ON_PAGE(!PageLocked(page), page);
 505         VM_BUG_ON_PAGE(PageUptodate(page), page);
 506
 507         /*
 508          * Count submission time as memory stall and delay. When the device
 509          * is congested, or the submitting cgroup IO-throttled, submission
 510          * can be a significant part of overall IO time.
 511          */
 512         if (workingset) {
 513                 delayacct_thrashing_start(&in_thrashing);
 514                 psi_memstall_enter(&pflags);
 515         }
 516         delayacct_swapin_start();
 517
 518         if (frontswap_load(page) == 0) {
 519                 SetPageUptodate(page);
 520                 unlock_page(page);
 521         } else if (data_race(sis->flags & SWP_FS_OPS)) {
 522                 swap_readpage_fs(page, plug);
 523         } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
 524                 swap_readpage_bdev_sync(page, sis);
 525         } else {
 526                 swap_readpage_bdev_async(page, sis);
 527         }
 528
 529         if (workingset) {
 530                 delayacct_thrashing_end(&in_thrashing);
 531                 psi_memstall_leave(&pflags);
 532         }
 533         delayacct_swapin_end();
 534 }
 535
 536 void __swap_read_unplug(struct swap_iocb *sio)
 537 {
 538         struct iov_iter from;
 539         struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
 540         int ret;
 541
 542         iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
 543         ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
 544         if (ret != -EIOCBQUEUED)
 545                 sio_read_complete(&sio->iocb, ret);
 546 }