ITER_PIPE: helper for getting pipe buffer by index
[linux.git] / lib / iov_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
16
17 #define PIPE_PARANOIA /* for now */
18
19 /* covers ubuf and kbuf alike */
20 #define iterate_buf(i, n, base, len, off, __p, STEP) {          \
21         size_t __maybe_unused off = 0;                          \
22         len = n;                                                \
23         base = __p + i->iov_offset;                             \
24         len -= (STEP);                                          \
25         i->iov_offset += len;                                   \
26         n = len;                                                \
27 }
28
29 /* covers iovec and kvec alike */
30 #define iterate_iovec(i, n, base, len, off, __p, STEP) {        \
31         size_t off = 0;                                         \
32         size_t skip = i->iov_offset;                            \
33         do {                                                    \
34                 len = min(n, __p->iov_len - skip);              \
35                 if (likely(len)) {                              \
36                         base = __p->iov_base + skip;            \
37                         len -= (STEP);                          \
38                         off += len;                             \
39                         skip += len;                            \
40                         n -= len;                               \
41                         if (skip < __p->iov_len)                \
42                                 break;                          \
43                 }                                               \
44                 __p++;                                          \
45                 skip = 0;                                       \
46         } while (n);                                            \
47         i->iov_offset = skip;                                   \
48         n = off;                                                \
49 }
50
51 #define iterate_bvec(i, n, base, len, off, p, STEP) {           \
52         size_t off = 0;                                         \
53         unsigned skip = i->iov_offset;                          \
54         while (n) {                                             \
55                 unsigned offset = p->bv_offset + skip;          \
56                 unsigned left;                                  \
57                 void *kaddr = kmap_local_page(p->bv_page +      \
58                                         offset / PAGE_SIZE);    \
59                 base = kaddr + offset % PAGE_SIZE;              \
60                 len = min(min(n, (size_t)(p->bv_len - skip)),   \
61                      (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
62                 left = (STEP);                                  \
63                 kunmap_local(kaddr);                            \
64                 len -= left;                                    \
65                 off += len;                                     \
66                 skip += len;                                    \
67                 if (skip == p->bv_len) {                        \
68                         skip = 0;                               \
69                         p++;                                    \
70                 }                                               \
71                 n -= len;                                       \
72                 if (left)                                       \
73                         break;                                  \
74         }                                                       \
75         i->iov_offset = skip;                                   \
76         n = off;                                                \
77 }
78
79 #define iterate_xarray(i, n, base, len, __off, STEP) {          \
80         __label__ __out;                                        \
81         size_t __off = 0;                                       \
82         struct folio *folio;                                    \
83         loff_t start = i->xarray_start + i->iov_offset;         \
84         pgoff_t index = start / PAGE_SIZE;                      \
85         XA_STATE(xas, i->xarray, index);                        \
86                                                                 \
87         len = PAGE_SIZE - offset_in_page(start);                \
88         rcu_read_lock();                                        \
89         xas_for_each(&xas, folio, ULONG_MAX) {                  \
90                 unsigned left;                                  \
91                 size_t offset;                                  \
92                 if (xas_retry(&xas, folio))                     \
93                         continue;                               \
94                 if (WARN_ON(xa_is_value(folio)))                \
95                         break;                                  \
96                 if (WARN_ON(folio_test_hugetlb(folio)))         \
97                         break;                                  \
98                 offset = offset_in_folio(folio, start + __off); \
99                 while (offset < folio_size(folio)) {            \
100                         base = kmap_local_folio(folio, offset); \
101                         len = min(n, len);                      \
102                         left = (STEP);                          \
103                         kunmap_local(base);                     \
104                         len -= left;                            \
105                         __off += len;                           \
106                         n -= len;                               \
107                         if (left || n == 0)                     \
108                                 goto __out;                     \
109                         offset += len;                          \
110                         len = PAGE_SIZE;                        \
111                 }                                               \
112         }                                                       \
113 __out:                                                          \
114         rcu_read_unlock();                                      \
115         i->iov_offset += __off;                                 \
116         n = __off;                                              \
117 }
118
119 #define __iterate_and_advance(i, n, base, len, off, I, K) {     \
120         if (unlikely(i->count < n))                             \
121                 n = i->count;                                   \
122         if (likely(n)) {                                        \
123                 if (likely(iter_is_ubuf(i))) {                  \
124                         void __user *base;                      \
125                         size_t len;                             \
126                         iterate_buf(i, n, base, len, off,       \
127                                                 i->ubuf, (I))   \
128                 } else if (likely(iter_is_iovec(i))) {          \
129                         const struct iovec *iov = i->iov;       \
130                         void __user *base;                      \
131                         size_t len;                             \
132                         iterate_iovec(i, n, base, len, off,     \
133                                                 iov, (I))       \
134                         i->nr_segs -= iov - i->iov;             \
135                         i->iov = iov;                           \
136                 } else if (iov_iter_is_bvec(i)) {               \
137                         const struct bio_vec *bvec = i->bvec;   \
138                         void *base;                             \
139                         size_t len;                             \
140                         iterate_bvec(i, n, base, len, off,      \
141                                                 bvec, (K))      \
142                         i->nr_segs -= bvec - i->bvec;           \
143                         i->bvec = bvec;                         \
144                 } else if (iov_iter_is_kvec(i)) {               \
145                         const struct kvec *kvec = i->kvec;      \
146                         void *base;                             \
147                         size_t len;                             \
148                         iterate_iovec(i, n, base, len, off,     \
149                                                 kvec, (K))      \
150                         i->nr_segs -= kvec - i->kvec;           \
151                         i->kvec = kvec;                         \
152                 } else if (iov_iter_is_xarray(i)) {             \
153                         void *base;                             \
154                         size_t len;                             \
155                         iterate_xarray(i, n, base, len, off,    \
156                                                         (K))    \
157                 }                                               \
158                 i->count -= n;                                  \
159         }                                                       \
160 }
161 #define iterate_and_advance(i, n, base, len, off, I, K) \
162         __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
163
164 static int copyout(void __user *to, const void *from, size_t n)
165 {
166         if (should_fail_usercopy())
167                 return n;
168         if (access_ok(to, n)) {
169                 instrument_copy_to_user(to, from, n);
170                 n = raw_copy_to_user(to, from, n);
171         }
172         return n;
173 }
174
175 static int copyin(void *to, const void __user *from, size_t n)
176 {
177         if (should_fail_usercopy())
178                 return n;
179         if (access_ok(from, n)) {
180                 instrument_copy_from_user(to, from, n);
181                 n = raw_copy_from_user(to, from, n);
182         }
183         return n;
184 }
185
186 static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe,
187                                            unsigned int slot)
188 {
189         return &pipe->bufs[slot & (pipe->ring_size - 1)];
190 }
191
192 #ifdef PIPE_PARANOIA
193 static bool sanity(const struct iov_iter *i)
194 {
195         struct pipe_inode_info *pipe = i->pipe;
196         unsigned int p_head = pipe->head;
197         unsigned int p_tail = pipe->tail;
198         unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
199         unsigned int i_head = i->head;
200         unsigned int idx;
201
202         if (i->iov_offset) {
203                 struct pipe_buffer *p;
204                 if (unlikely(p_occupancy == 0))
205                         goto Bad;       // pipe must be non-empty
206                 if (unlikely(i_head != p_head - 1))
207                         goto Bad;       // must be at the last buffer...
208
209                 p = pipe_buf(pipe, i_head);
210                 if (unlikely(p->offset + p->len != i->iov_offset))
211                         goto Bad;       // ... at the end of segment
212         } else {
213                 if (i_head != p_head)
214                         goto Bad;       // must be right after the last buffer
215         }
216         return true;
217 Bad:
218         printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
219         printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
220                         p_head, p_tail, pipe->ring_size);
221         for (idx = 0; idx < pipe->ring_size; idx++)
222                 printk(KERN_ERR "[%p %p %d %d]\n",
223                         pipe->bufs[idx].ops,
224                         pipe->bufs[idx].page,
225                         pipe->bufs[idx].offset,
226                         pipe->bufs[idx].len);
227         WARN_ON(1);
228         return false;
229 }
230 #else
231 #define sanity(i) true
232 #endif
233
234 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
235                          struct iov_iter *i)
236 {
237         struct pipe_inode_info *pipe = i->pipe;
238         struct pipe_buffer *buf;
239         unsigned int p_tail = pipe->tail;
240         unsigned int p_mask = pipe->ring_size - 1;
241         unsigned int i_head = i->head;
242         size_t off;
243
244         if (unlikely(bytes > i->count))
245                 bytes = i->count;
246
247         if (unlikely(!bytes))
248                 return 0;
249
250         if (!sanity(i))
251                 return 0;
252
253         off = i->iov_offset;
254         buf = &pipe->bufs[i_head & p_mask];
255         if (off) {
256                 if (offset == off && buf->page == page) {
257                         /* merge with the last one */
258                         buf->len += bytes;
259                         i->iov_offset += bytes;
260                         goto out;
261                 }
262                 i_head++;
263                 buf = &pipe->bufs[i_head & p_mask];
264         }
265         if (pipe_full(i_head, p_tail, pipe->max_usage))
266                 return 0;
267
268         buf->ops = &page_cache_pipe_buf_ops;
269         buf->flags = 0;
270         get_page(page);
271         buf->page = page;
272         buf->offset = offset;
273         buf->len = bytes;
274
275         pipe->head = i_head + 1;
276         i->iov_offset = offset + bytes;
277         i->head = i_head;
278 out:
279         i->count -= bytes;
280         return bytes;
281 }
282
283 /*
284  * fault_in_iov_iter_readable - fault in iov iterator for reading
285  * @i: iterator
286  * @size: maximum length
287  *
288  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
289  * @size.  For each iovec, fault in each page that constitutes the iovec.
290  *
291  * Returns the number of bytes not faulted in (like copy_to_user() and
292  * copy_from_user()).
293  *
294  * Always returns 0 for non-userspace iterators.
295  */
296 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
297 {
298         if (iter_is_ubuf(i)) {
299                 size_t n = min(size, iov_iter_count(i));
300                 n -= fault_in_readable(i->ubuf + i->iov_offset, n);
301                 return size - n;
302         } else if (iter_is_iovec(i)) {
303                 size_t count = min(size, iov_iter_count(i));
304                 const struct iovec *p;
305                 size_t skip;
306
307                 size -= count;
308                 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
309                         size_t len = min(count, p->iov_len - skip);
310                         size_t ret;
311
312                         if (unlikely(!len))
313                                 continue;
314                         ret = fault_in_readable(p->iov_base + skip, len);
315                         count -= len - ret;
316                         if (ret)
317                                 break;
318                 }
319                 return count + size;
320         }
321         return 0;
322 }
323 EXPORT_SYMBOL(fault_in_iov_iter_readable);
324
325 /*
326  * fault_in_iov_iter_writeable - fault in iov iterator for writing
327  * @i: iterator
328  * @size: maximum length
329  *
330  * Faults in the iterator using get_user_pages(), i.e., without triggering
331  * hardware page faults.  This is primarily useful when we already know that
332  * some or all of the pages in @i aren't in memory.
333  *
334  * Returns the number of bytes not faulted in, like copy_to_user() and
335  * copy_from_user().
336  *
337  * Always returns 0 for non-user-space iterators.
338  */
339 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
340 {
341         if (iter_is_ubuf(i)) {
342                 size_t n = min(size, iov_iter_count(i));
343                 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
344                 return size - n;
345         } else if (iter_is_iovec(i)) {
346                 size_t count = min(size, iov_iter_count(i));
347                 const struct iovec *p;
348                 size_t skip;
349
350                 size -= count;
351                 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
352                         size_t len = min(count, p->iov_len - skip);
353                         size_t ret;
354
355                         if (unlikely(!len))
356                                 continue;
357                         ret = fault_in_safe_writeable(p->iov_base + skip, len);
358                         count -= len - ret;
359                         if (ret)
360                                 break;
361                 }
362                 return count + size;
363         }
364         return 0;
365 }
366 EXPORT_SYMBOL(fault_in_iov_iter_writeable);
367
368 void iov_iter_init(struct iov_iter *i, unsigned int direction,
369                         const struct iovec *iov, unsigned long nr_segs,
370                         size_t count)
371 {
372         WARN_ON(direction & ~(READ | WRITE));
373         *i = (struct iov_iter) {
374                 .iter_type = ITER_IOVEC,
375                 .nofault = false,
376                 .user_backed = true,
377                 .data_source = direction,
378                 .iov = iov,
379                 .nr_segs = nr_segs,
380                 .iov_offset = 0,
381                 .count = count
382         };
383 }
384 EXPORT_SYMBOL(iov_iter_init);
385
386 static inline bool allocated(struct pipe_buffer *buf)
387 {
388         return buf->ops == &default_pipe_buf_ops;
389 }
390
391 static inline void data_start(const struct iov_iter *i,
392                               unsigned int *iter_headp, size_t *offp)
393 {
394         unsigned int iter_head = i->head;
395         size_t off = i->iov_offset;
396
397         if (off && (!allocated(pipe_buf(i->pipe, iter_head)) ||
398                     off == PAGE_SIZE)) {
399                 iter_head++;
400                 off = 0;
401         }
402         *iter_headp = iter_head;
403         *offp = off;
404 }
405
406 static size_t push_pipe(struct iov_iter *i, size_t size,
407                         int *iter_headp, size_t *offp)
408 {
409         struct pipe_inode_info *pipe = i->pipe;
410         unsigned int p_tail = pipe->tail;
411         unsigned int p_mask = pipe->ring_size - 1;
412         unsigned int iter_head;
413         size_t off;
414         ssize_t left;
415
416         if (unlikely(size > i->count))
417                 size = i->count;
418         if (unlikely(!size))
419                 return 0;
420
421         left = size;
422         data_start(i, &iter_head, &off);
423         *iter_headp = iter_head;
424         *offp = off;
425         if (off) {
426                 left -= PAGE_SIZE - off;
427                 if (left <= 0) {
428                         pipe->bufs[iter_head & p_mask].len += size;
429                         return size;
430                 }
431                 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
432                 iter_head++;
433         }
434         while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
435                 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
436                 struct page *page = alloc_page(GFP_USER);
437                 if (!page)
438                         break;
439
440                 buf->ops = &default_pipe_buf_ops;
441                 buf->flags = 0;
442                 buf->page = page;
443                 buf->offset = 0;
444                 buf->len = min_t(ssize_t, left, PAGE_SIZE);
445                 left -= buf->len;
446                 iter_head++;
447                 pipe->head = iter_head;
448
449                 if (left == 0)
450                         return size;
451         }
452         return size - left;
453 }
454
455 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
456                                 struct iov_iter *i)
457 {
458         struct pipe_inode_info *pipe = i->pipe;
459         unsigned int p_mask = pipe->ring_size - 1;
460         unsigned int i_head;
461         size_t n, off;
462
463         if (!sanity(i))
464                 return 0;
465
466         bytes = n = push_pipe(i, bytes, &i_head, &off);
467         if (unlikely(!n))
468                 return 0;
469         do {
470                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
471                 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
472                 i->head = i_head;
473                 i->iov_offset = off + chunk;
474                 n -= chunk;
475                 addr += chunk;
476                 off = 0;
477                 i_head++;
478         } while (n);
479         i->count -= bytes;
480         return bytes;
481 }
482
483 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
484                               __wsum sum, size_t off)
485 {
486         __wsum next = csum_partial_copy_nocheck(from, to, len);
487         return csum_block_add(sum, next, off);
488 }
489
490 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
491                                          struct iov_iter *i, __wsum *sump)
492 {
493         struct pipe_inode_info *pipe = i->pipe;
494         unsigned int p_mask = pipe->ring_size - 1;
495         __wsum sum = *sump;
496         size_t off = 0;
497         unsigned int i_head;
498         size_t r;
499
500         if (!sanity(i))
501                 return 0;
502
503         bytes = push_pipe(i, bytes, &i_head, &r);
504         while (bytes) {
505                 size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r);
506                 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
507                 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off);
508                 kunmap_local(p);
509                 i->head = i_head;
510                 i->iov_offset = r + chunk;
511                 bytes -= chunk;
512                 off += chunk;
513                 r = 0;
514                 i_head++;
515         }
516         *sump = sum;
517         i->count -= off;
518         return off;
519 }
520
521 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
522 {
523         if (unlikely(iov_iter_is_pipe(i)))
524                 return copy_pipe_to_iter(addr, bytes, i);
525         if (user_backed_iter(i))
526                 might_fault();
527         iterate_and_advance(i, bytes, base, len, off,
528                 copyout(base, addr + off, len),
529                 memcpy(base, addr + off, len)
530         )
531
532         return bytes;
533 }
534 EXPORT_SYMBOL(_copy_to_iter);
535
536 #ifdef CONFIG_ARCH_HAS_COPY_MC
537 static int copyout_mc(void __user *to, const void *from, size_t n)
538 {
539         if (access_ok(to, n)) {
540                 instrument_copy_to_user(to, from, n);
541                 n = copy_mc_to_user((__force void *) to, from, n);
542         }
543         return n;
544 }
545
546 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
547                                 struct iov_iter *i)
548 {
549         struct pipe_inode_info *pipe = i->pipe;
550         unsigned int p_mask = pipe->ring_size - 1;
551         unsigned int i_head;
552         unsigned int valid = pipe->head;
553         size_t n, off, xfer = 0;
554
555         if (!sanity(i))
556                 return 0;
557
558         n = push_pipe(i, bytes, &i_head, &off);
559         while (n) {
560                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
561                 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
562                 unsigned long rem;
563                 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
564                 chunk -= rem;
565                 kunmap_local(p);
566                 if (chunk) {
567                         i->head = i_head;
568                         i->iov_offset = off + chunk;
569                         xfer += chunk;
570                         valid = i_head + 1;
571                 }
572                 if (rem) {
573                         pipe->bufs[i_head & p_mask].len -= rem;
574                         pipe_discard_from(pipe, valid);
575                         break;
576                 }
577                 n -= chunk;
578                 off = 0;
579                 i_head++;
580         }
581         i->count -= xfer;
582         return xfer;
583 }
584
585 /**
586  * _copy_mc_to_iter - copy to iter with source memory error exception handling
587  * @addr: source kernel address
588  * @bytes: total transfer length
589  * @i: destination iterator
590  *
591  * The pmem driver deploys this for the dax operation
592  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
593  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
594  * successfully copied.
595  *
596  * The main differences between this and typical _copy_to_iter().
597  *
598  * * Typical tail/residue handling after a fault retries the copy
599  *   byte-by-byte until the fault happens again. Re-triggering machine
600  *   checks is potentially fatal so the implementation uses source
601  *   alignment and poison alignment assumptions to avoid re-triggering
602  *   hardware exceptions.
603  *
604  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
605  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
606  *   a short copy.
607  *
608  * Return: number of bytes copied (may be %0)
609  */
610 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
611 {
612         if (unlikely(iov_iter_is_pipe(i)))
613                 return copy_mc_pipe_to_iter(addr, bytes, i);
614         if (user_backed_iter(i))
615                 might_fault();
616         __iterate_and_advance(i, bytes, base, len, off,
617                 copyout_mc(base, addr + off, len),
618                 copy_mc_to_kernel(base, addr + off, len)
619         )
620
621         return bytes;
622 }
623 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
624 #endif /* CONFIG_ARCH_HAS_COPY_MC */
625
626 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
627 {
628         if (unlikely(iov_iter_is_pipe(i))) {
629                 WARN_ON(1);
630                 return 0;
631         }
632         if (user_backed_iter(i))
633                 might_fault();
634         iterate_and_advance(i, bytes, base, len, off,
635                 copyin(addr + off, base, len),
636                 memcpy(addr + off, base, len)
637         )
638
639         return bytes;
640 }
641 EXPORT_SYMBOL(_copy_from_iter);
642
643 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
644 {
645         if (unlikely(iov_iter_is_pipe(i))) {
646                 WARN_ON(1);
647                 return 0;
648         }
649         iterate_and_advance(i, bytes, base, len, off,
650                 __copy_from_user_inatomic_nocache(addr + off, base, len),
651                 memcpy(addr + off, base, len)
652         )
653
654         return bytes;
655 }
656 EXPORT_SYMBOL(_copy_from_iter_nocache);
657
658 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
659 /**
660  * _copy_from_iter_flushcache - write destination through cpu cache
661  * @addr: destination kernel address
662  * @bytes: total transfer length
663  * @i: source iterator
664  *
665  * The pmem driver arranges for filesystem-dax to use this facility via
666  * dax_copy_from_iter() for ensuring that writes to persistent memory
667  * are flushed through the CPU cache. It is differentiated from
668  * _copy_from_iter_nocache() in that guarantees all data is flushed for
669  * all iterator types. The _copy_from_iter_nocache() only attempts to
670  * bypass the cache for the ITER_IOVEC case, and on some archs may use
671  * instructions that strand dirty-data in the cache.
672  *
673  * Return: number of bytes copied (may be %0)
674  */
675 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
676 {
677         if (unlikely(iov_iter_is_pipe(i))) {
678                 WARN_ON(1);
679                 return 0;
680         }
681         iterate_and_advance(i, bytes, base, len, off,
682                 __copy_from_user_flushcache(addr + off, base, len),
683                 memcpy_flushcache(addr + off, base, len)
684         )
685
686         return bytes;
687 }
688 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
689 #endif
690
691 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
692 {
693         struct page *head;
694         size_t v = n + offset;
695
696         /*
697          * The general case needs to access the page order in order
698          * to compute the page size.
699          * However, we mostly deal with order-0 pages and thus can
700          * avoid a possible cache line miss for requests that fit all
701          * page orders.
702          */
703         if (n <= v && v <= PAGE_SIZE)
704                 return true;
705
706         head = compound_head(page);
707         v += (page - head) << PAGE_SHIFT;
708
709         if (likely(n <= v && v <= (page_size(head))))
710                 return true;
711         WARN_ON(1);
712         return false;
713 }
714
715 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
716                          struct iov_iter *i)
717 {
718         if (unlikely(iov_iter_is_pipe(i))) {
719                 return copy_page_to_iter_pipe(page, offset, bytes, i);
720         } else {
721                 void *kaddr = kmap_local_page(page);
722                 size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
723                 kunmap_local(kaddr);
724                 return wanted;
725         }
726 }
727
728 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
729                          struct iov_iter *i)
730 {
731         size_t res = 0;
732         if (unlikely(!page_copy_sane(page, offset, bytes)))
733                 return 0;
734         page += offset / PAGE_SIZE; // first subpage
735         offset %= PAGE_SIZE;
736         while (1) {
737                 size_t n = __copy_page_to_iter(page, offset,
738                                 min(bytes, (size_t)PAGE_SIZE - offset), i);
739                 res += n;
740                 bytes -= n;
741                 if (!bytes || !n)
742                         break;
743                 offset += n;
744                 if (offset == PAGE_SIZE) {
745                         page++;
746                         offset = 0;
747                 }
748         }
749         return res;
750 }
751 EXPORT_SYMBOL(copy_page_to_iter);
752
753 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
754                          struct iov_iter *i)
755 {
756         if (page_copy_sane(page, offset, bytes)) {
757                 void *kaddr = kmap_local_page(page);
758                 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
759                 kunmap_local(kaddr);
760                 return wanted;
761         }
762         return 0;
763 }
764 EXPORT_SYMBOL(copy_page_from_iter);
765
766 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
767 {
768         struct pipe_inode_info *pipe = i->pipe;
769         unsigned int p_mask = pipe->ring_size - 1;
770         unsigned int i_head;
771         size_t n, off;
772
773         if (!sanity(i))
774                 return 0;
775
776         bytes = n = push_pipe(i, bytes, &i_head, &off);
777         if (unlikely(!n))
778                 return 0;
779
780         do {
781                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
782                 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
783                 memset(p + off, 0, chunk);
784                 kunmap_local(p);
785                 i->head = i_head;
786                 i->iov_offset = off + chunk;
787                 n -= chunk;
788                 off = 0;
789                 i_head++;
790         } while (n);
791         i->count -= bytes;
792         return bytes;
793 }
794
795 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
796 {
797         if (unlikely(iov_iter_is_pipe(i)))
798                 return pipe_zero(bytes, i);
799         iterate_and_advance(i, bytes, base, len, count,
800                 clear_user(base, len),
801                 memset(base, 0, len)
802         )
803
804         return bytes;
805 }
806 EXPORT_SYMBOL(iov_iter_zero);
807
808 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
809                                   struct iov_iter *i)
810 {
811         char *kaddr = kmap_atomic(page), *p = kaddr + offset;
812         if (unlikely(!page_copy_sane(page, offset, bytes))) {
813                 kunmap_atomic(kaddr);
814                 return 0;
815         }
816         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
817                 kunmap_atomic(kaddr);
818                 WARN_ON(1);
819                 return 0;
820         }
821         iterate_and_advance(i, bytes, base, len, off,
822                 copyin(p + off, base, len),
823                 memcpy(p + off, base, len)
824         )
825         kunmap_atomic(kaddr);
826         return bytes;
827 }
828 EXPORT_SYMBOL(copy_page_from_iter_atomic);
829
830 static inline void pipe_truncate(struct iov_iter *i)
831 {
832         struct pipe_inode_info *pipe = i->pipe;
833         unsigned int p_tail = pipe->tail;
834         unsigned int p_head = pipe->head;
835         unsigned int p_mask = pipe->ring_size - 1;
836
837         if (!pipe_empty(p_head, p_tail)) {
838                 struct pipe_buffer *buf;
839                 unsigned int i_head = i->head;
840                 size_t off = i->iov_offset;
841
842                 if (off) {
843                         buf = &pipe->bufs[i_head & p_mask];
844                         buf->len = off - buf->offset;
845                         i_head++;
846                 }
847                 while (p_head != i_head) {
848                         p_head--;
849                         pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
850                 }
851
852                 pipe->head = p_head;
853         }
854 }
855
856 static void pipe_advance(struct iov_iter *i, size_t size)
857 {
858         struct pipe_inode_info *pipe = i->pipe;
859         if (size) {
860                 struct pipe_buffer *buf;
861                 unsigned int p_mask = pipe->ring_size - 1;
862                 unsigned int i_head = i->head;
863                 size_t off = i->iov_offset, left = size;
864
865                 if (off) /* make it relative to the beginning of buffer */
866                         left += off - pipe->bufs[i_head & p_mask].offset;
867                 while (1) {
868                         buf = &pipe->bufs[i_head & p_mask];
869                         if (left <= buf->len)
870                                 break;
871                         left -= buf->len;
872                         i_head++;
873                 }
874                 i->head = i_head;
875                 i->iov_offset = buf->offset + left;
876         }
877         i->count -= size;
878         /* ... and discard everything past that point */
879         pipe_truncate(i);
880 }
881
882 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
883 {
884         const struct bio_vec *bvec, *end;
885
886         if (!i->count)
887                 return;
888         i->count -= size;
889
890         size += i->iov_offset;
891
892         for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
893                 if (likely(size < bvec->bv_len))
894                         break;
895                 size -= bvec->bv_len;
896         }
897         i->iov_offset = size;
898         i->nr_segs -= bvec - i->bvec;
899         i->bvec = bvec;
900 }
901
902 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
903 {
904         const struct iovec *iov, *end;
905
906         if (!i->count)
907                 return;
908         i->count -= size;
909
910         size += i->iov_offset; // from beginning of current segment
911         for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
912                 if (likely(size < iov->iov_len))
913                         break;
914                 size -= iov->iov_len;
915         }
916         i->iov_offset = size;
917         i->nr_segs -= iov - i->iov;
918         i->iov = iov;
919 }
920
921 void iov_iter_advance(struct iov_iter *i, size_t size)
922 {
923         if (unlikely(i->count < size))
924                 size = i->count;
925         if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
926                 i->iov_offset += size;
927                 i->count -= size;
928         } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
929                 /* iovec and kvec have identical layouts */
930                 iov_iter_iovec_advance(i, size);
931         } else if (iov_iter_is_bvec(i)) {
932                 iov_iter_bvec_advance(i, size);
933         } else if (iov_iter_is_pipe(i)) {
934                 pipe_advance(i, size);
935         } else if (iov_iter_is_discard(i)) {
936                 i->count -= size;
937         }
938 }
939 EXPORT_SYMBOL(iov_iter_advance);
940
941 void iov_iter_revert(struct iov_iter *i, size_t unroll)
942 {
943         if (!unroll)
944                 return;
945         if (WARN_ON(unroll > MAX_RW_COUNT))
946                 return;
947         i->count += unroll;
948         if (unlikely(iov_iter_is_pipe(i))) {
949                 struct pipe_inode_info *pipe = i->pipe;
950                 unsigned int p_mask = pipe->ring_size - 1;
951                 unsigned int i_head = i->head;
952                 size_t off = i->iov_offset;
953                 while (1) {
954                         struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
955                         size_t n = off - b->offset;
956                         if (unroll < n) {
957                                 off -= unroll;
958                                 break;
959                         }
960                         unroll -= n;
961                         if (!unroll && i_head == i->start_head) {
962                                 off = 0;
963                                 break;
964                         }
965                         i_head--;
966                         b = &pipe->bufs[i_head & p_mask];
967                         off = b->offset + b->len;
968                 }
969                 i->iov_offset = off;
970                 i->head = i_head;
971                 pipe_truncate(i);
972                 return;
973         }
974         if (unlikely(iov_iter_is_discard(i)))
975                 return;
976         if (unroll <= i->iov_offset) {
977                 i->iov_offset -= unroll;
978                 return;
979         }
980         unroll -= i->iov_offset;
981         if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
982                 BUG(); /* We should never go beyond the start of the specified
983                         * range since we might then be straying into pages that
984                         * aren't pinned.
985                         */
986         } else if (iov_iter_is_bvec(i)) {
987                 const struct bio_vec *bvec = i->bvec;
988                 while (1) {
989                         size_t n = (--bvec)->bv_len;
990                         i->nr_segs++;
991                         if (unroll <= n) {
992                                 i->bvec = bvec;
993                                 i->iov_offset = n - unroll;
994                                 return;
995                         }
996                         unroll -= n;
997                 }
998         } else { /* same logics for iovec and kvec */
999                 const struct iovec *iov = i->iov;
1000                 while (1) {
1001                         size_t n = (--iov)->iov_len;
1002                         i->nr_segs++;
1003                         if (unroll <= n) {
1004                                 i->iov = iov;
1005                                 i->iov_offset = n - unroll;
1006                                 return;
1007                         }
1008                         unroll -= n;
1009                 }
1010         }
1011 }
1012 EXPORT_SYMBOL(iov_iter_revert);
1013
1014 /*
1015  * Return the count of just the current iov_iter segment.
1016  */
1017 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1018 {
1019         if (i->nr_segs > 1) {
1020                 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1021                         return min(i->count, i->iov->iov_len - i->iov_offset);
1022                 if (iov_iter_is_bvec(i))
1023                         return min(i->count, i->bvec->bv_len - i->iov_offset);
1024         }
1025         return i->count;
1026 }
1027 EXPORT_SYMBOL(iov_iter_single_seg_count);
1028
1029 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1030                         const struct kvec *kvec, unsigned long nr_segs,
1031                         size_t count)
1032 {
1033         WARN_ON(direction & ~(READ | WRITE));
1034         *i = (struct iov_iter){
1035                 .iter_type = ITER_KVEC,
1036                 .data_source = direction,
1037                 .kvec = kvec,
1038                 .nr_segs = nr_segs,
1039                 .iov_offset = 0,
1040                 .count = count
1041         };
1042 }
1043 EXPORT_SYMBOL(iov_iter_kvec);
1044
1045 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1046                         const struct bio_vec *bvec, unsigned long nr_segs,
1047                         size_t count)
1048 {
1049         WARN_ON(direction & ~(READ | WRITE));
1050         *i = (struct iov_iter){
1051                 .iter_type = ITER_BVEC,
1052                 .data_source = direction,
1053                 .bvec = bvec,
1054                 .nr_segs = nr_segs,
1055                 .iov_offset = 0,
1056                 .count = count
1057         };
1058 }
1059 EXPORT_SYMBOL(iov_iter_bvec);
1060
1061 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1062                         struct pipe_inode_info *pipe,
1063                         size_t count)
1064 {
1065         BUG_ON(direction != READ);
1066         WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1067         *i = (struct iov_iter){
1068                 .iter_type = ITER_PIPE,
1069                 .data_source = false,
1070                 .pipe = pipe,
1071                 .head = pipe->head,
1072                 .start_head = pipe->head,
1073                 .iov_offset = 0,
1074                 .count = count
1075         };
1076 }
1077 EXPORT_SYMBOL(iov_iter_pipe);
1078
1079 /**
1080  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1081  * @i: The iterator to initialise.
1082  * @direction: The direction of the transfer.
1083  * @xarray: The xarray to access.
1084  * @start: The start file position.
1085  * @count: The size of the I/O buffer in bytes.
1086  *
1087  * Set up an I/O iterator to either draw data out of the pages attached to an
1088  * inode or to inject data into those pages.  The pages *must* be prevented
1089  * from evaporation, either by taking a ref on them or locking them by the
1090  * caller.
1091  */
1092 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1093                      struct xarray *xarray, loff_t start, size_t count)
1094 {
1095         BUG_ON(direction & ~1);
1096         *i = (struct iov_iter) {
1097                 .iter_type = ITER_XARRAY,
1098                 .data_source = direction,
1099                 .xarray = xarray,
1100                 .xarray_start = start,
1101                 .count = count,
1102                 .iov_offset = 0
1103         };
1104 }
1105 EXPORT_SYMBOL(iov_iter_xarray);
1106
1107 /**
1108  * iov_iter_discard - Initialise an I/O iterator that discards data
1109  * @i: The iterator to initialise.
1110  * @direction: The direction of the transfer.
1111  * @count: The size of the I/O buffer in bytes.
1112  *
1113  * Set up an I/O iterator that just discards everything that's written to it.
1114  * It's only available as a READ iterator.
1115  */
1116 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1117 {
1118         BUG_ON(direction != READ);
1119         *i = (struct iov_iter){
1120                 .iter_type = ITER_DISCARD,
1121                 .data_source = false,
1122                 .count = count,
1123                 .iov_offset = 0
1124         };
1125 }
1126 EXPORT_SYMBOL(iov_iter_discard);
1127
1128 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
1129                                    unsigned len_mask)
1130 {
1131         size_t size = i->count;
1132         size_t skip = i->iov_offset;
1133         unsigned k;
1134
1135         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1136                 size_t len = i->iov[k].iov_len - skip;
1137
1138                 if (len > size)
1139                         len = size;
1140                 if (len & len_mask)
1141                         return false;
1142                 if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask)
1143                         return false;
1144
1145                 size -= len;
1146                 if (!size)
1147                         break;
1148         }
1149         return true;
1150 }
1151
1152 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
1153                                   unsigned len_mask)
1154 {
1155         size_t size = i->count;
1156         unsigned skip = i->iov_offset;
1157         unsigned k;
1158
1159         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1160                 size_t len = i->bvec[k].bv_len - skip;
1161
1162                 if (len > size)
1163                         len = size;
1164                 if (len & len_mask)
1165                         return false;
1166                 if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask)
1167                         return false;
1168
1169                 size -= len;
1170                 if (!size)
1171                         break;
1172         }
1173         return true;
1174 }
1175
1176 /**
1177  * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
1178  *      are aligned to the parameters.
1179  *
1180  * @i: &struct iov_iter to restore
1181  * @addr_mask: bit mask to check against the iov element's addresses
1182  * @len_mask: bit mask to check against the iov element's lengths
1183  *
1184  * Return: false if any addresses or lengths intersect with the provided masks
1185  */
1186 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
1187                          unsigned len_mask)
1188 {
1189         if (likely(iter_is_ubuf(i))) {
1190                 if (i->count & len_mask)
1191                         return false;
1192                 if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
1193                         return false;
1194                 return true;
1195         }
1196
1197         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1198                 return iov_iter_aligned_iovec(i, addr_mask, len_mask);
1199
1200         if (iov_iter_is_bvec(i))
1201                 return iov_iter_aligned_bvec(i, addr_mask, len_mask);
1202
1203         if (iov_iter_is_pipe(i)) {
1204                 unsigned int p_mask = i->pipe->ring_size - 1;
1205                 size_t size = i->count;
1206
1207                 if (size & len_mask)
1208                         return false;
1209                 if (size && allocated(&i->pipe->bufs[i->head & p_mask])) {
1210                         if (i->iov_offset & addr_mask)
1211                                 return false;
1212                 }
1213
1214                 return true;
1215         }
1216
1217         if (iov_iter_is_xarray(i)) {
1218                 if (i->count & len_mask)
1219                         return false;
1220                 if ((i->xarray_start + i->iov_offset) & addr_mask)
1221                         return false;
1222         }
1223
1224         return true;
1225 }
1226 EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
1227
1228 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1229 {
1230         unsigned long res = 0;
1231         size_t size = i->count;
1232         size_t skip = i->iov_offset;
1233         unsigned k;
1234
1235         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1236                 size_t len = i->iov[k].iov_len - skip;
1237                 if (len) {
1238                         res |= (unsigned long)i->iov[k].iov_base + skip;
1239                         if (len > size)
1240                                 len = size;
1241                         res |= len;
1242                         size -= len;
1243                         if (!size)
1244                                 break;
1245                 }
1246         }
1247         return res;
1248 }
1249
1250 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1251 {
1252         unsigned res = 0;
1253         size_t size = i->count;
1254         unsigned skip = i->iov_offset;
1255         unsigned k;
1256
1257         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1258                 size_t len = i->bvec[k].bv_len - skip;
1259                 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1260                 if (len > size)
1261                         len = size;
1262                 res |= len;
1263                 size -= len;
1264                 if (!size)
1265                         break;
1266         }
1267         return res;
1268 }
1269
1270 unsigned long iov_iter_alignment(const struct iov_iter *i)
1271 {
1272         if (likely(iter_is_ubuf(i))) {
1273                 size_t size = i->count;
1274                 if (size)
1275                         return ((unsigned long)i->ubuf + i->iov_offset) | size;
1276                 return 0;
1277         }
1278
1279         /* iovec and kvec have identical layouts */
1280         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1281                 return iov_iter_alignment_iovec(i);
1282
1283         if (iov_iter_is_bvec(i))
1284                 return iov_iter_alignment_bvec(i);
1285
1286         if (iov_iter_is_pipe(i)) {
1287                 size_t size = i->count;
1288
1289                 if (size && i->iov_offset && allocated(pipe_buf(i->pipe, i->head)))
1290                         return size | i->iov_offset;
1291                 return size;
1292         }
1293
1294         if (iov_iter_is_xarray(i))
1295                 return (i->xarray_start + i->iov_offset) | i->count;
1296
1297         return 0;
1298 }
1299 EXPORT_SYMBOL(iov_iter_alignment);
1300
1301 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1302 {
1303         unsigned long res = 0;
1304         unsigned long v = 0;
1305         size_t size = i->count;
1306         unsigned k;
1307
1308         if (iter_is_ubuf(i))
1309                 return 0;
1310
1311         if (WARN_ON(!iter_is_iovec(i)))
1312                 return ~0U;
1313
1314         for (k = 0; k < i->nr_segs; k++) {
1315                 if (i->iov[k].iov_len) {
1316                         unsigned long base = (unsigned long)i->iov[k].iov_base;
1317                         if (v) // if not the first one
1318                                 res |= base | v; // this start | previous end
1319                         v = base + i->iov[k].iov_len;
1320                         if (size <= i->iov[k].iov_len)
1321                                 break;
1322                         size -= i->iov[k].iov_len;
1323                 }
1324         }
1325         return res;
1326 }
1327 EXPORT_SYMBOL(iov_iter_gap_alignment);
1328
1329 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1330                                 size_t maxsize,
1331                                 struct page **pages,
1332                                 int iter_head,
1333                                 size_t *start)
1334 {
1335         struct pipe_inode_info *pipe = i->pipe;
1336         unsigned int p_mask = pipe->ring_size - 1;
1337         ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1338         if (!n)
1339                 return -EFAULT;
1340
1341         maxsize = n;
1342         n += *start;
1343         while (n > 0) {
1344                 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1345                 iter_head++;
1346                 n -= PAGE_SIZE;
1347         }
1348
1349         return maxsize;
1350 }
1351
1352 static ssize_t pipe_get_pages(struct iov_iter *i,
1353                    struct page **pages, size_t maxsize, unsigned maxpages,
1354                    size_t *start)
1355 {
1356         unsigned int iter_head, npages;
1357         size_t capacity;
1358
1359         if (!sanity(i))
1360                 return -EFAULT;
1361
1362         data_start(i, &iter_head, start);
1363         /* Amount of free space: some of this one + all after this one */
1364         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1365         capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1366
1367         return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1368 }
1369
1370 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1371                                           pgoff_t index, unsigned int nr_pages)
1372 {
1373         XA_STATE(xas, xa, index);
1374         struct page *page;
1375         unsigned int ret = 0;
1376
1377         rcu_read_lock();
1378         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1379                 if (xas_retry(&xas, page))
1380                         continue;
1381
1382                 /* Has the page moved or been split? */
1383                 if (unlikely(page != xas_reload(&xas))) {
1384                         xas_reset(&xas);
1385                         continue;
1386                 }
1387
1388                 pages[ret] = find_subpage(page, xas.xa_index);
1389                 get_page(pages[ret]);
1390                 if (++ret == nr_pages)
1391                         break;
1392         }
1393         rcu_read_unlock();
1394         return ret;
1395 }
1396
1397 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1398                                      struct page **pages, size_t maxsize,
1399                                      unsigned maxpages, size_t *_start_offset)
1400 {
1401         unsigned nr, offset;
1402         pgoff_t index, count;
1403         size_t size = maxsize;
1404         loff_t pos;
1405
1406         if (!size || !maxpages)
1407                 return 0;
1408
1409         pos = i->xarray_start + i->iov_offset;
1410         index = pos >> PAGE_SHIFT;
1411         offset = pos & ~PAGE_MASK;
1412         *_start_offset = offset;
1413
1414         count = 1;
1415         if (size > PAGE_SIZE - offset) {
1416                 size -= PAGE_SIZE - offset;
1417                 count += size >> PAGE_SHIFT;
1418                 size &= ~PAGE_MASK;
1419                 if (size)
1420                         count++;
1421         }
1422
1423         if (count > maxpages)
1424                 count = maxpages;
1425
1426         nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1427         if (nr == 0)
1428                 return 0;
1429
1430         return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1431 }
1432
1433 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
1434 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
1435 {
1436         size_t skip;
1437         long k;
1438
1439         if (iter_is_ubuf(i))
1440                 return (unsigned long)i->ubuf + i->iov_offset;
1441
1442         for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1443                 size_t len = i->iov[k].iov_len - skip;
1444
1445                 if (unlikely(!len))
1446                         continue;
1447                 if (*size > len)
1448                         *size = len;
1449                 return (unsigned long)i->iov[k].iov_base + skip;
1450         }
1451         BUG(); // if it had been empty, we wouldn't get called
1452 }
1453
1454 /* must be done on non-empty ITER_BVEC one */
1455 static struct page *first_bvec_segment(const struct iov_iter *i,
1456                                        size_t *size, size_t *start)
1457 {
1458         struct page *page;
1459         size_t skip = i->iov_offset, len;
1460
1461         len = i->bvec->bv_len - skip;
1462         if (*size > len)
1463                 *size = len;
1464         skip += i->bvec->bv_offset;
1465         page = i->bvec->bv_page + skip / PAGE_SIZE;
1466         *start = skip % PAGE_SIZE;
1467         return page;
1468 }
1469
1470 ssize_t iov_iter_get_pages(struct iov_iter *i,
1471                    struct page **pages, size_t maxsize, unsigned maxpages,
1472                    size_t *start)
1473 {
1474         int n, res;
1475
1476         if (maxsize > i->count)
1477                 maxsize = i->count;
1478         if (!maxsize)
1479                 return 0;
1480         if (maxsize > MAX_RW_COUNT)
1481                 maxsize = MAX_RW_COUNT;
1482
1483         if (likely(user_backed_iter(i))) {
1484                 unsigned int gup_flags = 0;
1485                 unsigned long addr;
1486
1487                 if (iov_iter_rw(i) != WRITE)
1488                         gup_flags |= FOLL_WRITE;
1489                 if (i->nofault)
1490                         gup_flags |= FOLL_NOFAULT;
1491
1492                 addr = first_iovec_segment(i, &maxsize);
1493                 *start = addr % PAGE_SIZE;
1494                 addr &= PAGE_MASK;
1495                 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1496                 if (n > maxpages)
1497                         n = maxpages;
1498                 res = get_user_pages_fast(addr, n, gup_flags, pages);
1499                 if (unlikely(res <= 0))
1500                         return res;
1501                 return min_t(size_t, maxsize, res * PAGE_SIZE - *start);
1502         }
1503         if (iov_iter_is_bvec(i)) {
1504                 struct page *page;
1505
1506                 page = first_bvec_segment(i, &maxsize, start);
1507                 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1508                 if (n > maxpages)
1509                         n = maxpages;
1510                 for (int k = 0; k < n; k++)
1511                         get_page(*pages++ = page++);
1512                 return min_t(size_t, maxsize, n * PAGE_SIZE - *start);
1513         }
1514         if (iov_iter_is_pipe(i))
1515                 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1516         if (iov_iter_is_xarray(i))
1517                 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1518         return -EFAULT;
1519 }
1520 EXPORT_SYMBOL(iov_iter_get_pages);
1521
1522 static struct page **get_pages_array(size_t n)
1523 {
1524         return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1525 }
1526
1527 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1528                    struct page ***pages, size_t maxsize,
1529                    size_t *start)
1530 {
1531         struct page **p;
1532         unsigned int iter_head, npages;
1533         ssize_t n;
1534
1535         if (!sanity(i))
1536                 return -EFAULT;
1537
1538         data_start(i, &iter_head, start);
1539         /* Amount of free space: some of this one + all after this one */
1540         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1541         n = npages * PAGE_SIZE - *start;
1542         if (maxsize > n)
1543                 maxsize = n;
1544         else
1545                 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1546         p = get_pages_array(npages);
1547         if (!p)
1548                 return -ENOMEM;
1549         n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1550         if (n > 0)
1551                 *pages = p;
1552         else
1553                 kvfree(p);
1554         return n;
1555 }
1556
1557 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1558                                            struct page ***pages, size_t maxsize,
1559                                            size_t *_start_offset)
1560 {
1561         struct page **p;
1562         unsigned nr, offset;
1563         pgoff_t index, count;
1564         size_t size = maxsize;
1565         loff_t pos;
1566
1567         if (!size)
1568                 return 0;
1569
1570         pos = i->xarray_start + i->iov_offset;
1571         index = pos >> PAGE_SHIFT;
1572         offset = pos & ~PAGE_MASK;
1573         *_start_offset = offset;
1574
1575         count = 1;
1576         if (size > PAGE_SIZE - offset) {
1577                 size -= PAGE_SIZE - offset;
1578                 count += size >> PAGE_SHIFT;
1579                 size &= ~PAGE_MASK;
1580                 if (size)
1581                         count++;
1582         }
1583
1584         p = get_pages_array(count);
1585         if (!p)
1586                 return -ENOMEM;
1587         *pages = p;
1588
1589         nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1590         if (nr == 0)
1591                 return 0;
1592
1593         return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1594 }
1595
1596 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1597                    struct page ***pages, size_t maxsize,
1598                    size_t *start)
1599 {
1600         struct page **p;
1601         int n, res;
1602
1603         if (maxsize > i->count)
1604                 maxsize = i->count;
1605         if (!maxsize)
1606                 return 0;
1607         if (maxsize > MAX_RW_COUNT)
1608                 maxsize = MAX_RW_COUNT;
1609
1610         if (likely(user_backed_iter(i))) {
1611                 unsigned int gup_flags = 0;
1612                 unsigned long addr;
1613
1614                 if (iov_iter_rw(i) != WRITE)
1615                         gup_flags |= FOLL_WRITE;
1616                 if (i->nofault)
1617                         gup_flags |= FOLL_NOFAULT;
1618
1619                 addr = first_iovec_segment(i, &maxsize);
1620                 *start = addr % PAGE_SIZE;
1621                 addr &= PAGE_MASK;
1622                 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1623                 p = get_pages_array(n);
1624                 if (!p)
1625                         return -ENOMEM;
1626                 res = get_user_pages_fast(addr, n, gup_flags, p);
1627                 if (unlikely(res <= 0)) {
1628                         kvfree(p);
1629                         *pages = NULL;
1630                         return res;
1631                 }
1632                 *pages = p;
1633                 return min_t(size_t, maxsize, res * PAGE_SIZE - *start);
1634         }
1635         if (iov_iter_is_bvec(i)) {
1636                 struct page *page;
1637
1638                 page = first_bvec_segment(i, &maxsize, start);
1639                 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1640                 *pages = p = get_pages_array(n);
1641                 if (!p)
1642                         return -ENOMEM;
1643                 for (int k = 0; k < n; k++)
1644                         get_page(*p++ = page++);
1645                 return min_t(size_t, maxsize, n * PAGE_SIZE - *start);
1646         }
1647         if (iov_iter_is_pipe(i))
1648                 return pipe_get_pages_alloc(i, pages, maxsize, start);
1649         if (iov_iter_is_xarray(i))
1650                 return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1651         return -EFAULT;
1652 }
1653 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1654
1655 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1656                                struct iov_iter *i)
1657 {
1658         __wsum sum, next;
1659         sum = *csum;
1660         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1661                 WARN_ON(1);
1662                 return 0;
1663         }
1664         iterate_and_advance(i, bytes, base, len, off, ({
1665                 next = csum_and_copy_from_user(base, addr + off, len);
1666                 sum = csum_block_add(sum, next, off);
1667                 next ? 0 : len;
1668         }), ({
1669                 sum = csum_and_memcpy(addr + off, base, len, sum, off);
1670         })
1671         )
1672         *csum = sum;
1673         return bytes;
1674 }
1675 EXPORT_SYMBOL(csum_and_copy_from_iter);
1676
1677 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1678                              struct iov_iter *i)
1679 {
1680         struct csum_state *csstate = _csstate;
1681         __wsum sum, next;
1682
1683         if (unlikely(iov_iter_is_discard(i))) {
1684                 WARN_ON(1);     /* for now */
1685                 return 0;
1686         }
1687
1688         sum = csum_shift(csstate->csum, csstate->off);
1689         if (unlikely(iov_iter_is_pipe(i)))
1690                 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum);
1691         else iterate_and_advance(i, bytes, base, len, off, ({
1692                 next = csum_and_copy_to_user(addr + off, base, len);
1693                 sum = csum_block_add(sum, next, off);
1694                 next ? 0 : len;
1695         }), ({
1696                 sum = csum_and_memcpy(base, addr + off, len, sum, off);
1697         })
1698         )
1699         csstate->csum = csum_shift(sum, csstate->off);
1700         csstate->off += bytes;
1701         return bytes;
1702 }
1703 EXPORT_SYMBOL(csum_and_copy_to_iter);
1704
1705 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1706                 struct iov_iter *i)
1707 {
1708 #ifdef CONFIG_CRYPTO_HASH
1709         struct ahash_request *hash = hashp;
1710         struct scatterlist sg;
1711         size_t copied;
1712
1713         copied = copy_to_iter(addr, bytes, i);
1714         sg_init_one(&sg, addr, copied);
1715         ahash_request_set_crypt(hash, &sg, NULL, copied);
1716         crypto_ahash_update(hash);
1717         return copied;
1718 #else
1719         return 0;
1720 #endif
1721 }
1722 EXPORT_SYMBOL(hash_and_copy_to_iter);
1723
1724 static int iov_npages(const struct iov_iter *i, int maxpages)
1725 {
1726         size_t skip = i->iov_offset, size = i->count;
1727         const struct iovec *p;
1728         int npages = 0;
1729
1730         for (p = i->iov; size; skip = 0, p++) {
1731                 unsigned offs = offset_in_page(p->iov_base + skip);
1732                 size_t len = min(p->iov_len - skip, size);
1733
1734                 if (len) {
1735                         size -= len;
1736                         npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1737                         if (unlikely(npages > maxpages))
1738                                 return maxpages;
1739                 }
1740         }
1741         return npages;
1742 }
1743
1744 static int bvec_npages(const struct iov_iter *i, int maxpages)
1745 {
1746         size_t skip = i->iov_offset, size = i->count;
1747         const struct bio_vec *p;
1748         int npages = 0;
1749
1750         for (p = i->bvec; size; skip = 0, p++) {
1751                 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1752                 size_t len = min(p->bv_len - skip, size);
1753
1754                 size -= len;
1755                 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1756                 if (unlikely(npages > maxpages))
1757                         return maxpages;
1758         }
1759         return npages;
1760 }
1761
1762 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1763 {
1764         if (unlikely(!i->count))
1765                 return 0;
1766         if (likely(iter_is_ubuf(i))) {
1767                 unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
1768                 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
1769                 return min(npages, maxpages);
1770         }
1771         /* iovec and kvec have identical layouts */
1772         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1773                 return iov_npages(i, maxpages);
1774         if (iov_iter_is_bvec(i))
1775                 return bvec_npages(i, maxpages);
1776         if (iov_iter_is_pipe(i)) {
1777                 unsigned int iter_head;
1778                 int npages;
1779                 size_t off;
1780
1781                 if (!sanity(i))
1782                         return 0;
1783
1784                 data_start(i, &iter_head, &off);
1785                 /* some of this one + all after this one */
1786                 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1787                 return min(npages, maxpages);
1788         }
1789         if (iov_iter_is_xarray(i)) {
1790                 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1791                 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1792                 return min(npages, maxpages);
1793         }
1794         return 0;
1795 }
1796 EXPORT_SYMBOL(iov_iter_npages);
1797
1798 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1799 {
1800         *new = *old;
1801         if (unlikely(iov_iter_is_pipe(new))) {
1802                 WARN_ON(1);
1803                 return NULL;
1804         }
1805         if (iov_iter_is_bvec(new))
1806                 return new->bvec = kmemdup(new->bvec,
1807                                     new->nr_segs * sizeof(struct bio_vec),
1808                                     flags);
1809         else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
1810                 /* iovec and kvec have identical layout */
1811                 return new->iov = kmemdup(new->iov,
1812                                    new->nr_segs * sizeof(struct iovec),
1813                                    flags);
1814         return NULL;
1815 }
1816 EXPORT_SYMBOL(dup_iter);
1817
1818 static int copy_compat_iovec_from_user(struct iovec *iov,
1819                 const struct iovec __user *uvec, unsigned long nr_segs)
1820 {
1821         const struct compat_iovec __user *uiov =
1822                 (const struct compat_iovec __user *)uvec;
1823         int ret = -EFAULT, i;
1824
1825         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1826                 return -EFAULT;
1827
1828         for (i = 0; i < nr_segs; i++) {
1829                 compat_uptr_t buf;
1830                 compat_ssize_t len;
1831
1832                 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1833                 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1834
1835                 /* check for compat_size_t not fitting in compat_ssize_t .. */
1836                 if (len < 0) {
1837                         ret = -EINVAL;
1838                         goto uaccess_end;
1839                 }
1840                 iov[i].iov_base = compat_ptr(buf);
1841                 iov[i].iov_len = len;
1842         }
1843
1844         ret = 0;
1845 uaccess_end:
1846         user_access_end();
1847         return ret;
1848 }
1849
1850 static int copy_iovec_from_user(struct iovec *iov,
1851                 const struct iovec __user *uvec, unsigned long nr_segs)
1852 {
1853         unsigned long seg;
1854
1855         if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1856                 return -EFAULT;
1857         for (seg = 0; seg < nr_segs; seg++) {
1858                 if ((ssize_t)iov[seg].iov_len < 0)
1859                         return -EINVAL;
1860         }
1861
1862         return 0;
1863 }
1864
1865 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1866                 unsigned long nr_segs, unsigned long fast_segs,
1867                 struct iovec *fast_iov, bool compat)
1868 {
1869         struct iovec *iov = fast_iov;
1870         int ret;
1871
1872         /*
1873          * SuS says "The readv() function *may* fail if the iovcnt argument was
1874          * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1875          * traditionally returned zero for zero segments, so...
1876          */
1877         if (nr_segs == 0)
1878                 return iov;
1879         if (nr_segs > UIO_MAXIOV)
1880                 return ERR_PTR(-EINVAL);
1881         if (nr_segs > fast_segs) {
1882                 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1883                 if (!iov)
1884                         return ERR_PTR(-ENOMEM);
1885         }
1886
1887         if (compat)
1888                 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1889         else
1890                 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1891         if (ret) {
1892                 if (iov != fast_iov)
1893                         kfree(iov);
1894                 return ERR_PTR(ret);
1895         }
1896
1897         return iov;
1898 }
1899
1900 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1901                  unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1902                  struct iov_iter *i, bool compat)
1903 {
1904         ssize_t total_len = 0;
1905         unsigned long seg;
1906         struct iovec *iov;
1907
1908         iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1909         if (IS_ERR(iov)) {
1910                 *iovp = NULL;
1911                 return PTR_ERR(iov);
1912         }
1913
1914         /*
1915          * According to the Single Unix Specification we should return EINVAL if
1916          * an element length is < 0 when cast to ssize_t or if the total length
1917          * would overflow the ssize_t return value of the system call.
1918          *
1919          * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1920          * overflow case.
1921          */
1922         for (seg = 0; seg < nr_segs; seg++) {
1923                 ssize_t len = (ssize_t)iov[seg].iov_len;
1924
1925                 if (!access_ok(iov[seg].iov_base, len)) {
1926                         if (iov != *iovp)
1927                                 kfree(iov);
1928                         *iovp = NULL;
1929                         return -EFAULT;
1930                 }
1931
1932                 if (len > MAX_RW_COUNT - total_len) {
1933                         len = MAX_RW_COUNT - total_len;
1934                         iov[seg].iov_len = len;
1935                 }
1936                 total_len += len;
1937         }
1938
1939         iov_iter_init(i, type, iov, nr_segs, total_len);
1940         if (iov == *iovp)
1941                 *iovp = NULL;
1942         else
1943                 *iovp = iov;
1944         return total_len;
1945 }
1946
1947 /**
1948  * import_iovec() - Copy an array of &struct iovec from userspace
1949  *     into the kernel, check that it is valid, and initialize a new
1950  *     &struct iov_iter iterator to access it.
1951  *
1952  * @type: One of %READ or %WRITE.
1953  * @uvec: Pointer to the userspace array.
1954  * @nr_segs: Number of elements in userspace array.
1955  * @fast_segs: Number of elements in @iov.
1956  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1957  *     on-stack) kernel array.
1958  * @i: Pointer to iterator that will be initialized on success.
1959  *
1960  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1961  * then this function places %NULL in *@iov on return. Otherwise, a new
1962  * array will be allocated and the result placed in *@iov. This means that
1963  * the caller may call kfree() on *@iov regardless of whether the small
1964  * on-stack array was used or not (and regardless of whether this function
1965  * returns an error or not).
1966  *
1967  * Return: Negative error code on error, bytes imported on success
1968  */
1969 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1970                  unsigned nr_segs, unsigned fast_segs,
1971                  struct iovec **iovp, struct iov_iter *i)
1972 {
1973         return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1974                               in_compat_syscall());
1975 }
1976 EXPORT_SYMBOL(import_iovec);
1977
1978 int import_single_range(int rw, void __user *buf, size_t len,
1979                  struct iovec *iov, struct iov_iter *i)
1980 {
1981         if (len > MAX_RW_COUNT)
1982                 len = MAX_RW_COUNT;
1983         if (unlikely(!access_ok(buf, len)))
1984                 return -EFAULT;
1985
1986         iov->iov_base = buf;
1987         iov->iov_len = len;
1988         iov_iter_init(i, rw, iov, 1, len);
1989         return 0;
1990 }
1991 EXPORT_SYMBOL(import_single_range);
1992
1993 /**
1994  * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1995  *     iov_iter_save_state() was called.
1996  *
1997  * @i: &struct iov_iter to restore
1998  * @state: state to restore from
1999  *
2000  * Used after iov_iter_save_state() to bring restore @i, if operations may
2001  * have advanced it.
2002  *
2003  * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
2004  */
2005 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
2006 {
2007         if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
2008                          !iov_iter_is_kvec(i) && !iter_is_ubuf(i))
2009                 return;
2010         i->iov_offset = state->iov_offset;
2011         i->count = state->count;
2012         if (iter_is_ubuf(i))
2013                 return;
2014         /*
2015          * For the *vec iters, nr_segs + iov is constant - if we increment
2016          * the vec, then we also decrement the nr_segs count. Hence we don't
2017          * need to track both of these, just one is enough and we can deduct
2018          * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
2019          * size, so we can just increment the iov pointer as they are unionzed.
2020          * ITER_BVEC _may_ be the same size on some archs, but on others it is
2021          * not. Be safe and handle it separately.
2022          */
2023         BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
2024         if (iov_iter_is_bvec(i))
2025                 i->bvec -= state->nr_segs - i->nr_segs;
2026         else
2027                 i->iov -= state->nr_segs - i->nr_segs;
2028         i->nr_segs = state->nr_segs;
2029 }