Linux 6.10-rc1
[linux.git] / arch / powerpc / kernel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
4  * 
5  * Rewrite, cleanup, new allocation schemes, virtual merging: 
6  * Copyright (C) 2004 Olof Johansson, IBM Corporation
7  *               and  Ben. Herrenschmidt, IBM Corporation
8  *
9  * Dynamic DMA mapping support, bus-independent parts.
10  */
11
12
13 #include <linux/init.h>
14 #include <linux/types.h>
15 #include <linux/slab.h>
16 #include <linux/mm.h>
17 #include <linux/spinlock.h>
18 #include <linux/string.h>
19 #include <linux/dma-mapping.h>
20 #include <linux/bitmap.h>
21 #include <linux/iommu-helper.h>
22 #include <linux/crash_dump.h>
23 #include <linux/hash.h>
24 #include <linux/fault-inject.h>
25 #include <linux/pci.h>
26 #include <linux/iommu.h>
27 #include <linux/sched.h>
28 #include <linux/debugfs.h>
29 #include <linux/vmalloc.h>
30 #include <asm/io.h>
31 #include <asm/iommu.h>
32 #include <asm/pci-bridge.h>
33 #include <asm/machdep.h>
34 #include <asm/kdump.h>
35 #include <asm/fadump.h>
36 #include <asm/vio.h>
37 #include <asm/tce.h>
38 #include <asm/mmu_context.h>
39 #include <asm/ppc-pci.h>
40
41 #define DBG(...)
42
43 #ifdef CONFIG_IOMMU_DEBUGFS
44 static int iommu_debugfs_weight_get(void *data, u64 *val)
45 {
46         struct iommu_table *tbl = data;
47         *val = bitmap_weight(tbl->it_map, tbl->it_size);
48         return 0;
49 }
50 DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n");
51
52 static void iommu_debugfs_add(struct iommu_table *tbl)
53 {
54         char name[10];
55         struct dentry *liobn_entry;
56
57         sprintf(name, "%08lx", tbl->it_index);
58         liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir);
59
60         debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight);
61         debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size);
62         debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift);
63         debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start);
64         debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end);
65         debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels);
66         debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size);
67 }
68
69 static void iommu_debugfs_del(struct iommu_table *tbl)
70 {
71         char name[10];
72
73         sprintf(name, "%08lx", tbl->it_index);
74         debugfs_lookup_and_remove(name, iommu_debugfs_dir);
75 }
76 #else
77 static void iommu_debugfs_add(struct iommu_table *tbl){}
78 static void iommu_debugfs_del(struct iommu_table *tbl){}
79 #endif
80
81 static int novmerge;
82
83 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
84
85 static int __init setup_iommu(char *str)
86 {
87         if (!strcmp(str, "novmerge"))
88                 novmerge = 1;
89         else if (!strcmp(str, "vmerge"))
90                 novmerge = 0;
91         return 1;
92 }
93
94 __setup("iommu=", setup_iommu);
95
96 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
97
98 /*
99  * We precalculate the hash to avoid doing it on every allocation.
100  *
101  * The hash is important to spread CPUs across all the pools. For example,
102  * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
103  * with 4 pools all primary threads would map to the same pool.
104  */
105 static int __init setup_iommu_pool_hash(void)
106 {
107         unsigned int i;
108
109         for_each_possible_cpu(i)
110                 per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
111
112         return 0;
113 }
114 subsys_initcall(setup_iommu_pool_hash);
115
116 #ifdef CONFIG_FAIL_IOMMU
117
118 static DECLARE_FAULT_ATTR(fail_iommu);
119
120 static int __init setup_fail_iommu(char *str)
121 {
122         return setup_fault_attr(&fail_iommu, str);
123 }
124 __setup("fail_iommu=", setup_fail_iommu);
125
126 static bool should_fail_iommu(struct device *dev)
127 {
128         return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
129 }
130
131 static int __init fail_iommu_debugfs(void)
132 {
133         struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
134                                                        NULL, &fail_iommu);
135
136         return PTR_ERR_OR_ZERO(dir);
137 }
138 late_initcall(fail_iommu_debugfs);
139
140 static ssize_t fail_iommu_show(struct device *dev,
141                                struct device_attribute *attr, char *buf)
142 {
143         return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
144 }
145
146 static ssize_t fail_iommu_store(struct device *dev,
147                                 struct device_attribute *attr, const char *buf,
148                                 size_t count)
149 {
150         int i;
151
152         if (count > 0 && sscanf(buf, "%d", &i) > 0)
153                 dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
154
155         return count;
156 }
157
158 static DEVICE_ATTR_RW(fail_iommu);
159
160 static int fail_iommu_bus_notify(struct notifier_block *nb,
161                                  unsigned long action, void *data)
162 {
163         struct device *dev = data;
164
165         if (action == BUS_NOTIFY_ADD_DEVICE) {
166                 if (device_create_file(dev, &dev_attr_fail_iommu))
167                         pr_warn("Unable to create IOMMU fault injection sysfs "
168                                 "entries\n");
169         } else if (action == BUS_NOTIFY_DEL_DEVICE) {
170                 device_remove_file(dev, &dev_attr_fail_iommu);
171         }
172
173         return 0;
174 }
175
176 /*
177  * PCI and VIO buses need separate notifier_block structs, since they're linked
178  * list nodes.  Sharing a notifier_block would mean that any notifiers later
179  * registered for PCI buses would also get called by VIO buses and vice versa.
180  */
181 static struct notifier_block fail_iommu_pci_bus_notifier = {
182         .notifier_call = fail_iommu_bus_notify
183 };
184
185 #ifdef CONFIG_IBMVIO
186 static struct notifier_block fail_iommu_vio_bus_notifier = {
187         .notifier_call = fail_iommu_bus_notify
188 };
189 #endif
190
191 static int __init fail_iommu_setup(void)
192 {
193 #ifdef CONFIG_PCI
194         bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier);
195 #endif
196 #ifdef CONFIG_IBMVIO
197         bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier);
198 #endif
199
200         return 0;
201 }
202 /*
203  * Must execute after PCI and VIO subsystem have initialised but before
204  * devices are probed.
205  */
206 arch_initcall(fail_iommu_setup);
207 #else
208 static inline bool should_fail_iommu(struct device *dev)
209 {
210         return false;
211 }
212 #endif
213
214 static unsigned long iommu_range_alloc(struct device *dev,
215                                        struct iommu_table *tbl,
216                                        unsigned long npages,
217                                        unsigned long *handle,
218                                        unsigned long mask,
219                                        unsigned int align_order)
220
221         unsigned long n, end, start;
222         unsigned long limit;
223         int largealloc = npages > 15;
224         int pass = 0;
225         unsigned long align_mask;
226         unsigned long flags;
227         unsigned int pool_nr;
228         struct iommu_pool *pool;
229
230         align_mask = (1ull << align_order) - 1;
231
232         /* This allocator was derived from x86_64's bit string search */
233
234         /* Sanity check */
235         if (unlikely(npages == 0)) {
236                 if (printk_ratelimit())
237                         WARN_ON(1);
238                 return DMA_MAPPING_ERROR;
239         }
240
241         if (should_fail_iommu(dev))
242                 return DMA_MAPPING_ERROR;
243
244         /*
245          * We don't need to disable preemption here because any CPU can
246          * safely use any IOMMU pool.
247          */
248         pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
249
250         if (largealloc)
251                 pool = &(tbl->large_pool);
252         else
253                 pool = &(tbl->pools[pool_nr]);
254
255         spin_lock_irqsave(&(pool->lock), flags);
256
257 again:
258         if ((pass == 0) && handle && *handle &&
259             (*handle >= pool->start) && (*handle < pool->end))
260                 start = *handle;
261         else
262                 start = pool->hint;
263
264         limit = pool->end;
265
266         /* The case below can happen if we have a small segment appended
267          * to a large, or when the previous alloc was at the very end of
268          * the available space. If so, go back to the initial start.
269          */
270         if (start >= limit)
271                 start = pool->start;
272
273         if (limit + tbl->it_offset > mask) {
274                 limit = mask - tbl->it_offset + 1;
275                 /* If we're constrained on address range, first try
276                  * at the masked hint to avoid O(n) search complexity,
277                  * but on second pass, start at 0 in pool 0.
278                  */
279                 if ((start & mask) >= limit || pass > 0) {
280                         spin_unlock(&(pool->lock));
281                         pool = &(tbl->pools[0]);
282                         spin_lock(&(pool->lock));
283                         start = pool->start;
284                 } else {
285                         start &= mask;
286                 }
287         }
288
289         n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
290                         dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift),
291                         align_mask);
292         if (n == -1) {
293                 if (likely(pass == 0)) {
294                         /* First try the pool from the start */
295                         pool->hint = pool->start;
296                         pass++;
297                         goto again;
298
299                 } else if (pass <= tbl->nr_pools) {
300                         /* Now try scanning all the other pools */
301                         spin_unlock(&(pool->lock));
302                         pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
303                         pool = &tbl->pools[pool_nr];
304                         spin_lock(&(pool->lock));
305                         pool->hint = pool->start;
306                         pass++;
307                         goto again;
308
309                 } else if (pass == tbl->nr_pools + 1) {
310                         /* Last resort: try largepool */
311                         spin_unlock(&pool->lock);
312                         pool = &tbl->large_pool;
313                         spin_lock(&pool->lock);
314                         pool->hint = pool->start;
315                         pass++;
316                         goto again;
317
318                 } else {
319                         /* Give up */
320                         spin_unlock_irqrestore(&(pool->lock), flags);
321                         return DMA_MAPPING_ERROR;
322                 }
323         }
324
325         end = n + npages;
326
327         /* Bump the hint to a new block for small allocs. */
328         if (largealloc) {
329                 /* Don't bump to new block to avoid fragmentation */
330                 pool->hint = end;
331         } else {
332                 /* Overflow will be taken care of at the next allocation */
333                 pool->hint = (end + tbl->it_blocksize - 1) &
334                                 ~(tbl->it_blocksize - 1);
335         }
336
337         /* Update handle for SG allocations */
338         if (handle)
339                 *handle = end;
340
341         spin_unlock_irqrestore(&(pool->lock), flags);
342
343         return n;
344 }
345
346 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
347                               void *page, unsigned int npages,
348                               enum dma_data_direction direction,
349                               unsigned long mask, unsigned int align_order,
350                               unsigned long attrs)
351 {
352         unsigned long entry;
353         dma_addr_t ret = DMA_MAPPING_ERROR;
354         int build_fail;
355
356         entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
357
358         if (unlikely(entry == DMA_MAPPING_ERROR))
359                 return DMA_MAPPING_ERROR;
360
361         entry += tbl->it_offset;        /* Offset into real TCE table */
362         ret = entry << tbl->it_page_shift;      /* Set the return dma address */
363
364         /* Put the TCEs in the HW table */
365         build_fail = tbl->it_ops->set(tbl, entry, npages,
366                                       (unsigned long)page &
367                                       IOMMU_PAGE_MASK(tbl), direction, attrs);
368
369         /* tbl->it_ops->set() only returns non-zero for transient errors.
370          * Clean up the table bitmap in this case and return
371          * DMA_MAPPING_ERROR. For all other errors the functionality is
372          * not altered.
373          */
374         if (unlikely(build_fail)) {
375                 __iommu_free(tbl, ret, npages);
376                 return DMA_MAPPING_ERROR;
377         }
378
379         /* Flush/invalidate TLB caches if necessary */
380         if (tbl->it_ops->flush)
381                 tbl->it_ops->flush(tbl);
382
383         /* Make sure updates are seen by hardware */
384         mb();
385
386         return ret;
387 }
388
389 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
390                              unsigned int npages)
391 {
392         unsigned long entry, free_entry;
393
394         entry = dma_addr >> tbl->it_page_shift;
395         free_entry = entry - tbl->it_offset;
396
397         if (((free_entry + npages) > tbl->it_size) ||
398             (entry < tbl->it_offset)) {
399                 if (printk_ratelimit()) {
400                         printk(KERN_INFO "iommu_free: invalid entry\n");
401                         printk(KERN_INFO "\tentry     = 0x%lx\n", entry); 
402                         printk(KERN_INFO "\tdma_addr  = 0x%llx\n", (u64)dma_addr);
403                         printk(KERN_INFO "\tTable     = 0x%llx\n", (u64)tbl);
404                         printk(KERN_INFO "\tbus#      = 0x%llx\n", (u64)tbl->it_busno);
405                         printk(KERN_INFO "\tsize      = 0x%llx\n", (u64)tbl->it_size);
406                         printk(KERN_INFO "\tstartOff  = 0x%llx\n", (u64)tbl->it_offset);
407                         printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
408                         WARN_ON(1);
409                 }
410
411                 return false;
412         }
413
414         return true;
415 }
416
417 static struct iommu_pool *get_pool(struct iommu_table *tbl,
418                                    unsigned long entry)
419 {
420         struct iommu_pool *p;
421         unsigned long largepool_start = tbl->large_pool.start;
422
423         /* The large pool is the last pool at the top of the table */
424         if (entry >= largepool_start) {
425                 p = &tbl->large_pool;
426         } else {
427                 unsigned int pool_nr = entry / tbl->poolsize;
428
429                 BUG_ON(pool_nr > tbl->nr_pools);
430                 p = &tbl->pools[pool_nr];
431         }
432
433         return p;
434 }
435
436 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
437                          unsigned int npages)
438 {
439         unsigned long entry, free_entry;
440         unsigned long flags;
441         struct iommu_pool *pool;
442
443         entry = dma_addr >> tbl->it_page_shift;
444         free_entry = entry - tbl->it_offset;
445
446         pool = get_pool(tbl, free_entry);
447
448         if (!iommu_free_check(tbl, dma_addr, npages))
449                 return;
450
451         tbl->it_ops->clear(tbl, entry, npages);
452
453         spin_lock_irqsave(&(pool->lock), flags);
454         bitmap_clear(tbl->it_map, free_entry, npages);
455         spin_unlock_irqrestore(&(pool->lock), flags);
456 }
457
458 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
459                 unsigned int npages)
460 {
461         __iommu_free(tbl, dma_addr, npages);
462
463         /* Make sure TLB cache is flushed if the HW needs it. We do
464          * not do an mb() here on purpose, it is not needed on any of
465          * the current platforms.
466          */
467         if (tbl->it_ops->flush)
468                 tbl->it_ops->flush(tbl);
469 }
470
471 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
472                      struct scatterlist *sglist, int nelems,
473                      unsigned long mask, enum dma_data_direction direction,
474                      unsigned long attrs)
475 {
476         dma_addr_t dma_next = 0, dma_addr;
477         struct scatterlist *s, *outs, *segstart;
478         int outcount, incount, i, build_fail = 0;
479         unsigned int align;
480         unsigned long handle;
481         unsigned int max_seg_size;
482
483         BUG_ON(direction == DMA_NONE);
484
485         if ((nelems == 0) || !tbl)
486                 return -EINVAL;
487
488         outs = s = segstart = &sglist[0];
489         outcount = 1;
490         incount = nelems;
491         handle = 0;
492
493         /* Init first segment length for backout at failure */
494         outs->dma_length = 0;
495
496         DBG("sg mapping %d elements:\n", nelems);
497
498         max_seg_size = dma_get_max_seg_size(dev);
499         for_each_sg(sglist, s, nelems, i) {
500                 unsigned long vaddr, npages, entry, slen;
501
502                 slen = s->length;
503                 /* Sanity check */
504                 if (slen == 0) {
505                         dma_next = 0;
506                         continue;
507                 }
508                 /* Allocate iommu entries for that segment */
509                 vaddr = (unsigned long) sg_virt(s);
510                 npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
511                 align = 0;
512                 if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
513                     (vaddr & ~PAGE_MASK) == 0)
514                         align = PAGE_SHIFT - tbl->it_page_shift;
515                 entry = iommu_range_alloc(dev, tbl, npages, &handle,
516                                           mask >> tbl->it_page_shift, align);
517
518                 DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
519
520                 /* Handle failure */
521                 if (unlikely(entry == DMA_MAPPING_ERROR)) {
522                         if (!(attrs & DMA_ATTR_NO_WARN) &&
523                             printk_ratelimit())
524                                 dev_info(dev, "iommu_alloc failed, tbl %p "
525                                          "vaddr %lx npages %lu\n", tbl, vaddr,
526                                          npages);
527                         goto failure;
528                 }
529
530                 /* Convert entry to a dma_addr_t */
531                 entry += tbl->it_offset;
532                 dma_addr = entry << tbl->it_page_shift;
533                 dma_addr |= (vaddr & ~IOMMU_PAGE_MASK(tbl));
534
535                 DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
536                             npages, entry, dma_addr);
537
538                 /* Insert into HW table */
539                 build_fail = tbl->it_ops->set(tbl, entry, npages,
540                                               vaddr & IOMMU_PAGE_MASK(tbl),
541                                               direction, attrs);
542                 if(unlikely(build_fail))
543                         goto failure;
544
545                 /* If we are in an open segment, try merging */
546                 if (segstart != s) {
547                         DBG("  - trying merge...\n");
548                         /* We cannot merge if:
549                          * - allocated dma_addr isn't contiguous to previous allocation
550                          */
551                         if (novmerge || (dma_addr != dma_next) ||
552                             (outs->dma_length + s->length > max_seg_size)) {
553                                 /* Can't merge: create a new segment */
554                                 segstart = s;
555                                 outcount++;
556                                 outs = sg_next(outs);
557                                 DBG("    can't merge, new segment.\n");
558                         } else {
559                                 outs->dma_length += s->length;
560                                 DBG("    merged, new len: %ux\n", outs->dma_length);
561                         }
562                 }
563
564                 if (segstart == s) {
565                         /* This is a new segment, fill entries */
566                         DBG("  - filling new segment.\n");
567                         outs->dma_address = dma_addr;
568                         outs->dma_length = slen;
569                 }
570
571                 /* Calculate next page pointer for contiguous check */
572                 dma_next = dma_addr + slen;
573
574                 DBG("  - dma next is: %lx\n", dma_next);
575         }
576
577         /* Flush/invalidate TLB caches if necessary */
578         if (tbl->it_ops->flush)
579                 tbl->it_ops->flush(tbl);
580
581         DBG("mapped %d elements:\n", outcount);
582
583         /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
584          * next entry of the sglist if we didn't fill the list completely
585          */
586         if (outcount < incount) {
587                 outs = sg_next(outs);
588                 outs->dma_length = 0;
589         }
590
591         /* Make sure updates are seen by hardware */
592         mb();
593
594         return outcount;
595
596  failure:
597         for_each_sg(sglist, s, nelems, i) {
598                 if (s->dma_length != 0) {
599                         unsigned long vaddr, npages;
600
601                         vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
602                         npages = iommu_num_pages(s->dma_address, s->dma_length,
603                                                  IOMMU_PAGE_SIZE(tbl));
604                         __iommu_free(tbl, vaddr, npages);
605                         s->dma_length = 0;
606                 }
607                 if (s == outs)
608                         break;
609         }
610         return -EIO;
611 }
612
613
614 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
615                         int nelems, enum dma_data_direction direction,
616                         unsigned long attrs)
617 {
618         struct scatterlist *sg;
619
620         BUG_ON(direction == DMA_NONE);
621
622         if (!tbl)
623                 return;
624
625         sg = sglist;
626         while (nelems--) {
627                 unsigned int npages;
628                 dma_addr_t dma_handle = sg->dma_address;
629
630                 if (sg->dma_length == 0)
631                         break;
632                 npages = iommu_num_pages(dma_handle, sg->dma_length,
633                                          IOMMU_PAGE_SIZE(tbl));
634                 __iommu_free(tbl, dma_handle, npages);
635                 sg = sg_next(sg);
636         }
637
638         /* Flush/invalidate TLBs if necessary. As for iommu_free(), we
639          * do not do an mb() here, the affected platforms do not need it
640          * when freeing.
641          */
642         if (tbl->it_ops->flush)
643                 tbl->it_ops->flush(tbl);
644 }
645
646 static void iommu_table_clear(struct iommu_table *tbl)
647 {
648         /*
649          * In case of firmware assisted dump system goes through clean
650          * reboot process at the time of system crash. Hence it's safe to
651          * clear the TCE entries if firmware assisted dump is active.
652          */
653         if (!is_kdump_kernel() || is_fadump_active()) {
654                 /* Clear the table in case firmware left allocations in it */
655                 tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
656                 return;
657         }
658
659 #ifdef CONFIG_CRASH_DUMP
660         if (tbl->it_ops->get) {
661                 unsigned long index, tceval, tcecount = 0;
662
663                 /* Reserve the existing mappings left by the first kernel. */
664                 for (index = 0; index < tbl->it_size; index++) {
665                         tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
666                         /*
667                          * Freed TCE entry contains 0x7fffffffffffffff on JS20
668                          */
669                         if (tceval && (tceval != 0x7fffffffffffffffUL)) {
670                                 __set_bit(index, tbl->it_map);
671                                 tcecount++;
672                         }
673                 }
674
675                 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
676                         printk(KERN_WARNING "TCE table is full; freeing ");
677                         printk(KERN_WARNING "%d entries for the kdump boot\n",
678                                 KDUMP_MIN_TCE_ENTRIES);
679                         for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
680                                 index < tbl->it_size; index++)
681                                 __clear_bit(index, tbl->it_map);
682                 }
683         }
684 #endif
685 }
686
687 static void iommu_table_reserve_pages(struct iommu_table *tbl,
688                 unsigned long res_start, unsigned long res_end)
689 {
690         int i;
691
692         WARN_ON_ONCE(res_end < res_start);
693         /*
694          * Reserve page 0 so it will not be used for any mappings.
695          * This avoids buggy drivers that consider page 0 to be invalid
696          * to crash the machine or even lose data.
697          */
698         if (tbl->it_offset == 0)
699                 set_bit(0, tbl->it_map);
700
701         if (res_start < tbl->it_offset)
702                 res_start = tbl->it_offset;
703
704         if (res_end > (tbl->it_offset + tbl->it_size))
705                 res_end = tbl->it_offset + tbl->it_size;
706
707         /* Check if res_start..res_end is a valid range in the table */
708         if (res_start >= res_end) {
709                 tbl->it_reserved_start = tbl->it_offset;
710                 tbl->it_reserved_end = tbl->it_offset;
711                 return;
712         }
713
714         tbl->it_reserved_start = res_start;
715         tbl->it_reserved_end = res_end;
716
717         for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
718                 set_bit(i - tbl->it_offset, tbl->it_map);
719 }
720
721 /*
722  * Build a iommu_table structure.  This contains a bit map which
723  * is used to manage allocation of the tce space.
724  */
725 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
726                 unsigned long res_start, unsigned long res_end)
727 {
728         unsigned long sz;
729         static int welcomed = 0;
730         unsigned int i;
731         struct iommu_pool *p;
732
733         BUG_ON(!tbl->it_ops);
734
735         /* number of bytes needed for the bitmap */
736         sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
737
738         tbl->it_map = vzalloc_node(sz, nid);
739         if (!tbl->it_map) {
740                 pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
741                 return NULL;
742         }
743
744         iommu_table_reserve_pages(tbl, res_start, res_end);
745
746         /* We only split the IOMMU table if we have 1GB or more of space */
747         if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
748                 tbl->nr_pools = IOMMU_NR_POOLS;
749         else
750                 tbl->nr_pools = 1;
751
752         /* We reserve the top 1/4 of the table for large allocations */
753         tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
754
755         for (i = 0; i < tbl->nr_pools; i++) {
756                 p = &tbl->pools[i];
757                 spin_lock_init(&(p->lock));
758                 p->start = tbl->poolsize * i;
759                 p->hint = p->start;
760                 p->end = p->start + tbl->poolsize;
761         }
762
763         p = &tbl->large_pool;
764         spin_lock_init(&(p->lock));
765         p->start = tbl->poolsize * i;
766         p->hint = p->start;
767         p->end = tbl->it_size;
768
769         iommu_table_clear(tbl);
770
771         if (!welcomed) {
772                 printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
773                        novmerge ? "disabled" : "enabled");
774                 welcomed = 1;
775         }
776
777         iommu_debugfs_add(tbl);
778
779         return tbl;
780 }
781
782 bool iommu_table_in_use(struct iommu_table *tbl)
783 {
784         unsigned long start = 0, end;
785
786         /* ignore reserved bit0 */
787         if (tbl->it_offset == 0)
788                 start = 1;
789
790         /* Simple case with no reserved MMIO32 region */
791         if (!tbl->it_reserved_start && !tbl->it_reserved_end)
792                 return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size;
793
794         end = tbl->it_reserved_start - tbl->it_offset;
795         if (find_next_bit(tbl->it_map, end, start) != end)
796                 return true;
797
798         start = tbl->it_reserved_end - tbl->it_offset;
799         end = tbl->it_size;
800         return find_next_bit(tbl->it_map, end, start) != end;
801 }
802
803 static void iommu_table_free(struct kref *kref)
804 {
805         struct iommu_table *tbl;
806
807         tbl = container_of(kref, struct iommu_table, it_kref);
808
809         if (tbl->it_ops->free)
810                 tbl->it_ops->free(tbl);
811
812         if (!tbl->it_map) {
813                 kfree(tbl);
814                 return;
815         }
816
817         iommu_debugfs_del(tbl);
818
819         /* verify that table contains no entries */
820         if (iommu_table_in_use(tbl))
821                 pr_warn("%s: Unexpected TCEs\n", __func__);
822
823         /* free bitmap */
824         vfree(tbl->it_map);
825
826         /* free table */
827         kfree(tbl);
828 }
829
830 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
831 {
832         if (kref_get_unless_zero(&tbl->it_kref))
833                 return tbl;
834
835         return NULL;
836 }
837 EXPORT_SYMBOL_GPL(iommu_tce_table_get);
838
839 int iommu_tce_table_put(struct iommu_table *tbl)
840 {
841         if (WARN_ON(!tbl))
842                 return 0;
843
844         return kref_put(&tbl->it_kref, iommu_table_free);
845 }
846 EXPORT_SYMBOL_GPL(iommu_tce_table_put);
847
848 /* Creates TCEs for a user provided buffer.  The user buffer must be
849  * contiguous real kernel storage (not vmalloc).  The address passed here
850  * comprises a page address and offset into that page. The dma_addr_t
851  * returned will point to the same byte within the page as was passed in.
852  */
853 dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
854                           struct page *page, unsigned long offset, size_t size,
855                           unsigned long mask, enum dma_data_direction direction,
856                           unsigned long attrs)
857 {
858         dma_addr_t dma_handle = DMA_MAPPING_ERROR;
859         void *vaddr;
860         unsigned long uaddr;
861         unsigned int npages, align;
862
863         BUG_ON(direction == DMA_NONE);
864
865         vaddr = page_address(page) + offset;
866         uaddr = (unsigned long)vaddr;
867
868         if (tbl) {
869                 npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
870                 align = 0;
871                 if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
872                     ((unsigned long)vaddr & ~PAGE_MASK) == 0)
873                         align = PAGE_SHIFT - tbl->it_page_shift;
874
875                 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
876                                          mask >> tbl->it_page_shift, align,
877                                          attrs);
878                 if (dma_handle == DMA_MAPPING_ERROR) {
879                         if (!(attrs & DMA_ATTR_NO_WARN) &&
880                             printk_ratelimit())  {
881                                 dev_info(dev, "iommu_alloc failed, tbl %p "
882                                          "vaddr %p npages %d\n", tbl, vaddr,
883                                          npages);
884                         }
885                 } else
886                         dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
887         }
888
889         return dma_handle;
890 }
891
892 void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
893                       size_t size, enum dma_data_direction direction,
894                       unsigned long attrs)
895 {
896         unsigned int npages;
897
898         BUG_ON(direction == DMA_NONE);
899
900         if (tbl) {
901                 npages = iommu_num_pages(dma_handle, size,
902                                          IOMMU_PAGE_SIZE(tbl));
903                 iommu_free(tbl, dma_handle, npages);
904         }
905 }
906
907 /* Allocates a contiguous real buffer and creates mappings over it.
908  * Returns the virtual address of the buffer and sets dma_handle
909  * to the dma address (mapping) of the first page.
910  */
911 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
912                            size_t size, dma_addr_t *dma_handle,
913                            unsigned long mask, gfp_t flag, int node)
914 {
915         void *ret = NULL;
916         dma_addr_t mapping;
917         unsigned int order;
918         unsigned int nio_pages, io_order;
919         struct page *page;
920         int tcesize = (1 << tbl->it_page_shift);
921
922         size = PAGE_ALIGN(size);
923         order = get_order(size);
924
925         /*
926          * Client asked for way too much space.  This is checked later
927          * anyway.  It is easier to debug here for the drivers than in
928          * the tce tables.
929          */
930         if (order >= IOMAP_MAX_ORDER) {
931                 dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
932                          size);
933                 return NULL;
934         }
935
936         if (!tbl)
937                 return NULL;
938
939         /* Alloc enough pages (and possibly more) */
940         page = alloc_pages_node(node, flag, order);
941         if (!page)
942                 return NULL;
943         ret = page_address(page);
944         memset(ret, 0, size);
945
946         /* Set up tces to cover the allocated range */
947         nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift;
948
949         io_order = get_iommu_order(size, tbl);
950         mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
951                               mask >> tbl->it_page_shift, io_order, 0);
952         if (mapping == DMA_MAPPING_ERROR) {
953                 free_pages((unsigned long)ret, order);
954                 return NULL;
955         }
956
957         *dma_handle = mapping | ((u64)ret & (tcesize - 1));
958         return ret;
959 }
960
961 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
962                          void *vaddr, dma_addr_t dma_handle)
963 {
964         if (tbl) {
965                 unsigned int nio_pages;
966
967                 size = PAGE_ALIGN(size);
968                 nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift;
969                 iommu_free(tbl, dma_handle, nio_pages);
970                 size = PAGE_ALIGN(size);
971                 free_pages((unsigned long)vaddr, get_order(size));
972         }
973 }
974
975 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
976 {
977         switch (dir) {
978         case DMA_BIDIRECTIONAL:
979                 return TCE_PCI_READ | TCE_PCI_WRITE;
980         case DMA_FROM_DEVICE:
981                 return TCE_PCI_WRITE;
982         case DMA_TO_DEVICE:
983                 return TCE_PCI_READ;
984         default:
985                 return 0;
986         }
987 }
988 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
989
990 #ifdef CONFIG_IOMMU_API
991 /*
992  * SPAPR TCE API
993  */
994 static void group_release(void *iommu_data)
995 {
996         struct iommu_table_group *table_group = iommu_data;
997
998         table_group->group = NULL;
999 }
1000
1001 void iommu_register_group(struct iommu_table_group *table_group,
1002                 int pci_domain_number, unsigned long pe_num)
1003 {
1004         struct iommu_group *grp;
1005         char *name;
1006
1007         grp = iommu_group_alloc();
1008         if (IS_ERR(grp)) {
1009                 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
1010                                 PTR_ERR(grp));
1011                 return;
1012         }
1013         table_group->group = grp;
1014         iommu_group_set_iommudata(grp, table_group, group_release);
1015         name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
1016                         pci_domain_number, pe_num);
1017         if (!name)
1018                 return;
1019         iommu_group_set_name(grp, name);
1020         kfree(name);
1021 }
1022
1023 enum dma_data_direction iommu_tce_direction(unsigned long tce)
1024 {
1025         if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
1026                 return DMA_BIDIRECTIONAL;
1027         else if (tce & TCE_PCI_READ)
1028                 return DMA_TO_DEVICE;
1029         else if (tce & TCE_PCI_WRITE)
1030                 return DMA_FROM_DEVICE;
1031         else
1032                 return DMA_NONE;
1033 }
1034 EXPORT_SYMBOL_GPL(iommu_tce_direction);
1035
1036 void iommu_flush_tce(struct iommu_table *tbl)
1037 {
1038         /* Flush/invalidate TLB caches if necessary */
1039         if (tbl->it_ops->flush)
1040                 tbl->it_ops->flush(tbl);
1041
1042         /* Make sure updates are seen by hardware */
1043         mb();
1044 }
1045 EXPORT_SYMBOL_GPL(iommu_flush_tce);
1046
1047 int iommu_tce_check_ioba(unsigned long page_shift,
1048                 unsigned long offset, unsigned long size,
1049                 unsigned long ioba, unsigned long npages)
1050 {
1051         unsigned long mask = (1UL << page_shift) - 1;
1052
1053         if (ioba & mask)
1054                 return -EINVAL;
1055
1056         ioba >>= page_shift;
1057         if (ioba < offset)
1058                 return -EINVAL;
1059
1060         if ((ioba + 1) > (offset + size))
1061                 return -EINVAL;
1062
1063         return 0;
1064 }
1065 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
1066
1067 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
1068 {
1069         unsigned long mask = (1UL << page_shift) - 1;
1070
1071         if (gpa & mask)
1072                 return -EINVAL;
1073
1074         return 0;
1075 }
1076 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
1077
1078 long iommu_tce_xchg_no_kill(struct mm_struct *mm,
1079                             struct iommu_table *tbl,
1080                             unsigned long entry, unsigned long *hpa,
1081                             enum dma_data_direction *direction)
1082 {
1083         long ret;
1084         unsigned long size = 0;
1085
1086         ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction);
1087         if (!ret && ((*direction == DMA_FROM_DEVICE) ||
1088                         (*direction == DMA_BIDIRECTIONAL)) &&
1089                         !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
1090                                         &size))
1091                 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
1092
1093         return ret;
1094 }
1095 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
1096
1097 void iommu_tce_kill(struct iommu_table *tbl,
1098                 unsigned long entry, unsigned long pages)
1099 {
1100         if (tbl->it_ops->tce_kill)
1101                 tbl->it_ops->tce_kill(tbl, entry, pages);
1102 }
1103 EXPORT_SYMBOL_GPL(iommu_tce_kill);
1104
1105 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
1106 static int iommu_take_ownership(struct iommu_table *tbl)
1107 {
1108         unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
1109         int ret = 0;
1110
1111         /*
1112          * VFIO does not control TCE entries allocation and the guest
1113          * can write new TCEs on top of existing ones so iommu_tce_build()
1114          * must be able to release old pages. This functionality
1115          * requires exchange() callback defined so if it is not
1116          * implemented, we disallow taking ownership over the table.
1117          */
1118         if (!tbl->it_ops->xchg_no_kill)
1119                 return -EINVAL;
1120
1121         spin_lock_irqsave(&tbl->large_pool.lock, flags);
1122         for (i = 0; i < tbl->nr_pools; i++)
1123                 spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
1124
1125         if (iommu_table_in_use(tbl)) {
1126                 pr_err("iommu_tce: it_map is not empty");
1127                 ret = -EBUSY;
1128         } else {
1129                 memset(tbl->it_map, 0xff, sz);
1130         }
1131
1132         for (i = 0; i < tbl->nr_pools; i++)
1133                 spin_unlock(&tbl->pools[i].lock);
1134         spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
1135
1136         return ret;
1137 }
1138
1139 static void iommu_release_ownership(struct iommu_table *tbl)
1140 {
1141         unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
1142
1143         spin_lock_irqsave(&tbl->large_pool.lock, flags);
1144         for (i = 0; i < tbl->nr_pools; i++)
1145                 spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
1146
1147         memset(tbl->it_map, 0, sz);
1148
1149         iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
1150                         tbl->it_reserved_end);
1151
1152         for (i = 0; i < tbl->nr_pools; i++)
1153                 spin_unlock(&tbl->pools[i].lock);
1154         spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
1155 }
1156 #endif
1157
1158 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
1159 {
1160         /*
1161          * The sysfs entries should be populated before
1162          * binding IOMMU group. If sysfs entries isn't
1163          * ready, we simply bail.
1164          */
1165         if (!device_is_registered(dev))
1166                 return -ENOENT;
1167
1168         if (device_iommu_mapped(dev)) {
1169                 pr_debug("%s: Skipping device %s with iommu group %d\n",
1170                          __func__, dev_name(dev),
1171                          iommu_group_id(dev->iommu_group));
1172                 return -EBUSY;
1173         }
1174
1175         pr_debug("%s: Adding %s to iommu group %d\n",
1176                  __func__, dev_name(dev),  iommu_group_id(table_group->group));
1177         /*
1178          * This is still not adding devices via the IOMMU bus notifier because
1179          * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls
1180          * pcibios_scan_phb() first (and this guy adds devices and triggers
1181          * the notifier) and only then it calls pci_bus_add_devices() which
1182          * configures DMA for buses which also creates PEs and IOMMU groups.
1183          */
1184         return iommu_probe_device(dev);
1185 }
1186 EXPORT_SYMBOL_GPL(iommu_add_device);
1187
1188 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
1189 /*
1190  * A simple iommu_table_group_ops which only allows reusing the existing
1191  * iommu_table. This handles VFIO for POWER7 or the nested KVM.
1192  * The ops does not allow creating windows and only allows reusing the existing
1193  * one if it matches table_group->tce32_start/tce32_size/page_shift.
1194  */
1195 static unsigned long spapr_tce_get_table_size(__u32 page_shift,
1196                                               __u64 window_size, __u32 levels)
1197 {
1198         unsigned long size;
1199
1200         if (levels > 1)
1201                 return ~0U;
1202         size = window_size >> (page_shift - 3);
1203         return size;
1204 }
1205
1206 static long spapr_tce_create_table(struct iommu_table_group *table_group, int num,
1207                                    __u32 page_shift, __u64 window_size, __u32 levels,
1208                                    struct iommu_table **ptbl)
1209 {
1210         struct iommu_table *tbl = table_group->tables[0];
1211
1212         if (num > 0)
1213                 return -EPERM;
1214
1215         if (tbl->it_page_shift != page_shift ||
1216             tbl->it_size != (window_size >> page_shift) ||
1217             tbl->it_indirect_levels != levels - 1)
1218                 return -EINVAL;
1219
1220         *ptbl = iommu_tce_table_get(tbl);
1221         return 0;
1222 }
1223
1224 static long spapr_tce_set_window(struct iommu_table_group *table_group,
1225                                  int num, struct iommu_table *tbl)
1226 {
1227         return tbl == table_group->tables[num] ? 0 : -EPERM;
1228 }
1229
1230 static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num)
1231 {
1232         return 0;
1233 }
1234
1235 static long spapr_tce_take_ownership(struct iommu_table_group *table_group)
1236 {
1237         int i, j, rc = 0;
1238
1239         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1240                 struct iommu_table *tbl = table_group->tables[i];
1241
1242                 if (!tbl || !tbl->it_map)
1243                         continue;
1244
1245                 rc = iommu_take_ownership(tbl);
1246                 if (!rc)
1247                         continue;
1248
1249                 for (j = 0; j < i; ++j)
1250                         iommu_release_ownership(table_group->tables[j]);
1251                 return rc;
1252         }
1253         return 0;
1254 }
1255
1256 static void spapr_tce_release_ownership(struct iommu_table_group *table_group)
1257 {
1258         int i;
1259
1260         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1261                 struct iommu_table *tbl = table_group->tables[i];
1262
1263                 if (!tbl)
1264                         continue;
1265
1266                 iommu_table_clear(tbl);
1267                 if (tbl->it_map)
1268                         iommu_release_ownership(tbl);
1269         }
1270 }
1271
1272 struct iommu_table_group_ops spapr_tce_table_group_ops = {
1273         .get_table_size = spapr_tce_get_table_size,
1274         .create_table = spapr_tce_create_table,
1275         .set_window = spapr_tce_set_window,
1276         .unset_window = spapr_tce_unset_window,
1277         .take_ownership = spapr_tce_take_ownership,
1278         .release_ownership = spapr_tce_release_ownership,
1279 };
1280
1281 /*
1282  * A simple iommu_ops to allow less cruft in generic VFIO code.
1283  */
1284 static int
1285 spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain,
1286                                     struct device *dev)
1287 {
1288         struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
1289         struct iommu_table_group *table_group;
1290         struct iommu_group *grp;
1291
1292         /* At first attach the ownership is already set */
1293         if (!domain)
1294                 return 0;
1295
1296         grp = iommu_group_get(dev);
1297         table_group = iommu_group_get_iommudata(grp);
1298         /*
1299          * The domain being set to PLATFORM from earlier
1300          * BLOCKED. The table_group ownership has to be released.
1301          */
1302         table_group->ops->release_ownership(table_group);
1303         iommu_group_put(grp);
1304
1305         return 0;
1306 }
1307
1308 static const struct iommu_domain_ops spapr_tce_platform_domain_ops = {
1309         .attach_dev = spapr_tce_platform_iommu_attach_dev,
1310 };
1311
1312 static struct iommu_domain spapr_tce_platform_domain = {
1313         .type = IOMMU_DOMAIN_PLATFORM,
1314         .ops = &spapr_tce_platform_domain_ops,
1315 };
1316
1317 static int
1318 spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain,
1319                                      struct device *dev)
1320 {
1321         struct iommu_group *grp = iommu_group_get(dev);
1322         struct iommu_table_group *table_group;
1323         int ret = -EINVAL;
1324
1325         /*
1326          * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain
1327          * also sets the dma_api ops
1328          */
1329         table_group = iommu_group_get_iommudata(grp);
1330         ret = table_group->ops->take_ownership(table_group);
1331         iommu_group_put(grp);
1332
1333         return ret;
1334 }
1335
1336 static const struct iommu_domain_ops spapr_tce_blocked_domain_ops = {
1337         .attach_dev = spapr_tce_blocked_iommu_attach_dev,
1338 };
1339
1340 static struct iommu_domain spapr_tce_blocked_domain = {
1341         .type = IOMMU_DOMAIN_BLOCKED,
1342         .ops = &spapr_tce_blocked_domain_ops,
1343 };
1344
1345 static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap)
1346 {
1347         switch (cap) {
1348         case IOMMU_CAP_CACHE_COHERENCY:
1349                 return true;
1350         default:
1351                 break;
1352         }
1353
1354         return false;
1355 }
1356
1357 static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev)
1358 {
1359         struct pci_dev *pdev;
1360         struct pci_controller *hose;
1361
1362         if (!dev_is_pci(dev))
1363                 return ERR_PTR(-ENODEV);
1364
1365         pdev = to_pci_dev(dev);
1366         hose = pdev->bus->sysdata;
1367
1368         return &hose->iommu;
1369 }
1370
1371 static void spapr_tce_iommu_release_device(struct device *dev)
1372 {
1373 }
1374
1375 static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev)
1376 {
1377         struct pci_controller *hose;
1378         struct pci_dev *pdev;
1379
1380         pdev = to_pci_dev(dev);
1381         hose = pdev->bus->sysdata;
1382
1383         if (!hose->controller_ops.device_group)
1384                 return ERR_PTR(-ENOENT);
1385
1386         return hose->controller_ops.device_group(hose, pdev);
1387 }
1388
1389 static const struct iommu_ops spapr_tce_iommu_ops = {
1390         .default_domain = &spapr_tce_platform_domain,
1391         .blocked_domain = &spapr_tce_blocked_domain,
1392         .capable = spapr_tce_iommu_capable,
1393         .probe_device = spapr_tce_iommu_probe_device,
1394         .release_device = spapr_tce_iommu_release_device,
1395         .device_group = spapr_tce_iommu_device_group,
1396 };
1397
1398 static struct attribute *spapr_tce_iommu_attrs[] = {
1399         NULL,
1400 };
1401
1402 static struct attribute_group spapr_tce_iommu_group = {
1403         .name = "spapr-tce-iommu",
1404         .attrs = spapr_tce_iommu_attrs,
1405 };
1406
1407 static const struct attribute_group *spapr_tce_iommu_groups[] = {
1408         &spapr_tce_iommu_group,
1409         NULL,
1410 };
1411
1412 void ppc_iommu_register_device(struct pci_controller *phb)
1413 {
1414         iommu_device_sysfs_add(&phb->iommu, phb->parent,
1415                                 spapr_tce_iommu_groups, "iommu-phb%04x",
1416                                 phb->global_number);
1417         iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops,
1418                                 phb->parent);
1419 }
1420
1421 void ppc_iommu_unregister_device(struct pci_controller *phb)
1422 {
1423         iommu_device_unregister(&phb->iommu);
1424         iommu_device_sysfs_remove(&phb->iommu);
1425 }
1426
1427 /*
1428  * This registers IOMMU devices of PHBs. This needs to happen
1429  * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and
1430  * before subsys_initcall(iommu_subsys_init).
1431  */
1432 static int __init spapr_tce_setup_phb_iommus_initcall(void)
1433 {
1434         struct pci_controller *hose;
1435
1436         list_for_each_entry(hose, &hose_list, list_node) {
1437                 ppc_iommu_register_device(hose);
1438         }
1439         return 0;
1440 }
1441 postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall);
1442 #endif
1443
1444 #endif /* CONFIG_IOMMU_API */