intel-iommu: optimize sg map/unmap calls
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19  * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20  */
21
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
33 #include "iova.h"
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/iommu.h>
38 #include "pci.h"
39
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53 static void domain_remove_dev_info(struct dmar_domain *domain);
54
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
58
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
62
63 static int __init intel_iommu_setup(char *str)
64 {
65         if (!str)
66                 return -EINVAL;
67         while (*str) {
68                 if (!strncmp(str, "off", 3)) {
69                         dmar_disabled = 1;
70                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
71                 } else if (!strncmp(str, "igfx_off", 8)) {
72                         dmar_map_gfx = 0;
73                         printk(KERN_INFO
74                                 "Intel-IOMMU: disable GFX device mapping\n");
75                 } else if (!strncmp(str, "forcedac", 8)) {
76                         printk (KERN_INFO
77                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
78                         dmar_forcedac = 1;
79                 }
80
81                 str += strcspn(str, ",");
82                 while (*str == ',')
83                         str++;
84         }
85         return 0;
86 }
87 __setup("intel_iommu=", intel_iommu_setup);
88
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
92
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
94 {
95         unsigned int flags;
96         void *vaddr;
97
98         /* trying to avoid low memory issues */
99         flags = current->flags & PF_MEMALLOC;
100         current->flags |= PF_MEMALLOC;
101         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102         current->flags &= (~PF_MEMALLOC | flags);
103         return vaddr;
104 }
105
106
107 static inline void *alloc_pgtable_page(void)
108 {
109         unsigned int flags;
110         void *vaddr;
111
112         /* trying to avoid low memory issues */
113         flags = current->flags & PF_MEMALLOC;
114         current->flags |= PF_MEMALLOC;
115         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116         current->flags &= (~PF_MEMALLOC | flags);
117         return vaddr;
118 }
119
120 static inline void free_pgtable_page(void *vaddr)
121 {
122         free_page((unsigned long)vaddr);
123 }
124
125 static inline void *alloc_domain_mem(void)
126 {
127         return iommu_kmem_cache_alloc(iommu_domain_cache);
128 }
129
130 static inline void free_domain_mem(void *vaddr)
131 {
132         kmem_cache_free(iommu_domain_cache, vaddr);
133 }
134
135 static inline void * alloc_devinfo_mem(void)
136 {
137         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
138 }
139
140 static inline void free_devinfo_mem(void *vaddr)
141 {
142         kmem_cache_free(iommu_devinfo_cache, vaddr);
143 }
144
145 struct iova *alloc_iova_mem(void)
146 {
147         return iommu_kmem_cache_alloc(iommu_iova_cache);
148 }
149
150 void free_iova_mem(struct iova *iova)
151 {
152         kmem_cache_free(iommu_iova_cache, iova);
153 }
154
155 static inline void __iommu_flush_cache(
156         struct intel_iommu *iommu, void *addr, int size)
157 {
158         if (!ecap_coherent(iommu->ecap))
159                 clflush_cache_range(addr, size);
160 }
161
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
164                 u8 bus, u8 devfn)
165 {
166         struct root_entry *root;
167         struct context_entry *context;
168         unsigned long phy_addr;
169         unsigned long flags;
170
171         spin_lock_irqsave(&iommu->lock, flags);
172         root = &iommu->root_entry[bus];
173         context = get_context_addr_from_root(root);
174         if (!context) {
175                 context = (struct context_entry *)alloc_pgtable_page();
176                 if (!context) {
177                         spin_unlock_irqrestore(&iommu->lock, flags);
178                         return NULL;
179                 }
180                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181                 phy_addr = virt_to_phys((void *)context);
182                 set_root_value(root, phy_addr);
183                 set_root_present(root);
184                 __iommu_flush_cache(iommu, root, sizeof(*root));
185         }
186         spin_unlock_irqrestore(&iommu->lock, flags);
187         return &context[devfn];
188 }
189
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
191 {
192         struct root_entry *root;
193         struct context_entry *context;
194         int ret;
195         unsigned long flags;
196
197         spin_lock_irqsave(&iommu->lock, flags);
198         root = &iommu->root_entry[bus];
199         context = get_context_addr_from_root(root);
200         if (!context) {
201                 ret = 0;
202                 goto out;
203         }
204         ret = context_present(context[devfn]);
205 out:
206         spin_unlock_irqrestore(&iommu->lock, flags);
207         return ret;
208 }
209
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
211 {
212         struct root_entry *root;
213         struct context_entry *context;
214         unsigned long flags;
215
216         spin_lock_irqsave(&iommu->lock, flags);
217         root = &iommu->root_entry[bus];
218         context = get_context_addr_from_root(root);
219         if (context) {
220                 context_clear_entry(context[devfn]);
221                 __iommu_flush_cache(iommu, &context[devfn], \
222                         sizeof(*context));
223         }
224         spin_unlock_irqrestore(&iommu->lock, flags);
225 }
226
227 static void free_context_table(struct intel_iommu *iommu)
228 {
229         struct root_entry *root;
230         int i;
231         unsigned long flags;
232         struct context_entry *context;
233
234         spin_lock_irqsave(&iommu->lock, flags);
235         if (!iommu->root_entry) {
236                 goto out;
237         }
238         for (i = 0; i < ROOT_ENTRY_NR; i++) {
239                 root = &iommu->root_entry[i];
240                 context = get_context_addr_from_root(root);
241                 if (context)
242                         free_pgtable_page(context);
243         }
244         free_pgtable_page(iommu->root_entry);
245         iommu->root_entry = NULL;
246 out:
247         spin_unlock_irqrestore(&iommu->lock, flags);
248 }
249
250 /* page table handling */
251 #define LEVEL_STRIDE            (9)
252 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
253
254 static inline int agaw_to_level(int agaw)
255 {
256         return agaw + 2;
257 }
258
259 static inline int agaw_to_width(int agaw)
260 {
261         return 30 + agaw * LEVEL_STRIDE;
262
263 }
264
265 static inline int width_to_agaw(int width)
266 {
267         return (width - 30) / LEVEL_STRIDE;
268 }
269
270 static inline unsigned int level_to_offset_bits(int level)
271 {
272         return (12 + (level - 1) * LEVEL_STRIDE);
273 }
274
275 static inline int address_level_offset(u64 addr, int level)
276 {
277         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
278 }
279
280 static inline u64 level_mask(int level)
281 {
282         return ((u64)-1 << level_to_offset_bits(level));
283 }
284
285 static inline u64 level_size(int level)
286 {
287         return ((u64)1 << level_to_offset_bits(level));
288 }
289
290 static inline u64 align_to_level(u64 addr, int level)
291 {
292         return ((addr + level_size(level) - 1) & level_mask(level));
293 }
294
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
296 {
297         int addr_width = agaw_to_width(domain->agaw);
298         struct dma_pte *parent, *pte = NULL;
299         int level = agaw_to_level(domain->agaw);
300         int offset;
301         unsigned long flags;
302
303         BUG_ON(!domain->pgd);
304
305         addr &= (((u64)1) << addr_width) - 1;
306         parent = domain->pgd;
307
308         spin_lock_irqsave(&domain->mapping_lock, flags);
309         while (level > 0) {
310                 void *tmp_page;
311
312                 offset = address_level_offset(addr, level);
313                 pte = &parent[offset];
314                 if (level == 1)
315                         break;
316
317                 if (!dma_pte_present(*pte)) {
318                         tmp_page = alloc_pgtable_page();
319
320                         if (!tmp_page) {
321                                 spin_unlock_irqrestore(&domain->mapping_lock,
322                                         flags);
323                                 return NULL;
324                         }
325                         __iommu_flush_cache(domain->iommu, tmp_page,
326                                         PAGE_SIZE_4K);
327                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
328                         /*
329                          * high level table always sets r/w, last level page
330                          * table control read/write
331                          */
332                         dma_set_pte_readable(*pte);
333                         dma_set_pte_writable(*pte);
334                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
335                 }
336                 parent = phys_to_virt(dma_pte_addr(*pte));
337                 level--;
338         }
339
340         spin_unlock_irqrestore(&domain->mapping_lock, flags);
341         return pte;
342 }
343
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
346                 int level)
347 {
348         struct dma_pte *parent, *pte = NULL;
349         int total = agaw_to_level(domain->agaw);
350         int offset;
351
352         parent = domain->pgd;
353         while (level <= total) {
354                 offset = address_level_offset(addr, total);
355                 pte = &parent[offset];
356                 if (level == total)
357                         return pte;
358
359                 if (!dma_pte_present(*pte))
360                         break;
361                 parent = phys_to_virt(dma_pte_addr(*pte));
362                 total--;
363         }
364         return NULL;
365 }
366
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
369 {
370         struct dma_pte *pte = NULL;
371
372         /* get last level pte */
373         pte = dma_addr_level_pte(domain, addr, 1);
374
375         if (pte) {
376                 dma_clear_pte(*pte);
377                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
378         }
379 }
380
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
383 {
384         int addr_width = agaw_to_width(domain->agaw);
385
386         start &= (((u64)1) << addr_width) - 1;
387         end &= (((u64)1) << addr_width) - 1;
388         /* in case it's partial page */
389         start = PAGE_ALIGN_4K(start);
390         end &= PAGE_MASK_4K;
391
392         /* we don't need lock here, nobody else touches the iova range */
393         while (start < end) {
394                 dma_pte_clear_one(domain, start);
395                 start += PAGE_SIZE_4K;
396         }
397 }
398
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
401         u64 start, u64 end)
402 {
403         int addr_width = agaw_to_width(domain->agaw);
404         struct dma_pte *pte;
405         int total = agaw_to_level(domain->agaw);
406         int level;
407         u64 tmp;
408
409         start &= (((u64)1) << addr_width) - 1;
410         end &= (((u64)1) << addr_width) - 1;
411
412         /* we don't need lock here, nobody else touches the iova range */
413         level = 2;
414         while (level <= total) {
415                 tmp = align_to_level(start, level);
416                 if (tmp >= end || (tmp + level_size(level) > end))
417                         return;
418
419                 while (tmp < end) {
420                         pte = dma_addr_level_pte(domain, tmp, level);
421                         if (pte) {
422                                 free_pgtable_page(
423                                         phys_to_virt(dma_pte_addr(*pte)));
424                                 dma_clear_pte(*pte);
425                                 __iommu_flush_cache(domain->iommu,
426                                                 pte, sizeof(*pte));
427                         }
428                         tmp += level_size(level);
429                 }
430                 level++;
431         }
432         /* free pgd */
433         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434                 free_pgtable_page(domain->pgd);
435                 domain->pgd = NULL;
436         }
437 }
438
439 /* iommu handling */
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
441 {
442         struct root_entry *root;
443         unsigned long flags;
444
445         root = (struct root_entry *)alloc_pgtable_page();
446         if (!root)
447                 return -ENOMEM;
448
449         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
450
451         spin_lock_irqsave(&iommu->lock, flags);
452         iommu->root_entry = root;
453         spin_unlock_irqrestore(&iommu->lock, flags);
454
455         return 0;
456 }
457
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
459 {\
460         unsigned long start_time = jiffies;\
461         while (1) {\
462                 sts = op (iommu->reg + offset);\
463                 if (cond)\
464                         break;\
465                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466                         panic("DMAR hardware is malfunctioning\n");\
467                 cpu_relax();\
468         }\
469 }
470
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
472 {
473         void *addr;
474         u32 cmd, sts;
475         unsigned long flag;
476
477         addr = iommu->root_entry;
478
479         spin_lock_irqsave(&iommu->register_lock, flag);
480         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
481
482         cmd = iommu->gcmd | DMA_GCMD_SRTP;
483         writel(cmd, iommu->reg + DMAR_GCMD_REG);
484
485         /* Make sure hardware complete it */
486         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487                 readl, (sts & DMA_GSTS_RTPS), sts);
488
489         spin_unlock_irqrestore(&iommu->register_lock, flag);
490 }
491
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
493 {
494         u32 val;
495         unsigned long flag;
496
497         if (!cap_rwbf(iommu->cap))
498                 return;
499         val = iommu->gcmd | DMA_GCMD_WBF;
500
501         spin_lock_irqsave(&iommu->register_lock, flag);
502         writel(val, iommu->reg + DMAR_GCMD_REG);
503
504         /* Make sure hardware complete it */
505         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506                         readl, (!(val & DMA_GSTS_WBFS)), val);
507
508         spin_unlock_irqrestore(&iommu->register_lock, flag);
509 }
510
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513         u16 did, u16 source_id, u8 function_mask, u64 type,
514         int non_present_entry_flush)
515 {
516         u64 val = 0;
517         unsigned long flag;
518
519         /*
520          * In the non-present entry flush case, if hardware doesn't cache
521          * non-present entry we do nothing and if hardware cache non-present
522          * entry, we flush entries of domain 0 (the domain id is used to cache
523          * any non-present entries)
524          */
525         if (non_present_entry_flush) {
526                 if (!cap_caching_mode(iommu->cap))
527                         return 1;
528                 else
529                         did = 0;
530         }
531
532         switch (type) {
533         case DMA_CCMD_GLOBAL_INVL:
534                 val = DMA_CCMD_GLOBAL_INVL;
535                 break;
536         case DMA_CCMD_DOMAIN_INVL:
537                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
538                 break;
539         case DMA_CCMD_DEVICE_INVL:
540                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
542                 break;
543         default:
544                 BUG();
545         }
546         val |= DMA_CCMD_ICC;
547
548         spin_lock_irqsave(&iommu->register_lock, flag);
549         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
550
551         /* Make sure hardware complete it */
552         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
554
555         spin_unlock_irqrestore(&iommu->register_lock, flag);
556
557         /* flush context entry will implictly flush write buffer */
558         return 0;
559 }
560
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562         int non_present_entry_flush)
563 {
564         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565                 non_present_entry_flush);
566 }
567
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569         int non_present_entry_flush)
570 {
571         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572                 non_present_entry_flush);
573 }
574
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
577 {
578         return __iommu_flush_context(iommu, did, source_id, function_mask,
579                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
580 }
581
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584         u64 addr, unsigned int size_order, u64 type,
585         int non_present_entry_flush)
586 {
587         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588         u64 val = 0, val_iva = 0;
589         unsigned long flag;
590
591         /*
592          * In the non-present entry flush case, if hardware doesn't cache
593          * non-present entry we do nothing and if hardware cache non-present
594          * entry, we flush entries of domain 0 (the domain id is used to cache
595          * any non-present entries)
596          */
597         if (non_present_entry_flush) {
598                 if (!cap_caching_mode(iommu->cap))
599                         return 1;
600                 else
601                         did = 0;
602         }
603
604         switch (type) {
605         case DMA_TLB_GLOBAL_FLUSH:
606                 /* global flush doesn't need set IVA_REG */
607                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
608                 break;
609         case DMA_TLB_DSI_FLUSH:
610                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
611                 break;
612         case DMA_TLB_PSI_FLUSH:
613                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614                 /* Note: always flush non-leaf currently */
615                 val_iva = size_order | addr;
616                 break;
617         default:
618                 BUG();
619         }
620         /* Note: set drain read/write */
621 #if 0
622         /*
623          * This is probably to be super secure.. Looks like we can
624          * ignore it without any impact.
625          */
626         if (cap_read_drain(iommu->cap))
627                 val |= DMA_TLB_READ_DRAIN;
628 #endif
629         if (cap_write_drain(iommu->cap))
630                 val |= DMA_TLB_WRITE_DRAIN;
631
632         spin_lock_irqsave(&iommu->register_lock, flag);
633         /* Note: Only uses first TLB reg currently */
634         if (val_iva)
635                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636         dmar_writeq(iommu->reg + tlb_offset + 8, val);
637
638         /* Make sure hardware complete it */
639         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
641
642         spin_unlock_irqrestore(&iommu->register_lock, flag);
643
644         /* check IOTLB invalidation granularity */
645         if (DMA_TLB_IAIG(val) == 0)
646                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650         /* flush context entry will implictly flush write buffer */
651         return 0;
652 }
653
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655         int non_present_entry_flush)
656 {
657         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658                 non_present_entry_flush);
659 }
660
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662         int non_present_entry_flush)
663 {
664         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665                 non_present_entry_flush);
666 }
667
668 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
669         u64 addr, unsigned int pages, int non_present_entry_flush)
670 {
671         unsigned int mask;
672
673         BUG_ON(addr & (~PAGE_MASK_4K));
674         BUG_ON(pages == 0);
675
676         /* Fallback to domain selective flush if no PSI support */
677         if (!cap_pgsel_inv(iommu->cap))
678                 return iommu_flush_iotlb_dsi(iommu, did,
679                         non_present_entry_flush);
680
681         /*
682          * PSI requires page size to be 2 ^ x, and the base address is naturally
683          * aligned to the size
684          */
685         mask = ilog2(__roundup_pow_of_two(pages));
686         /* Fallback to domain selective flush if size is too big */
687         if (mask > cap_max_amask_val(iommu->cap))
688                 return iommu_flush_iotlb_dsi(iommu, did,
689                         non_present_entry_flush);
690
691         return __iommu_flush_iotlb(iommu, did, addr, mask,
692                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
693 }
694
695 static int iommu_enable_translation(struct intel_iommu *iommu)
696 {
697         u32 sts;
698         unsigned long flags;
699
700         spin_lock_irqsave(&iommu->register_lock, flags);
701         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
702
703         /* Make sure hardware complete it */
704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
705                 readl, (sts & DMA_GSTS_TES), sts);
706
707         iommu->gcmd |= DMA_GCMD_TE;
708         spin_unlock_irqrestore(&iommu->register_lock, flags);
709         return 0;
710 }
711
712 static int iommu_disable_translation(struct intel_iommu *iommu)
713 {
714         u32 sts;
715         unsigned long flag;
716
717         spin_lock_irqsave(&iommu->register_lock, flag);
718         iommu->gcmd &= ~DMA_GCMD_TE;
719         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
720
721         /* Make sure hardware complete it */
722         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723                 readl, (!(sts & DMA_GSTS_TES)), sts);
724
725         spin_unlock_irqrestore(&iommu->register_lock, flag);
726         return 0;
727 }
728
729 /* iommu interrupt handling. Most stuff are MSI-like. */
730
731 static char *fault_reason_strings[] =
732 {
733         "Software",
734         "Present bit in root entry is clear",
735         "Present bit in context entry is clear",
736         "Invalid context entry",
737         "Access beyond MGAW",
738         "PTE Write access is not set",
739         "PTE Read access is not set",
740         "Next page table ptr is invalid",
741         "Root table address invalid",
742         "Context table ptr is invalid",
743         "non-zero reserved fields in RTP",
744         "non-zero reserved fields in CTP",
745         "non-zero reserved fields in PTE",
746         "Unknown"
747 };
748 #define MAX_FAULT_REASON_IDX    ARRAY_SIZE(fault_reason_strings)
749
750 char *dmar_get_fault_reason(u8 fault_reason)
751 {
752         if (fault_reason > MAX_FAULT_REASON_IDX)
753                 return fault_reason_strings[MAX_FAULT_REASON_IDX];
754         else
755                 return fault_reason_strings[fault_reason];
756 }
757
758 void dmar_msi_unmask(unsigned int irq)
759 {
760         struct intel_iommu *iommu = get_irq_data(irq);
761         unsigned long flag;
762
763         /* unmask it */
764         spin_lock_irqsave(&iommu->register_lock, flag);
765         writel(0, iommu->reg + DMAR_FECTL_REG);
766         /* Read a reg to force flush the post write */
767         readl(iommu->reg + DMAR_FECTL_REG);
768         spin_unlock_irqrestore(&iommu->register_lock, flag);
769 }
770
771 void dmar_msi_mask(unsigned int irq)
772 {
773         unsigned long flag;
774         struct intel_iommu *iommu = get_irq_data(irq);
775
776         /* mask it */
777         spin_lock_irqsave(&iommu->register_lock, flag);
778         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
779         /* Read a reg to force flush the post write */
780         readl(iommu->reg + DMAR_FECTL_REG);
781         spin_unlock_irqrestore(&iommu->register_lock, flag);
782 }
783
784 void dmar_msi_write(int irq, struct msi_msg *msg)
785 {
786         struct intel_iommu *iommu = get_irq_data(irq);
787         unsigned long flag;
788
789         spin_lock_irqsave(&iommu->register_lock, flag);
790         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
791         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
792         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
793         spin_unlock_irqrestore(&iommu->register_lock, flag);
794 }
795
796 void dmar_msi_read(int irq, struct msi_msg *msg)
797 {
798         struct intel_iommu *iommu = get_irq_data(irq);
799         unsigned long flag;
800
801         spin_lock_irqsave(&iommu->register_lock, flag);
802         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
803         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
804         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
805         spin_unlock_irqrestore(&iommu->register_lock, flag);
806 }
807
808 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
809                 u8 fault_reason, u16 source_id, u64 addr)
810 {
811         char *reason;
812
813         reason = dmar_get_fault_reason(fault_reason);
814
815         printk(KERN_ERR
816                 "DMAR:[%s] Request device [%02x:%02x.%d] "
817                 "fault addr %llx \n"
818                 "DMAR:[fault reason %02d] %s\n",
819                 (type ? "DMA Read" : "DMA Write"),
820                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
821                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
822         return 0;
823 }
824
825 #define PRIMARY_FAULT_REG_LEN (16)
826 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
827 {
828         struct intel_iommu *iommu = dev_id;
829         int reg, fault_index;
830         u32 fault_status;
831         unsigned long flag;
832
833         spin_lock_irqsave(&iommu->register_lock, flag);
834         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
835
836         /* TBD: ignore advanced fault log currently */
837         if (!(fault_status & DMA_FSTS_PPF))
838                 goto clear_overflow;
839
840         fault_index = dma_fsts_fault_record_index(fault_status);
841         reg = cap_fault_reg_offset(iommu->cap);
842         while (1) {
843                 u8 fault_reason;
844                 u16 source_id;
845                 u64 guest_addr;
846                 int type;
847                 u32 data;
848
849                 /* highest 32 bits */
850                 data = readl(iommu->reg + reg +
851                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
852                 if (!(data & DMA_FRCD_F))
853                         break;
854
855                 fault_reason = dma_frcd_fault_reason(data);
856                 type = dma_frcd_type(data);
857
858                 data = readl(iommu->reg + reg +
859                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
860                 source_id = dma_frcd_source_id(data);
861
862                 guest_addr = dmar_readq(iommu->reg + reg +
863                                 fault_index * PRIMARY_FAULT_REG_LEN);
864                 guest_addr = dma_frcd_page_addr(guest_addr);
865                 /* clear the fault */
866                 writel(DMA_FRCD_F, iommu->reg + reg +
867                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
868
869                 spin_unlock_irqrestore(&iommu->register_lock, flag);
870
871                 iommu_page_fault_do_one(iommu, type, fault_reason,
872                                 source_id, guest_addr);
873
874                 fault_index++;
875                 if (fault_index > cap_num_fault_regs(iommu->cap))
876                         fault_index = 0;
877                 spin_lock_irqsave(&iommu->register_lock, flag);
878         }
879 clear_overflow:
880         /* clear primary fault overflow */
881         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
882         if (fault_status & DMA_FSTS_PFO)
883                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
884
885         spin_unlock_irqrestore(&iommu->register_lock, flag);
886         return IRQ_HANDLED;
887 }
888
889 int dmar_set_interrupt(struct intel_iommu *iommu)
890 {
891         int irq, ret;
892
893         irq = create_irq();
894         if (!irq) {
895                 printk(KERN_ERR "IOMMU: no free vectors\n");
896                 return -EINVAL;
897         }
898
899         set_irq_data(irq, iommu);
900         iommu->irq = irq;
901
902         ret = arch_setup_dmar_msi(irq);
903         if (ret) {
904                 set_irq_data(irq, NULL);
905                 iommu->irq = 0;
906                 destroy_irq(irq);
907                 return 0;
908         }
909
910         /* Force fault register is cleared */
911         iommu_page_fault(irq, iommu);
912
913         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
914         if (ret)
915                 printk(KERN_ERR "IOMMU: can't request irq\n");
916         return ret;
917 }
918
919 static int iommu_init_domains(struct intel_iommu *iommu)
920 {
921         unsigned long ndomains;
922         unsigned long nlongs;
923
924         ndomains = cap_ndoms(iommu->cap);
925         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
926         nlongs = BITS_TO_LONGS(ndomains);
927
928         /* TBD: there might be 64K domains,
929          * consider other allocation for future chip
930          */
931         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
932         if (!iommu->domain_ids) {
933                 printk(KERN_ERR "Allocating domain id array failed\n");
934                 return -ENOMEM;
935         }
936         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
937                         GFP_KERNEL);
938         if (!iommu->domains) {
939                 printk(KERN_ERR "Allocating domain array failed\n");
940                 kfree(iommu->domain_ids);
941                 return -ENOMEM;
942         }
943
944         /*
945          * if Caching mode is set, then invalid translations are tagged
946          * with domainid 0. Hence we need to pre-allocate it.
947          */
948         if (cap_caching_mode(iommu->cap))
949                 set_bit(0, iommu->domain_ids);
950         return 0;
951 }
952
953 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
954 {
955         struct intel_iommu *iommu;
956         int ret;
957         int map_size;
958         u32 ver;
959
960         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
961         if (!iommu)
962                 return NULL;
963         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
964         if (!iommu->reg) {
965                 printk(KERN_ERR "IOMMU: can't map the region\n");
966                 goto error;
967         }
968         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
969         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
970
971         /* the registers might be more than one page */
972         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
973                 cap_max_fault_reg_offset(iommu->cap));
974         map_size = PAGE_ALIGN_4K(map_size);
975         if (map_size > PAGE_SIZE_4K) {
976                 iounmap(iommu->reg);
977                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
978                 if (!iommu->reg) {
979                         printk(KERN_ERR "IOMMU: can't map the region\n");
980                         goto error;
981                 }
982         }
983
984         ver = readl(iommu->reg + DMAR_VER_REG);
985         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
986                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
987                 iommu->cap, iommu->ecap);
988         ret = iommu_init_domains(iommu);
989         if (ret)
990                 goto error_unmap;
991         spin_lock_init(&iommu->lock);
992         spin_lock_init(&iommu->register_lock);
993
994         drhd->iommu = iommu;
995         return iommu;
996 error_unmap:
997         iounmap(iommu->reg);
998         iommu->reg = 0;
999 error:
1000         kfree(iommu);
1001         return NULL;
1002 }
1003
1004 static void domain_exit(struct dmar_domain *domain);
1005 static void free_iommu(struct intel_iommu *iommu)
1006 {
1007         struct dmar_domain *domain;
1008         int i;
1009
1010         if (!iommu)
1011                 return;
1012
1013         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1014         for (; i < cap_ndoms(iommu->cap); ) {
1015                 domain = iommu->domains[i];
1016                 clear_bit(i, iommu->domain_ids);
1017                 domain_exit(domain);
1018                 i = find_next_bit(iommu->domain_ids,
1019                         cap_ndoms(iommu->cap), i+1);
1020         }
1021
1022         if (iommu->gcmd & DMA_GCMD_TE)
1023                 iommu_disable_translation(iommu);
1024
1025         if (iommu->irq) {
1026                 set_irq_data(iommu->irq, NULL);
1027                 /* This will mask the irq */
1028                 free_irq(iommu->irq, iommu);
1029                 destroy_irq(iommu->irq);
1030         }
1031
1032         kfree(iommu->domains);
1033         kfree(iommu->domain_ids);
1034
1035         /* free context mapping */
1036         free_context_table(iommu);
1037
1038         if (iommu->reg)
1039                 iounmap(iommu->reg);
1040         kfree(iommu);
1041 }
1042
1043 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1044 {
1045         unsigned long num;
1046         unsigned long ndomains;
1047         struct dmar_domain *domain;
1048         unsigned long flags;
1049
1050         domain = alloc_domain_mem();
1051         if (!domain)
1052                 return NULL;
1053
1054         ndomains = cap_ndoms(iommu->cap);
1055
1056         spin_lock_irqsave(&iommu->lock, flags);
1057         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1058         if (num >= ndomains) {
1059                 spin_unlock_irqrestore(&iommu->lock, flags);
1060                 free_domain_mem(domain);
1061                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1062                 return NULL;
1063         }
1064
1065         set_bit(num, iommu->domain_ids);
1066         domain->id = num;
1067         domain->iommu = iommu;
1068         iommu->domains[num] = domain;
1069         spin_unlock_irqrestore(&iommu->lock, flags);
1070
1071         return domain;
1072 }
1073
1074 static void iommu_free_domain(struct dmar_domain *domain)
1075 {
1076         unsigned long flags;
1077
1078         spin_lock_irqsave(&domain->iommu->lock, flags);
1079         clear_bit(domain->id, domain->iommu->domain_ids);
1080         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1081 }
1082
1083 static struct iova_domain reserved_iova_list;
1084
1085 static void dmar_init_reserved_ranges(void)
1086 {
1087         struct pci_dev *pdev = NULL;
1088         struct iova *iova;
1089         int i;
1090         u64 addr, size;
1091
1092         init_iova_domain(&reserved_iova_list);
1093
1094         /* IOAPIC ranges shouldn't be accessed by DMA */
1095         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096                 IOVA_PFN(IOAPIC_RANGE_END));
1097         if (!iova)
1098                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1099
1100         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101         for_each_pci_dev(pdev) {
1102                 struct resource *r;
1103
1104                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105                         r = &pdev->resource[i];
1106                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1107                                 continue;
1108                         addr = r->start;
1109                         addr &= PAGE_MASK_4K;
1110                         size = r->end - addr;
1111                         size = PAGE_ALIGN_4K(size);
1112                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113                                 IOVA_PFN(size + addr) - 1);
1114                         if (!iova)
1115                                 printk(KERN_ERR "Reserve iova failed\n");
1116                 }
1117         }
1118
1119 }
1120
1121 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1122 {
1123         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1124 }
1125
1126 static inline int guestwidth_to_adjustwidth(int gaw)
1127 {
1128         int agaw;
1129         int r = (gaw - 12) % 9;
1130
1131         if (r == 0)
1132                 agaw = gaw;
1133         else
1134                 agaw = gaw + 9 - r;
1135         if (agaw > 64)
1136                 agaw = 64;
1137         return agaw;
1138 }
1139
1140 static int domain_init(struct dmar_domain *domain, int guest_width)
1141 {
1142         struct intel_iommu *iommu;
1143         int adjust_width, agaw;
1144         unsigned long sagaw;
1145
1146         init_iova_domain(&domain->iovad);
1147         spin_lock_init(&domain->mapping_lock);
1148
1149         domain_reserve_special_ranges(domain);
1150
1151         /* calculate AGAW */
1152         iommu = domain->iommu;
1153         if (guest_width > cap_mgaw(iommu->cap))
1154                 guest_width = cap_mgaw(iommu->cap);
1155         domain->gaw = guest_width;
1156         adjust_width = guestwidth_to_adjustwidth(guest_width);
1157         agaw = width_to_agaw(adjust_width);
1158         sagaw = cap_sagaw(iommu->cap);
1159         if (!test_bit(agaw, &sagaw)) {
1160                 /* hardware doesn't support it, choose a bigger one */
1161                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162                 agaw = find_next_bit(&sagaw, 5, agaw);
1163                 if (agaw >= 5)
1164                         return -ENODEV;
1165         }
1166         domain->agaw = agaw;
1167         INIT_LIST_HEAD(&domain->devices);
1168
1169         /* always allocate the top pgd */
1170         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1171         if (!domain->pgd)
1172                 return -ENOMEM;
1173         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1174         return 0;
1175 }
1176
1177 static void domain_exit(struct dmar_domain *domain)
1178 {
1179         u64 end;
1180
1181         /* Domain 0 is reserved, so dont process it */
1182         if (!domain)
1183                 return;
1184
1185         domain_remove_dev_info(domain);
1186         /* destroy iovas */
1187         put_iova_domain(&domain->iovad);
1188         end = DOMAIN_MAX_ADDR(domain->gaw);
1189         end = end & (~PAGE_MASK_4K);
1190
1191         /* clear ptes */
1192         dma_pte_clear_range(domain, 0, end);
1193
1194         /* free page tables */
1195         dma_pte_free_pagetable(domain, 0, end);
1196
1197         iommu_free_domain(domain);
1198         free_domain_mem(domain);
1199 }
1200
1201 static int domain_context_mapping_one(struct dmar_domain *domain,
1202                 u8 bus, u8 devfn)
1203 {
1204         struct context_entry *context;
1205         struct intel_iommu *iommu = domain->iommu;
1206         unsigned long flags;
1207
1208         pr_debug("Set context mapping for %02x:%02x.%d\n",
1209                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210         BUG_ON(!domain->pgd);
1211         context = device_to_context_entry(iommu, bus, devfn);
1212         if (!context)
1213                 return -ENOMEM;
1214         spin_lock_irqsave(&iommu->lock, flags);
1215         if (context_present(*context)) {
1216                 spin_unlock_irqrestore(&iommu->lock, flags);
1217                 return 0;
1218         }
1219
1220         context_set_domain_id(*context, domain->id);
1221         context_set_address_width(*context, domain->agaw);
1222         context_set_address_root(*context, virt_to_phys(domain->pgd));
1223         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224         context_set_fault_enable(*context);
1225         context_set_present(*context);
1226         __iommu_flush_cache(iommu, context, sizeof(*context));
1227
1228         /* it's a non-present to present mapping */
1229         if (iommu_flush_context_device(iommu, domain->id,
1230                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231                 iommu_flush_write_buffer(iommu);
1232         else
1233                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234         spin_unlock_irqrestore(&iommu->lock, flags);
1235         return 0;
1236 }
1237
1238 static int
1239 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1240 {
1241         int ret;
1242         struct pci_dev *tmp, *parent;
1243
1244         ret = domain_context_mapping_one(domain, pdev->bus->number,
1245                 pdev->devfn);
1246         if (ret)
1247                 return ret;
1248
1249         /* dependent device mapping */
1250         tmp = pci_find_upstream_pcie_bridge(pdev);
1251         if (!tmp)
1252                 return 0;
1253         /* Secondary interface's bus number and devfn 0 */
1254         parent = pdev->bus->self;
1255         while (parent != tmp) {
1256                 ret = domain_context_mapping_one(domain, parent->bus->number,
1257                         parent->devfn);
1258                 if (ret)
1259                         return ret;
1260                 parent = parent->bus->self;
1261         }
1262         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263                 return domain_context_mapping_one(domain,
1264                         tmp->subordinate->number, 0);
1265         else /* this is a legacy PCI bridge */
1266                 return domain_context_mapping_one(domain,
1267                         tmp->bus->number, tmp->devfn);
1268 }
1269
1270 static int domain_context_mapped(struct dmar_domain *domain,
1271         struct pci_dev *pdev)
1272 {
1273         int ret;
1274         struct pci_dev *tmp, *parent;
1275
1276         ret = device_context_mapped(domain->iommu,
1277                 pdev->bus->number, pdev->devfn);
1278         if (!ret)
1279                 return ret;
1280         /* dependent device mapping */
1281         tmp = pci_find_upstream_pcie_bridge(pdev);
1282         if (!tmp)
1283                 return ret;
1284         /* Secondary interface's bus number and devfn 0 */
1285         parent = pdev->bus->self;
1286         while (parent != tmp) {
1287                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1288                         parent->devfn);
1289                 if (!ret)
1290                         return ret;
1291                 parent = parent->bus->self;
1292         }
1293         if (tmp->is_pcie)
1294                 return device_context_mapped(domain->iommu,
1295                         tmp->subordinate->number, 0);
1296         else
1297                 return device_context_mapped(domain->iommu,
1298                         tmp->bus->number, tmp->devfn);
1299 }
1300
1301 static int
1302 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303                         u64 hpa, size_t size, int prot)
1304 {
1305         u64 start_pfn, end_pfn;
1306         struct dma_pte *pte;
1307         int index;
1308
1309         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1310                 return -EINVAL;
1311         iova &= PAGE_MASK_4K;
1312         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1314         index = 0;
1315         while (start_pfn < end_pfn) {
1316                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1317                 if (!pte)
1318                         return -ENOMEM;
1319                 /* We don't need lock here, nobody else
1320                  * touches the iova range
1321                  */
1322                 BUG_ON(dma_pte_addr(*pte));
1323                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324                 dma_set_pte_prot(*pte, prot);
1325                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1326                 start_pfn++;
1327                 index++;
1328         }
1329         return 0;
1330 }
1331
1332 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1333 {
1334         clear_context_table(domain->iommu, bus, devfn);
1335         iommu_flush_context_global(domain->iommu, 0);
1336         iommu_flush_iotlb_global(domain->iommu, 0);
1337 }
1338
1339 static void domain_remove_dev_info(struct dmar_domain *domain)
1340 {
1341         struct device_domain_info *info;
1342         unsigned long flags;
1343
1344         spin_lock_irqsave(&device_domain_lock, flags);
1345         while (!list_empty(&domain->devices)) {
1346                 info = list_entry(domain->devices.next,
1347                         struct device_domain_info, link);
1348                 list_del(&info->link);
1349                 list_del(&info->global);
1350                 if (info->dev)
1351                         info->dev->sysdata = NULL;
1352                 spin_unlock_irqrestore(&device_domain_lock, flags);
1353
1354                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355                 free_devinfo_mem(info);
1356
1357                 spin_lock_irqsave(&device_domain_lock, flags);
1358         }
1359         spin_unlock_irqrestore(&device_domain_lock, flags);
1360 }
1361
1362 /*
1363  * find_domain
1364  * Note: we use struct pci_dev->sysdata stores the info
1365  */
1366 struct dmar_domain *
1367 find_domain(struct pci_dev *pdev)
1368 {
1369         struct device_domain_info *info;
1370
1371         /* No lock here, assumes no domain exit in normal case */
1372         info = pdev->sysdata;
1373         if (info)
1374                 return info->domain;
1375         return NULL;
1376 }
1377
1378 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1379      struct pci_dev *dev)
1380 {
1381         int index;
1382
1383         while (dev) {
1384                 for (index = 0; index < cnt; index ++)
1385                         if (dev == devices[index])
1386                                 return 1;
1387
1388                 /* Check our parent */
1389                 dev = dev->bus->self;
1390         }
1391
1392         return 0;
1393 }
1394
1395 static struct dmar_drhd_unit *
1396 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1397 {
1398         struct dmar_drhd_unit *drhd = NULL;
1399
1400         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1401                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1402                                                 drhd->devices_cnt, dev))
1403                         return drhd;
1404         }
1405
1406         return NULL;
1407 }
1408
1409 /* domain is initialized */
1410 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1411 {
1412         struct dmar_domain *domain, *found = NULL;
1413         struct intel_iommu *iommu;
1414         struct dmar_drhd_unit *drhd;
1415         struct device_domain_info *info, *tmp;
1416         struct pci_dev *dev_tmp;
1417         unsigned long flags;
1418         int bus = 0, devfn = 0;
1419
1420         domain = find_domain(pdev);
1421         if (domain)
1422                 return domain;
1423
1424         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1425         if (dev_tmp) {
1426                 if (dev_tmp->is_pcie) {
1427                         bus = dev_tmp->subordinate->number;
1428                         devfn = 0;
1429                 } else {
1430                         bus = dev_tmp->bus->number;
1431                         devfn = dev_tmp->devfn;
1432                 }
1433                 spin_lock_irqsave(&device_domain_lock, flags);
1434                 list_for_each_entry(info, &device_domain_list, global) {
1435                         if (info->bus == bus && info->devfn == devfn) {
1436                                 found = info->domain;
1437                                 break;
1438                         }
1439                 }
1440                 spin_unlock_irqrestore(&device_domain_lock, flags);
1441                 /* pcie-pci bridge already has a domain, uses it */
1442                 if (found) {
1443                         domain = found;
1444                         goto found_domain;
1445                 }
1446         }
1447
1448         /* Allocate new domain for the device */
1449         drhd = dmar_find_matched_drhd_unit(pdev);
1450         if (!drhd) {
1451                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1452                         pci_name(pdev));
1453                 return NULL;
1454         }
1455         iommu = drhd->iommu;
1456
1457         domain = iommu_alloc_domain(iommu);
1458         if (!domain)
1459                 goto error;
1460
1461         if (domain_init(domain, gaw)) {
1462                 domain_exit(domain);
1463                 goto error;
1464         }
1465
1466         /* register pcie-to-pci device */
1467         if (dev_tmp) {
1468                 info = alloc_devinfo_mem();
1469                 if (!info) {
1470                         domain_exit(domain);
1471                         goto error;
1472                 }
1473                 info->bus = bus;
1474                 info->devfn = devfn;
1475                 info->dev = NULL;
1476                 info->domain = domain;
1477                 /* This domain is shared by devices under p2p bridge */
1478                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1479
1480                 /* pcie-to-pci bridge already has a domain, uses it */
1481                 found = NULL;
1482                 spin_lock_irqsave(&device_domain_lock, flags);
1483                 list_for_each_entry(tmp, &device_domain_list, global) {
1484                         if (tmp->bus == bus && tmp->devfn == devfn) {
1485                                 found = tmp->domain;
1486                                 break;
1487                         }
1488                 }
1489                 if (found) {
1490                         free_devinfo_mem(info);
1491                         domain_exit(domain);
1492                         domain = found;
1493                 } else {
1494                         list_add(&info->link, &domain->devices);
1495                         list_add(&info->global, &device_domain_list);
1496                 }
1497                 spin_unlock_irqrestore(&device_domain_lock, flags);
1498         }
1499
1500 found_domain:
1501         info = alloc_devinfo_mem();
1502         if (!info)
1503                 goto error;
1504         info->bus = pdev->bus->number;
1505         info->devfn = pdev->devfn;
1506         info->dev = pdev;
1507         info->domain = domain;
1508         spin_lock_irqsave(&device_domain_lock, flags);
1509         /* somebody is fast */
1510         found = find_domain(pdev);
1511         if (found != NULL) {
1512                 spin_unlock_irqrestore(&device_domain_lock, flags);
1513                 if (found != domain) {
1514                         domain_exit(domain);
1515                         domain = found;
1516                 }
1517                 free_devinfo_mem(info);
1518                 return domain;
1519         }
1520         list_add(&info->link, &domain->devices);
1521         list_add(&info->global, &device_domain_list);
1522         pdev->sysdata = info;
1523         spin_unlock_irqrestore(&device_domain_lock, flags);
1524         return domain;
1525 error:
1526         /* recheck it here, maybe others set it */
1527         return find_domain(pdev);
1528 }
1529
1530 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1531 {
1532         struct dmar_domain *domain;
1533         unsigned long size;
1534         u64 base;
1535         int ret;
1536
1537         printk(KERN_INFO
1538                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1539                 pci_name(pdev), start, end);
1540         /* page table init */
1541         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1542         if (!domain)
1543                 return -ENOMEM;
1544
1545         /* The address might not be aligned */
1546         base = start & PAGE_MASK_4K;
1547         size = end - base;
1548         size = PAGE_ALIGN_4K(size);
1549         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1550                         IOVA_PFN(base + size) - 1)) {
1551                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1552                 ret = -ENOMEM;
1553                 goto error;
1554         }
1555
1556         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1557                 size, base, pci_name(pdev));
1558         /*
1559          * RMRR range might have overlap with physical memory range,
1560          * clear it first
1561          */
1562         dma_pte_clear_range(domain, base, base + size);
1563
1564         ret = domain_page_mapping(domain, base, base, size,
1565                 DMA_PTE_READ|DMA_PTE_WRITE);
1566         if (ret)
1567                 goto error;
1568
1569         /* context entry init */
1570         ret = domain_context_mapping(domain, pdev);
1571         if (!ret)
1572                 return 0;
1573 error:
1574         domain_exit(domain);
1575         return ret;
1576
1577 }
1578
1579 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1580         struct pci_dev *pdev)
1581 {
1582         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1583                 return 0;
1584         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1585                 rmrr->end_address + 1);
1586 }
1587
1588 #ifdef CONFIG_DMAR_GFX_WA
1589 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1590 static void __init iommu_prepare_gfx_mapping(void)
1591 {
1592         struct pci_dev *pdev = NULL;
1593         u64 base, size;
1594         int slot;
1595         int ret;
1596
1597         for_each_pci_dev(pdev) {
1598                 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO ||
1599                                 !IS_GFX_DEVICE(pdev))
1600                         continue;
1601                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1602                         pci_name(pdev));
1603                 slot = arch_get_ram_range(0, &base, &size);
1604                 while (slot >= 0) {
1605                         ret = iommu_prepare_identity_map(pdev,
1606                                         base, base + size);
1607                         if (ret)
1608                                 goto error;
1609                         slot = arch_get_ram_range(slot, &base, &size);
1610                 }
1611                 continue;
1612 error:
1613                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1614         }
1615 }
1616 #endif
1617
1618 #ifdef CONFIG_DMAR_FLOPPY_WA
1619 static inline void iommu_prepare_isa(void)
1620 {
1621         struct pci_dev *pdev;
1622         int ret;
1623
1624         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1625         if (!pdev)
1626                 return;
1627
1628         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1629         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1630
1631         if (ret)
1632                 printk("IOMMU: Failed to create 0-64M identity map, "
1633                         "floppy might not work\n");
1634
1635 }
1636 #else
1637 static inline void iommu_prepare_isa(void)
1638 {
1639         return;
1640 }
1641 #endif /* !CONFIG_DMAR_FLPY_WA */
1642
1643 int __init init_dmars(void)
1644 {
1645         struct dmar_drhd_unit *drhd;
1646         struct dmar_rmrr_unit *rmrr;
1647         struct pci_dev *pdev;
1648         struct intel_iommu *iommu;
1649         int ret, unit = 0;
1650
1651         /*
1652          * for each drhd
1653          *    allocate root
1654          *    initialize and program root entry to not present
1655          * endfor
1656          */
1657         for_each_drhd_unit(drhd) {
1658                 if (drhd->ignored)
1659                         continue;
1660                 iommu = alloc_iommu(drhd);
1661                 if (!iommu) {
1662                         ret = -ENOMEM;
1663                         goto error;
1664                 }
1665
1666                 /*
1667                  * TBD:
1668                  * we could share the same root & context tables
1669                  * amoung all IOMMU's. Need to Split it later.
1670                  */
1671                 ret = iommu_alloc_root_entry(iommu);
1672                 if (ret) {
1673                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1674                         goto error;
1675                 }
1676         }
1677
1678         /*
1679          * For each rmrr
1680          *   for each dev attached to rmrr
1681          *   do
1682          *     locate drhd for dev, alloc domain for dev
1683          *     allocate free domain
1684          *     allocate page table entries for rmrr
1685          *     if context not allocated for bus
1686          *           allocate and init context
1687          *           set present in root table for this bus
1688          *     init context with domain, translation etc
1689          *    endfor
1690          * endfor
1691          */
1692         for_each_rmrr_units(rmrr) {
1693                 int i;
1694                 for (i = 0; i < rmrr->devices_cnt; i++) {
1695                         pdev = rmrr->devices[i];
1696                         /* some BIOS lists non-exist devices in DMAR table */
1697                         if (!pdev)
1698                                 continue;
1699                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1700                         if (ret)
1701                                 printk(KERN_ERR
1702                                  "IOMMU: mapping reserved region failed\n");
1703                 }
1704         }
1705
1706         iommu_prepare_gfx_mapping();
1707
1708         iommu_prepare_isa();
1709
1710         /*
1711          * for each drhd
1712          *   enable fault log
1713          *   global invalidate context cache
1714          *   global invalidate iotlb
1715          *   enable translation
1716          */
1717         for_each_drhd_unit(drhd) {
1718                 if (drhd->ignored)
1719                         continue;
1720                 iommu = drhd->iommu;
1721                 sprintf (iommu->name, "dmar%d", unit++);
1722
1723                 iommu_flush_write_buffer(iommu);
1724
1725                 ret = dmar_set_interrupt(iommu);
1726                 if (ret)
1727                         goto error;
1728
1729                 iommu_set_root_entry(iommu);
1730
1731                 iommu_flush_context_global(iommu, 0);
1732                 iommu_flush_iotlb_global(iommu, 0);
1733
1734                 ret = iommu_enable_translation(iommu);
1735                 if (ret)
1736                         goto error;
1737         }
1738
1739         return 0;
1740 error:
1741         for_each_drhd_unit(drhd) {
1742                 if (drhd->ignored)
1743                         continue;
1744                 iommu = drhd->iommu;
1745                 free_iommu(iommu);
1746         }
1747         return ret;
1748 }
1749
1750 static inline u64 aligned_size(u64 host_addr, size_t size)
1751 {
1752         u64 addr;
1753         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1754         return PAGE_ALIGN_4K(addr);
1755 }
1756
1757 struct iova *
1758 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1759 {
1760         struct iova *piova;
1761
1762         /* Make sure it's in range */
1763         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1764         if (!size || (IOVA_START_ADDR + size > end))
1765                 return NULL;
1766
1767         piova = alloc_iova(&domain->iovad,
1768                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1769         return piova;
1770 }
1771
1772 static struct iova *
1773 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1774                 size_t size)
1775 {
1776         struct pci_dev *pdev = to_pci_dev(dev);
1777         struct iova *iova = NULL;
1778
1779         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1780                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1781         } else  {
1782                 /*
1783                  * First try to allocate an io virtual address in
1784                  * DMA_32BIT_MASK and if that fails then try allocating
1785                  * from higer range
1786                  */
1787                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1788                 if (!iova)
1789                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1790         }
1791
1792         if (!iova) {
1793                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1794                 return NULL;
1795         }
1796
1797         return iova;
1798 }
1799
1800 static struct dmar_domain *
1801 get_valid_domain_for_dev(struct pci_dev *pdev)
1802 {
1803         struct dmar_domain *domain;
1804         int ret;
1805
1806         domain = get_domain_for_dev(pdev,
1807                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1808         if (!domain) {
1809                 printk(KERN_ERR
1810                         "Allocating domain for %s failed", pci_name(pdev));
1811                 return 0;
1812         }
1813
1814         /* make sure context mapping is ok */
1815         if (unlikely(!domain_context_mapped(domain, pdev))) {
1816                 ret = domain_context_mapping(domain, pdev);
1817                 if (ret) {
1818                         printk(KERN_ERR
1819                                 "Domain context map for %s failed",
1820                                 pci_name(pdev));
1821                         return 0;
1822                 }
1823         }
1824
1825         return domain;
1826 }
1827
1828 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1829         size_t size, int dir)
1830 {
1831         struct pci_dev *pdev = to_pci_dev(hwdev);
1832         int ret;
1833         struct dmar_domain *domain;
1834         unsigned long start_addr;
1835         struct iova *iova;
1836         int prot = 0;
1837
1838         BUG_ON(dir == DMA_NONE);
1839         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1840                 return virt_to_bus(addr);
1841
1842         domain = get_valid_domain_for_dev(pdev);
1843         if (!domain)
1844                 return 0;
1845
1846         addr = (void *)virt_to_phys(addr);
1847         size = aligned_size((u64)addr, size);
1848
1849         iova = __intel_alloc_iova(hwdev, domain, size);
1850         if (!iova)
1851                 goto error;
1852
1853         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1854
1855         /*
1856          * Check if DMAR supports zero-length reads on write only
1857          * mappings..
1858          */
1859         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1860                         !cap_zlr(domain->iommu->cap))
1861                 prot |= DMA_PTE_READ;
1862         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1863                 prot |= DMA_PTE_WRITE;
1864         /*
1865          * addr - (addr + size) might be partial page, we should map the whole
1866          * page.  Note: if two part of one page are separately mapped, we
1867          * might have two guest_addr mapping to the same host addr, but this
1868          * is not a big problem
1869          */
1870         ret = domain_page_mapping(domain, start_addr,
1871                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1872         if (ret)
1873                 goto error;
1874
1875         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1876                 pci_name(pdev), size, (u64)addr,
1877                 size, (u64)start_addr, dir);
1878
1879         /* it's a non-present to present mapping */
1880         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1881                         start_addr, size >> PAGE_SHIFT_4K, 1);
1882         if (ret)
1883                 iommu_flush_write_buffer(domain->iommu);
1884
1885         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1886
1887 error:
1888         if (iova)
1889                 __free_iova(&domain->iovad, iova);
1890         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1891                 pci_name(pdev), size, (u64)addr, dir);
1892         return 0;
1893 }
1894
1895 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1896         size_t size, int dir)
1897 {
1898         struct pci_dev *pdev = to_pci_dev(dev);
1899         struct dmar_domain *domain;
1900         unsigned long start_addr;
1901         struct iova *iova;
1902
1903         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1904                 return;
1905         domain = find_domain(pdev);
1906         BUG_ON(!domain);
1907
1908         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1909         if (!iova)
1910                 return;
1911
1912         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1913         size = aligned_size((u64)dev_addr, size);
1914
1915         pr_debug("Device %s unmapping: %lx@%llx\n",
1916                 pci_name(pdev), size, (u64)start_addr);
1917
1918         /*  clear the whole page */
1919         dma_pte_clear_range(domain, start_addr, start_addr + size);
1920         /* free page tables */
1921         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1922
1923         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1924                         size >> PAGE_SHIFT_4K, 0))
1925                 iommu_flush_write_buffer(domain->iommu);
1926
1927         /* free iova */
1928         __free_iova(&domain->iovad, iova);
1929 }
1930
1931 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1932                        dma_addr_t *dma_handle, gfp_t flags)
1933 {
1934         void *vaddr;
1935         int order;
1936
1937         size = PAGE_ALIGN_4K(size);
1938         order = get_order(size);
1939         flags &= ~(GFP_DMA | GFP_DMA32);
1940
1941         vaddr = (void *)__get_free_pages(flags, order);
1942         if (!vaddr)
1943                 return NULL;
1944         memset(vaddr, 0, size);
1945
1946         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1947         if (*dma_handle)
1948                 return vaddr;
1949         free_pages((unsigned long)vaddr, order);
1950         return NULL;
1951 }
1952
1953 static void intel_free_coherent(struct device *hwdev, size_t size,
1954         void *vaddr, dma_addr_t dma_handle)
1955 {
1956         int order;
1957
1958         size = PAGE_ALIGN_4K(size);
1959         order = get_order(size);
1960
1961         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1962         free_pages((unsigned long)vaddr, order);
1963 }
1964
1965 #define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
1966 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sg,
1967         int nelems, int dir)
1968 {
1969         int i;
1970         struct pci_dev *pdev = to_pci_dev(hwdev);
1971         struct dmar_domain *domain;
1972         unsigned long start_addr;
1973         struct iova *iova;
1974         size_t size = 0;
1975         void *addr;
1976
1977         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1978                 return;
1979
1980         domain = find_domain(pdev);
1981
1982         iova = find_iova(&domain->iovad, IOVA_PFN(sg[0].dma_address));
1983         if (!iova)
1984                 return;
1985         for (i = 0; i < nelems; i++, sg++) {
1986                 addr = SG_ENT_VIRT_ADDRESS(sg);
1987                 size += aligned_size((u64)addr, sg->length);
1988         }
1989
1990         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1991
1992         /*  clear the whole page */
1993         dma_pte_clear_range(domain, start_addr, start_addr + size);
1994         /* free page tables */
1995         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1996
1997         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1998                         size >> PAGE_SHIFT_4K, 0))
1999                 iommu_flush_write_buffer(domain->iommu);
2000
2001         /* free iova */
2002         __free_iova(&domain->iovad, iova);
2003 }
2004
2005 static int intel_nontranslate_map_sg(struct device *hddev,
2006         struct scatterlist *sg, int nelems, int dir)
2007 {
2008         int i;
2009
2010         for (i = 0; i < nelems; i++) {
2011                 struct scatterlist *s = &sg[i];
2012                 BUG_ON(!s->page);
2013                 s->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(s));
2014                 s->dma_length = s->length;
2015         }
2016         return nelems;
2017 }
2018
2019 static int intel_map_sg(struct device *hwdev, struct scatterlist *sg,
2020         int nelems, int dir)
2021 {
2022         void *addr;
2023         int i;
2024         struct pci_dev *pdev = to_pci_dev(hwdev);
2025         struct dmar_domain *domain;
2026         size_t size = 0;
2027         int prot = 0;
2028         size_t offset = 0;
2029         struct iova *iova = NULL;
2030         int ret;
2031         struct scatterlist *orig_sg = sg;
2032         unsigned long start_addr;
2033
2034         BUG_ON(dir == DMA_NONE);
2035         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
2036                 return intel_nontranslate_map_sg(hwdev, sg, nelems, dir);
2037
2038         domain = get_valid_domain_for_dev(pdev);
2039         if (!domain)
2040                 return 0;
2041
2042         for (i = 0; i < nelems; i++, sg++) {
2043                 addr = SG_ENT_VIRT_ADDRESS(sg);
2044                 addr = (void *)virt_to_phys(addr);
2045                 size += aligned_size((u64)addr, sg->length);
2046         }
2047
2048         iova = __intel_alloc_iova(hwdev, domain, size);
2049         if (!iova) {
2050                 orig_sg->dma_length = 0;
2051                 return 0;
2052         }
2053
2054         /*
2055          * Check if DMAR supports zero-length reads on write only
2056          * mappings..
2057          */
2058         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2059                         !cap_zlr(domain->iommu->cap))
2060                 prot |= DMA_PTE_READ;
2061         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2062                 prot |= DMA_PTE_WRITE;
2063
2064         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2065         offset = 0;
2066         sg = orig_sg;
2067         for (i = 0; i < nelems; i++, sg++) {
2068                 addr = SG_ENT_VIRT_ADDRESS(sg);
2069                 addr = (void *)virt_to_phys(addr);
2070                 size = aligned_size((u64)addr, sg->length);
2071                 ret = domain_page_mapping(domain, start_addr + offset,
2072                         ((u64)addr) & PAGE_MASK_4K,
2073                         size, prot);
2074                 if (ret) {
2075                         /*  clear the page */
2076                         dma_pte_clear_range(domain, start_addr,
2077                                   start_addr + offset);
2078                         /* free page tables */
2079                         dma_pte_free_pagetable(domain, start_addr,
2080                                   start_addr + offset);
2081                         /* free iova */
2082                         __free_iova(&domain->iovad, iova);
2083                         return 0;
2084                 }
2085                 sg->dma_address = start_addr + offset +
2086                                 ((u64)addr & (~PAGE_MASK_4K));
2087                 sg->dma_length = sg->length;
2088                 offset += size;
2089         }
2090
2091         /* it's a non-present to present mapping */
2092         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2093                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2094                 iommu_flush_write_buffer(domain->iommu);
2095         return nelems;
2096 }
2097
2098 static struct dma_mapping_ops intel_dma_ops = {
2099         .alloc_coherent = intel_alloc_coherent,
2100         .free_coherent = intel_free_coherent,
2101         .map_single = intel_map_single,
2102         .unmap_single = intel_unmap_single,
2103         .map_sg = intel_map_sg,
2104         .unmap_sg = intel_unmap_sg,
2105 };
2106
2107 static inline int iommu_domain_cache_init(void)
2108 {
2109         int ret = 0;
2110
2111         iommu_domain_cache = kmem_cache_create("iommu_domain",
2112                                          sizeof(struct dmar_domain),
2113                                          0,
2114                                          SLAB_HWCACHE_ALIGN,
2115
2116                                          NULL);
2117         if (!iommu_domain_cache) {
2118                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2119                 ret = -ENOMEM;
2120         }
2121
2122         return ret;
2123 }
2124
2125 static inline int iommu_devinfo_cache_init(void)
2126 {
2127         int ret = 0;
2128
2129         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2130                                          sizeof(struct device_domain_info),
2131                                          0,
2132                                          SLAB_HWCACHE_ALIGN,
2133
2134                                          NULL);
2135         if (!iommu_devinfo_cache) {
2136                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2137                 ret = -ENOMEM;
2138         }
2139
2140         return ret;
2141 }
2142
2143 static inline int iommu_iova_cache_init(void)
2144 {
2145         int ret = 0;
2146
2147         iommu_iova_cache = kmem_cache_create("iommu_iova",
2148                                          sizeof(struct iova),
2149                                          0,
2150                                          SLAB_HWCACHE_ALIGN,
2151
2152                                          NULL);
2153         if (!iommu_iova_cache) {
2154                 printk(KERN_ERR "Couldn't create iova cache\n");
2155                 ret = -ENOMEM;
2156         }
2157
2158         return ret;
2159 }
2160
2161 static int __init iommu_init_mempool(void)
2162 {
2163         int ret;
2164         ret = iommu_iova_cache_init();
2165         if (ret)
2166                 return ret;
2167
2168         ret = iommu_domain_cache_init();
2169         if (ret)
2170                 goto domain_error;
2171
2172         ret = iommu_devinfo_cache_init();
2173         if (!ret)
2174                 return ret;
2175
2176         kmem_cache_destroy(iommu_domain_cache);
2177 domain_error:
2178         kmem_cache_destroy(iommu_iova_cache);
2179
2180         return -ENOMEM;
2181 }
2182
2183 static void __init iommu_exit_mempool(void)
2184 {
2185         kmem_cache_destroy(iommu_devinfo_cache);
2186         kmem_cache_destroy(iommu_domain_cache);
2187         kmem_cache_destroy(iommu_iova_cache);
2188
2189 }
2190
2191 void __init detect_intel_iommu(void)
2192 {
2193         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2194                 return;
2195         if (early_dmar_detect()) {
2196                 iommu_detected = 1;
2197         }
2198 }
2199
2200 static void __init init_no_remapping_devices(void)
2201 {
2202         struct dmar_drhd_unit *drhd;
2203
2204         for_each_drhd_unit(drhd) {
2205                 if (!drhd->include_all) {
2206                         int i;
2207                         for (i = 0; i < drhd->devices_cnt; i++)
2208                                 if (drhd->devices[i] != NULL)
2209                                         break;
2210                         /* ignore DMAR unit if no pci devices exist */
2211                         if (i == drhd->devices_cnt)
2212                                 drhd->ignored = 1;
2213                 }
2214         }
2215
2216         if (dmar_map_gfx)
2217                 return;
2218
2219         for_each_drhd_unit(drhd) {
2220                 int i;
2221                 if (drhd->ignored || drhd->include_all)
2222                         continue;
2223
2224                 for (i = 0; i < drhd->devices_cnt; i++)
2225                         if (drhd->devices[i] &&
2226                                 !IS_GFX_DEVICE(drhd->devices[i]))
2227                                 break;
2228
2229                 if (i < drhd->devices_cnt)
2230                         continue;
2231
2232                 /* bypass IOMMU if it is just for gfx devices */
2233                 drhd->ignored = 1;
2234                 for (i = 0; i < drhd->devices_cnt; i++) {
2235                         if (!drhd->devices[i])
2236                                 continue;
2237                         drhd->devices[i]->sysdata = DUMMY_DEVICE_DOMAIN_INFO;
2238                 }
2239         }
2240 }
2241
2242 int __init intel_iommu_init(void)
2243 {
2244         int ret = 0;
2245
2246         if (no_iommu || swiotlb || dmar_disabled)
2247                 return -ENODEV;
2248
2249         if (dmar_table_init())
2250                 return  -ENODEV;
2251
2252         iommu_init_mempool();
2253         dmar_init_reserved_ranges();
2254
2255         init_no_remapping_devices();
2256
2257         ret = init_dmars();
2258         if (ret) {
2259                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2260                 put_iova_domain(&reserved_iova_list);
2261                 iommu_exit_mempool();
2262                 return ret;
2263         }
2264         printk(KERN_INFO
2265         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2266
2267         force_iommu = 1;
2268         dma_ops = &intel_dma_ops;
2269         return 0;
2270 }
2271