585e188c17463c128ec589ad0c198442a606ee39
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19  * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20  */
21
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
33 #include "iova.h"
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/gart.h>
38 #include "pci.h"
39
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53 static void domain_remove_dev_info(struct dmar_domain *domain);
54
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
58
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
62
63 static int __init intel_iommu_setup(char *str)
64 {
65         if (!str)
66                 return -EINVAL;
67         while (*str) {
68                 if (!strncmp(str, "off", 3)) {
69                         dmar_disabled = 1;
70                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
71                 } else if (!strncmp(str, "igfx_off", 8)) {
72                         dmar_map_gfx = 0;
73                         printk(KERN_INFO
74                                 "Intel-IOMMU: disable GFX device mapping\n");
75                 } else if (!strncmp(str, "forcedac", 8)) {
76                         printk (KERN_INFO
77                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
78                         dmar_forcedac = 1;
79                 }
80
81                 str += strcspn(str, ",");
82                 while (*str == ',')
83                         str++;
84         }
85         return 0;
86 }
87 __setup("intel_iommu=", intel_iommu_setup);
88
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
92
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
94 {
95         unsigned int flags;
96         void *vaddr;
97
98         /* trying to avoid low memory issues */
99         flags = current->flags & PF_MEMALLOC;
100         current->flags |= PF_MEMALLOC;
101         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102         current->flags &= (~PF_MEMALLOC | flags);
103         return vaddr;
104 }
105
106
107 static inline void *alloc_pgtable_page(void)
108 {
109         unsigned int flags;
110         void *vaddr;
111
112         /* trying to avoid low memory issues */
113         flags = current->flags & PF_MEMALLOC;
114         current->flags |= PF_MEMALLOC;
115         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116         current->flags &= (~PF_MEMALLOC | flags);
117         return vaddr;
118 }
119
120 static inline void free_pgtable_page(void *vaddr)
121 {
122         free_page((unsigned long)vaddr);
123 }
124
125 static inline void *alloc_domain_mem(void)
126 {
127         return iommu_kmem_cache_alloc(iommu_domain_cache);
128 }
129
130 static inline void free_domain_mem(void *vaddr)
131 {
132         kmem_cache_free(iommu_domain_cache, vaddr);
133 }
134
135 static inline void * alloc_devinfo_mem(void)
136 {
137         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
138 }
139
140 static inline void free_devinfo_mem(void *vaddr)
141 {
142         kmem_cache_free(iommu_devinfo_cache, vaddr);
143 }
144
145 struct iova *alloc_iova_mem(void)
146 {
147         return iommu_kmem_cache_alloc(iommu_iova_cache);
148 }
149
150 void free_iova_mem(struct iova *iova)
151 {
152         kmem_cache_free(iommu_iova_cache, iova);
153 }
154
155 static inline void __iommu_flush_cache(
156         struct intel_iommu *iommu, void *addr, int size)
157 {
158         if (!ecap_coherent(iommu->ecap))
159                 clflush_cache_range(addr, size);
160 }
161
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
164                 u8 bus, u8 devfn)
165 {
166         struct root_entry *root;
167         struct context_entry *context;
168         unsigned long phy_addr;
169         unsigned long flags;
170
171         spin_lock_irqsave(&iommu->lock, flags);
172         root = &iommu->root_entry[bus];
173         context = get_context_addr_from_root(root);
174         if (!context) {
175                 context = (struct context_entry *)alloc_pgtable_page();
176                 if (!context) {
177                         spin_unlock_irqrestore(&iommu->lock, flags);
178                         return NULL;
179                 }
180                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181                 phy_addr = virt_to_phys((void *)context);
182                 set_root_value(root, phy_addr);
183                 set_root_present(root);
184                 __iommu_flush_cache(iommu, root, sizeof(*root));
185         }
186         spin_unlock_irqrestore(&iommu->lock, flags);
187         return &context[devfn];
188 }
189
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
191 {
192         struct root_entry *root;
193         struct context_entry *context;
194         int ret;
195         unsigned long flags;
196
197         spin_lock_irqsave(&iommu->lock, flags);
198         root = &iommu->root_entry[bus];
199         context = get_context_addr_from_root(root);
200         if (!context) {
201                 ret = 0;
202                 goto out;
203         }
204         ret = context_present(context[devfn]);
205 out:
206         spin_unlock_irqrestore(&iommu->lock, flags);
207         return ret;
208 }
209
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
211 {
212         struct root_entry *root;
213         struct context_entry *context;
214         unsigned long flags;
215
216         spin_lock_irqsave(&iommu->lock, flags);
217         root = &iommu->root_entry[bus];
218         context = get_context_addr_from_root(root);
219         if (context) {
220                 context_clear_entry(context[devfn]);
221                 __iommu_flush_cache(iommu, &context[devfn], \
222                         sizeof(*context));
223         }
224         spin_unlock_irqrestore(&iommu->lock, flags);
225 }
226
227 static void free_context_table(struct intel_iommu *iommu)
228 {
229         struct root_entry *root;
230         int i;
231         unsigned long flags;
232         struct context_entry *context;
233
234         spin_lock_irqsave(&iommu->lock, flags);
235         if (!iommu->root_entry) {
236                 goto out;
237         }
238         for (i = 0; i < ROOT_ENTRY_NR; i++) {
239                 root = &iommu->root_entry[i];
240                 context = get_context_addr_from_root(root);
241                 if (context)
242                         free_pgtable_page(context);
243         }
244         free_pgtable_page(iommu->root_entry);
245         iommu->root_entry = NULL;
246 out:
247         spin_unlock_irqrestore(&iommu->lock, flags);
248 }
249
250 /* page table handling */
251 #define LEVEL_STRIDE            (9)
252 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
253
254 static inline int agaw_to_level(int agaw)
255 {
256         return agaw + 2;
257 }
258
259 static inline int agaw_to_width(int agaw)
260 {
261         return 30 + agaw * LEVEL_STRIDE;
262
263 }
264
265 static inline int width_to_agaw(int width)
266 {
267         return (width - 30) / LEVEL_STRIDE;
268 }
269
270 static inline unsigned int level_to_offset_bits(int level)
271 {
272         return (12 + (level - 1) * LEVEL_STRIDE);
273 }
274
275 static inline int address_level_offset(u64 addr, int level)
276 {
277         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
278 }
279
280 static inline u64 level_mask(int level)
281 {
282         return ((u64)-1 << level_to_offset_bits(level));
283 }
284
285 static inline u64 level_size(int level)
286 {
287         return ((u64)1 << level_to_offset_bits(level));
288 }
289
290 static inline u64 align_to_level(u64 addr, int level)
291 {
292         return ((addr + level_size(level) - 1) & level_mask(level));
293 }
294
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
296 {
297         int addr_width = agaw_to_width(domain->agaw);
298         struct dma_pte *parent, *pte = NULL;
299         int level = agaw_to_level(domain->agaw);
300         int offset;
301         unsigned long flags;
302
303         BUG_ON(!domain->pgd);
304
305         addr &= (((u64)1) << addr_width) - 1;
306         parent = domain->pgd;
307
308         spin_lock_irqsave(&domain->mapping_lock, flags);
309         while (level > 0) {
310                 void *tmp_page;
311
312                 offset = address_level_offset(addr, level);
313                 pte = &parent[offset];
314                 if (level == 1)
315                         break;
316
317                 if (!dma_pte_present(*pte)) {
318                         tmp_page = alloc_pgtable_page();
319
320                         if (!tmp_page) {
321                                 spin_unlock_irqrestore(&domain->mapping_lock,
322                                         flags);
323                                 return NULL;
324                         }
325                         __iommu_flush_cache(domain->iommu, tmp_page,
326                                         PAGE_SIZE_4K);
327                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
328                         /*
329                          * high level table always sets r/w, last level page
330                          * table control read/write
331                          */
332                         dma_set_pte_readable(*pte);
333                         dma_set_pte_writable(*pte);
334                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
335                 }
336                 parent = phys_to_virt(dma_pte_addr(*pte));
337                 level--;
338         }
339
340         spin_unlock_irqrestore(&domain->mapping_lock, flags);
341         return pte;
342 }
343
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
346                 int level)
347 {
348         struct dma_pte *parent, *pte = NULL;
349         int total = agaw_to_level(domain->agaw);
350         int offset;
351
352         parent = domain->pgd;
353         while (level <= total) {
354                 offset = address_level_offset(addr, total);
355                 pte = &parent[offset];
356                 if (level == total)
357                         return pte;
358
359                 if (!dma_pte_present(*pte))
360                         break;
361                 parent = phys_to_virt(dma_pte_addr(*pte));
362                 total--;
363         }
364         return NULL;
365 }
366
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
369 {
370         struct dma_pte *pte = NULL;
371
372         /* get last level pte */
373         pte = dma_addr_level_pte(domain, addr, 1);
374
375         if (pte) {
376                 dma_clear_pte(*pte);
377                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
378         }
379 }
380
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
383 {
384         int addr_width = agaw_to_width(domain->agaw);
385
386         start &= (((u64)1) << addr_width) - 1;
387         end &= (((u64)1) << addr_width) - 1;
388         /* in case it's partial page */
389         start = PAGE_ALIGN_4K(start);
390         end &= PAGE_MASK_4K;
391
392         /* we don't need lock here, nobody else touches the iova range */
393         while (start < end) {
394                 dma_pte_clear_one(domain, start);
395                 start += PAGE_SIZE_4K;
396         }
397 }
398
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
401         u64 start, u64 end)
402 {
403         int addr_width = agaw_to_width(domain->agaw);
404         struct dma_pte *pte;
405         int total = agaw_to_level(domain->agaw);
406         int level;
407         u64 tmp;
408
409         start &= (((u64)1) << addr_width) - 1;
410         end &= (((u64)1) << addr_width) - 1;
411
412         /* we don't need lock here, nobody else touches the iova range */
413         level = 2;
414         while (level <= total) {
415                 tmp = align_to_level(start, level);
416                 if (tmp >= end || (tmp + level_size(level) > end))
417                         return;
418
419                 while (tmp < end) {
420                         pte = dma_addr_level_pte(domain, tmp, level);
421                         if (pte) {
422                                 free_pgtable_page(
423                                         phys_to_virt(dma_pte_addr(*pte)));
424                                 dma_clear_pte(*pte);
425                                 __iommu_flush_cache(domain->iommu,
426                                                 pte, sizeof(*pte));
427                         }
428                         tmp += level_size(level);
429                 }
430                 level++;
431         }
432         /* free pgd */
433         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434                 free_pgtable_page(domain->pgd);
435                 domain->pgd = NULL;
436         }
437 }
438
439 /* iommu handling */
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
441 {
442         struct root_entry *root;
443         unsigned long flags;
444
445         root = (struct root_entry *)alloc_pgtable_page();
446         if (!root)
447                 return -ENOMEM;
448
449         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
450
451         spin_lock_irqsave(&iommu->lock, flags);
452         iommu->root_entry = root;
453         spin_unlock_irqrestore(&iommu->lock, flags);
454
455         return 0;
456 }
457
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
459 {\
460         unsigned long start_time = jiffies;\
461         while (1) {\
462                 sts = op (iommu->reg + offset);\
463                 if (cond)\
464                         break;\
465                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466                         panic("DMAR hardware is malfunctioning\n");\
467                 cpu_relax();\
468         }\
469 }
470
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
472 {
473         void *addr;
474         u32 cmd, sts;
475         unsigned long flag;
476
477         addr = iommu->root_entry;
478
479         spin_lock_irqsave(&iommu->register_lock, flag);
480         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
481
482         cmd = iommu->gcmd | DMA_GCMD_SRTP;
483         writel(cmd, iommu->reg + DMAR_GCMD_REG);
484
485         /* Make sure hardware complete it */
486         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487                 readl, (sts & DMA_GSTS_RTPS), sts);
488
489         spin_unlock_irqrestore(&iommu->register_lock, flag);
490 }
491
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
493 {
494         u32 val;
495         unsigned long flag;
496
497         if (!cap_rwbf(iommu->cap))
498                 return;
499         val = iommu->gcmd | DMA_GCMD_WBF;
500
501         spin_lock_irqsave(&iommu->register_lock, flag);
502         writel(val, iommu->reg + DMAR_GCMD_REG);
503
504         /* Make sure hardware complete it */
505         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506                         readl, (!(val & DMA_GSTS_WBFS)), val);
507
508         spin_unlock_irqrestore(&iommu->register_lock, flag);
509 }
510
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513         u16 did, u16 source_id, u8 function_mask, u64 type,
514         int non_present_entry_flush)
515 {
516         u64 val = 0;
517         unsigned long flag;
518
519         /*
520          * In the non-present entry flush case, if hardware doesn't cache
521          * non-present entry we do nothing and if hardware cache non-present
522          * entry, we flush entries of domain 0 (the domain id is used to cache
523          * any non-present entries)
524          */
525         if (non_present_entry_flush) {
526                 if (!cap_caching_mode(iommu->cap))
527                         return 1;
528                 else
529                         did = 0;
530         }
531
532         switch (type) {
533         case DMA_CCMD_GLOBAL_INVL:
534                 val = DMA_CCMD_GLOBAL_INVL;
535                 break;
536         case DMA_CCMD_DOMAIN_INVL:
537                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
538                 break;
539         case DMA_CCMD_DEVICE_INVL:
540                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
542                 break;
543         default:
544                 BUG();
545         }
546         val |= DMA_CCMD_ICC;
547
548         spin_lock_irqsave(&iommu->register_lock, flag);
549         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
550
551         /* Make sure hardware complete it */
552         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
554
555         spin_unlock_irqrestore(&iommu->register_lock, flag);
556
557         /* flush context entry will implictly flush write buffer */
558         return 0;
559 }
560
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562         int non_present_entry_flush)
563 {
564         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565                 non_present_entry_flush);
566 }
567
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569         int non_present_entry_flush)
570 {
571         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572                 non_present_entry_flush);
573 }
574
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
577 {
578         return __iommu_flush_context(iommu, did, source_id, function_mask,
579                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
580 }
581
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584         u64 addr, unsigned int size_order, u64 type,
585         int non_present_entry_flush)
586 {
587         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588         u64 val = 0, val_iva = 0;
589         unsigned long flag;
590
591         /*
592          * In the non-present entry flush case, if hardware doesn't cache
593          * non-present entry we do nothing and if hardware cache non-present
594          * entry, we flush entries of domain 0 (the domain id is used to cache
595          * any non-present entries)
596          */
597         if (non_present_entry_flush) {
598                 if (!cap_caching_mode(iommu->cap))
599                         return 1;
600                 else
601                         did = 0;
602         }
603
604         switch (type) {
605         case DMA_TLB_GLOBAL_FLUSH:
606                 /* global flush doesn't need set IVA_REG */
607                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
608                 break;
609         case DMA_TLB_DSI_FLUSH:
610                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
611                 break;
612         case DMA_TLB_PSI_FLUSH:
613                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614                 /* Note: always flush non-leaf currently */
615                 val_iva = size_order | addr;
616                 break;
617         default:
618                 BUG();
619         }
620         /* Note: set drain read/write */
621 #if 0
622         /*
623          * This is probably to be super secure.. Looks like we can
624          * ignore it without any impact.
625          */
626         if (cap_read_drain(iommu->cap))
627                 val |= DMA_TLB_READ_DRAIN;
628 #endif
629         if (cap_write_drain(iommu->cap))
630                 val |= DMA_TLB_WRITE_DRAIN;
631
632         spin_lock_irqsave(&iommu->register_lock, flag);
633         /* Note: Only uses first TLB reg currently */
634         if (val_iva)
635                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636         dmar_writeq(iommu->reg + tlb_offset + 8, val);
637
638         /* Make sure hardware complete it */
639         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
641
642         spin_unlock_irqrestore(&iommu->register_lock, flag);
643
644         /* check IOTLB invalidation granularity */
645         if (DMA_TLB_IAIG(val) == 0)
646                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650         /* flush context entry will implictly flush write buffer */
651         return 0;
652 }
653
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655         int non_present_entry_flush)
656 {
657         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658                 non_present_entry_flush);
659 }
660
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662         int non_present_entry_flush)
663 {
664         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665                 non_present_entry_flush);
666 }
667
668 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
669         u64 addr, unsigned int pages, int non_present_entry_flush)
670 {
671         unsigned int mask;
672
673         BUG_ON(addr & (~PAGE_MASK_4K));
674         BUG_ON(pages == 0);
675
676         /* Fallback to domain selective flush if no PSI support */
677         if (!cap_pgsel_inv(iommu->cap))
678                 return iommu_flush_iotlb_dsi(iommu, did,
679                         non_present_entry_flush);
680
681         /*
682          * PSI requires page size to be 2 ^ x, and the base address is naturally
683          * aligned to the size
684          */
685         mask = ilog2(__roundup_pow_of_two(pages));
686         /* Fallback to domain selective flush if size is too big */
687         if (mask > cap_max_amask_val(iommu->cap))
688                 return iommu_flush_iotlb_dsi(iommu, did,
689                         non_present_entry_flush);
690
691         return __iommu_flush_iotlb(iommu, did, addr, mask,
692                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
693 }
694
695 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
696 {
697         u32 pmen;
698         unsigned long flags;
699
700         spin_lock_irqsave(&iommu->register_lock, flags);
701         pmen = readl(iommu->reg + DMAR_PMEN_REG);
702         pmen &= ~DMA_PMEN_EPM;
703         writel(pmen, iommu->reg + DMAR_PMEN_REG);
704
705         /* wait for the protected region status bit to clear */
706         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
707                 readl, !(pmen & DMA_PMEN_PRS), pmen);
708
709         spin_unlock_irqrestore(&iommu->register_lock, flags);
710 }
711
712 static int iommu_enable_translation(struct intel_iommu *iommu)
713 {
714         u32 sts;
715         unsigned long flags;
716
717         spin_lock_irqsave(&iommu->register_lock, flags);
718         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
719
720         /* Make sure hardware complete it */
721         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
722                 readl, (sts & DMA_GSTS_TES), sts);
723
724         iommu->gcmd |= DMA_GCMD_TE;
725         spin_unlock_irqrestore(&iommu->register_lock, flags);
726         return 0;
727 }
728
729 static int iommu_disable_translation(struct intel_iommu *iommu)
730 {
731         u32 sts;
732         unsigned long flag;
733
734         spin_lock_irqsave(&iommu->register_lock, flag);
735         iommu->gcmd &= ~DMA_GCMD_TE;
736         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
737
738         /* Make sure hardware complete it */
739         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
740                 readl, (!(sts & DMA_GSTS_TES)), sts);
741
742         spin_unlock_irqrestore(&iommu->register_lock, flag);
743         return 0;
744 }
745
746 /* iommu interrupt handling. Most stuff are MSI-like. */
747
748 static char *fault_reason_strings[] =
749 {
750         "Software",
751         "Present bit in root entry is clear",
752         "Present bit in context entry is clear",
753         "Invalid context entry",
754         "Access beyond MGAW",
755         "PTE Write access is not set",
756         "PTE Read access is not set",
757         "Next page table ptr is invalid",
758         "Root table address invalid",
759         "Context table ptr is invalid",
760         "non-zero reserved fields in RTP",
761         "non-zero reserved fields in CTP",
762         "non-zero reserved fields in PTE",
763         "Unknown"
764 };
765 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
766
767 char *dmar_get_fault_reason(u8 fault_reason)
768 {
769         if (fault_reason >= MAX_FAULT_REASON_IDX)
770                 return fault_reason_strings[MAX_FAULT_REASON_IDX - 1];
771         else
772                 return fault_reason_strings[fault_reason];
773 }
774
775 void dmar_msi_unmask(unsigned int irq)
776 {
777         struct intel_iommu *iommu = get_irq_data(irq);
778         unsigned long flag;
779
780         /* unmask it */
781         spin_lock_irqsave(&iommu->register_lock, flag);
782         writel(0, iommu->reg + DMAR_FECTL_REG);
783         /* Read a reg to force flush the post write */
784         readl(iommu->reg + DMAR_FECTL_REG);
785         spin_unlock_irqrestore(&iommu->register_lock, flag);
786 }
787
788 void dmar_msi_mask(unsigned int irq)
789 {
790         unsigned long flag;
791         struct intel_iommu *iommu = get_irq_data(irq);
792
793         /* mask it */
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
796         /* Read a reg to force flush the post write */
797         readl(iommu->reg + DMAR_FECTL_REG);
798         spin_unlock_irqrestore(&iommu->register_lock, flag);
799 }
800
801 void dmar_msi_write(int irq, struct msi_msg *msg)
802 {
803         struct intel_iommu *iommu = get_irq_data(irq);
804         unsigned long flag;
805
806         spin_lock_irqsave(&iommu->register_lock, flag);
807         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
808         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
809         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
810         spin_unlock_irqrestore(&iommu->register_lock, flag);
811 }
812
813 void dmar_msi_read(int irq, struct msi_msg *msg)
814 {
815         struct intel_iommu *iommu = get_irq_data(irq);
816         unsigned long flag;
817
818         spin_lock_irqsave(&iommu->register_lock, flag);
819         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
820         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
821         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
822         spin_unlock_irqrestore(&iommu->register_lock, flag);
823 }
824
825 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
826                 u8 fault_reason, u16 source_id, u64 addr)
827 {
828         char *reason;
829
830         reason = dmar_get_fault_reason(fault_reason);
831
832         printk(KERN_ERR
833                 "DMAR:[%s] Request device [%02x:%02x.%d] "
834                 "fault addr %llx \n"
835                 "DMAR:[fault reason %02d] %s\n",
836                 (type ? "DMA Read" : "DMA Write"),
837                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
838                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
839         return 0;
840 }
841
842 #define PRIMARY_FAULT_REG_LEN (16)
843 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
844 {
845         struct intel_iommu *iommu = dev_id;
846         int reg, fault_index;
847         u32 fault_status;
848         unsigned long flag;
849
850         spin_lock_irqsave(&iommu->register_lock, flag);
851         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
852
853         /* TBD: ignore advanced fault log currently */
854         if (!(fault_status & DMA_FSTS_PPF))
855                 goto clear_overflow;
856
857         fault_index = dma_fsts_fault_record_index(fault_status);
858         reg = cap_fault_reg_offset(iommu->cap);
859         while (1) {
860                 u8 fault_reason;
861                 u16 source_id;
862                 u64 guest_addr;
863                 int type;
864                 u32 data;
865
866                 /* highest 32 bits */
867                 data = readl(iommu->reg + reg +
868                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869                 if (!(data & DMA_FRCD_F))
870                         break;
871
872                 fault_reason = dma_frcd_fault_reason(data);
873                 type = dma_frcd_type(data);
874
875                 data = readl(iommu->reg + reg +
876                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
877                 source_id = dma_frcd_source_id(data);
878
879                 guest_addr = dmar_readq(iommu->reg + reg +
880                                 fault_index * PRIMARY_FAULT_REG_LEN);
881                 guest_addr = dma_frcd_page_addr(guest_addr);
882                 /* clear the fault */
883                 writel(DMA_FRCD_F, iommu->reg + reg +
884                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
885
886                 spin_unlock_irqrestore(&iommu->register_lock, flag);
887
888                 iommu_page_fault_do_one(iommu, type, fault_reason,
889                                 source_id, guest_addr);
890
891                 fault_index++;
892                 if (fault_index > cap_num_fault_regs(iommu->cap))
893                         fault_index = 0;
894                 spin_lock_irqsave(&iommu->register_lock, flag);
895         }
896 clear_overflow:
897         /* clear primary fault overflow */
898         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
899         if (fault_status & DMA_FSTS_PFO)
900                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
901
902         spin_unlock_irqrestore(&iommu->register_lock, flag);
903         return IRQ_HANDLED;
904 }
905
906 int dmar_set_interrupt(struct intel_iommu *iommu)
907 {
908         int irq, ret;
909
910         irq = create_irq();
911         if (!irq) {
912                 printk(KERN_ERR "IOMMU: no free vectors\n");
913                 return -EINVAL;
914         }
915
916         set_irq_data(irq, iommu);
917         iommu->irq = irq;
918
919         ret = arch_setup_dmar_msi(irq);
920         if (ret) {
921                 set_irq_data(irq, NULL);
922                 iommu->irq = 0;
923                 destroy_irq(irq);
924                 return 0;
925         }
926
927         /* Force fault register is cleared */
928         iommu_page_fault(irq, iommu);
929
930         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
931         if (ret)
932                 printk(KERN_ERR "IOMMU: can't request irq\n");
933         return ret;
934 }
935
936 static int iommu_init_domains(struct intel_iommu *iommu)
937 {
938         unsigned long ndomains;
939         unsigned long nlongs;
940
941         ndomains = cap_ndoms(iommu->cap);
942         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
943         nlongs = BITS_TO_LONGS(ndomains);
944
945         /* TBD: there might be 64K domains,
946          * consider other allocation for future chip
947          */
948         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
949         if (!iommu->domain_ids) {
950                 printk(KERN_ERR "Allocating domain id array failed\n");
951                 return -ENOMEM;
952         }
953         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
954                         GFP_KERNEL);
955         if (!iommu->domains) {
956                 printk(KERN_ERR "Allocating domain array failed\n");
957                 kfree(iommu->domain_ids);
958                 return -ENOMEM;
959         }
960
961         /*
962          * if Caching mode is set, then invalid translations are tagged
963          * with domainid 0. Hence we need to pre-allocate it.
964          */
965         if (cap_caching_mode(iommu->cap))
966                 set_bit(0, iommu->domain_ids);
967         return 0;
968 }
969
970 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
971 {
972         struct intel_iommu *iommu;
973         int ret;
974         int map_size;
975         u32 ver;
976
977         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
978         if (!iommu)
979                 return NULL;
980         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
981         if (!iommu->reg) {
982                 printk(KERN_ERR "IOMMU: can't map the region\n");
983                 goto error;
984         }
985         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
986         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
987
988         /* the registers might be more than one page */
989         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
990                 cap_max_fault_reg_offset(iommu->cap));
991         map_size = PAGE_ALIGN_4K(map_size);
992         if (map_size > PAGE_SIZE_4K) {
993                 iounmap(iommu->reg);
994                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
995                 if (!iommu->reg) {
996                         printk(KERN_ERR "IOMMU: can't map the region\n");
997                         goto error;
998                 }
999         }
1000
1001         ver = readl(iommu->reg + DMAR_VER_REG);
1002         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1003                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1004                 iommu->cap, iommu->ecap);
1005         ret = iommu_init_domains(iommu);
1006         if (ret)
1007                 goto error_unmap;
1008         spin_lock_init(&iommu->lock);
1009         spin_lock_init(&iommu->register_lock);
1010
1011         drhd->iommu = iommu;
1012         return iommu;
1013 error_unmap:
1014         iounmap(iommu->reg);
1015 error:
1016         kfree(iommu);
1017         return NULL;
1018 }
1019
1020 static void domain_exit(struct dmar_domain *domain);
1021 static void free_iommu(struct intel_iommu *iommu)
1022 {
1023         struct dmar_domain *domain;
1024         int i;
1025
1026         if (!iommu)
1027                 return;
1028
1029         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1030         for (; i < cap_ndoms(iommu->cap); ) {
1031                 domain = iommu->domains[i];
1032                 clear_bit(i, iommu->domain_ids);
1033                 domain_exit(domain);
1034                 i = find_next_bit(iommu->domain_ids,
1035                         cap_ndoms(iommu->cap), i+1);
1036         }
1037
1038         if (iommu->gcmd & DMA_GCMD_TE)
1039                 iommu_disable_translation(iommu);
1040
1041         if (iommu->irq) {
1042                 set_irq_data(iommu->irq, NULL);
1043                 /* This will mask the irq */
1044                 free_irq(iommu->irq, iommu);
1045                 destroy_irq(iommu->irq);
1046         }
1047
1048         kfree(iommu->domains);
1049         kfree(iommu->domain_ids);
1050
1051         /* free context mapping */
1052         free_context_table(iommu);
1053
1054         if (iommu->reg)
1055                 iounmap(iommu->reg);
1056         kfree(iommu);
1057 }
1058
1059 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1060 {
1061         unsigned long num;
1062         unsigned long ndomains;
1063         struct dmar_domain *domain;
1064         unsigned long flags;
1065
1066         domain = alloc_domain_mem();
1067         if (!domain)
1068                 return NULL;
1069
1070         ndomains = cap_ndoms(iommu->cap);
1071
1072         spin_lock_irqsave(&iommu->lock, flags);
1073         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1074         if (num >= ndomains) {
1075                 spin_unlock_irqrestore(&iommu->lock, flags);
1076                 free_domain_mem(domain);
1077                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1078                 return NULL;
1079         }
1080
1081         set_bit(num, iommu->domain_ids);
1082         domain->id = num;
1083         domain->iommu = iommu;
1084         iommu->domains[num] = domain;
1085         spin_unlock_irqrestore(&iommu->lock, flags);
1086
1087         return domain;
1088 }
1089
1090 static void iommu_free_domain(struct dmar_domain *domain)
1091 {
1092         unsigned long flags;
1093
1094         spin_lock_irqsave(&domain->iommu->lock, flags);
1095         clear_bit(domain->id, domain->iommu->domain_ids);
1096         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1097 }
1098
1099 static struct iova_domain reserved_iova_list;
1100
1101 static void dmar_init_reserved_ranges(void)
1102 {
1103         struct pci_dev *pdev = NULL;
1104         struct iova *iova;
1105         int i;
1106         u64 addr, size;
1107
1108         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1109
1110         /* IOAPIC ranges shouldn't be accessed by DMA */
1111         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1112                 IOVA_PFN(IOAPIC_RANGE_END));
1113         if (!iova)
1114                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1115
1116         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1117         for_each_pci_dev(pdev) {
1118                 struct resource *r;
1119
1120                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1121                         r = &pdev->resource[i];
1122                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1123                                 continue;
1124                         addr = r->start;
1125                         addr &= PAGE_MASK_4K;
1126                         size = r->end - addr;
1127                         size = PAGE_ALIGN_4K(size);
1128                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1129                                 IOVA_PFN(size + addr) - 1);
1130                         if (!iova)
1131                                 printk(KERN_ERR "Reserve iova failed\n");
1132                 }
1133         }
1134
1135 }
1136
1137 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1138 {
1139         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1140 }
1141
1142 static inline int guestwidth_to_adjustwidth(int gaw)
1143 {
1144         int agaw;
1145         int r = (gaw - 12) % 9;
1146
1147         if (r == 0)
1148                 agaw = gaw;
1149         else
1150                 agaw = gaw + 9 - r;
1151         if (agaw > 64)
1152                 agaw = 64;
1153         return agaw;
1154 }
1155
1156 static int domain_init(struct dmar_domain *domain, int guest_width)
1157 {
1158         struct intel_iommu *iommu;
1159         int adjust_width, agaw;
1160         unsigned long sagaw;
1161
1162         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1163         spin_lock_init(&domain->mapping_lock);
1164
1165         domain_reserve_special_ranges(domain);
1166
1167         /* calculate AGAW */
1168         iommu = domain->iommu;
1169         if (guest_width > cap_mgaw(iommu->cap))
1170                 guest_width = cap_mgaw(iommu->cap);
1171         domain->gaw = guest_width;
1172         adjust_width = guestwidth_to_adjustwidth(guest_width);
1173         agaw = width_to_agaw(adjust_width);
1174         sagaw = cap_sagaw(iommu->cap);
1175         if (!test_bit(agaw, &sagaw)) {
1176                 /* hardware doesn't support it, choose a bigger one */
1177                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1178                 agaw = find_next_bit(&sagaw, 5, agaw);
1179                 if (agaw >= 5)
1180                         return -ENODEV;
1181         }
1182         domain->agaw = agaw;
1183         INIT_LIST_HEAD(&domain->devices);
1184
1185         /* always allocate the top pgd */
1186         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1187         if (!domain->pgd)
1188                 return -ENOMEM;
1189         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1190         return 0;
1191 }
1192
1193 static void domain_exit(struct dmar_domain *domain)
1194 {
1195         u64 end;
1196
1197         /* Domain 0 is reserved, so dont process it */
1198         if (!domain)
1199                 return;
1200
1201         domain_remove_dev_info(domain);
1202         /* destroy iovas */
1203         put_iova_domain(&domain->iovad);
1204         end = DOMAIN_MAX_ADDR(domain->gaw);
1205         end = end & (~PAGE_MASK_4K);
1206
1207         /* clear ptes */
1208         dma_pte_clear_range(domain, 0, end);
1209
1210         /* free page tables */
1211         dma_pte_free_pagetable(domain, 0, end);
1212
1213         iommu_free_domain(domain);
1214         free_domain_mem(domain);
1215 }
1216
1217 static int domain_context_mapping_one(struct dmar_domain *domain,
1218                 u8 bus, u8 devfn)
1219 {
1220         struct context_entry *context;
1221         struct intel_iommu *iommu = domain->iommu;
1222         unsigned long flags;
1223
1224         pr_debug("Set context mapping for %02x:%02x.%d\n",
1225                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1226         BUG_ON(!domain->pgd);
1227         context = device_to_context_entry(iommu, bus, devfn);
1228         if (!context)
1229                 return -ENOMEM;
1230         spin_lock_irqsave(&iommu->lock, flags);
1231         if (context_present(*context)) {
1232                 spin_unlock_irqrestore(&iommu->lock, flags);
1233                 return 0;
1234         }
1235
1236         context_set_domain_id(*context, domain->id);
1237         context_set_address_width(*context, domain->agaw);
1238         context_set_address_root(*context, virt_to_phys(domain->pgd));
1239         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1240         context_set_fault_enable(*context);
1241         context_set_present(*context);
1242         __iommu_flush_cache(iommu, context, sizeof(*context));
1243
1244         /* it's a non-present to present mapping */
1245         if (iommu_flush_context_device(iommu, domain->id,
1246                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1247                 iommu_flush_write_buffer(iommu);
1248         else
1249                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1250         spin_unlock_irqrestore(&iommu->lock, flags);
1251         return 0;
1252 }
1253
1254 static int
1255 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1256 {
1257         int ret;
1258         struct pci_dev *tmp, *parent;
1259
1260         ret = domain_context_mapping_one(domain, pdev->bus->number,
1261                 pdev->devfn);
1262         if (ret)
1263                 return ret;
1264
1265         /* dependent device mapping */
1266         tmp = pci_find_upstream_pcie_bridge(pdev);
1267         if (!tmp)
1268                 return 0;
1269         /* Secondary interface's bus number and devfn 0 */
1270         parent = pdev->bus->self;
1271         while (parent != tmp) {
1272                 ret = domain_context_mapping_one(domain, parent->bus->number,
1273                         parent->devfn);
1274                 if (ret)
1275                         return ret;
1276                 parent = parent->bus->self;
1277         }
1278         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1279                 return domain_context_mapping_one(domain,
1280                         tmp->subordinate->number, 0);
1281         else /* this is a legacy PCI bridge */
1282                 return domain_context_mapping_one(domain,
1283                         tmp->bus->number, tmp->devfn);
1284 }
1285
1286 static int domain_context_mapped(struct dmar_domain *domain,
1287         struct pci_dev *pdev)
1288 {
1289         int ret;
1290         struct pci_dev *tmp, *parent;
1291
1292         ret = device_context_mapped(domain->iommu,
1293                 pdev->bus->number, pdev->devfn);
1294         if (!ret)
1295                 return ret;
1296         /* dependent device mapping */
1297         tmp = pci_find_upstream_pcie_bridge(pdev);
1298         if (!tmp)
1299                 return ret;
1300         /* Secondary interface's bus number and devfn 0 */
1301         parent = pdev->bus->self;
1302         while (parent != tmp) {
1303                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1304                         parent->devfn);
1305                 if (!ret)
1306                         return ret;
1307                 parent = parent->bus->self;
1308         }
1309         if (tmp->is_pcie)
1310                 return device_context_mapped(domain->iommu,
1311                         tmp->subordinate->number, 0);
1312         else
1313                 return device_context_mapped(domain->iommu,
1314                         tmp->bus->number, tmp->devfn);
1315 }
1316
1317 static int
1318 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1319                         u64 hpa, size_t size, int prot)
1320 {
1321         u64 start_pfn, end_pfn;
1322         struct dma_pte *pte;
1323         int index;
1324
1325         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1326                 return -EINVAL;
1327         iova &= PAGE_MASK_4K;
1328         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1329         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1330         index = 0;
1331         while (start_pfn < end_pfn) {
1332                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1333                 if (!pte)
1334                         return -ENOMEM;
1335                 /* We don't need lock here, nobody else
1336                  * touches the iova range
1337                  */
1338                 BUG_ON(dma_pte_addr(*pte));
1339                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1340                 dma_set_pte_prot(*pte, prot);
1341                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1342                 start_pfn++;
1343                 index++;
1344         }
1345         return 0;
1346 }
1347
1348 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1349 {
1350         clear_context_table(domain->iommu, bus, devfn);
1351         iommu_flush_context_global(domain->iommu, 0);
1352         iommu_flush_iotlb_global(domain->iommu, 0);
1353 }
1354
1355 static void domain_remove_dev_info(struct dmar_domain *domain)
1356 {
1357         struct device_domain_info *info;
1358         unsigned long flags;
1359
1360         spin_lock_irqsave(&device_domain_lock, flags);
1361         while (!list_empty(&domain->devices)) {
1362                 info = list_entry(domain->devices.next,
1363                         struct device_domain_info, link);
1364                 list_del(&info->link);
1365                 list_del(&info->global);
1366                 if (info->dev)
1367                         info->dev->dev.archdata.iommu = NULL;
1368                 spin_unlock_irqrestore(&device_domain_lock, flags);
1369
1370                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1371                 free_devinfo_mem(info);
1372
1373                 spin_lock_irqsave(&device_domain_lock, flags);
1374         }
1375         spin_unlock_irqrestore(&device_domain_lock, flags);
1376 }
1377
1378 /*
1379  * find_domain
1380  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1381  */
1382 struct dmar_domain *
1383 find_domain(struct pci_dev *pdev)
1384 {
1385         struct device_domain_info *info;
1386
1387         /* No lock here, assumes no domain exit in normal case */
1388         info = pdev->dev.archdata.iommu;
1389         if (info)
1390                 return info->domain;
1391         return NULL;
1392 }
1393
1394 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1395      struct pci_dev *dev)
1396 {
1397         int index;
1398
1399         while (dev) {
1400                 for (index = 0; index < cnt; index ++)
1401                         if (dev == devices[index])
1402                                 return 1;
1403
1404                 /* Check our parent */
1405                 dev = dev->bus->self;
1406         }
1407
1408         return 0;
1409 }
1410
1411 static struct dmar_drhd_unit *
1412 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1413 {
1414         struct dmar_drhd_unit *drhd = NULL;
1415
1416         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1417                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1418                                                 drhd->devices_cnt, dev))
1419                         return drhd;
1420         }
1421
1422         return NULL;
1423 }
1424
1425 /* domain is initialized */
1426 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1427 {
1428         struct dmar_domain *domain, *found = NULL;
1429         struct intel_iommu *iommu;
1430         struct dmar_drhd_unit *drhd;
1431         struct device_domain_info *info, *tmp;
1432         struct pci_dev *dev_tmp;
1433         unsigned long flags;
1434         int bus = 0, devfn = 0;
1435
1436         domain = find_domain(pdev);
1437         if (domain)
1438                 return domain;
1439
1440         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1441         if (dev_tmp) {
1442                 if (dev_tmp->is_pcie) {
1443                         bus = dev_tmp->subordinate->number;
1444                         devfn = 0;
1445                 } else {
1446                         bus = dev_tmp->bus->number;
1447                         devfn = dev_tmp->devfn;
1448                 }
1449                 spin_lock_irqsave(&device_domain_lock, flags);
1450                 list_for_each_entry(info, &device_domain_list, global) {
1451                         if (info->bus == bus && info->devfn == devfn) {
1452                                 found = info->domain;
1453                                 break;
1454                         }
1455                 }
1456                 spin_unlock_irqrestore(&device_domain_lock, flags);
1457                 /* pcie-pci bridge already has a domain, uses it */
1458                 if (found) {
1459                         domain = found;
1460                         goto found_domain;
1461                 }
1462         }
1463
1464         /* Allocate new domain for the device */
1465         drhd = dmar_find_matched_drhd_unit(pdev);
1466         if (!drhd) {
1467                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1468                         pci_name(pdev));
1469                 return NULL;
1470         }
1471         iommu = drhd->iommu;
1472
1473         domain = iommu_alloc_domain(iommu);
1474         if (!domain)
1475                 goto error;
1476
1477         if (domain_init(domain, gaw)) {
1478                 domain_exit(domain);
1479                 goto error;
1480         }
1481
1482         /* register pcie-to-pci device */
1483         if (dev_tmp) {
1484                 info = alloc_devinfo_mem();
1485                 if (!info) {
1486                         domain_exit(domain);
1487                         goto error;
1488                 }
1489                 info->bus = bus;
1490                 info->devfn = devfn;
1491                 info->dev = NULL;
1492                 info->domain = domain;
1493                 /* This domain is shared by devices under p2p bridge */
1494                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1495
1496                 /* pcie-to-pci bridge already has a domain, uses it */
1497                 found = NULL;
1498                 spin_lock_irqsave(&device_domain_lock, flags);
1499                 list_for_each_entry(tmp, &device_domain_list, global) {
1500                         if (tmp->bus == bus && tmp->devfn == devfn) {
1501                                 found = tmp->domain;
1502                                 break;
1503                         }
1504                 }
1505                 if (found) {
1506                         free_devinfo_mem(info);
1507                         domain_exit(domain);
1508                         domain = found;
1509                 } else {
1510                         list_add(&info->link, &domain->devices);
1511                         list_add(&info->global, &device_domain_list);
1512                 }
1513                 spin_unlock_irqrestore(&device_domain_lock, flags);
1514         }
1515
1516 found_domain:
1517         info = alloc_devinfo_mem();
1518         if (!info)
1519                 goto error;
1520         info->bus = pdev->bus->number;
1521         info->devfn = pdev->devfn;
1522         info->dev = pdev;
1523         info->domain = domain;
1524         spin_lock_irqsave(&device_domain_lock, flags);
1525         /* somebody is fast */
1526         found = find_domain(pdev);
1527         if (found != NULL) {
1528                 spin_unlock_irqrestore(&device_domain_lock, flags);
1529                 if (found != domain) {
1530                         domain_exit(domain);
1531                         domain = found;
1532                 }
1533                 free_devinfo_mem(info);
1534                 return domain;
1535         }
1536         list_add(&info->link, &domain->devices);
1537         list_add(&info->global, &device_domain_list);
1538         pdev->dev.archdata.iommu = info;
1539         spin_unlock_irqrestore(&device_domain_lock, flags);
1540         return domain;
1541 error:
1542         /* recheck it here, maybe others set it */
1543         return find_domain(pdev);
1544 }
1545
1546 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1547 {
1548         struct dmar_domain *domain;
1549         unsigned long size;
1550         u64 base;
1551         int ret;
1552
1553         printk(KERN_INFO
1554                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1555                 pci_name(pdev), start, end);
1556         /* page table init */
1557         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1558         if (!domain)
1559                 return -ENOMEM;
1560
1561         /* The address might not be aligned */
1562         base = start & PAGE_MASK_4K;
1563         size = end - base;
1564         size = PAGE_ALIGN_4K(size);
1565         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1566                         IOVA_PFN(base + size) - 1)) {
1567                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1568                 ret = -ENOMEM;
1569                 goto error;
1570         }
1571
1572         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1573                 size, base, pci_name(pdev));
1574         /*
1575          * RMRR range might have overlap with physical memory range,
1576          * clear it first
1577          */
1578         dma_pte_clear_range(domain, base, base + size);
1579
1580         ret = domain_page_mapping(domain, base, base, size,
1581                 DMA_PTE_READ|DMA_PTE_WRITE);
1582         if (ret)
1583                 goto error;
1584
1585         /* context entry init */
1586         ret = domain_context_mapping(domain, pdev);
1587         if (!ret)
1588                 return 0;
1589 error:
1590         domain_exit(domain);
1591         return ret;
1592
1593 }
1594
1595 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1596         struct pci_dev *pdev)
1597 {
1598         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1599                 return 0;
1600         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1601                 rmrr->end_address + 1);
1602 }
1603
1604 #ifdef CONFIG_DMAR_GFX_WA
1605 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1606 static void __init iommu_prepare_gfx_mapping(void)
1607 {
1608         struct pci_dev *pdev = NULL;
1609         u64 base, size;
1610         int slot;
1611         int ret;
1612
1613         for_each_pci_dev(pdev) {
1614                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1615                                 !IS_GFX_DEVICE(pdev))
1616                         continue;
1617                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1618                         pci_name(pdev));
1619                 slot = arch_get_ram_range(0, &base, &size);
1620                 while (slot >= 0) {
1621                         ret = iommu_prepare_identity_map(pdev,
1622                                         base, base + size);
1623                         if (ret)
1624                                 goto error;
1625                         slot = arch_get_ram_range(slot, &base, &size);
1626                 }
1627                 continue;
1628 error:
1629                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1630         }
1631 }
1632 #endif
1633
1634 #ifdef CONFIG_DMAR_FLOPPY_WA
1635 static inline void iommu_prepare_isa(void)
1636 {
1637         struct pci_dev *pdev;
1638         int ret;
1639
1640         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1641         if (!pdev)
1642                 return;
1643
1644         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1645         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1646
1647         if (ret)
1648                 printk("IOMMU: Failed to create 0-64M identity map, "
1649                         "floppy might not work\n");
1650
1651 }
1652 #else
1653 static inline void iommu_prepare_isa(void)
1654 {
1655         return;
1656 }
1657 #endif /* !CONFIG_DMAR_FLPY_WA */
1658
1659 int __init init_dmars(void)
1660 {
1661         struct dmar_drhd_unit *drhd;
1662         struct dmar_rmrr_unit *rmrr;
1663         struct pci_dev *pdev;
1664         struct intel_iommu *iommu;
1665         int ret, unit = 0;
1666
1667         /*
1668          * for each drhd
1669          *    allocate root
1670          *    initialize and program root entry to not present
1671          * endfor
1672          */
1673         for_each_drhd_unit(drhd) {
1674                 if (drhd->ignored)
1675                         continue;
1676                 iommu = alloc_iommu(drhd);
1677                 if (!iommu) {
1678                         ret = -ENOMEM;
1679                         goto error;
1680                 }
1681
1682                 /*
1683                  * TBD:
1684                  * we could share the same root & context tables
1685                  * amoung all IOMMU's. Need to Split it later.
1686                  */
1687                 ret = iommu_alloc_root_entry(iommu);
1688                 if (ret) {
1689                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1690                         goto error;
1691                 }
1692         }
1693
1694         /*
1695          * For each rmrr
1696          *   for each dev attached to rmrr
1697          *   do
1698          *     locate drhd for dev, alloc domain for dev
1699          *     allocate free domain
1700          *     allocate page table entries for rmrr
1701          *     if context not allocated for bus
1702          *           allocate and init context
1703          *           set present in root table for this bus
1704          *     init context with domain, translation etc
1705          *    endfor
1706          * endfor
1707          */
1708         for_each_rmrr_units(rmrr) {
1709                 int i;
1710                 for (i = 0; i < rmrr->devices_cnt; i++) {
1711                         pdev = rmrr->devices[i];
1712                         /* some BIOS lists non-exist devices in DMAR table */
1713                         if (!pdev)
1714                                 continue;
1715                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1716                         if (ret)
1717                                 printk(KERN_ERR
1718                                  "IOMMU: mapping reserved region failed\n");
1719                 }
1720         }
1721
1722         iommu_prepare_gfx_mapping();
1723
1724         iommu_prepare_isa();
1725
1726         /*
1727          * for each drhd
1728          *   enable fault log
1729          *   global invalidate context cache
1730          *   global invalidate iotlb
1731          *   enable translation
1732          */
1733         for_each_drhd_unit(drhd) {
1734                 if (drhd->ignored)
1735                         continue;
1736                 iommu = drhd->iommu;
1737                 sprintf (iommu->name, "dmar%d", unit++);
1738
1739                 iommu_flush_write_buffer(iommu);
1740
1741                 ret = dmar_set_interrupt(iommu);
1742                 if (ret)
1743                         goto error;
1744
1745                 iommu_set_root_entry(iommu);
1746
1747                 iommu_flush_context_global(iommu, 0);
1748                 iommu_flush_iotlb_global(iommu, 0);
1749
1750                 iommu_disable_protect_mem_regions(iommu);
1751
1752                 ret = iommu_enable_translation(iommu);
1753                 if (ret)
1754                         goto error;
1755         }
1756
1757         return 0;
1758 error:
1759         for_each_drhd_unit(drhd) {
1760                 if (drhd->ignored)
1761                         continue;
1762                 iommu = drhd->iommu;
1763                 free_iommu(iommu);
1764         }
1765         return ret;
1766 }
1767
1768 static inline u64 aligned_size(u64 host_addr, size_t size)
1769 {
1770         u64 addr;
1771         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1772         return PAGE_ALIGN_4K(addr);
1773 }
1774
1775 struct iova *
1776 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1777 {
1778         struct iova *piova;
1779
1780         /* Make sure it's in range */
1781         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1782         if (!size || (IOVA_START_ADDR + size > end))
1783                 return NULL;
1784
1785         piova = alloc_iova(&domain->iovad,
1786                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1787         return piova;
1788 }
1789
1790 static struct iova *
1791 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1792                 size_t size)
1793 {
1794         struct pci_dev *pdev = to_pci_dev(dev);
1795         struct iova *iova = NULL;
1796
1797         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1798                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1799         } else  {
1800                 /*
1801                  * First try to allocate an io virtual address in
1802                  * DMA_32BIT_MASK and if that fails then try allocating
1803                  * from higher range
1804                  */
1805                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1806                 if (!iova)
1807                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1808         }
1809
1810         if (!iova) {
1811                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1812                 return NULL;
1813         }
1814
1815         return iova;
1816 }
1817
1818 static struct dmar_domain *
1819 get_valid_domain_for_dev(struct pci_dev *pdev)
1820 {
1821         struct dmar_domain *domain;
1822         int ret;
1823
1824         domain = get_domain_for_dev(pdev,
1825                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1826         if (!domain) {
1827                 printk(KERN_ERR
1828                         "Allocating domain for %s failed", pci_name(pdev));
1829                 return NULL;
1830         }
1831
1832         /* make sure context mapping is ok */
1833         if (unlikely(!domain_context_mapped(domain, pdev))) {
1834                 ret = domain_context_mapping(domain, pdev);
1835                 if (ret) {
1836                         printk(KERN_ERR
1837                                 "Domain context map for %s failed",
1838                                 pci_name(pdev));
1839                         return NULL;
1840                 }
1841         }
1842
1843         return domain;
1844 }
1845
1846 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1847         size_t size, int dir)
1848 {
1849         struct pci_dev *pdev = to_pci_dev(hwdev);
1850         int ret;
1851         struct dmar_domain *domain;
1852         unsigned long start_addr;
1853         struct iova *iova;
1854         int prot = 0;
1855
1856         BUG_ON(dir == DMA_NONE);
1857         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1858                 return virt_to_bus(addr);
1859
1860         domain = get_valid_domain_for_dev(pdev);
1861         if (!domain)
1862                 return 0;
1863
1864         addr = (void *)virt_to_phys(addr);
1865         size = aligned_size((u64)addr, size);
1866
1867         iova = __intel_alloc_iova(hwdev, domain, size);
1868         if (!iova)
1869                 goto error;
1870
1871         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1872
1873         /*
1874          * Check if DMAR supports zero-length reads on write only
1875          * mappings..
1876          */
1877         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1878                         !cap_zlr(domain->iommu->cap))
1879                 prot |= DMA_PTE_READ;
1880         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1881                 prot |= DMA_PTE_WRITE;
1882         /*
1883          * addr - (addr + size) might be partial page, we should map the whole
1884          * page.  Note: if two part of one page are separately mapped, we
1885          * might have two guest_addr mapping to the same host addr, but this
1886          * is not a big problem
1887          */
1888         ret = domain_page_mapping(domain, start_addr,
1889                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1890         if (ret)
1891                 goto error;
1892
1893         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1894                 pci_name(pdev), size, (u64)addr,
1895                 size, (u64)start_addr, dir);
1896
1897         /* it's a non-present to present mapping */
1898         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1899                         start_addr, size >> PAGE_SHIFT_4K, 1);
1900         if (ret)
1901                 iommu_flush_write_buffer(domain->iommu);
1902
1903         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1904
1905 error:
1906         if (iova)
1907                 __free_iova(&domain->iovad, iova);
1908         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1909                 pci_name(pdev), size, (u64)addr, dir);
1910         return 0;
1911 }
1912
1913 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1914         size_t size, int dir)
1915 {
1916         struct pci_dev *pdev = to_pci_dev(dev);
1917         struct dmar_domain *domain;
1918         unsigned long start_addr;
1919         struct iova *iova;
1920
1921         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1922                 return;
1923         domain = find_domain(pdev);
1924         BUG_ON(!domain);
1925
1926         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1927         if (!iova)
1928                 return;
1929
1930         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1931         size = aligned_size((u64)dev_addr, size);
1932
1933         pr_debug("Device %s unmapping: %lx@%llx\n",
1934                 pci_name(pdev), size, (u64)start_addr);
1935
1936         /*  clear the whole page */
1937         dma_pte_clear_range(domain, start_addr, start_addr + size);
1938         /* free page tables */
1939         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1940
1941         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1942                         size >> PAGE_SHIFT_4K, 0))
1943                 iommu_flush_write_buffer(domain->iommu);
1944
1945         /* free iova */
1946         __free_iova(&domain->iovad, iova);
1947 }
1948
1949 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1950                        dma_addr_t *dma_handle, gfp_t flags)
1951 {
1952         void *vaddr;
1953         int order;
1954
1955         size = PAGE_ALIGN_4K(size);
1956         order = get_order(size);
1957         flags &= ~(GFP_DMA | GFP_DMA32);
1958
1959         vaddr = (void *)__get_free_pages(flags, order);
1960         if (!vaddr)
1961                 return NULL;
1962         memset(vaddr, 0, size);
1963
1964         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1965         if (*dma_handle)
1966                 return vaddr;
1967         free_pages((unsigned long)vaddr, order);
1968         return NULL;
1969 }
1970
1971 static void intel_free_coherent(struct device *hwdev, size_t size,
1972         void *vaddr, dma_addr_t dma_handle)
1973 {
1974         int order;
1975
1976         size = PAGE_ALIGN_4K(size);
1977         order = get_order(size);
1978
1979         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1980         free_pages((unsigned long)vaddr, order);
1981 }
1982
1983 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1984 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1985         int nelems, int dir)
1986 {
1987         int i;
1988         struct pci_dev *pdev = to_pci_dev(hwdev);
1989         struct dmar_domain *domain;
1990         unsigned long start_addr;
1991         struct iova *iova;
1992         size_t size = 0;
1993         void *addr;
1994         struct scatterlist *sg;
1995
1996         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1997                 return;
1998
1999         domain = find_domain(pdev);
2000
2001         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2002         if (!iova)
2003                 return;
2004         for_each_sg(sglist, sg, nelems, i) {
2005                 addr = SG_ENT_VIRT_ADDRESS(sg);
2006                 size += aligned_size((u64)addr, sg->length);
2007         }
2008
2009         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2010
2011         /*  clear the whole page */
2012         dma_pte_clear_range(domain, start_addr, start_addr + size);
2013         /* free page tables */
2014         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2015
2016         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2017                         size >> PAGE_SHIFT_4K, 0))
2018                 iommu_flush_write_buffer(domain->iommu);
2019
2020         /* free iova */
2021         __free_iova(&domain->iovad, iova);
2022 }
2023
2024 static int intel_nontranslate_map_sg(struct device *hddev,
2025         struct scatterlist *sglist, int nelems, int dir)
2026 {
2027         int i;
2028         struct scatterlist *sg;
2029
2030         for_each_sg(sglist, sg, nelems, i) {
2031                 BUG_ON(!sg_page(sg));
2032                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2033                 sg->dma_length = sg->length;
2034         }
2035         return nelems;
2036 }
2037
2038 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2039                                 int nelems, int dir)
2040 {
2041         void *addr;
2042         int i;
2043         struct pci_dev *pdev = to_pci_dev(hwdev);
2044         struct dmar_domain *domain;
2045         size_t size = 0;
2046         int prot = 0;
2047         size_t offset = 0;
2048         struct iova *iova = NULL;
2049         int ret;
2050         struct scatterlist *sg;
2051         unsigned long start_addr;
2052
2053         BUG_ON(dir == DMA_NONE);
2054         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2055                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2056
2057         domain = get_valid_domain_for_dev(pdev);
2058         if (!domain)
2059                 return 0;
2060
2061         for_each_sg(sglist, sg, nelems, i) {
2062                 addr = SG_ENT_VIRT_ADDRESS(sg);
2063                 addr = (void *)virt_to_phys(addr);
2064                 size += aligned_size((u64)addr, sg->length);
2065         }
2066
2067         iova = __intel_alloc_iova(hwdev, domain, size);
2068         if (!iova) {
2069                 sglist->dma_length = 0;
2070                 return 0;
2071         }
2072
2073         /*
2074          * Check if DMAR supports zero-length reads on write only
2075          * mappings..
2076          */
2077         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2078                         !cap_zlr(domain->iommu->cap))
2079                 prot |= DMA_PTE_READ;
2080         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2081                 prot |= DMA_PTE_WRITE;
2082
2083         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2084         offset = 0;
2085         for_each_sg(sglist, sg, nelems, i) {
2086                 addr = SG_ENT_VIRT_ADDRESS(sg);
2087                 addr = (void *)virt_to_phys(addr);
2088                 size = aligned_size((u64)addr, sg->length);
2089                 ret = domain_page_mapping(domain, start_addr + offset,
2090                         ((u64)addr) & PAGE_MASK_4K,
2091                         size, prot);
2092                 if (ret) {
2093                         /*  clear the page */
2094                         dma_pte_clear_range(domain, start_addr,
2095                                   start_addr + offset);
2096                         /* free page tables */
2097                         dma_pte_free_pagetable(domain, start_addr,
2098                                   start_addr + offset);
2099                         /* free iova */
2100                         __free_iova(&domain->iovad, iova);
2101                         return 0;
2102                 }
2103                 sg->dma_address = start_addr + offset +
2104                                 ((u64)addr & (~PAGE_MASK_4K));
2105                 sg->dma_length = sg->length;
2106                 offset += size;
2107         }
2108
2109         /* it's a non-present to present mapping */
2110         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2111                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2112                 iommu_flush_write_buffer(domain->iommu);
2113         return nelems;
2114 }
2115
2116 static struct dma_mapping_ops intel_dma_ops = {
2117         .alloc_coherent = intel_alloc_coherent,
2118         .free_coherent = intel_free_coherent,
2119         .map_single = intel_map_single,
2120         .unmap_single = intel_unmap_single,
2121         .map_sg = intel_map_sg,
2122         .unmap_sg = intel_unmap_sg,
2123 };
2124
2125 static inline int iommu_domain_cache_init(void)
2126 {
2127         int ret = 0;
2128
2129         iommu_domain_cache = kmem_cache_create("iommu_domain",
2130                                          sizeof(struct dmar_domain),
2131                                          0,
2132                                          SLAB_HWCACHE_ALIGN,
2133
2134                                          NULL);
2135         if (!iommu_domain_cache) {
2136                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2137                 ret = -ENOMEM;
2138         }
2139
2140         return ret;
2141 }
2142
2143 static inline int iommu_devinfo_cache_init(void)
2144 {
2145         int ret = 0;
2146
2147         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2148                                          sizeof(struct device_domain_info),
2149                                          0,
2150                                          SLAB_HWCACHE_ALIGN,
2151
2152                                          NULL);
2153         if (!iommu_devinfo_cache) {
2154                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2155                 ret = -ENOMEM;
2156         }
2157
2158         return ret;
2159 }
2160
2161 static inline int iommu_iova_cache_init(void)
2162 {
2163         int ret = 0;
2164
2165         iommu_iova_cache = kmem_cache_create("iommu_iova",
2166                                          sizeof(struct iova),
2167                                          0,
2168                                          SLAB_HWCACHE_ALIGN,
2169
2170                                          NULL);
2171         if (!iommu_iova_cache) {
2172                 printk(KERN_ERR "Couldn't create iova cache\n");
2173                 ret = -ENOMEM;
2174         }
2175
2176         return ret;
2177 }
2178
2179 static int __init iommu_init_mempool(void)
2180 {
2181         int ret;
2182         ret = iommu_iova_cache_init();
2183         if (ret)
2184                 return ret;
2185
2186         ret = iommu_domain_cache_init();
2187         if (ret)
2188                 goto domain_error;
2189
2190         ret = iommu_devinfo_cache_init();
2191         if (!ret)
2192                 return ret;
2193
2194         kmem_cache_destroy(iommu_domain_cache);
2195 domain_error:
2196         kmem_cache_destroy(iommu_iova_cache);
2197
2198         return -ENOMEM;
2199 }
2200
2201 static void __init iommu_exit_mempool(void)
2202 {
2203         kmem_cache_destroy(iommu_devinfo_cache);
2204         kmem_cache_destroy(iommu_domain_cache);
2205         kmem_cache_destroy(iommu_iova_cache);
2206
2207 }
2208
2209 void __init detect_intel_iommu(void)
2210 {
2211         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2212                 return;
2213         if (early_dmar_detect()) {
2214                 iommu_detected = 1;
2215         }
2216 }
2217
2218 static void __init init_no_remapping_devices(void)
2219 {
2220         struct dmar_drhd_unit *drhd;
2221
2222         for_each_drhd_unit(drhd) {
2223                 if (!drhd->include_all) {
2224                         int i;
2225                         for (i = 0; i < drhd->devices_cnt; i++)
2226                                 if (drhd->devices[i] != NULL)
2227                                         break;
2228                         /* ignore DMAR unit if no pci devices exist */
2229                         if (i == drhd->devices_cnt)
2230                                 drhd->ignored = 1;
2231                 }
2232         }
2233
2234         if (dmar_map_gfx)
2235                 return;
2236
2237         for_each_drhd_unit(drhd) {
2238                 int i;
2239                 if (drhd->ignored || drhd->include_all)
2240                         continue;
2241
2242                 for (i = 0; i < drhd->devices_cnt; i++)
2243                         if (drhd->devices[i] &&
2244                                 !IS_GFX_DEVICE(drhd->devices[i]))
2245                                 break;
2246
2247                 if (i < drhd->devices_cnt)
2248                         continue;
2249
2250                 /* bypass IOMMU if it is just for gfx devices */
2251                 drhd->ignored = 1;
2252                 for (i = 0; i < drhd->devices_cnt; i++) {
2253                         if (!drhd->devices[i])
2254                                 continue;
2255                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2256                 }
2257         }
2258 }
2259
2260 int __init intel_iommu_init(void)
2261 {
2262         int ret = 0;
2263
2264         if (no_iommu || swiotlb || dmar_disabled)
2265                 return -ENODEV;
2266
2267         if (dmar_table_init())
2268                 return  -ENODEV;
2269
2270         iommu_init_mempool();
2271         dmar_init_reserved_ranges();
2272
2273         init_no_remapping_devices();
2274
2275         ret = init_dmars();
2276         if (ret) {
2277                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2278                 put_iova_domain(&reserved_iova_list);
2279                 iommu_exit_mempool();
2280                 return ret;
2281         }
2282         printk(KERN_INFO
2283         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2284
2285         force_iommu = 1;
2286         dma_ops = &intel_dma_ops;
2287         return 0;
2288 }
2289