intel-iommu: Enable DMAR on 32-bit kernel.
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67  * 0: Present
68  * 1-11: Reserved
69  * 12-63: Context Ptr (12 - (haw-1))
70  * 64-127: Reserved
71  */
72 struct root_entry {
73         u64     val;
74         u64     rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79         return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83         root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87         root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93         return (struct context_entry *)
94                 (root_present(root)?phys_to_virt(
95                 root->val & VTD_PAGE_MASK) :
96                 NULL);
97 }
98
99 /*
100  * low 64 bits:
101  * 0: present
102  * 1: fault processing disable
103  * 2-3: translation type
104  * 12-63: address space root
105  * high 64 bits:
106  * 0-2: address width
107  * 3-6: aval
108  * 8-23: domain id
109  */
110 struct context_entry {
111         u64 lo;
112         u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117         return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121         context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126         context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132                                                 unsigned long value)
133 {
134         context->lo &= (((u64)-1) << 4) | 3;
135         context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139                                             unsigned long value)
140 {
141         context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145                                              unsigned long value)
146 {
147         context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151                                          unsigned long value)
152 {
153         context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158         context->lo = 0;
159         context->hi = 0;
160 }
161
162 /*
163  * 0: readable
164  * 1: writable
165  * 2-6: reserved
166  * 7: super page
167  * 8-10: available
168  * 11: snoop behavior
169  * 12-63: Host physcial address
170  */
171 struct dma_pte {
172         u64 val;
173 };
174
175 static inline void dma_clear_pte(struct dma_pte *pte)
176 {
177         pte->val = 0;
178 }
179
180 static inline void dma_set_pte_readable(struct dma_pte *pte)
181 {
182         pte->val |= DMA_PTE_READ;
183 }
184
185 static inline void dma_set_pte_writable(struct dma_pte *pte)
186 {
187         pte->val |= DMA_PTE_WRITE;
188 }
189
190 static inline void dma_set_pte_snp(struct dma_pte *pte)
191 {
192         pte->val |= DMA_PTE_SNP;
193 }
194
195 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
196 {
197         pte->val = (pte->val & ~3) | (prot & 3);
198 }
199
200 static inline u64 dma_pte_addr(struct dma_pte *pte)
201 {
202         return (pte->val & VTD_PAGE_MASK);
203 }
204
205 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
206 {
207         pte->val |= (addr & VTD_PAGE_MASK);
208 }
209
210 static inline bool dma_pte_present(struct dma_pte *pte)
211 {
212         return (pte->val & 3) != 0;
213 }
214
215 /* devices under the same p2p bridge are owned in one domain */
216 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
217
218 /* domain represents a virtual machine, more than one devices
219  * across iommus may be owned in one domain, e.g. kvm guest.
220  */
221 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
222
223 struct dmar_domain {
224         int     id;                     /* domain id */
225         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
226
227         struct list_head devices;       /* all devices' list */
228         struct iova_domain iovad;       /* iova's that belong to this domain */
229
230         struct dma_pte  *pgd;           /* virtual address */
231         spinlock_t      mapping_lock;   /* page table lock */
232         int             gaw;            /* max guest address width */
233
234         /* adjusted guest address width, 0 is level 2 30-bit */
235         int             agaw;
236
237         int             flags;          /* flags to find out type of domain */
238
239         int             iommu_coherency;/* indicate coherency of iommu access */
240         int             iommu_snooping; /* indicate snooping control feature*/
241         int             iommu_count;    /* reference count of iommu */
242         spinlock_t      iommu_lock;     /* protect iommu set in domain */
243         u64             max_addr;       /* maximum mapped address */
244 };
245
246 /* PCI domain-device relationship */
247 struct device_domain_info {
248         struct list_head link;  /* link to domain siblings */
249         struct list_head global; /* link to global list */
250         u8 bus;                 /* PCI bus numer */
251         u8 devfn;               /* PCI devfn number */
252         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
253         struct dmar_domain *domain; /* pointer to domain */
254 };
255
256 static void flush_unmaps_timeout(unsigned long data);
257
258 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
259
260 #define HIGH_WATER_MARK 250
261 struct deferred_flush_tables {
262         int next;
263         struct iova *iova[HIGH_WATER_MARK];
264         struct dmar_domain *domain[HIGH_WATER_MARK];
265 };
266
267 static struct deferred_flush_tables *deferred_flush;
268
269 /* bitmap for indexing intel_iommus */
270 static int g_num_of_iommus;
271
272 static DEFINE_SPINLOCK(async_umap_flush_lock);
273 static LIST_HEAD(unmaps_to_do);
274
275 static int timer_on;
276 static long list_size;
277
278 static void domain_remove_dev_info(struct dmar_domain *domain);
279
280 #ifdef CONFIG_DMAR_DEFAULT_ON
281 int dmar_disabled = 0;
282 #else
283 int dmar_disabled = 1;
284 #endif /*CONFIG_DMAR_DEFAULT_ON*/
285
286 static int __initdata dmar_map_gfx = 1;
287 static int dmar_forcedac;
288 static int intel_iommu_strict;
289
290 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
291 static DEFINE_SPINLOCK(device_domain_lock);
292 static LIST_HEAD(device_domain_list);
293
294 static struct iommu_ops intel_iommu_ops;
295
296 static int __init intel_iommu_setup(char *str)
297 {
298         if (!str)
299                 return -EINVAL;
300         while (*str) {
301                 if (!strncmp(str, "on", 2)) {
302                         dmar_disabled = 0;
303                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
304                 } else if (!strncmp(str, "off", 3)) {
305                         dmar_disabled = 1;
306                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
307                 } else if (!strncmp(str, "igfx_off", 8)) {
308                         dmar_map_gfx = 0;
309                         printk(KERN_INFO
310                                 "Intel-IOMMU: disable GFX device mapping\n");
311                 } else if (!strncmp(str, "forcedac", 8)) {
312                         printk(KERN_INFO
313                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
314                         dmar_forcedac = 1;
315                 } else if (!strncmp(str, "strict", 6)) {
316                         printk(KERN_INFO
317                                 "Intel-IOMMU: disable batched IOTLB flush\n");
318                         intel_iommu_strict = 1;
319                 }
320
321                 str += strcspn(str, ",");
322                 while (*str == ',')
323                         str++;
324         }
325         return 0;
326 }
327 __setup("intel_iommu=", intel_iommu_setup);
328
329 static struct kmem_cache *iommu_domain_cache;
330 static struct kmem_cache *iommu_devinfo_cache;
331 static struct kmem_cache *iommu_iova_cache;
332
333 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
334 {
335         unsigned int flags;
336         void *vaddr;
337
338         /* trying to avoid low memory issues */
339         flags = current->flags & PF_MEMALLOC;
340         current->flags |= PF_MEMALLOC;
341         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
342         current->flags &= (~PF_MEMALLOC | flags);
343         return vaddr;
344 }
345
346
347 static inline void *alloc_pgtable_page(void)
348 {
349         unsigned int flags;
350         void *vaddr;
351
352         /* trying to avoid low memory issues */
353         flags = current->flags & PF_MEMALLOC;
354         current->flags |= PF_MEMALLOC;
355         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
356         current->flags &= (~PF_MEMALLOC | flags);
357         return vaddr;
358 }
359
360 static inline void free_pgtable_page(void *vaddr)
361 {
362         free_page((unsigned long)vaddr);
363 }
364
365 static inline void *alloc_domain_mem(void)
366 {
367         return iommu_kmem_cache_alloc(iommu_domain_cache);
368 }
369
370 static void free_domain_mem(void *vaddr)
371 {
372         kmem_cache_free(iommu_domain_cache, vaddr);
373 }
374
375 static inline void * alloc_devinfo_mem(void)
376 {
377         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
378 }
379
380 static inline void free_devinfo_mem(void *vaddr)
381 {
382         kmem_cache_free(iommu_devinfo_cache, vaddr);
383 }
384
385 struct iova *alloc_iova_mem(void)
386 {
387         return iommu_kmem_cache_alloc(iommu_iova_cache);
388 }
389
390 void free_iova_mem(struct iova *iova)
391 {
392         kmem_cache_free(iommu_iova_cache, iova);
393 }
394
395
396 static inline int width_to_agaw(int width);
397
398 /* calculate agaw for each iommu.
399  * "SAGAW" may be different across iommus, use a default agaw, and
400  * get a supported less agaw for iommus that don't support the default agaw.
401  */
402 int iommu_calculate_agaw(struct intel_iommu *iommu)
403 {
404         unsigned long sagaw;
405         int agaw = -1;
406
407         sagaw = cap_sagaw(iommu->cap);
408         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
409              agaw >= 0; agaw--) {
410                 if (test_bit(agaw, &sagaw))
411                         break;
412         }
413
414         return agaw;
415 }
416
417 /* in native case, each domain is related to only one iommu */
418 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
419 {
420         int iommu_id;
421
422         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
423
424         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
425         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
426                 return NULL;
427
428         return g_iommus[iommu_id];
429 }
430
431 static void domain_update_iommu_coherency(struct dmar_domain *domain)
432 {
433         int i;
434
435         domain->iommu_coherency = 1;
436
437         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
438         for (; i < g_num_of_iommus; ) {
439                 if (!ecap_coherent(g_iommus[i]->ecap)) {
440                         domain->iommu_coherency = 0;
441                         break;
442                 }
443                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
444         }
445 }
446
447 static void domain_update_iommu_snooping(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_snooping = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
456                         domain->iommu_snooping = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 /* Some capabilities may be different across iommus */
464 static void domain_update_iommu_cap(struct dmar_domain *domain)
465 {
466         domain_update_iommu_coherency(domain);
467         domain_update_iommu_snooping(domain);
468 }
469
470 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
471 {
472         struct dmar_drhd_unit *drhd = NULL;
473         int i;
474
475         for_each_drhd_unit(drhd) {
476                 if (drhd->ignored)
477                         continue;
478
479                 for (i = 0; i < drhd->devices_cnt; i++)
480                         if (drhd->devices[i] &&
481                             drhd->devices[i]->bus->number == bus &&
482                             drhd->devices[i]->devfn == devfn)
483                                 return drhd->iommu;
484
485                 if (drhd->include_all)
486                         return drhd->iommu;
487         }
488
489         return NULL;
490 }
491
492 static void domain_flush_cache(struct dmar_domain *domain,
493                                void *addr, int size)
494 {
495         if (!domain->iommu_coherency)
496                 clflush_cache_range(addr, size);
497 }
498
499 /* Gets context entry for a given bus and devfn */
500 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
501                 u8 bus, u8 devfn)
502 {
503         struct root_entry *root;
504         struct context_entry *context;
505         unsigned long phy_addr;
506         unsigned long flags;
507
508         spin_lock_irqsave(&iommu->lock, flags);
509         root = &iommu->root_entry[bus];
510         context = get_context_addr_from_root(root);
511         if (!context) {
512                 context = (struct context_entry *)alloc_pgtable_page();
513                 if (!context) {
514                         spin_unlock_irqrestore(&iommu->lock, flags);
515                         return NULL;
516                 }
517                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
518                 phy_addr = virt_to_phys((void *)context);
519                 set_root_value(root, phy_addr);
520                 set_root_present(root);
521                 __iommu_flush_cache(iommu, root, sizeof(*root));
522         }
523         spin_unlock_irqrestore(&iommu->lock, flags);
524         return &context[devfn];
525 }
526
527 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
528 {
529         struct root_entry *root;
530         struct context_entry *context;
531         int ret;
532         unsigned long flags;
533
534         spin_lock_irqsave(&iommu->lock, flags);
535         root = &iommu->root_entry[bus];
536         context = get_context_addr_from_root(root);
537         if (!context) {
538                 ret = 0;
539                 goto out;
540         }
541         ret = context_present(&context[devfn]);
542 out:
543         spin_unlock_irqrestore(&iommu->lock, flags);
544         return ret;
545 }
546
547 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
548 {
549         struct root_entry *root;
550         struct context_entry *context;
551         unsigned long flags;
552
553         spin_lock_irqsave(&iommu->lock, flags);
554         root = &iommu->root_entry[bus];
555         context = get_context_addr_from_root(root);
556         if (context) {
557                 context_clear_entry(&context[devfn]);
558                 __iommu_flush_cache(iommu, &context[devfn], \
559                         sizeof(*context));
560         }
561         spin_unlock_irqrestore(&iommu->lock, flags);
562 }
563
564 static void free_context_table(struct intel_iommu *iommu)
565 {
566         struct root_entry *root;
567         int i;
568         unsigned long flags;
569         struct context_entry *context;
570
571         spin_lock_irqsave(&iommu->lock, flags);
572         if (!iommu->root_entry) {
573                 goto out;
574         }
575         for (i = 0; i < ROOT_ENTRY_NR; i++) {
576                 root = &iommu->root_entry[i];
577                 context = get_context_addr_from_root(root);
578                 if (context)
579                         free_pgtable_page(context);
580         }
581         free_pgtable_page(iommu->root_entry);
582         iommu->root_entry = NULL;
583 out:
584         spin_unlock_irqrestore(&iommu->lock, flags);
585 }
586
587 /* page table handling */
588 #define LEVEL_STRIDE            (9)
589 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
590
591 static inline int agaw_to_level(int agaw)
592 {
593         return agaw + 2;
594 }
595
596 static inline int agaw_to_width(int agaw)
597 {
598         return 30 + agaw * LEVEL_STRIDE;
599
600 }
601
602 static inline int width_to_agaw(int width)
603 {
604         return (width - 30) / LEVEL_STRIDE;
605 }
606
607 static inline unsigned int level_to_offset_bits(int level)
608 {
609         return (12 + (level - 1) * LEVEL_STRIDE);
610 }
611
612 static inline int address_level_offset(u64 addr, int level)
613 {
614         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
615 }
616
617 static inline u64 level_mask(int level)
618 {
619         return ((u64)-1 << level_to_offset_bits(level));
620 }
621
622 static inline u64 level_size(int level)
623 {
624         return ((u64)1 << level_to_offset_bits(level));
625 }
626
627 static inline u64 align_to_level(u64 addr, int level)
628 {
629         return ((addr + level_size(level) - 1) & level_mask(level));
630 }
631
632 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
633 {
634         int addr_width = agaw_to_width(domain->agaw);
635         struct dma_pte *parent, *pte = NULL;
636         int level = agaw_to_level(domain->agaw);
637         int offset;
638         unsigned long flags;
639
640         BUG_ON(!domain->pgd);
641
642         addr &= (((u64)1) << addr_width) - 1;
643         parent = domain->pgd;
644
645         spin_lock_irqsave(&domain->mapping_lock, flags);
646         while (level > 0) {
647                 void *tmp_page;
648
649                 offset = address_level_offset(addr, level);
650                 pte = &parent[offset];
651                 if (level == 1)
652                         break;
653
654                 if (!dma_pte_present(pte)) {
655                         tmp_page = alloc_pgtable_page();
656
657                         if (!tmp_page) {
658                                 spin_unlock_irqrestore(&domain->mapping_lock,
659                                         flags);
660                                 return NULL;
661                         }
662                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
663                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
664                         /*
665                          * high level table always sets r/w, last level page
666                          * table control read/write
667                          */
668                         dma_set_pte_readable(pte);
669                         dma_set_pte_writable(pte);
670                         domain_flush_cache(domain, pte, sizeof(*pte));
671                 }
672                 parent = phys_to_virt(dma_pte_addr(pte));
673                 level--;
674         }
675
676         spin_unlock_irqrestore(&domain->mapping_lock, flags);
677         return pte;
678 }
679
680 /* return address's pte at specific level */
681 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
682                 int level)
683 {
684         struct dma_pte *parent, *pte = NULL;
685         int total = agaw_to_level(domain->agaw);
686         int offset;
687
688         parent = domain->pgd;
689         while (level <= total) {
690                 offset = address_level_offset(addr, total);
691                 pte = &parent[offset];
692                 if (level == total)
693                         return pte;
694
695                 if (!dma_pte_present(pte))
696                         break;
697                 parent = phys_to_virt(dma_pte_addr(pte));
698                 total--;
699         }
700         return NULL;
701 }
702
703 /* clear one page's page table */
704 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
705 {
706         struct dma_pte *pte = NULL;
707
708         /* get last level pte */
709         pte = dma_addr_level_pte(domain, addr, 1);
710
711         if (pte) {
712                 dma_clear_pte(pte);
713                 domain_flush_cache(domain, pte, sizeof(*pte));
714         }
715 }
716
717 /* clear last level pte, a tlb flush should be followed */
718 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
719 {
720         int addr_width = agaw_to_width(domain->agaw);
721
722         start &= (((u64)1) << addr_width) - 1;
723         end &= (((u64)1) << addr_width) - 1;
724         /* in case it's partial page */
725         start = PAGE_ALIGN(start);
726         end &= PAGE_MASK;
727
728         /* we don't need lock here, nobody else touches the iova range */
729         while (start < end) {
730                 dma_pte_clear_one(domain, start);
731                 start += VTD_PAGE_SIZE;
732         }
733 }
734
735 /* free page table pages. last level pte should already be cleared */
736 static void dma_pte_free_pagetable(struct dmar_domain *domain,
737         u64 start, u64 end)
738 {
739         int addr_width = agaw_to_width(domain->agaw);
740         struct dma_pte *pte;
741         int total = agaw_to_level(domain->agaw);
742         int level;
743         u64 tmp;
744
745         start &= (((u64)1) << addr_width) - 1;
746         end &= (((u64)1) << addr_width) - 1;
747
748         /* we don't need lock here, nobody else touches the iova range */
749         level = 2;
750         while (level <= total) {
751                 tmp = align_to_level(start, level);
752                 if (tmp >= end || (tmp + level_size(level) > end))
753                         return;
754
755                 while (tmp < end) {
756                         pte = dma_addr_level_pte(domain, tmp, level);
757                         if (pte) {
758                                 free_pgtable_page(
759                                         phys_to_virt(dma_pte_addr(pte)));
760                                 dma_clear_pte(pte);
761                                 domain_flush_cache(domain, pte, sizeof(*pte));
762                         }
763                         tmp += level_size(level);
764                 }
765                 level++;
766         }
767         /* free pgd */
768         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
769                 free_pgtable_page(domain->pgd);
770                 domain->pgd = NULL;
771         }
772 }
773
774 /* iommu handling */
775 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
776 {
777         struct root_entry *root;
778         unsigned long flags;
779
780         root = (struct root_entry *)alloc_pgtable_page();
781         if (!root)
782                 return -ENOMEM;
783
784         __iommu_flush_cache(iommu, root, ROOT_SIZE);
785
786         spin_lock_irqsave(&iommu->lock, flags);
787         iommu->root_entry = root;
788         spin_unlock_irqrestore(&iommu->lock, flags);
789
790         return 0;
791 }
792
793 static void iommu_set_root_entry(struct intel_iommu *iommu)
794 {
795         void *addr;
796         u32 cmd, sts;
797         unsigned long flag;
798
799         addr = iommu->root_entry;
800
801         spin_lock_irqsave(&iommu->register_lock, flag);
802         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
803
804         cmd = iommu->gcmd | DMA_GCMD_SRTP;
805         writel(cmd, iommu->reg + DMAR_GCMD_REG);
806
807         /* Make sure hardware complete it */
808         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
809                 readl, (sts & DMA_GSTS_RTPS), sts);
810
811         spin_unlock_irqrestore(&iommu->register_lock, flag);
812 }
813
814 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
815 {
816         u32 val;
817         unsigned long flag;
818
819         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
820                 return;
821         val = iommu->gcmd | DMA_GCMD_WBF;
822
823         spin_lock_irqsave(&iommu->register_lock, flag);
824         writel(val, iommu->reg + DMAR_GCMD_REG);
825
826         /* Make sure hardware complete it */
827         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
828                         readl, (!(val & DMA_GSTS_WBFS)), val);
829
830         spin_unlock_irqrestore(&iommu->register_lock, flag);
831 }
832
833 /* return value determine if we need a write buffer flush */
834 static int __iommu_flush_context(struct intel_iommu *iommu,
835         u16 did, u16 source_id, u8 function_mask, u64 type,
836         int non_present_entry_flush)
837 {
838         u64 val = 0;
839         unsigned long flag;
840
841         /*
842          * In the non-present entry flush case, if hardware doesn't cache
843          * non-present entry we do nothing and if hardware cache non-present
844          * entry, we flush entries of domain 0 (the domain id is used to cache
845          * any non-present entries)
846          */
847         if (non_present_entry_flush) {
848                 if (!cap_caching_mode(iommu->cap))
849                         return 1;
850                 else
851                         did = 0;
852         }
853
854         switch (type) {
855         case DMA_CCMD_GLOBAL_INVL:
856                 val = DMA_CCMD_GLOBAL_INVL;
857                 break;
858         case DMA_CCMD_DOMAIN_INVL:
859                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
860                 break;
861         case DMA_CCMD_DEVICE_INVL:
862                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
863                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
864                 break;
865         default:
866                 BUG();
867         }
868         val |= DMA_CCMD_ICC;
869
870         spin_lock_irqsave(&iommu->register_lock, flag);
871         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
872
873         /* Make sure hardware complete it */
874         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
875                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
876
877         spin_unlock_irqrestore(&iommu->register_lock, flag);
878
879         /* flush context entry will implicitly flush write buffer */
880         return 0;
881 }
882
883 /* return value determine if we need a write buffer flush */
884 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
885         u64 addr, unsigned int size_order, u64 type,
886         int non_present_entry_flush)
887 {
888         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
889         u64 val = 0, val_iva = 0;
890         unsigned long flag;
891
892         /*
893          * In the non-present entry flush case, if hardware doesn't cache
894          * non-present entry we do nothing and if hardware cache non-present
895          * entry, we flush entries of domain 0 (the domain id is used to cache
896          * any non-present entries)
897          */
898         if (non_present_entry_flush) {
899                 if (!cap_caching_mode(iommu->cap))
900                         return 1;
901                 else
902                         did = 0;
903         }
904
905         switch (type) {
906         case DMA_TLB_GLOBAL_FLUSH:
907                 /* global flush doesn't need set IVA_REG */
908                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
909                 break;
910         case DMA_TLB_DSI_FLUSH:
911                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
912                 break;
913         case DMA_TLB_PSI_FLUSH:
914                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
915                 /* Note: always flush non-leaf currently */
916                 val_iva = size_order | addr;
917                 break;
918         default:
919                 BUG();
920         }
921         /* Note: set drain read/write */
922 #if 0
923         /*
924          * This is probably to be super secure.. Looks like we can
925          * ignore it without any impact.
926          */
927         if (cap_read_drain(iommu->cap))
928                 val |= DMA_TLB_READ_DRAIN;
929 #endif
930         if (cap_write_drain(iommu->cap))
931                 val |= DMA_TLB_WRITE_DRAIN;
932
933         spin_lock_irqsave(&iommu->register_lock, flag);
934         /* Note: Only uses first TLB reg currently */
935         if (val_iva)
936                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
937         dmar_writeq(iommu->reg + tlb_offset + 8, val);
938
939         /* Make sure hardware complete it */
940         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
941                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
942
943         spin_unlock_irqrestore(&iommu->register_lock, flag);
944
945         /* check IOTLB invalidation granularity */
946         if (DMA_TLB_IAIG(val) == 0)
947                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
948         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
949                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
950                         (unsigned long long)DMA_TLB_IIRG(type),
951                         (unsigned long long)DMA_TLB_IAIG(val));
952         /* flush iotlb entry will implicitly flush write buffer */
953         return 0;
954 }
955
956 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
957         u64 addr, unsigned int pages, int non_present_entry_flush)
958 {
959         unsigned int mask;
960
961         BUG_ON(addr & (~VTD_PAGE_MASK));
962         BUG_ON(pages == 0);
963
964         /* Fallback to domain selective flush if no PSI support */
965         if (!cap_pgsel_inv(iommu->cap))
966                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
967                                                 DMA_TLB_DSI_FLUSH,
968                                                 non_present_entry_flush);
969
970         /*
971          * PSI requires page size to be 2 ^ x, and the base address is naturally
972          * aligned to the size
973          */
974         mask = ilog2(__roundup_pow_of_two(pages));
975         /* Fallback to domain selective flush if size is too big */
976         if (mask > cap_max_amask_val(iommu->cap))
977                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
978                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
979
980         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
981                                         DMA_TLB_PSI_FLUSH,
982                                         non_present_entry_flush);
983 }
984
985 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
986 {
987         u32 pmen;
988         unsigned long flags;
989
990         spin_lock_irqsave(&iommu->register_lock, flags);
991         pmen = readl(iommu->reg + DMAR_PMEN_REG);
992         pmen &= ~DMA_PMEN_EPM;
993         writel(pmen, iommu->reg + DMAR_PMEN_REG);
994
995         /* wait for the protected region status bit to clear */
996         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
997                 readl, !(pmen & DMA_PMEN_PRS), pmen);
998
999         spin_unlock_irqrestore(&iommu->register_lock, flags);
1000 }
1001
1002 static int iommu_enable_translation(struct intel_iommu *iommu)
1003 {
1004         u32 sts;
1005         unsigned long flags;
1006
1007         spin_lock_irqsave(&iommu->register_lock, flags);
1008         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1009
1010         /* Make sure hardware complete it */
1011         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1012                 readl, (sts & DMA_GSTS_TES), sts);
1013
1014         iommu->gcmd |= DMA_GCMD_TE;
1015         spin_unlock_irqrestore(&iommu->register_lock, flags);
1016         return 0;
1017 }
1018
1019 static int iommu_disable_translation(struct intel_iommu *iommu)
1020 {
1021         u32 sts;
1022         unsigned long flag;
1023
1024         spin_lock_irqsave(&iommu->register_lock, flag);
1025         iommu->gcmd &= ~DMA_GCMD_TE;
1026         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1027
1028         /* Make sure hardware complete it */
1029         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1030                 readl, (!(sts & DMA_GSTS_TES)), sts);
1031
1032         spin_unlock_irqrestore(&iommu->register_lock, flag);
1033         return 0;
1034 }
1035
1036 /* iommu interrupt handling. Most stuff are MSI-like. */
1037
1038 static const char *fault_reason_strings[] =
1039 {
1040         "Software",
1041         "Present bit in root entry is clear",
1042         "Present bit in context entry is clear",
1043         "Invalid context entry",
1044         "Access beyond MGAW",
1045         "PTE Write access is not set",
1046         "PTE Read access is not set",
1047         "Next page table ptr is invalid",
1048         "Root table address invalid",
1049         "Context table ptr is invalid",
1050         "non-zero reserved fields in RTP",
1051         "non-zero reserved fields in CTP",
1052         "non-zero reserved fields in PTE",
1053 };
1054 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1055
1056 const char *dmar_get_fault_reason(u8 fault_reason)
1057 {
1058         if (fault_reason > MAX_FAULT_REASON_IDX)
1059                 return "Unknown";
1060         else
1061                 return fault_reason_strings[fault_reason];
1062 }
1063
1064 void dmar_msi_unmask(unsigned int irq)
1065 {
1066         struct intel_iommu *iommu = get_irq_data(irq);
1067         unsigned long flag;
1068
1069         /* unmask it */
1070         spin_lock_irqsave(&iommu->register_lock, flag);
1071         writel(0, iommu->reg + DMAR_FECTL_REG);
1072         /* Read a reg to force flush the post write */
1073         readl(iommu->reg + DMAR_FECTL_REG);
1074         spin_unlock_irqrestore(&iommu->register_lock, flag);
1075 }
1076
1077 void dmar_msi_mask(unsigned int irq)
1078 {
1079         unsigned long flag;
1080         struct intel_iommu *iommu = get_irq_data(irq);
1081
1082         /* mask it */
1083         spin_lock_irqsave(&iommu->register_lock, flag);
1084         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1085         /* Read a reg to force flush the post write */
1086         readl(iommu->reg + DMAR_FECTL_REG);
1087         spin_unlock_irqrestore(&iommu->register_lock, flag);
1088 }
1089
1090 void dmar_msi_write(int irq, struct msi_msg *msg)
1091 {
1092         struct intel_iommu *iommu = get_irq_data(irq);
1093         unsigned long flag;
1094
1095         spin_lock_irqsave(&iommu->register_lock, flag);
1096         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1097         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1098         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1099         spin_unlock_irqrestore(&iommu->register_lock, flag);
1100 }
1101
1102 void dmar_msi_read(int irq, struct msi_msg *msg)
1103 {
1104         struct intel_iommu *iommu = get_irq_data(irq);
1105         unsigned long flag;
1106
1107         spin_lock_irqsave(&iommu->register_lock, flag);
1108         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1109         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1110         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1111         spin_unlock_irqrestore(&iommu->register_lock, flag);
1112 }
1113
1114 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1115                 u8 fault_reason, u16 source_id, unsigned long long addr)
1116 {
1117         const char *reason;
1118
1119         reason = dmar_get_fault_reason(fault_reason);
1120
1121         printk(KERN_ERR
1122                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1123                 "fault addr %llx \n"
1124                 "DMAR:[fault reason %02d] %s\n",
1125                 (type ? "DMA Read" : "DMA Write"),
1126                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1127                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1128         return 0;
1129 }
1130
1131 #define PRIMARY_FAULT_REG_LEN (16)
1132 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1133 {
1134         struct intel_iommu *iommu = dev_id;
1135         int reg, fault_index;
1136         u32 fault_status;
1137         unsigned long flag;
1138
1139         spin_lock_irqsave(&iommu->register_lock, flag);
1140         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1141
1142         /* TBD: ignore advanced fault log currently */
1143         if (!(fault_status & DMA_FSTS_PPF))
1144                 goto clear_overflow;
1145
1146         fault_index = dma_fsts_fault_record_index(fault_status);
1147         reg = cap_fault_reg_offset(iommu->cap);
1148         while (1) {
1149                 u8 fault_reason;
1150                 u16 source_id;
1151                 u64 guest_addr;
1152                 int type;
1153                 u32 data;
1154
1155                 /* highest 32 bits */
1156                 data = readl(iommu->reg + reg +
1157                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1158                 if (!(data & DMA_FRCD_F))
1159                         break;
1160
1161                 fault_reason = dma_frcd_fault_reason(data);
1162                 type = dma_frcd_type(data);
1163
1164                 data = readl(iommu->reg + reg +
1165                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1166                 source_id = dma_frcd_source_id(data);
1167
1168                 guest_addr = dmar_readq(iommu->reg + reg +
1169                                 fault_index * PRIMARY_FAULT_REG_LEN);
1170                 guest_addr = dma_frcd_page_addr(guest_addr);
1171                 /* clear the fault */
1172                 writel(DMA_FRCD_F, iommu->reg + reg +
1173                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1174
1175                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1176
1177                 iommu_page_fault_do_one(iommu, type, fault_reason,
1178                                 source_id, guest_addr);
1179
1180                 fault_index++;
1181                 if (fault_index > cap_num_fault_regs(iommu->cap))
1182                         fault_index = 0;
1183                 spin_lock_irqsave(&iommu->register_lock, flag);
1184         }
1185 clear_overflow:
1186         /* clear primary fault overflow */
1187         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1188         if (fault_status & DMA_FSTS_PFO)
1189                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1190
1191         spin_unlock_irqrestore(&iommu->register_lock, flag);
1192         return IRQ_HANDLED;
1193 }
1194
1195 int dmar_set_interrupt(struct intel_iommu *iommu)
1196 {
1197         int irq, ret;
1198
1199         irq = create_irq();
1200         if (!irq) {
1201                 printk(KERN_ERR "IOMMU: no free vectors\n");
1202                 return -EINVAL;
1203         }
1204
1205         set_irq_data(irq, iommu);
1206         iommu->irq = irq;
1207
1208         ret = arch_setup_dmar_msi(irq);
1209         if (ret) {
1210                 set_irq_data(irq, NULL);
1211                 iommu->irq = 0;
1212                 destroy_irq(irq);
1213                 return 0;
1214         }
1215
1216         /* Force fault register is cleared */
1217         iommu_page_fault(irq, iommu);
1218
1219         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1220         if (ret)
1221                 printk(KERN_ERR "IOMMU: can't request irq\n");
1222         return ret;
1223 }
1224
1225 static int iommu_init_domains(struct intel_iommu *iommu)
1226 {
1227         unsigned long ndomains;
1228         unsigned long nlongs;
1229
1230         ndomains = cap_ndoms(iommu->cap);
1231         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1232         nlongs = BITS_TO_LONGS(ndomains);
1233
1234         /* TBD: there might be 64K domains,
1235          * consider other allocation for future chip
1236          */
1237         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1238         if (!iommu->domain_ids) {
1239                 printk(KERN_ERR "Allocating domain id array failed\n");
1240                 return -ENOMEM;
1241         }
1242         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1243                         GFP_KERNEL);
1244         if (!iommu->domains) {
1245                 printk(KERN_ERR "Allocating domain array failed\n");
1246                 kfree(iommu->domain_ids);
1247                 return -ENOMEM;
1248         }
1249
1250         spin_lock_init(&iommu->lock);
1251
1252         /*
1253          * if Caching mode is set, then invalid translations are tagged
1254          * with domainid 0. Hence we need to pre-allocate it.
1255          */
1256         if (cap_caching_mode(iommu->cap))
1257                 set_bit(0, iommu->domain_ids);
1258         return 0;
1259 }
1260
1261
1262 static void domain_exit(struct dmar_domain *domain);
1263 static void vm_domain_exit(struct dmar_domain *domain);
1264
1265 void free_dmar_iommu(struct intel_iommu *iommu)
1266 {
1267         struct dmar_domain *domain;
1268         int i;
1269         unsigned long flags;
1270
1271         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1272         for (; i < cap_ndoms(iommu->cap); ) {
1273                 domain = iommu->domains[i];
1274                 clear_bit(i, iommu->domain_ids);
1275
1276                 spin_lock_irqsave(&domain->iommu_lock, flags);
1277                 if (--domain->iommu_count == 0) {
1278                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1279                                 vm_domain_exit(domain);
1280                         else
1281                                 domain_exit(domain);
1282                 }
1283                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1284
1285                 i = find_next_bit(iommu->domain_ids,
1286                         cap_ndoms(iommu->cap), i+1);
1287         }
1288
1289         if (iommu->gcmd & DMA_GCMD_TE)
1290                 iommu_disable_translation(iommu);
1291
1292         if (iommu->irq) {
1293                 set_irq_data(iommu->irq, NULL);
1294                 /* This will mask the irq */
1295                 free_irq(iommu->irq, iommu);
1296                 destroy_irq(iommu->irq);
1297         }
1298
1299         kfree(iommu->domains);
1300         kfree(iommu->domain_ids);
1301
1302         g_iommus[iommu->seq_id] = NULL;
1303
1304         /* if all iommus are freed, free g_iommus */
1305         for (i = 0; i < g_num_of_iommus; i++) {
1306                 if (g_iommus[i])
1307                         break;
1308         }
1309
1310         if (i == g_num_of_iommus)
1311                 kfree(g_iommus);
1312
1313         /* free context mapping */
1314         free_context_table(iommu);
1315 }
1316
1317 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1318 {
1319         unsigned long num;
1320         unsigned long ndomains;
1321         struct dmar_domain *domain;
1322         unsigned long flags;
1323
1324         domain = alloc_domain_mem();
1325         if (!domain)
1326                 return NULL;
1327
1328         ndomains = cap_ndoms(iommu->cap);
1329
1330         spin_lock_irqsave(&iommu->lock, flags);
1331         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1332         if (num >= ndomains) {
1333                 spin_unlock_irqrestore(&iommu->lock, flags);
1334                 free_domain_mem(domain);
1335                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1336                 return NULL;
1337         }
1338
1339         set_bit(num, iommu->domain_ids);
1340         domain->id = num;
1341         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1342         set_bit(iommu->seq_id, &domain->iommu_bmp);
1343         domain->flags = 0;
1344         iommu->domains[num] = domain;
1345         spin_unlock_irqrestore(&iommu->lock, flags);
1346
1347         return domain;
1348 }
1349
1350 static void iommu_free_domain(struct dmar_domain *domain)
1351 {
1352         unsigned long flags;
1353         struct intel_iommu *iommu;
1354
1355         iommu = domain_get_iommu(domain);
1356
1357         spin_lock_irqsave(&iommu->lock, flags);
1358         clear_bit(domain->id, iommu->domain_ids);
1359         spin_unlock_irqrestore(&iommu->lock, flags);
1360 }
1361
1362 static struct iova_domain reserved_iova_list;
1363 static struct lock_class_key reserved_alloc_key;
1364 static struct lock_class_key reserved_rbtree_key;
1365
1366 static void dmar_init_reserved_ranges(void)
1367 {
1368         struct pci_dev *pdev = NULL;
1369         struct iova *iova;
1370         int i;
1371         u64 addr, size;
1372
1373         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1374
1375         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1376                 &reserved_alloc_key);
1377         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1378                 &reserved_rbtree_key);
1379
1380         /* IOAPIC ranges shouldn't be accessed by DMA */
1381         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1382                 IOVA_PFN(IOAPIC_RANGE_END));
1383         if (!iova)
1384                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1385
1386         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1387         for_each_pci_dev(pdev) {
1388                 struct resource *r;
1389
1390                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1391                         r = &pdev->resource[i];
1392                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1393                                 continue;
1394                         addr = r->start;
1395                         addr &= PAGE_MASK;
1396                         size = r->end - addr;
1397                         size = PAGE_ALIGN(size);
1398                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1399                                 IOVA_PFN(size + addr) - 1);
1400                         if (!iova)
1401                                 printk(KERN_ERR "Reserve iova failed\n");
1402                 }
1403         }
1404
1405 }
1406
1407 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1408 {
1409         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1410 }
1411
1412 static inline int guestwidth_to_adjustwidth(int gaw)
1413 {
1414         int agaw;
1415         int r = (gaw - 12) % 9;
1416
1417         if (r == 0)
1418                 agaw = gaw;
1419         else
1420                 agaw = gaw + 9 - r;
1421         if (agaw > 64)
1422                 agaw = 64;
1423         return agaw;
1424 }
1425
1426 static int domain_init(struct dmar_domain *domain, int guest_width)
1427 {
1428         struct intel_iommu *iommu;
1429         int adjust_width, agaw;
1430         unsigned long sagaw;
1431
1432         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1433         spin_lock_init(&domain->mapping_lock);
1434         spin_lock_init(&domain->iommu_lock);
1435
1436         domain_reserve_special_ranges(domain);
1437
1438         /* calculate AGAW */
1439         iommu = domain_get_iommu(domain);
1440         if (guest_width > cap_mgaw(iommu->cap))
1441                 guest_width = cap_mgaw(iommu->cap);
1442         domain->gaw = guest_width;
1443         adjust_width = guestwidth_to_adjustwidth(guest_width);
1444         agaw = width_to_agaw(adjust_width);
1445         sagaw = cap_sagaw(iommu->cap);
1446         if (!test_bit(agaw, &sagaw)) {
1447                 /* hardware doesn't support it, choose a bigger one */
1448                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1449                 agaw = find_next_bit(&sagaw, 5, agaw);
1450                 if (agaw >= 5)
1451                         return -ENODEV;
1452         }
1453         domain->agaw = agaw;
1454         INIT_LIST_HEAD(&domain->devices);
1455
1456         if (ecap_coherent(iommu->ecap))
1457                 domain->iommu_coherency = 1;
1458         else
1459                 domain->iommu_coherency = 0;
1460
1461         if (ecap_sc_support(iommu->ecap))
1462                 domain->iommu_snooping = 1;
1463         else
1464                 domain->iommu_snooping = 0;
1465
1466         domain->iommu_count = 1;
1467
1468         /* always allocate the top pgd */
1469         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1470         if (!domain->pgd)
1471                 return -ENOMEM;
1472         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1473         return 0;
1474 }
1475
1476 static void domain_exit(struct dmar_domain *domain)
1477 {
1478         u64 end;
1479
1480         /* Domain 0 is reserved, so dont process it */
1481         if (!domain)
1482                 return;
1483
1484         domain_remove_dev_info(domain);
1485         /* destroy iovas */
1486         put_iova_domain(&domain->iovad);
1487         end = DOMAIN_MAX_ADDR(domain->gaw);
1488         end = end & (~PAGE_MASK);
1489
1490         /* clear ptes */
1491         dma_pte_clear_range(domain, 0, end);
1492
1493         /* free page tables */
1494         dma_pte_free_pagetable(domain, 0, end);
1495
1496         iommu_free_domain(domain);
1497         free_domain_mem(domain);
1498 }
1499
1500 static int domain_context_mapping_one(struct dmar_domain *domain,
1501                 u8 bus, u8 devfn)
1502 {
1503         struct context_entry *context;
1504         unsigned long flags;
1505         struct intel_iommu *iommu;
1506         struct dma_pte *pgd;
1507         unsigned long num;
1508         unsigned long ndomains;
1509         int id;
1510         int agaw;
1511
1512         pr_debug("Set context mapping for %02x:%02x.%d\n",
1513                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1514         BUG_ON(!domain->pgd);
1515
1516         iommu = device_to_iommu(bus, devfn);
1517         if (!iommu)
1518                 return -ENODEV;
1519
1520         context = device_to_context_entry(iommu, bus, devfn);
1521         if (!context)
1522                 return -ENOMEM;
1523         spin_lock_irqsave(&iommu->lock, flags);
1524         if (context_present(context)) {
1525                 spin_unlock_irqrestore(&iommu->lock, flags);
1526                 return 0;
1527         }
1528
1529         id = domain->id;
1530         pgd = domain->pgd;
1531
1532         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1533                 int found = 0;
1534
1535                 /* find an available domain id for this device in iommu */
1536                 ndomains = cap_ndoms(iommu->cap);
1537                 num = find_first_bit(iommu->domain_ids, ndomains);
1538                 for (; num < ndomains; ) {
1539                         if (iommu->domains[num] == domain) {
1540                                 id = num;
1541                                 found = 1;
1542                                 break;
1543                         }
1544                         num = find_next_bit(iommu->domain_ids,
1545                                             cap_ndoms(iommu->cap), num+1);
1546                 }
1547
1548                 if (found == 0) {
1549                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1550                         if (num >= ndomains) {
1551                                 spin_unlock_irqrestore(&iommu->lock, flags);
1552                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1553                                 return -EFAULT;
1554                         }
1555
1556                         set_bit(num, iommu->domain_ids);
1557                         iommu->domains[num] = domain;
1558                         id = num;
1559                 }
1560
1561                 /* Skip top levels of page tables for
1562                  * iommu which has less agaw than default.
1563                  */
1564                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1565                         pgd = phys_to_virt(dma_pte_addr(pgd));
1566                         if (!dma_pte_present(pgd)) {
1567                                 spin_unlock_irqrestore(&iommu->lock, flags);
1568                                 return -ENOMEM;
1569                         }
1570                 }
1571         }
1572
1573         context_set_domain_id(context, id);
1574         context_set_address_width(context, iommu->agaw);
1575         context_set_address_root(context, virt_to_phys(pgd));
1576         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1577         context_set_fault_enable(context);
1578         context_set_present(context);
1579         domain_flush_cache(domain, context, sizeof(*context));
1580
1581         /* it's a non-present to present mapping */
1582         if (iommu->flush.flush_context(iommu, domain->id,
1583                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1584                 DMA_CCMD_DEVICE_INVL, 1))
1585                 iommu_flush_write_buffer(iommu);
1586         else
1587                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1588
1589         spin_unlock_irqrestore(&iommu->lock, flags);
1590
1591         spin_lock_irqsave(&domain->iommu_lock, flags);
1592         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1593                 domain->iommu_count++;
1594                 domain_update_iommu_cap(domain);
1595         }
1596         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1597         return 0;
1598 }
1599
1600 static int
1601 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1602 {
1603         int ret;
1604         struct pci_dev *tmp, *parent;
1605
1606         ret = domain_context_mapping_one(domain, pdev->bus->number,
1607                 pdev->devfn);
1608         if (ret)
1609                 return ret;
1610
1611         /* dependent device mapping */
1612         tmp = pci_find_upstream_pcie_bridge(pdev);
1613         if (!tmp)
1614                 return 0;
1615         /* Secondary interface's bus number and devfn 0 */
1616         parent = pdev->bus->self;
1617         while (parent != tmp) {
1618                 ret = domain_context_mapping_one(domain, parent->bus->number,
1619                         parent->devfn);
1620                 if (ret)
1621                         return ret;
1622                 parent = parent->bus->self;
1623         }
1624         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1625                 return domain_context_mapping_one(domain,
1626                         tmp->subordinate->number, 0);
1627         else /* this is a legacy PCI bridge */
1628                 return domain_context_mapping_one(domain,
1629                         tmp->bus->number, tmp->devfn);
1630 }
1631
1632 static int domain_context_mapped(struct pci_dev *pdev)
1633 {
1634         int ret;
1635         struct pci_dev *tmp, *parent;
1636         struct intel_iommu *iommu;
1637
1638         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1639         if (!iommu)
1640                 return -ENODEV;
1641
1642         ret = device_context_mapped(iommu,
1643                 pdev->bus->number, pdev->devfn);
1644         if (!ret)
1645                 return ret;
1646         /* dependent device mapping */
1647         tmp = pci_find_upstream_pcie_bridge(pdev);
1648         if (!tmp)
1649                 return ret;
1650         /* Secondary interface's bus number and devfn 0 */
1651         parent = pdev->bus->self;
1652         while (parent != tmp) {
1653                 ret = device_context_mapped(iommu, parent->bus->number,
1654                         parent->devfn);
1655                 if (!ret)
1656                         return ret;
1657                 parent = parent->bus->self;
1658         }
1659         if (tmp->is_pcie)
1660                 return device_context_mapped(iommu,
1661                         tmp->subordinate->number, 0);
1662         else
1663                 return device_context_mapped(iommu,
1664                         tmp->bus->number, tmp->devfn);
1665 }
1666
1667 static int
1668 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1669                         u64 hpa, size_t size, int prot)
1670 {
1671         u64 start_pfn, end_pfn;
1672         struct dma_pte *pte;
1673         int index;
1674         int addr_width = agaw_to_width(domain->agaw);
1675
1676         hpa &= (((u64)1) << addr_width) - 1;
1677
1678         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1679                 return -EINVAL;
1680         iova &= PAGE_MASK;
1681         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1682         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1683         index = 0;
1684         while (start_pfn < end_pfn) {
1685                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1686                 if (!pte)
1687                         return -ENOMEM;
1688                 /* We don't need lock here, nobody else
1689                  * touches the iova range
1690                  */
1691                 BUG_ON(dma_pte_addr(pte));
1692                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1693                 dma_set_pte_prot(pte, prot);
1694                 if (prot & DMA_PTE_SNP)
1695                         dma_set_pte_snp(pte);
1696                 domain_flush_cache(domain, pte, sizeof(*pte));
1697                 start_pfn++;
1698                 index++;
1699         }
1700         return 0;
1701 }
1702
1703 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1704 {
1705         if (!iommu)
1706                 return;
1707
1708         clear_context_table(iommu, bus, devfn);
1709         iommu->flush.flush_context(iommu, 0, 0, 0,
1710                                            DMA_CCMD_GLOBAL_INVL, 0);
1711         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1712                                          DMA_TLB_GLOBAL_FLUSH, 0);
1713 }
1714
1715 static void domain_remove_dev_info(struct dmar_domain *domain)
1716 {
1717         struct device_domain_info *info;
1718         unsigned long flags;
1719         struct intel_iommu *iommu;
1720
1721         spin_lock_irqsave(&device_domain_lock, flags);
1722         while (!list_empty(&domain->devices)) {
1723                 info = list_entry(domain->devices.next,
1724                         struct device_domain_info, link);
1725                 list_del(&info->link);
1726                 list_del(&info->global);
1727                 if (info->dev)
1728                         info->dev->dev.archdata.iommu = NULL;
1729                 spin_unlock_irqrestore(&device_domain_lock, flags);
1730
1731                 iommu = device_to_iommu(info->bus, info->devfn);
1732                 iommu_detach_dev(iommu, info->bus, info->devfn);
1733                 free_devinfo_mem(info);
1734
1735                 spin_lock_irqsave(&device_domain_lock, flags);
1736         }
1737         spin_unlock_irqrestore(&device_domain_lock, flags);
1738 }
1739
1740 /*
1741  * find_domain
1742  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1743  */
1744 static struct dmar_domain *
1745 find_domain(struct pci_dev *pdev)
1746 {
1747         struct device_domain_info *info;
1748
1749         /* No lock here, assumes no domain exit in normal case */
1750         info = pdev->dev.archdata.iommu;
1751         if (info)
1752                 return info->domain;
1753         return NULL;
1754 }
1755
1756 /* domain is initialized */
1757 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1758 {
1759         struct dmar_domain *domain, *found = NULL;
1760         struct intel_iommu *iommu;
1761         struct dmar_drhd_unit *drhd;
1762         struct device_domain_info *info, *tmp;
1763         struct pci_dev *dev_tmp;
1764         unsigned long flags;
1765         int bus = 0, devfn = 0;
1766
1767         domain = find_domain(pdev);
1768         if (domain)
1769                 return domain;
1770
1771         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1772         if (dev_tmp) {
1773                 if (dev_tmp->is_pcie) {
1774                         bus = dev_tmp->subordinate->number;
1775                         devfn = 0;
1776                 } else {
1777                         bus = dev_tmp->bus->number;
1778                         devfn = dev_tmp->devfn;
1779                 }
1780                 spin_lock_irqsave(&device_domain_lock, flags);
1781                 list_for_each_entry(info, &device_domain_list, global) {
1782                         if (info->bus == bus && info->devfn == devfn) {
1783                                 found = info->domain;
1784                                 break;
1785                         }
1786                 }
1787                 spin_unlock_irqrestore(&device_domain_lock, flags);
1788                 /* pcie-pci bridge already has a domain, uses it */
1789                 if (found) {
1790                         domain = found;
1791                         goto found_domain;
1792                 }
1793         }
1794
1795         /* Allocate new domain for the device */
1796         drhd = dmar_find_matched_drhd_unit(pdev);
1797         if (!drhd) {
1798                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1799                         pci_name(pdev));
1800                 return NULL;
1801         }
1802         iommu = drhd->iommu;
1803
1804         domain = iommu_alloc_domain(iommu);
1805         if (!domain)
1806                 goto error;
1807
1808         if (domain_init(domain, gaw)) {
1809                 domain_exit(domain);
1810                 goto error;
1811         }
1812
1813         /* register pcie-to-pci device */
1814         if (dev_tmp) {
1815                 info = alloc_devinfo_mem();
1816                 if (!info) {
1817                         domain_exit(domain);
1818                         goto error;
1819                 }
1820                 info->bus = bus;
1821                 info->devfn = devfn;
1822                 info->dev = NULL;
1823                 info->domain = domain;
1824                 /* This domain is shared by devices under p2p bridge */
1825                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1826
1827                 /* pcie-to-pci bridge already has a domain, uses it */
1828                 found = NULL;
1829                 spin_lock_irqsave(&device_domain_lock, flags);
1830                 list_for_each_entry(tmp, &device_domain_list, global) {
1831                         if (tmp->bus == bus && tmp->devfn == devfn) {
1832                                 found = tmp->domain;
1833                                 break;
1834                         }
1835                 }
1836                 if (found) {
1837                         free_devinfo_mem(info);
1838                         domain_exit(domain);
1839                         domain = found;
1840                 } else {
1841                         list_add(&info->link, &domain->devices);
1842                         list_add(&info->global, &device_domain_list);
1843                 }
1844                 spin_unlock_irqrestore(&device_domain_lock, flags);
1845         }
1846
1847 found_domain:
1848         info = alloc_devinfo_mem();
1849         if (!info)
1850                 goto error;
1851         info->bus = pdev->bus->number;
1852         info->devfn = pdev->devfn;
1853         info->dev = pdev;
1854         info->domain = domain;
1855         spin_lock_irqsave(&device_domain_lock, flags);
1856         /* somebody is fast */
1857         found = find_domain(pdev);
1858         if (found != NULL) {
1859                 spin_unlock_irqrestore(&device_domain_lock, flags);
1860                 if (found != domain) {
1861                         domain_exit(domain);
1862                         domain = found;
1863                 }
1864                 free_devinfo_mem(info);
1865                 return domain;
1866         }
1867         list_add(&info->link, &domain->devices);
1868         list_add(&info->global, &device_domain_list);
1869         pdev->dev.archdata.iommu = info;
1870         spin_unlock_irqrestore(&device_domain_lock, flags);
1871         return domain;
1872 error:
1873         /* recheck it here, maybe others set it */
1874         return find_domain(pdev);
1875 }
1876
1877 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1878                                       unsigned long long start,
1879                                       unsigned long long end)
1880 {
1881         struct dmar_domain *domain;
1882         unsigned long size;
1883         unsigned long long base;
1884         int ret;
1885
1886         printk(KERN_INFO
1887                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1888                 pci_name(pdev), start, end);
1889         /* page table init */
1890         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1891         if (!domain)
1892                 return -ENOMEM;
1893
1894         /* The address might not be aligned */
1895         base = start & PAGE_MASK;
1896         size = end - base;
1897         size = PAGE_ALIGN(size);
1898         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1899                         IOVA_PFN(base + size) - 1)) {
1900                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1901                 ret = -ENOMEM;
1902                 goto error;
1903         }
1904
1905         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1906                 size, base, pci_name(pdev));
1907         /*
1908          * RMRR range might have overlap with physical memory range,
1909          * clear it first
1910          */
1911         dma_pte_clear_range(domain, base, base + size);
1912
1913         ret = domain_page_mapping(domain, base, base, size,
1914                 DMA_PTE_READ|DMA_PTE_WRITE);
1915         if (ret)
1916                 goto error;
1917
1918         /* context entry init */
1919         ret = domain_context_mapping(domain, pdev);
1920         if (!ret)
1921                 return 0;
1922 error:
1923         domain_exit(domain);
1924         return ret;
1925
1926 }
1927
1928 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1929         struct pci_dev *pdev)
1930 {
1931         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1932                 return 0;
1933         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1934                 rmrr->end_address + 1);
1935 }
1936
1937 #ifdef CONFIG_DMAR_GFX_WA
1938 struct iommu_prepare_data {
1939         struct pci_dev *pdev;
1940         int ret;
1941 };
1942
1943 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1944                                          unsigned long end_pfn, void *datax)
1945 {
1946         struct iommu_prepare_data *data;
1947
1948         data = (struct iommu_prepare_data *)datax;
1949
1950         data->ret = iommu_prepare_identity_map(data->pdev,
1951                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1952         return data->ret;
1953
1954 }
1955
1956 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1957 {
1958         int nid;
1959         struct iommu_prepare_data data;
1960
1961         data.pdev = pdev;
1962         data.ret = 0;
1963
1964         for_each_online_node(nid) {
1965                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1966                 if (data.ret)
1967                         return data.ret;
1968         }
1969         return data.ret;
1970 }
1971
1972 static void __init iommu_prepare_gfx_mapping(void)
1973 {
1974         struct pci_dev *pdev = NULL;
1975         int ret;
1976
1977         for_each_pci_dev(pdev) {
1978                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1979                                 !IS_GFX_DEVICE(pdev))
1980                         continue;
1981                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1982                         pci_name(pdev));
1983                 ret = iommu_prepare_with_active_regions(pdev);
1984                 if (ret)
1985                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1986         }
1987 }
1988 #else /* !CONFIG_DMAR_GFX_WA */
1989 static inline void iommu_prepare_gfx_mapping(void)
1990 {
1991         return;
1992 }
1993 #endif
1994
1995 #ifdef CONFIG_DMAR_FLOPPY_WA
1996 static inline void iommu_prepare_isa(void)
1997 {
1998         struct pci_dev *pdev;
1999         int ret;
2000
2001         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2002         if (!pdev)
2003                 return;
2004
2005         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
2006         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2007
2008         if (ret)
2009                 printk("IOMMU: Failed to create 0-64M identity map, "
2010                         "floppy might not work\n");
2011
2012 }
2013 #else
2014 static inline void iommu_prepare_isa(void)
2015 {
2016         return;
2017 }
2018 #endif /* !CONFIG_DMAR_FLPY_WA */
2019
2020 static int __init init_dmars(void)
2021 {
2022         struct dmar_drhd_unit *drhd;
2023         struct dmar_rmrr_unit *rmrr;
2024         struct pci_dev *pdev;
2025         struct intel_iommu *iommu;
2026         int i, ret, unit = 0;
2027
2028         /*
2029          * for each drhd
2030          *    allocate root
2031          *    initialize and program root entry to not present
2032          * endfor
2033          */
2034         for_each_drhd_unit(drhd) {
2035                 g_num_of_iommus++;
2036                 /*
2037                  * lock not needed as this is only incremented in the single
2038                  * threaded kernel __init code path all other access are read
2039                  * only
2040                  */
2041         }
2042
2043         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2044                         GFP_KERNEL);
2045         if (!g_iommus) {
2046                 printk(KERN_ERR "Allocating global iommu array failed\n");
2047                 ret = -ENOMEM;
2048                 goto error;
2049         }
2050
2051         deferred_flush = kzalloc(g_num_of_iommus *
2052                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2053         if (!deferred_flush) {
2054                 kfree(g_iommus);
2055                 ret = -ENOMEM;
2056                 goto error;
2057         }
2058
2059         for_each_drhd_unit(drhd) {
2060                 if (drhd->ignored)
2061                         continue;
2062
2063                 iommu = drhd->iommu;
2064                 g_iommus[iommu->seq_id] = iommu;
2065
2066                 ret = iommu_init_domains(iommu);
2067                 if (ret)
2068                         goto error;
2069
2070                 /*
2071                  * TBD:
2072                  * we could share the same root & context tables
2073                  * amoung all IOMMU's. Need to Split it later.
2074                  */
2075                 ret = iommu_alloc_root_entry(iommu);
2076                 if (ret) {
2077                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2078                         goto error;
2079                 }
2080         }
2081
2082         for_each_drhd_unit(drhd) {
2083                 if (drhd->ignored)
2084                         continue;
2085
2086                 iommu = drhd->iommu;
2087                 if (dmar_enable_qi(iommu)) {
2088                         /*
2089                          * Queued Invalidate not enabled, use Register Based
2090                          * Invalidate
2091                          */
2092                         iommu->flush.flush_context = __iommu_flush_context;
2093                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2094                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2095                                "invalidation\n",
2096                                (unsigned long long)drhd->reg_base_addr);
2097                 } else {
2098                         iommu->flush.flush_context = qi_flush_context;
2099                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2100                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2101                                "invalidation\n",
2102                                (unsigned long long)drhd->reg_base_addr);
2103                 }
2104         }
2105
2106         /*
2107          * For each rmrr
2108          *   for each dev attached to rmrr
2109          *   do
2110          *     locate drhd for dev, alloc domain for dev
2111          *     allocate free domain
2112          *     allocate page table entries for rmrr
2113          *     if context not allocated for bus
2114          *           allocate and init context
2115          *           set present in root table for this bus
2116          *     init context with domain, translation etc
2117          *    endfor
2118          * endfor
2119          */
2120         for_each_rmrr_units(rmrr) {
2121                 for (i = 0; i < rmrr->devices_cnt; i++) {
2122                         pdev = rmrr->devices[i];
2123                         /* some BIOS lists non-exist devices in DMAR table */
2124                         if (!pdev)
2125                                 continue;
2126                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2127                         if (ret)
2128                                 printk(KERN_ERR
2129                                  "IOMMU: mapping reserved region failed\n");
2130                 }
2131         }
2132
2133         iommu_prepare_gfx_mapping();
2134
2135         iommu_prepare_isa();
2136
2137         /*
2138          * for each drhd
2139          *   enable fault log
2140          *   global invalidate context cache
2141          *   global invalidate iotlb
2142          *   enable translation
2143          */
2144         for_each_drhd_unit(drhd) {
2145                 if (drhd->ignored)
2146                         continue;
2147                 iommu = drhd->iommu;
2148                 sprintf (iommu->name, "dmar%d", unit++);
2149
2150                 iommu_flush_write_buffer(iommu);
2151
2152                 ret = dmar_set_interrupt(iommu);
2153                 if (ret)
2154                         goto error;
2155
2156                 iommu_set_root_entry(iommu);
2157
2158                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2159                                            0);
2160                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2161                                          0);
2162                 iommu_disable_protect_mem_regions(iommu);
2163
2164                 ret = iommu_enable_translation(iommu);
2165                 if (ret)
2166                         goto error;
2167         }
2168
2169         return 0;
2170 error:
2171         for_each_drhd_unit(drhd) {
2172                 if (drhd->ignored)
2173                         continue;
2174                 iommu = drhd->iommu;
2175                 free_iommu(iommu);
2176         }
2177         kfree(g_iommus);
2178         return ret;
2179 }
2180
2181 static inline u64 aligned_size(u64 host_addr, size_t size)
2182 {
2183         u64 addr;
2184         addr = (host_addr & (~PAGE_MASK)) + size;
2185         return PAGE_ALIGN(addr);
2186 }
2187
2188 struct iova *
2189 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2190 {
2191         struct iova *piova;
2192
2193         /* Make sure it's in range */
2194         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2195         if (!size || (IOVA_START_ADDR + size > end))
2196                 return NULL;
2197
2198         piova = alloc_iova(&domain->iovad,
2199                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2200         return piova;
2201 }
2202
2203 static struct iova *
2204 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2205                    size_t size, u64 dma_mask)
2206 {
2207         struct pci_dev *pdev = to_pci_dev(dev);
2208         struct iova *iova = NULL;
2209
2210         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2211                 iova = iommu_alloc_iova(domain, size, dma_mask);
2212         else {
2213                 /*
2214                  * First try to allocate an io virtual address in
2215                  * DMA_32BIT_MASK and if that fails then try allocating
2216                  * from higher range
2217                  */
2218                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2219                 if (!iova)
2220                         iova = iommu_alloc_iova(domain, size, dma_mask);
2221         }
2222
2223         if (!iova) {
2224                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2225                 return NULL;
2226         }
2227
2228         return iova;
2229 }
2230
2231 static struct dmar_domain *
2232 get_valid_domain_for_dev(struct pci_dev *pdev)
2233 {
2234         struct dmar_domain *domain;
2235         int ret;
2236
2237         domain = get_domain_for_dev(pdev,
2238                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2239         if (!domain) {
2240                 printk(KERN_ERR
2241                         "Allocating domain for %s failed", pci_name(pdev));
2242                 return NULL;
2243         }
2244
2245         /* make sure context mapping is ok */
2246         if (unlikely(!domain_context_mapped(pdev))) {
2247                 ret = domain_context_mapping(domain, pdev);
2248                 if (ret) {
2249                         printk(KERN_ERR
2250                                 "Domain context map for %s failed",
2251                                 pci_name(pdev));
2252                         return NULL;
2253                 }
2254         }
2255
2256         return domain;
2257 }
2258
2259 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2260                                      size_t size, int dir, u64 dma_mask)
2261 {
2262         struct pci_dev *pdev = to_pci_dev(hwdev);
2263         struct dmar_domain *domain;
2264         phys_addr_t start_paddr;
2265         struct iova *iova;
2266         int prot = 0;
2267         int ret;
2268         struct intel_iommu *iommu;
2269
2270         BUG_ON(dir == DMA_NONE);
2271         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2272                 return paddr;
2273
2274         domain = get_valid_domain_for_dev(pdev);
2275         if (!domain)
2276                 return 0;
2277
2278         iommu = domain_get_iommu(domain);
2279         size = aligned_size((u64)paddr, size);
2280
2281         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2282         if (!iova)
2283                 goto error;
2284
2285         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2286
2287         /*
2288          * Check if DMAR supports zero-length reads on write only
2289          * mappings..
2290          */
2291         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2292                         !cap_zlr(iommu->cap))
2293                 prot |= DMA_PTE_READ;
2294         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2295                 prot |= DMA_PTE_WRITE;
2296         /*
2297          * paddr - (paddr + size) might be partial page, we should map the whole
2298          * page.  Note: if two part of one page are separately mapped, we
2299          * might have two guest_addr mapping to the same host paddr, but this
2300          * is not a big problem
2301          */
2302         ret = domain_page_mapping(domain, start_paddr,
2303                 ((u64)paddr) & PAGE_MASK, size, prot);
2304         if (ret)
2305                 goto error;
2306
2307         /* it's a non-present to present mapping */
2308         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2309                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2310         if (ret)
2311                 iommu_flush_write_buffer(iommu);
2312
2313         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2314
2315 error:
2316         if (iova)
2317                 __free_iova(&domain->iovad, iova);
2318         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2319                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2320         return 0;
2321 }
2322
2323 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2324                             size_t size, int dir)
2325 {
2326         return __intel_map_single(hwdev, paddr, size, dir,
2327                                   to_pci_dev(hwdev)->dma_mask);
2328 }
2329
2330 static void flush_unmaps(void)
2331 {
2332         int i, j;
2333
2334         timer_on = 0;
2335
2336         /* just flush them all */
2337         for (i = 0; i < g_num_of_iommus; i++) {
2338                 struct intel_iommu *iommu = g_iommus[i];
2339                 if (!iommu)
2340                         continue;
2341
2342                 if (deferred_flush[i].next) {
2343                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2344                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2345                         for (j = 0; j < deferred_flush[i].next; j++) {
2346                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2347                                                 deferred_flush[i].iova[j]);
2348                         }
2349                         deferred_flush[i].next = 0;
2350                 }
2351         }
2352
2353         list_size = 0;
2354 }
2355
2356 static void flush_unmaps_timeout(unsigned long data)
2357 {
2358         unsigned long flags;
2359
2360         spin_lock_irqsave(&async_umap_flush_lock, flags);
2361         flush_unmaps();
2362         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2363 }
2364
2365 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2366 {
2367         unsigned long flags;
2368         int next, iommu_id;
2369         struct intel_iommu *iommu;
2370
2371         spin_lock_irqsave(&async_umap_flush_lock, flags);
2372         if (list_size == HIGH_WATER_MARK)
2373                 flush_unmaps();
2374
2375         iommu = domain_get_iommu(dom);
2376         iommu_id = iommu->seq_id;
2377
2378         next = deferred_flush[iommu_id].next;
2379         deferred_flush[iommu_id].domain[next] = dom;
2380         deferred_flush[iommu_id].iova[next] = iova;
2381         deferred_flush[iommu_id].next++;
2382
2383         if (!timer_on) {
2384                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2385                 timer_on = 1;
2386         }
2387         list_size++;
2388         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2389 }
2390
2391 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2392                         int dir)
2393 {
2394         struct pci_dev *pdev = to_pci_dev(dev);
2395         struct dmar_domain *domain;
2396         unsigned long start_addr;
2397         struct iova *iova;
2398         struct intel_iommu *iommu;
2399
2400         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2401                 return;
2402         domain = find_domain(pdev);
2403         BUG_ON(!domain);
2404
2405         iommu = domain_get_iommu(domain);
2406
2407         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2408         if (!iova)
2409                 return;
2410
2411         start_addr = iova->pfn_lo << PAGE_SHIFT;
2412         size = aligned_size((u64)dev_addr, size);
2413
2414         pr_debug("Device %s unmapping: %zx@%llx\n",
2415                 pci_name(pdev), size, (unsigned long long)start_addr);
2416
2417         /*  clear the whole page */
2418         dma_pte_clear_range(domain, start_addr, start_addr + size);
2419         /* free page tables */
2420         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2421         if (intel_iommu_strict) {
2422                 if (iommu_flush_iotlb_psi(iommu,
2423                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2424                         iommu_flush_write_buffer(iommu);
2425                 /* free iova */
2426                 __free_iova(&domain->iovad, iova);
2427         } else {
2428                 add_unmap(domain, iova);
2429                 /*
2430                  * queue up the release of the unmap to save the 1/6th of the
2431                  * cpu used up by the iotlb flush operation...
2432                  */
2433         }
2434 }
2435
2436 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2437                            dma_addr_t *dma_handle, gfp_t flags)
2438 {
2439         void *vaddr;
2440         int order;
2441
2442         size = PAGE_ALIGN(size);
2443         order = get_order(size);
2444         flags &= ~(GFP_DMA | GFP_DMA32);
2445
2446         vaddr = (void *)__get_free_pages(flags, order);
2447         if (!vaddr)
2448                 return NULL;
2449         memset(vaddr, 0, size);
2450
2451         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2452                                          DMA_BIDIRECTIONAL,
2453                                          hwdev->coherent_dma_mask);
2454         if (*dma_handle)
2455                 return vaddr;
2456         free_pages((unsigned long)vaddr, order);
2457         return NULL;
2458 }
2459
2460 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2461                          dma_addr_t dma_handle)
2462 {
2463         int order;
2464
2465         size = PAGE_ALIGN(size);
2466         order = get_order(size);
2467
2468         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2469         free_pages((unsigned long)vaddr, order);
2470 }
2471
2472 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2473                     int nelems, int dir)
2474 {
2475         int i;
2476         struct pci_dev *pdev = to_pci_dev(hwdev);
2477         struct dmar_domain *domain;
2478         unsigned long start_addr;
2479         struct iova *iova;
2480         size_t size = 0;
2481         phys_addr_t addr;
2482         struct scatterlist *sg;
2483         struct intel_iommu *iommu;
2484
2485         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2486                 return;
2487
2488         domain = find_domain(pdev);
2489         BUG_ON(!domain);
2490
2491         iommu = domain_get_iommu(domain);
2492
2493         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2494         if (!iova)
2495                 return;
2496         for_each_sg(sglist, sg, nelems, i) {
2497                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2498                 size += aligned_size((u64)addr, sg->length);
2499         }
2500
2501         start_addr = iova->pfn_lo << PAGE_SHIFT;
2502
2503         /*  clear the whole page */
2504         dma_pte_clear_range(domain, start_addr, start_addr + size);
2505         /* free page tables */
2506         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2507
2508         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2509                         size >> VTD_PAGE_SHIFT, 0))
2510                 iommu_flush_write_buffer(iommu);
2511
2512         /* free iova */
2513         __free_iova(&domain->iovad, iova);
2514 }
2515
2516 static int intel_nontranslate_map_sg(struct device *hddev,
2517         struct scatterlist *sglist, int nelems, int dir)
2518 {
2519         int i;
2520         struct scatterlist *sg;
2521
2522         for_each_sg(sglist, sg, nelems, i) {
2523                 BUG_ON(!sg_page(sg));
2524                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2525                 sg->dma_length = sg->length;
2526         }
2527         return nelems;
2528 }
2529
2530 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2531                  int dir)
2532 {
2533         phys_addr_t addr;
2534         int i;
2535         struct pci_dev *pdev = to_pci_dev(hwdev);
2536         struct dmar_domain *domain;
2537         size_t size = 0;
2538         int prot = 0;
2539         size_t offset = 0;
2540         struct iova *iova = NULL;
2541         int ret;
2542         struct scatterlist *sg;
2543         unsigned long start_addr;
2544         struct intel_iommu *iommu;
2545
2546         BUG_ON(dir == DMA_NONE);
2547         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2548                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2549
2550         domain = get_valid_domain_for_dev(pdev);
2551         if (!domain)
2552                 return 0;
2553
2554         iommu = domain_get_iommu(domain);
2555
2556         for_each_sg(sglist, sg, nelems, i) {
2557                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2558                 size += aligned_size((u64)addr, sg->length);
2559         }
2560
2561         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2562         if (!iova) {
2563                 sglist->dma_length = 0;
2564                 return 0;
2565         }
2566
2567         /*
2568          * Check if DMAR supports zero-length reads on write only
2569          * mappings..
2570          */
2571         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2572                         !cap_zlr(iommu->cap))
2573                 prot |= DMA_PTE_READ;
2574         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2575                 prot |= DMA_PTE_WRITE;
2576
2577         start_addr = iova->pfn_lo << PAGE_SHIFT;
2578         offset = 0;
2579         for_each_sg(sglist, sg, nelems, i) {
2580                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2581                 size = aligned_size((u64)addr, sg->length);
2582                 ret = domain_page_mapping(domain, start_addr + offset,
2583                         ((u64)addr) & PAGE_MASK,
2584                         size, prot);
2585                 if (ret) {
2586                         /*  clear the page */
2587                         dma_pte_clear_range(domain, start_addr,
2588                                   start_addr + offset);
2589                         /* free page tables */
2590                         dma_pte_free_pagetable(domain, start_addr,
2591                                   start_addr + offset);
2592                         /* free iova */
2593                         __free_iova(&domain->iovad, iova);
2594                         return 0;
2595                 }
2596                 sg->dma_address = start_addr + offset +
2597                                 ((u64)addr & (~PAGE_MASK));
2598                 sg->dma_length = sg->length;
2599                 offset += size;
2600         }
2601
2602         /* it's a non-present to present mapping */
2603         if (iommu_flush_iotlb_psi(iommu, domain->id,
2604                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2605                 iommu_flush_write_buffer(iommu);
2606         return nelems;
2607 }
2608
2609 static struct dma_mapping_ops intel_dma_ops = {
2610         .alloc_coherent = intel_alloc_coherent,
2611         .free_coherent = intel_free_coherent,
2612         .map_single = intel_map_single,
2613         .unmap_single = intel_unmap_single,
2614         .map_sg = intel_map_sg,
2615         .unmap_sg = intel_unmap_sg,
2616 };
2617
2618 static inline int iommu_domain_cache_init(void)
2619 {
2620         int ret = 0;
2621
2622         iommu_domain_cache = kmem_cache_create("iommu_domain",
2623                                          sizeof(struct dmar_domain),
2624                                          0,
2625                                          SLAB_HWCACHE_ALIGN,
2626
2627                                          NULL);
2628         if (!iommu_domain_cache) {
2629                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2630                 ret = -ENOMEM;
2631         }
2632
2633         return ret;
2634 }
2635
2636 static inline int iommu_devinfo_cache_init(void)
2637 {
2638         int ret = 0;
2639
2640         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2641                                          sizeof(struct device_domain_info),
2642                                          0,
2643                                          SLAB_HWCACHE_ALIGN,
2644                                          NULL);
2645         if (!iommu_devinfo_cache) {
2646                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2647                 ret = -ENOMEM;
2648         }
2649
2650         return ret;
2651 }
2652
2653 static inline int iommu_iova_cache_init(void)
2654 {
2655         int ret = 0;
2656
2657         iommu_iova_cache = kmem_cache_create("iommu_iova",
2658                                          sizeof(struct iova),
2659                                          0,
2660                                          SLAB_HWCACHE_ALIGN,
2661                                          NULL);
2662         if (!iommu_iova_cache) {
2663                 printk(KERN_ERR "Couldn't create iova cache\n");
2664                 ret = -ENOMEM;
2665         }
2666
2667         return ret;
2668 }
2669
2670 static int __init iommu_init_mempool(void)
2671 {
2672         int ret;
2673         ret = iommu_iova_cache_init();
2674         if (ret)
2675                 return ret;
2676
2677         ret = iommu_domain_cache_init();
2678         if (ret)
2679                 goto domain_error;
2680
2681         ret = iommu_devinfo_cache_init();
2682         if (!ret)
2683                 return ret;
2684
2685         kmem_cache_destroy(iommu_domain_cache);
2686 domain_error:
2687         kmem_cache_destroy(iommu_iova_cache);
2688
2689         return -ENOMEM;
2690 }
2691
2692 static void __init iommu_exit_mempool(void)
2693 {
2694         kmem_cache_destroy(iommu_devinfo_cache);
2695         kmem_cache_destroy(iommu_domain_cache);
2696         kmem_cache_destroy(iommu_iova_cache);
2697
2698 }
2699
2700 static void __init init_no_remapping_devices(void)
2701 {
2702         struct dmar_drhd_unit *drhd;
2703
2704         for_each_drhd_unit(drhd) {
2705                 if (!drhd->include_all) {
2706                         int i;
2707                         for (i = 0; i < drhd->devices_cnt; i++)
2708                                 if (drhd->devices[i] != NULL)
2709                                         break;
2710                         /* ignore DMAR unit if no pci devices exist */
2711                         if (i == drhd->devices_cnt)
2712                                 drhd->ignored = 1;
2713                 }
2714         }
2715
2716         if (dmar_map_gfx)
2717                 return;
2718
2719         for_each_drhd_unit(drhd) {
2720                 int i;
2721                 if (drhd->ignored || drhd->include_all)
2722                         continue;
2723
2724                 for (i = 0; i < drhd->devices_cnt; i++)
2725                         if (drhd->devices[i] &&
2726                                 !IS_GFX_DEVICE(drhd->devices[i]))
2727                                 break;
2728
2729                 if (i < drhd->devices_cnt)
2730                         continue;
2731
2732                 /* bypass IOMMU if it is just for gfx devices */
2733                 drhd->ignored = 1;
2734                 for (i = 0; i < drhd->devices_cnt; i++) {
2735                         if (!drhd->devices[i])
2736                                 continue;
2737                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2738                 }
2739         }
2740 }
2741
2742 int __init intel_iommu_init(void)
2743 {
2744         int ret = 0;
2745
2746         if (dmar_table_init())
2747                 return  -ENODEV;
2748
2749         if (dmar_dev_scope_init())
2750                 return  -ENODEV;
2751
2752         /*
2753          * Check the need for DMA-remapping initialization now.
2754          * Above initialization will also be used by Interrupt-remapping.
2755          */
2756         if (no_iommu || swiotlb || dmar_disabled)
2757                 return -ENODEV;
2758
2759         iommu_init_mempool();
2760         dmar_init_reserved_ranges();
2761
2762         init_no_remapping_devices();
2763
2764         ret = init_dmars();
2765         if (ret) {
2766                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2767                 put_iova_domain(&reserved_iova_list);
2768                 iommu_exit_mempool();
2769                 return ret;
2770         }
2771         printk(KERN_INFO
2772         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2773
2774         init_timer(&unmap_timer);
2775         force_iommu = 1;
2776         dma_ops = &intel_dma_ops;
2777
2778         register_iommu(&intel_iommu_ops);
2779
2780         return 0;
2781 }
2782
2783 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2784                                   struct pci_dev *pdev)
2785 {
2786         struct device_domain_info *info;
2787         unsigned long flags;
2788
2789         info = alloc_devinfo_mem();
2790         if (!info)
2791                 return -ENOMEM;
2792
2793         info->bus = pdev->bus->number;
2794         info->devfn = pdev->devfn;
2795         info->dev = pdev;
2796         info->domain = domain;
2797
2798         spin_lock_irqsave(&device_domain_lock, flags);
2799         list_add(&info->link, &domain->devices);
2800         list_add(&info->global, &device_domain_list);
2801         pdev->dev.archdata.iommu = info;
2802         spin_unlock_irqrestore(&device_domain_lock, flags);
2803
2804         return 0;
2805 }
2806
2807 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2808                                            struct pci_dev *pdev)
2809 {
2810         struct pci_dev *tmp, *parent;
2811
2812         if (!iommu || !pdev)
2813                 return;
2814
2815         /* dependent device detach */
2816         tmp = pci_find_upstream_pcie_bridge(pdev);
2817         /* Secondary interface's bus number and devfn 0 */
2818         if (tmp) {
2819                 parent = pdev->bus->self;
2820                 while (parent != tmp) {
2821                         iommu_detach_dev(iommu, parent->bus->number,
2822                                 parent->devfn);
2823                         parent = parent->bus->self;
2824                 }
2825                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2826                         iommu_detach_dev(iommu,
2827                                 tmp->subordinate->number, 0);
2828                 else /* this is a legacy PCI bridge */
2829                         iommu_detach_dev(iommu,
2830                                 tmp->bus->number, tmp->devfn);
2831         }
2832 }
2833
2834 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2835                                           struct pci_dev *pdev)
2836 {
2837         struct device_domain_info *info;
2838         struct intel_iommu *iommu;
2839         unsigned long flags;
2840         int found = 0;
2841         struct list_head *entry, *tmp;
2842
2843         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2844         if (!iommu)
2845                 return;
2846
2847         spin_lock_irqsave(&device_domain_lock, flags);
2848         list_for_each_safe(entry, tmp, &domain->devices) {
2849                 info = list_entry(entry, struct device_domain_info, link);
2850                 if (info->bus == pdev->bus->number &&
2851                     info->devfn == pdev->devfn) {
2852                         list_del(&info->link);
2853                         list_del(&info->global);
2854                         if (info->dev)
2855                                 info->dev->dev.archdata.iommu = NULL;
2856                         spin_unlock_irqrestore(&device_domain_lock, flags);
2857
2858                         iommu_detach_dev(iommu, info->bus, info->devfn);
2859                         iommu_detach_dependent_devices(iommu, pdev);
2860                         free_devinfo_mem(info);
2861
2862                         spin_lock_irqsave(&device_domain_lock, flags);
2863
2864                         if (found)
2865                                 break;
2866                         else
2867                                 continue;
2868                 }
2869
2870                 /* if there is no other devices under the same iommu
2871                  * owned by this domain, clear this iommu in iommu_bmp
2872                  * update iommu count and coherency
2873                  */
2874                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2875                         found = 1;
2876         }
2877
2878         if (found == 0) {
2879                 unsigned long tmp_flags;
2880                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2881                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2882                 domain->iommu_count--;
2883                 domain_update_iommu_cap(domain);
2884                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2885         }
2886
2887         spin_unlock_irqrestore(&device_domain_lock, flags);
2888 }
2889
2890 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2891 {
2892         struct device_domain_info *info;
2893         struct intel_iommu *iommu;
2894         unsigned long flags1, flags2;
2895
2896         spin_lock_irqsave(&device_domain_lock, flags1);
2897         while (!list_empty(&domain->devices)) {
2898                 info = list_entry(domain->devices.next,
2899                         struct device_domain_info, link);
2900                 list_del(&info->link);
2901                 list_del(&info->global);
2902                 if (info->dev)
2903                         info->dev->dev.archdata.iommu = NULL;
2904
2905                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2906
2907                 iommu = device_to_iommu(info->bus, info->devfn);
2908                 iommu_detach_dev(iommu, info->bus, info->devfn);
2909                 iommu_detach_dependent_devices(iommu, info->dev);
2910
2911                 /* clear this iommu in iommu_bmp, update iommu count
2912                  * and capabilities
2913                  */
2914                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2915                 if (test_and_clear_bit(iommu->seq_id,
2916                                        &domain->iommu_bmp)) {
2917                         domain->iommu_count--;
2918                         domain_update_iommu_cap(domain);
2919                 }
2920                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2921
2922                 free_devinfo_mem(info);
2923                 spin_lock_irqsave(&device_domain_lock, flags1);
2924         }
2925         spin_unlock_irqrestore(&device_domain_lock, flags1);
2926 }
2927
2928 /* domain id for virtual machine, it won't be set in context */
2929 static unsigned long vm_domid;
2930
2931 static int vm_domain_min_agaw(struct dmar_domain *domain)
2932 {
2933         int i;
2934         int min_agaw = domain->agaw;
2935
2936         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2937         for (; i < g_num_of_iommus; ) {
2938                 if (min_agaw > g_iommus[i]->agaw)
2939                         min_agaw = g_iommus[i]->agaw;
2940
2941                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2942         }
2943
2944         return min_agaw;
2945 }
2946
2947 static struct dmar_domain *iommu_alloc_vm_domain(void)
2948 {
2949         struct dmar_domain *domain;
2950
2951         domain = alloc_domain_mem();
2952         if (!domain)
2953                 return NULL;
2954
2955         domain->id = vm_domid++;
2956         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2957         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2958
2959         return domain;
2960 }
2961
2962 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2963 {
2964         int adjust_width;
2965
2966         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2967         spin_lock_init(&domain->mapping_lock);
2968         spin_lock_init(&domain->iommu_lock);
2969
2970         domain_reserve_special_ranges(domain);
2971
2972         /* calculate AGAW */
2973         domain->gaw = guest_width;
2974         adjust_width = guestwidth_to_adjustwidth(guest_width);
2975         domain->agaw = width_to_agaw(adjust_width);
2976
2977         INIT_LIST_HEAD(&domain->devices);
2978
2979         domain->iommu_count = 0;
2980         domain->iommu_coherency = 0;
2981         domain->max_addr = 0;
2982
2983         /* always allocate the top pgd */
2984         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2985         if (!domain->pgd)
2986                 return -ENOMEM;
2987         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2988         return 0;
2989 }
2990
2991 static void iommu_free_vm_domain(struct dmar_domain *domain)
2992 {
2993         unsigned long flags;
2994         struct dmar_drhd_unit *drhd;
2995         struct intel_iommu *iommu;
2996         unsigned long i;
2997         unsigned long ndomains;
2998
2999         for_each_drhd_unit(drhd) {
3000                 if (drhd->ignored)
3001                         continue;
3002                 iommu = drhd->iommu;
3003
3004                 ndomains = cap_ndoms(iommu->cap);
3005                 i = find_first_bit(iommu->domain_ids, ndomains);
3006                 for (; i < ndomains; ) {
3007                         if (iommu->domains[i] == domain) {
3008                                 spin_lock_irqsave(&iommu->lock, flags);
3009                                 clear_bit(i, iommu->domain_ids);
3010                                 iommu->domains[i] = NULL;
3011                                 spin_unlock_irqrestore(&iommu->lock, flags);
3012                                 break;
3013                         }
3014                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3015                 }
3016         }
3017 }
3018
3019 static void vm_domain_exit(struct dmar_domain *domain)
3020 {
3021         u64 end;
3022
3023         /* Domain 0 is reserved, so dont process it */
3024         if (!domain)
3025                 return;
3026
3027         vm_domain_remove_all_dev_info(domain);
3028         /* destroy iovas */
3029         put_iova_domain(&domain->iovad);
3030         end = DOMAIN_MAX_ADDR(domain->gaw);
3031         end = end & (~VTD_PAGE_MASK);
3032
3033         /* clear ptes */
3034         dma_pte_clear_range(domain, 0, end);
3035
3036         /* free page tables */
3037         dma_pte_free_pagetable(domain, 0, end);
3038
3039         iommu_free_vm_domain(domain);
3040         free_domain_mem(domain);
3041 }
3042
3043 static int intel_iommu_domain_init(struct iommu_domain *domain)
3044 {
3045         struct dmar_domain *dmar_domain;
3046
3047         dmar_domain = iommu_alloc_vm_domain();
3048         if (!dmar_domain) {
3049                 printk(KERN_ERR
3050                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3051                 return -ENOMEM;
3052         }
3053         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3054                 printk(KERN_ERR
3055                         "intel_iommu_domain_init() failed\n");
3056                 vm_domain_exit(dmar_domain);
3057                 return -ENOMEM;
3058         }
3059         domain->priv = dmar_domain;
3060
3061         return 0;
3062 }
3063
3064 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3065 {
3066         struct dmar_domain *dmar_domain = domain->priv;
3067
3068         domain->priv = NULL;
3069         vm_domain_exit(dmar_domain);
3070 }
3071
3072 static int intel_iommu_attach_device(struct iommu_domain *domain,
3073                                      struct device *dev)
3074 {
3075         struct dmar_domain *dmar_domain = domain->priv;
3076         struct pci_dev *pdev = to_pci_dev(dev);
3077         struct intel_iommu *iommu;
3078         int addr_width;
3079         u64 end;
3080         int ret;
3081
3082         /* normally pdev is not mapped */
3083         if (unlikely(domain_context_mapped(pdev))) {
3084                 struct dmar_domain *old_domain;
3085
3086                 old_domain = find_domain(pdev);
3087                 if (old_domain) {
3088                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3089                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3090                         else
3091                                 domain_remove_dev_info(old_domain);
3092                 }
3093         }
3094
3095         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3096         if (!iommu)
3097                 return -ENODEV;
3098
3099         /* check if this iommu agaw is sufficient for max mapped address */
3100         addr_width = agaw_to_width(iommu->agaw);
3101         end = DOMAIN_MAX_ADDR(addr_width);
3102         end = end & VTD_PAGE_MASK;
3103         if (end < dmar_domain->max_addr) {
3104                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3105                        "sufficient for the mapped address (%llx)\n",
3106                        __func__, iommu->agaw, dmar_domain->max_addr);
3107                 return -EFAULT;
3108         }
3109
3110         ret = domain_context_mapping(dmar_domain, pdev);
3111         if (ret)
3112                 return ret;
3113
3114         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3115         return ret;
3116 }
3117
3118 static void intel_iommu_detach_device(struct iommu_domain *domain,
3119                                       struct device *dev)
3120 {
3121         struct dmar_domain *dmar_domain = domain->priv;
3122         struct pci_dev *pdev = to_pci_dev(dev);
3123
3124         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3125 }
3126
3127 static int intel_iommu_map_range(struct iommu_domain *domain,
3128                                  unsigned long iova, phys_addr_t hpa,
3129                                  size_t size, int iommu_prot)
3130 {
3131         struct dmar_domain *dmar_domain = domain->priv;
3132         u64 max_addr;
3133         int addr_width;
3134         int prot = 0;
3135         int ret;
3136
3137         if (iommu_prot & IOMMU_READ)
3138                 prot |= DMA_PTE_READ;
3139         if (iommu_prot & IOMMU_WRITE)
3140                 prot |= DMA_PTE_WRITE;
3141         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3142                 prot |= DMA_PTE_SNP;
3143
3144         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3145         if (dmar_domain->max_addr < max_addr) {
3146                 int min_agaw;
3147                 u64 end;
3148
3149                 /* check if minimum agaw is sufficient for mapped address */
3150                 min_agaw = vm_domain_min_agaw(dmar_domain);
3151                 addr_width = agaw_to_width(min_agaw);
3152                 end = DOMAIN_MAX_ADDR(addr_width);
3153                 end = end & VTD_PAGE_MASK;
3154                 if (end < max_addr) {
3155                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3156                                "sufficient for the mapped address (%llx)\n",
3157                                __func__, min_agaw, max_addr);
3158                         return -EFAULT;
3159                 }
3160                 dmar_domain->max_addr = max_addr;
3161         }
3162
3163         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3164         return ret;
3165 }
3166
3167 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3168                                     unsigned long iova, size_t size)
3169 {
3170         struct dmar_domain *dmar_domain = domain->priv;
3171         dma_addr_t base;
3172
3173         /* The address might not be aligned */
3174         base = iova & VTD_PAGE_MASK;
3175         size = VTD_PAGE_ALIGN(size);
3176         dma_pte_clear_range(dmar_domain, base, base + size);
3177
3178         if (dmar_domain->max_addr == base + size)
3179                 dmar_domain->max_addr = base;
3180 }
3181
3182 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3183                                             unsigned long iova)
3184 {
3185         struct dmar_domain *dmar_domain = domain->priv;
3186         struct dma_pte *pte;
3187         u64 phys = 0;
3188
3189         pte = addr_to_dma_pte(dmar_domain, iova);
3190         if (pte)
3191                 phys = dma_pte_addr(pte);
3192
3193         return phys;
3194 }
3195
3196 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3197                                       unsigned long cap)
3198 {
3199         struct dmar_domain *dmar_domain = domain->priv;
3200
3201         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3202                 return dmar_domain->iommu_snooping;
3203
3204         return 0;
3205 }
3206
3207 static struct iommu_ops intel_iommu_ops = {
3208         .domain_init    = intel_iommu_domain_init,
3209         .domain_destroy = intel_iommu_domain_destroy,
3210         .attach_dev     = intel_iommu_attach_device,
3211         .detach_dev     = intel_iommu_detach_device,
3212         .map            = intel_iommu_map_range,
3213         .unmap          = intel_iommu_unmap_range,
3214         .iova_to_phys   = intel_iommu_iova_to_phys,
3215         .domain_has_cap = intel_iommu_domain_has_cap,
3216 };
3217
3218 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3219 {
3220         /*
3221          * Mobile 4 Series Chipset neglects to set RWBF capability,
3222          * but needs it:
3223          */
3224         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3225         rwbf_quirk = 1;
3226 }
3227
3228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);