intel-iommu: Fix device-to-iommu mapping for PCI-PCI bridges.
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <asm/cacheflush.h>
41 #include <asm/iommu.h>
42 #include "pci.h"
43
44 #define ROOT_SIZE               VTD_PAGE_SIZE
45 #define CONTEXT_SIZE            VTD_PAGE_SIZE
46
47 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
48 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49
50 #define IOAPIC_RANGE_START      (0xfee00000)
51 #define IOAPIC_RANGE_END        (0xfeefffff)
52 #define IOVA_START_ADDR         (0x1000)
53
54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55
56 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57
58 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
59 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
60 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
61
62 /* global iommu list, set NULL for ignored DMAR units */
63 static struct intel_iommu **g_iommus;
64
65 static int rwbf_quirk;
66
67 /*
68  * 0: Present
69  * 1-11: Reserved
70  * 12-63: Context Ptr (12 - (haw-1))
71  * 64-127: Reserved
72  */
73 struct root_entry {
74         u64     val;
75         u64     rsvd1;
76 };
77 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
78 static inline bool root_present(struct root_entry *root)
79 {
80         return (root->val & 1);
81 }
82 static inline void set_root_present(struct root_entry *root)
83 {
84         root->val |= 1;
85 }
86 static inline void set_root_value(struct root_entry *root, unsigned long value)
87 {
88         root->val |= value & VTD_PAGE_MASK;
89 }
90
91 static inline struct context_entry *
92 get_context_addr_from_root(struct root_entry *root)
93 {
94         return (struct context_entry *)
95                 (root_present(root)?phys_to_virt(
96                 root->val & VTD_PAGE_MASK) :
97                 NULL);
98 }
99
100 /*
101  * low 64 bits:
102  * 0: present
103  * 1: fault processing disable
104  * 2-3: translation type
105  * 12-63: address space root
106  * high 64 bits:
107  * 0-2: address width
108  * 3-6: aval
109  * 8-23: domain id
110  */
111 struct context_entry {
112         u64 lo;
113         u64 hi;
114 };
115
116 static inline bool context_present(struct context_entry *context)
117 {
118         return (context->lo & 1);
119 }
120 static inline void context_set_present(struct context_entry *context)
121 {
122         context->lo |= 1;
123 }
124
125 static inline void context_set_fault_enable(struct context_entry *context)
126 {
127         context->lo &= (((u64)-1) << 2) | 1;
128 }
129
130 #define CONTEXT_TT_MULTI_LEVEL 0
131
132 static inline void context_set_translation_type(struct context_entry *context,
133                                                 unsigned long value)
134 {
135         context->lo &= (((u64)-1) << 4) | 3;
136         context->lo |= (value & 3) << 2;
137 }
138
139 static inline void context_set_address_root(struct context_entry *context,
140                                             unsigned long value)
141 {
142         context->lo |= value & VTD_PAGE_MASK;
143 }
144
145 static inline void context_set_address_width(struct context_entry *context,
146                                              unsigned long value)
147 {
148         context->hi |= value & 7;
149 }
150
151 static inline void context_set_domain_id(struct context_entry *context,
152                                          unsigned long value)
153 {
154         context->hi |= (value & ((1 << 16) - 1)) << 8;
155 }
156
157 static inline void context_clear_entry(struct context_entry *context)
158 {
159         context->lo = 0;
160         context->hi = 0;
161 }
162
163 /*
164  * 0: readable
165  * 1: writable
166  * 2-6: reserved
167  * 7: super page
168  * 8-10: available
169  * 11: snoop behavior
170  * 12-63: Host physcial address
171  */
172 struct dma_pte {
173         u64 val;
174 };
175
176 static inline void dma_clear_pte(struct dma_pte *pte)
177 {
178         pte->val = 0;
179 }
180
181 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_READ;
184 }
185
186 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 {
188         pte->val |= DMA_PTE_WRITE;
189 }
190
191 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 {
193         pte->val |= DMA_PTE_SNP;
194 }
195
196 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 {
198         pte->val = (pte->val & ~3) | (prot & 3);
199 }
200
201 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 {
203         return (pte->val & VTD_PAGE_MASK);
204 }
205
206 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 {
208         pte->val |= (addr & VTD_PAGE_MASK);
209 }
210
211 static inline bool dma_pte_present(struct dma_pte *pte)
212 {
213         return (pte->val & 3) != 0;
214 }
215
216 /* devices under the same p2p bridge are owned in one domain */
217 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218
219 /* domain represents a virtual machine, more than one devices
220  * across iommus may be owned in one domain, e.g. kvm guest.
221  */
222 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
223
224 struct dmar_domain {
225         int     id;                     /* domain id */
226         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
227
228         struct list_head devices;       /* all devices' list */
229         struct iova_domain iovad;       /* iova's that belong to this domain */
230
231         struct dma_pte  *pgd;           /* virtual address */
232         spinlock_t      mapping_lock;   /* page table lock */
233         int             gaw;            /* max guest address width */
234
235         /* adjusted guest address width, 0 is level 2 30-bit */
236         int             agaw;
237
238         int             flags;          /* flags to find out type of domain */
239
240         int             iommu_coherency;/* indicate coherency of iommu access */
241         int             iommu_snooping; /* indicate snooping control feature*/
242         int             iommu_count;    /* reference count of iommu */
243         spinlock_t      iommu_lock;     /* protect iommu set in domain */
244         u64             max_addr;       /* maximum mapped address */
245 };
246
247 /* PCI domain-device relationship */
248 struct device_domain_info {
249         struct list_head link;  /* link to domain siblings */
250         struct list_head global; /* link to global list */
251         u8 bus;                 /* PCI bus numer */
252         u8 devfn;               /* PCI devfn number */
253         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
254         struct dmar_domain *domain; /* pointer to domain */
255 };
256
257 static void flush_unmaps_timeout(unsigned long data);
258
259 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
260
261 #define HIGH_WATER_MARK 250
262 struct deferred_flush_tables {
263         int next;
264         struct iova *iova[HIGH_WATER_MARK];
265         struct dmar_domain *domain[HIGH_WATER_MARK];
266 };
267
268 static struct deferred_flush_tables *deferred_flush;
269
270 /* bitmap for indexing intel_iommus */
271 static int g_num_of_iommus;
272
273 static DEFINE_SPINLOCK(async_umap_flush_lock);
274 static LIST_HEAD(unmaps_to_do);
275
276 static int timer_on;
277 static long list_size;
278
279 static void domain_remove_dev_info(struct dmar_domain *domain);
280
281 #ifdef CONFIG_DMAR_DEFAULT_ON
282 int dmar_disabled = 0;
283 #else
284 int dmar_disabled = 1;
285 #endif /*CONFIG_DMAR_DEFAULT_ON*/
286
287 static int __initdata dmar_map_gfx = 1;
288 static int dmar_forcedac;
289 static int intel_iommu_strict;
290
291 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
292 static DEFINE_SPINLOCK(device_domain_lock);
293 static LIST_HEAD(device_domain_list);
294
295 static struct iommu_ops intel_iommu_ops;
296
297 static int __init intel_iommu_setup(char *str)
298 {
299         if (!str)
300                 return -EINVAL;
301         while (*str) {
302                 if (!strncmp(str, "on", 2)) {
303                         dmar_disabled = 0;
304                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
305                 } else if (!strncmp(str, "off", 3)) {
306                         dmar_disabled = 1;
307                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
308                 } else if (!strncmp(str, "igfx_off", 8)) {
309                         dmar_map_gfx = 0;
310                         printk(KERN_INFO
311                                 "Intel-IOMMU: disable GFX device mapping\n");
312                 } else if (!strncmp(str, "forcedac", 8)) {
313                         printk(KERN_INFO
314                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
315                         dmar_forcedac = 1;
316                 } else if (!strncmp(str, "strict", 6)) {
317                         printk(KERN_INFO
318                                 "Intel-IOMMU: disable batched IOTLB flush\n");
319                         intel_iommu_strict = 1;
320                 }
321
322                 str += strcspn(str, ",");
323                 while (*str == ',')
324                         str++;
325         }
326         return 0;
327 }
328 __setup("intel_iommu=", intel_iommu_setup);
329
330 static struct kmem_cache *iommu_domain_cache;
331 static struct kmem_cache *iommu_devinfo_cache;
332 static struct kmem_cache *iommu_iova_cache;
333
334 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
335 {
336         unsigned int flags;
337         void *vaddr;
338
339         /* trying to avoid low memory issues */
340         flags = current->flags & PF_MEMALLOC;
341         current->flags |= PF_MEMALLOC;
342         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
343         current->flags &= (~PF_MEMALLOC | flags);
344         return vaddr;
345 }
346
347
348 static inline void *alloc_pgtable_page(void)
349 {
350         unsigned int flags;
351         void *vaddr;
352
353         /* trying to avoid low memory issues */
354         flags = current->flags & PF_MEMALLOC;
355         current->flags |= PF_MEMALLOC;
356         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
357         current->flags &= (~PF_MEMALLOC | flags);
358         return vaddr;
359 }
360
361 static inline void free_pgtable_page(void *vaddr)
362 {
363         free_page((unsigned long)vaddr);
364 }
365
366 static inline void *alloc_domain_mem(void)
367 {
368         return iommu_kmem_cache_alloc(iommu_domain_cache);
369 }
370
371 static void free_domain_mem(void *vaddr)
372 {
373         kmem_cache_free(iommu_domain_cache, vaddr);
374 }
375
376 static inline void * alloc_devinfo_mem(void)
377 {
378         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
379 }
380
381 static inline void free_devinfo_mem(void *vaddr)
382 {
383         kmem_cache_free(iommu_devinfo_cache, vaddr);
384 }
385
386 struct iova *alloc_iova_mem(void)
387 {
388         return iommu_kmem_cache_alloc(iommu_iova_cache);
389 }
390
391 void free_iova_mem(struct iova *iova)
392 {
393         kmem_cache_free(iommu_iova_cache, iova);
394 }
395
396
397 static inline int width_to_agaw(int width);
398
399 /* calculate agaw for each iommu.
400  * "SAGAW" may be different across iommus, use a default agaw, and
401  * get a supported less agaw for iommus that don't support the default agaw.
402  */
403 int iommu_calculate_agaw(struct intel_iommu *iommu)
404 {
405         unsigned long sagaw;
406         int agaw = -1;
407
408         sagaw = cap_sagaw(iommu->cap);
409         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
410              agaw >= 0; agaw--) {
411                 if (test_bit(agaw, &sagaw))
412                         break;
413         }
414
415         return agaw;
416 }
417
418 /* in native case, each domain is related to only one iommu */
419 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
420 {
421         int iommu_id;
422
423         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
424
425         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
426         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
427                 return NULL;
428
429         return g_iommus[iommu_id];
430 }
431
432 static void domain_update_iommu_coherency(struct dmar_domain *domain)
433 {
434         int i;
435
436         domain->iommu_coherency = 1;
437
438         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
439         for (; i < g_num_of_iommus; ) {
440                 if (!ecap_coherent(g_iommus[i]->ecap)) {
441                         domain->iommu_coherency = 0;
442                         break;
443                 }
444                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
445         }
446 }
447
448 static void domain_update_iommu_snooping(struct dmar_domain *domain)
449 {
450         int i;
451
452         domain->iommu_snooping = 1;
453
454         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
455         for (; i < g_num_of_iommus; ) {
456                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
457                         domain->iommu_snooping = 0;
458                         break;
459                 }
460                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
461         }
462 }
463
464 /* Some capabilities may be different across iommus */
465 static void domain_update_iommu_cap(struct dmar_domain *domain)
466 {
467         domain_update_iommu_coherency(domain);
468         domain_update_iommu_snooping(domain);
469 }
470
471 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
472 {
473         struct dmar_drhd_unit *drhd = NULL;
474         int i;
475
476         for_each_drhd_unit(drhd) {
477                 if (drhd->ignored)
478                         continue;
479
480                 for (i = 0; i < drhd->devices_cnt; i++) {
481                         if (drhd->devices[i] &&
482                             drhd->devices[i]->bus->number == bus &&
483                             drhd->devices[i]->devfn == devfn)
484                                 return drhd->iommu;
485                         if (drhd->devices[i]->subordinate &&
486                             drhd->devices[i]->subordinate->number <= bus &&
487                             drhd->devices[i]->subordinate->subordinate >= bus)
488                                 return drhd->iommu;
489                 }
490
491                 if (drhd->include_all)
492                         return drhd->iommu;
493         }
494
495         return NULL;
496 }
497
498 static void domain_flush_cache(struct dmar_domain *domain,
499                                void *addr, int size)
500 {
501         if (!domain->iommu_coherency)
502                 clflush_cache_range(addr, size);
503 }
504
505 /* Gets context entry for a given bus and devfn */
506 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
507                 u8 bus, u8 devfn)
508 {
509         struct root_entry *root;
510         struct context_entry *context;
511         unsigned long phy_addr;
512         unsigned long flags;
513
514         spin_lock_irqsave(&iommu->lock, flags);
515         root = &iommu->root_entry[bus];
516         context = get_context_addr_from_root(root);
517         if (!context) {
518                 context = (struct context_entry *)alloc_pgtable_page();
519                 if (!context) {
520                         spin_unlock_irqrestore(&iommu->lock, flags);
521                         return NULL;
522                 }
523                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
524                 phy_addr = virt_to_phys((void *)context);
525                 set_root_value(root, phy_addr);
526                 set_root_present(root);
527                 __iommu_flush_cache(iommu, root, sizeof(*root));
528         }
529         spin_unlock_irqrestore(&iommu->lock, flags);
530         return &context[devfn];
531 }
532
533 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
534 {
535         struct root_entry *root;
536         struct context_entry *context;
537         int ret;
538         unsigned long flags;
539
540         spin_lock_irqsave(&iommu->lock, flags);
541         root = &iommu->root_entry[bus];
542         context = get_context_addr_from_root(root);
543         if (!context) {
544                 ret = 0;
545                 goto out;
546         }
547         ret = context_present(&context[devfn]);
548 out:
549         spin_unlock_irqrestore(&iommu->lock, flags);
550         return ret;
551 }
552
553 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
554 {
555         struct root_entry *root;
556         struct context_entry *context;
557         unsigned long flags;
558
559         spin_lock_irqsave(&iommu->lock, flags);
560         root = &iommu->root_entry[bus];
561         context = get_context_addr_from_root(root);
562         if (context) {
563                 context_clear_entry(&context[devfn]);
564                 __iommu_flush_cache(iommu, &context[devfn], \
565                         sizeof(*context));
566         }
567         spin_unlock_irqrestore(&iommu->lock, flags);
568 }
569
570 static void free_context_table(struct intel_iommu *iommu)
571 {
572         struct root_entry *root;
573         int i;
574         unsigned long flags;
575         struct context_entry *context;
576
577         spin_lock_irqsave(&iommu->lock, flags);
578         if (!iommu->root_entry) {
579                 goto out;
580         }
581         for (i = 0; i < ROOT_ENTRY_NR; i++) {
582                 root = &iommu->root_entry[i];
583                 context = get_context_addr_from_root(root);
584                 if (context)
585                         free_pgtable_page(context);
586         }
587         free_pgtable_page(iommu->root_entry);
588         iommu->root_entry = NULL;
589 out:
590         spin_unlock_irqrestore(&iommu->lock, flags);
591 }
592
593 /* page table handling */
594 #define LEVEL_STRIDE            (9)
595 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
596
597 static inline int agaw_to_level(int agaw)
598 {
599         return agaw + 2;
600 }
601
602 static inline int agaw_to_width(int agaw)
603 {
604         return 30 + agaw * LEVEL_STRIDE;
605
606 }
607
608 static inline int width_to_agaw(int width)
609 {
610         return (width - 30) / LEVEL_STRIDE;
611 }
612
613 static inline unsigned int level_to_offset_bits(int level)
614 {
615         return (12 + (level - 1) * LEVEL_STRIDE);
616 }
617
618 static inline int address_level_offset(u64 addr, int level)
619 {
620         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
621 }
622
623 static inline u64 level_mask(int level)
624 {
625         return ((u64)-1 << level_to_offset_bits(level));
626 }
627
628 static inline u64 level_size(int level)
629 {
630         return ((u64)1 << level_to_offset_bits(level));
631 }
632
633 static inline u64 align_to_level(u64 addr, int level)
634 {
635         return ((addr + level_size(level) - 1) & level_mask(level));
636 }
637
638 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
639 {
640         int addr_width = agaw_to_width(domain->agaw);
641         struct dma_pte *parent, *pte = NULL;
642         int level = agaw_to_level(domain->agaw);
643         int offset;
644         unsigned long flags;
645
646         BUG_ON(!domain->pgd);
647
648         addr &= (((u64)1) << addr_width) - 1;
649         parent = domain->pgd;
650
651         spin_lock_irqsave(&domain->mapping_lock, flags);
652         while (level > 0) {
653                 void *tmp_page;
654
655                 offset = address_level_offset(addr, level);
656                 pte = &parent[offset];
657                 if (level == 1)
658                         break;
659
660                 if (!dma_pte_present(pte)) {
661                         tmp_page = alloc_pgtable_page();
662
663                         if (!tmp_page) {
664                                 spin_unlock_irqrestore(&domain->mapping_lock,
665                                         flags);
666                                 return NULL;
667                         }
668                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
669                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
670                         /*
671                          * high level table always sets r/w, last level page
672                          * table control read/write
673                          */
674                         dma_set_pte_readable(pte);
675                         dma_set_pte_writable(pte);
676                         domain_flush_cache(domain, pte, sizeof(*pte));
677                 }
678                 parent = phys_to_virt(dma_pte_addr(pte));
679                 level--;
680         }
681
682         spin_unlock_irqrestore(&domain->mapping_lock, flags);
683         return pte;
684 }
685
686 /* return address's pte at specific level */
687 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
688                 int level)
689 {
690         struct dma_pte *parent, *pte = NULL;
691         int total = agaw_to_level(domain->agaw);
692         int offset;
693
694         parent = domain->pgd;
695         while (level <= total) {
696                 offset = address_level_offset(addr, total);
697                 pte = &parent[offset];
698                 if (level == total)
699                         return pte;
700
701                 if (!dma_pte_present(pte))
702                         break;
703                 parent = phys_to_virt(dma_pte_addr(pte));
704                 total--;
705         }
706         return NULL;
707 }
708
709 /* clear one page's page table */
710 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
711 {
712         struct dma_pte *pte = NULL;
713
714         /* get last level pte */
715         pte = dma_addr_level_pte(domain, addr, 1);
716
717         if (pte) {
718                 dma_clear_pte(pte);
719                 domain_flush_cache(domain, pte, sizeof(*pte));
720         }
721 }
722
723 /* clear last level pte, a tlb flush should be followed */
724 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
725 {
726         int addr_width = agaw_to_width(domain->agaw);
727         int npages;
728
729         start &= (((u64)1) << addr_width) - 1;
730         end &= (((u64)1) << addr_width) - 1;
731         /* in case it's partial page */
732         start = PAGE_ALIGN(start);
733         end &= PAGE_MASK;
734         npages = (end - start) / VTD_PAGE_SIZE;
735
736         /* we don't need lock here, nobody else touches the iova range */
737         while (npages--) {
738                 dma_pte_clear_one(domain, start);
739                 start += VTD_PAGE_SIZE;
740         }
741 }
742
743 /* free page table pages. last level pte should already be cleared */
744 static void dma_pte_free_pagetable(struct dmar_domain *domain,
745         u64 start, u64 end)
746 {
747         int addr_width = agaw_to_width(domain->agaw);
748         struct dma_pte *pte;
749         int total = agaw_to_level(domain->agaw);
750         int level;
751         u64 tmp;
752
753         start &= (((u64)1) << addr_width) - 1;
754         end &= (((u64)1) << addr_width) - 1;
755
756         /* we don't need lock here, nobody else touches the iova range */
757         level = 2;
758         while (level <= total) {
759                 tmp = align_to_level(start, level);
760                 if (tmp >= end || (tmp + level_size(level) > end))
761                         return;
762
763                 while (tmp < end) {
764                         pte = dma_addr_level_pte(domain, tmp, level);
765                         if (pte) {
766                                 free_pgtable_page(
767                                         phys_to_virt(dma_pte_addr(pte)));
768                                 dma_clear_pte(pte);
769                                 domain_flush_cache(domain, pte, sizeof(*pte));
770                         }
771                         tmp += level_size(level);
772                 }
773                 level++;
774         }
775         /* free pgd */
776         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
777                 free_pgtable_page(domain->pgd);
778                 domain->pgd = NULL;
779         }
780 }
781
782 /* iommu handling */
783 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
784 {
785         struct root_entry *root;
786         unsigned long flags;
787
788         root = (struct root_entry *)alloc_pgtable_page();
789         if (!root)
790                 return -ENOMEM;
791
792         __iommu_flush_cache(iommu, root, ROOT_SIZE);
793
794         spin_lock_irqsave(&iommu->lock, flags);
795         iommu->root_entry = root;
796         spin_unlock_irqrestore(&iommu->lock, flags);
797
798         return 0;
799 }
800
801 static void iommu_set_root_entry(struct intel_iommu *iommu)
802 {
803         void *addr;
804         u32 cmd, sts;
805         unsigned long flag;
806
807         addr = iommu->root_entry;
808
809         spin_lock_irqsave(&iommu->register_lock, flag);
810         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
811
812         cmd = iommu->gcmd | DMA_GCMD_SRTP;
813         writel(cmd, iommu->reg + DMAR_GCMD_REG);
814
815         /* Make sure hardware complete it */
816         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
817                 readl, (sts & DMA_GSTS_RTPS), sts);
818
819         spin_unlock_irqrestore(&iommu->register_lock, flag);
820 }
821
822 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
823 {
824         u32 val;
825         unsigned long flag;
826
827         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
828                 return;
829         val = iommu->gcmd | DMA_GCMD_WBF;
830
831         spin_lock_irqsave(&iommu->register_lock, flag);
832         writel(val, iommu->reg + DMAR_GCMD_REG);
833
834         /* Make sure hardware complete it */
835         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
836                         readl, (!(val & DMA_GSTS_WBFS)), val);
837
838         spin_unlock_irqrestore(&iommu->register_lock, flag);
839 }
840
841 /* return value determine if we need a write buffer flush */
842 static int __iommu_flush_context(struct intel_iommu *iommu,
843         u16 did, u16 source_id, u8 function_mask, u64 type,
844         int non_present_entry_flush)
845 {
846         u64 val = 0;
847         unsigned long flag;
848
849         /*
850          * In the non-present entry flush case, if hardware doesn't cache
851          * non-present entry we do nothing and if hardware cache non-present
852          * entry, we flush entries of domain 0 (the domain id is used to cache
853          * any non-present entries)
854          */
855         if (non_present_entry_flush) {
856                 if (!cap_caching_mode(iommu->cap))
857                         return 1;
858                 else
859                         did = 0;
860         }
861
862         switch (type) {
863         case DMA_CCMD_GLOBAL_INVL:
864                 val = DMA_CCMD_GLOBAL_INVL;
865                 break;
866         case DMA_CCMD_DOMAIN_INVL:
867                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
868                 break;
869         case DMA_CCMD_DEVICE_INVL:
870                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
871                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
872                 break;
873         default:
874                 BUG();
875         }
876         val |= DMA_CCMD_ICC;
877
878         spin_lock_irqsave(&iommu->register_lock, flag);
879         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
880
881         /* Make sure hardware complete it */
882         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
883                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
884
885         spin_unlock_irqrestore(&iommu->register_lock, flag);
886
887         /* flush context entry will implicitly flush write buffer */
888         return 0;
889 }
890
891 /* return value determine if we need a write buffer flush */
892 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
893         u64 addr, unsigned int size_order, u64 type,
894         int non_present_entry_flush)
895 {
896         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
897         u64 val = 0, val_iva = 0;
898         unsigned long flag;
899
900         /*
901          * In the non-present entry flush case, if hardware doesn't cache
902          * non-present entry we do nothing and if hardware cache non-present
903          * entry, we flush entries of domain 0 (the domain id is used to cache
904          * any non-present entries)
905          */
906         if (non_present_entry_flush) {
907                 if (!cap_caching_mode(iommu->cap))
908                         return 1;
909                 else
910                         did = 0;
911         }
912
913         switch (type) {
914         case DMA_TLB_GLOBAL_FLUSH:
915                 /* global flush doesn't need set IVA_REG */
916                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
917                 break;
918         case DMA_TLB_DSI_FLUSH:
919                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
920                 break;
921         case DMA_TLB_PSI_FLUSH:
922                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
923                 /* Note: always flush non-leaf currently */
924                 val_iva = size_order | addr;
925                 break;
926         default:
927                 BUG();
928         }
929         /* Note: set drain read/write */
930 #if 0
931         /*
932          * This is probably to be super secure.. Looks like we can
933          * ignore it without any impact.
934          */
935         if (cap_read_drain(iommu->cap))
936                 val |= DMA_TLB_READ_DRAIN;
937 #endif
938         if (cap_write_drain(iommu->cap))
939                 val |= DMA_TLB_WRITE_DRAIN;
940
941         spin_lock_irqsave(&iommu->register_lock, flag);
942         /* Note: Only uses first TLB reg currently */
943         if (val_iva)
944                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
945         dmar_writeq(iommu->reg + tlb_offset + 8, val);
946
947         /* Make sure hardware complete it */
948         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
949                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
950
951         spin_unlock_irqrestore(&iommu->register_lock, flag);
952
953         /* check IOTLB invalidation granularity */
954         if (DMA_TLB_IAIG(val) == 0)
955                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
956         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
957                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
958                         (unsigned long long)DMA_TLB_IIRG(type),
959                         (unsigned long long)DMA_TLB_IAIG(val));
960         /* flush iotlb entry will implicitly flush write buffer */
961         return 0;
962 }
963
964 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
965         u64 addr, unsigned int pages, int non_present_entry_flush)
966 {
967         unsigned int mask;
968
969         BUG_ON(addr & (~VTD_PAGE_MASK));
970         BUG_ON(pages == 0);
971
972         /* Fallback to domain selective flush if no PSI support */
973         if (!cap_pgsel_inv(iommu->cap))
974                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
975                                                 DMA_TLB_DSI_FLUSH,
976                                                 non_present_entry_flush);
977
978         /*
979          * PSI requires page size to be 2 ^ x, and the base address is naturally
980          * aligned to the size
981          */
982         mask = ilog2(__roundup_pow_of_two(pages));
983         /* Fallback to domain selective flush if size is too big */
984         if (mask > cap_max_amask_val(iommu->cap))
985                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
986                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
987
988         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
989                                         DMA_TLB_PSI_FLUSH,
990                                         non_present_entry_flush);
991 }
992
993 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
994 {
995         u32 pmen;
996         unsigned long flags;
997
998         spin_lock_irqsave(&iommu->register_lock, flags);
999         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1000         pmen &= ~DMA_PMEN_EPM;
1001         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1002
1003         /* wait for the protected region status bit to clear */
1004         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1005                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1006
1007         spin_unlock_irqrestore(&iommu->register_lock, flags);
1008 }
1009
1010 static int iommu_enable_translation(struct intel_iommu *iommu)
1011 {
1012         u32 sts;
1013         unsigned long flags;
1014
1015         spin_lock_irqsave(&iommu->register_lock, flags);
1016         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1017
1018         /* Make sure hardware complete it */
1019         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1020                 readl, (sts & DMA_GSTS_TES), sts);
1021
1022         iommu->gcmd |= DMA_GCMD_TE;
1023         spin_unlock_irqrestore(&iommu->register_lock, flags);
1024         return 0;
1025 }
1026
1027 static int iommu_disable_translation(struct intel_iommu *iommu)
1028 {
1029         u32 sts;
1030         unsigned long flag;
1031
1032         spin_lock_irqsave(&iommu->register_lock, flag);
1033         iommu->gcmd &= ~DMA_GCMD_TE;
1034         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1035
1036         /* Make sure hardware complete it */
1037         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1038                 readl, (!(sts & DMA_GSTS_TES)), sts);
1039
1040         spin_unlock_irqrestore(&iommu->register_lock, flag);
1041         return 0;
1042 }
1043
1044
1045 static int iommu_init_domains(struct intel_iommu *iommu)
1046 {
1047         unsigned long ndomains;
1048         unsigned long nlongs;
1049
1050         ndomains = cap_ndoms(iommu->cap);
1051         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1052         nlongs = BITS_TO_LONGS(ndomains);
1053
1054         /* TBD: there might be 64K domains,
1055          * consider other allocation for future chip
1056          */
1057         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1058         if (!iommu->domain_ids) {
1059                 printk(KERN_ERR "Allocating domain id array failed\n");
1060                 return -ENOMEM;
1061         }
1062         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1063                         GFP_KERNEL);
1064         if (!iommu->domains) {
1065                 printk(KERN_ERR "Allocating domain array failed\n");
1066                 kfree(iommu->domain_ids);
1067                 return -ENOMEM;
1068         }
1069
1070         spin_lock_init(&iommu->lock);
1071
1072         /*
1073          * if Caching mode is set, then invalid translations are tagged
1074          * with domainid 0. Hence we need to pre-allocate it.
1075          */
1076         if (cap_caching_mode(iommu->cap))
1077                 set_bit(0, iommu->domain_ids);
1078         return 0;
1079 }
1080
1081
1082 static void domain_exit(struct dmar_domain *domain);
1083 static void vm_domain_exit(struct dmar_domain *domain);
1084
1085 void free_dmar_iommu(struct intel_iommu *iommu)
1086 {
1087         struct dmar_domain *domain;
1088         int i;
1089         unsigned long flags;
1090
1091         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1092         for (; i < cap_ndoms(iommu->cap); ) {
1093                 domain = iommu->domains[i];
1094                 clear_bit(i, iommu->domain_ids);
1095
1096                 spin_lock_irqsave(&domain->iommu_lock, flags);
1097                 if (--domain->iommu_count == 0) {
1098                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1099                                 vm_domain_exit(domain);
1100                         else
1101                                 domain_exit(domain);
1102                 }
1103                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1104
1105                 i = find_next_bit(iommu->domain_ids,
1106                         cap_ndoms(iommu->cap), i+1);
1107         }
1108
1109         if (iommu->gcmd & DMA_GCMD_TE)
1110                 iommu_disable_translation(iommu);
1111
1112         if (iommu->irq) {
1113                 set_irq_data(iommu->irq, NULL);
1114                 /* This will mask the irq */
1115                 free_irq(iommu->irq, iommu);
1116                 destroy_irq(iommu->irq);
1117         }
1118
1119         kfree(iommu->domains);
1120         kfree(iommu->domain_ids);
1121
1122         g_iommus[iommu->seq_id] = NULL;
1123
1124         /* if all iommus are freed, free g_iommus */
1125         for (i = 0; i < g_num_of_iommus; i++) {
1126                 if (g_iommus[i])
1127                         break;
1128         }
1129
1130         if (i == g_num_of_iommus)
1131                 kfree(g_iommus);
1132
1133         /* free context mapping */
1134         free_context_table(iommu);
1135 }
1136
1137 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1138 {
1139         unsigned long num;
1140         unsigned long ndomains;
1141         struct dmar_domain *domain;
1142         unsigned long flags;
1143
1144         domain = alloc_domain_mem();
1145         if (!domain)
1146                 return NULL;
1147
1148         ndomains = cap_ndoms(iommu->cap);
1149
1150         spin_lock_irqsave(&iommu->lock, flags);
1151         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1152         if (num >= ndomains) {
1153                 spin_unlock_irqrestore(&iommu->lock, flags);
1154                 free_domain_mem(domain);
1155                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1156                 return NULL;
1157         }
1158
1159         set_bit(num, iommu->domain_ids);
1160         domain->id = num;
1161         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1162         set_bit(iommu->seq_id, &domain->iommu_bmp);
1163         domain->flags = 0;
1164         iommu->domains[num] = domain;
1165         spin_unlock_irqrestore(&iommu->lock, flags);
1166
1167         return domain;
1168 }
1169
1170 static void iommu_free_domain(struct dmar_domain *domain)
1171 {
1172         unsigned long flags;
1173         struct intel_iommu *iommu;
1174
1175         iommu = domain_get_iommu(domain);
1176
1177         spin_lock_irqsave(&iommu->lock, flags);
1178         clear_bit(domain->id, iommu->domain_ids);
1179         spin_unlock_irqrestore(&iommu->lock, flags);
1180 }
1181
1182 static struct iova_domain reserved_iova_list;
1183 static struct lock_class_key reserved_alloc_key;
1184 static struct lock_class_key reserved_rbtree_key;
1185
1186 static void dmar_init_reserved_ranges(void)
1187 {
1188         struct pci_dev *pdev = NULL;
1189         struct iova *iova;
1190         int i;
1191         u64 addr, size;
1192
1193         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1194
1195         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1196                 &reserved_alloc_key);
1197         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1198                 &reserved_rbtree_key);
1199
1200         /* IOAPIC ranges shouldn't be accessed by DMA */
1201         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1202                 IOVA_PFN(IOAPIC_RANGE_END));
1203         if (!iova)
1204                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1205
1206         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1207         for_each_pci_dev(pdev) {
1208                 struct resource *r;
1209
1210                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1211                         r = &pdev->resource[i];
1212                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1213                                 continue;
1214                         addr = r->start;
1215                         addr &= PAGE_MASK;
1216                         size = r->end - addr;
1217                         size = PAGE_ALIGN(size);
1218                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1219                                 IOVA_PFN(size + addr) - 1);
1220                         if (!iova)
1221                                 printk(KERN_ERR "Reserve iova failed\n");
1222                 }
1223         }
1224
1225 }
1226
1227 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1228 {
1229         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1230 }
1231
1232 static inline int guestwidth_to_adjustwidth(int gaw)
1233 {
1234         int agaw;
1235         int r = (gaw - 12) % 9;
1236
1237         if (r == 0)
1238                 agaw = gaw;
1239         else
1240                 agaw = gaw + 9 - r;
1241         if (agaw > 64)
1242                 agaw = 64;
1243         return agaw;
1244 }
1245
1246 static int domain_init(struct dmar_domain *domain, int guest_width)
1247 {
1248         struct intel_iommu *iommu;
1249         int adjust_width, agaw;
1250         unsigned long sagaw;
1251
1252         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1253         spin_lock_init(&domain->mapping_lock);
1254         spin_lock_init(&domain->iommu_lock);
1255
1256         domain_reserve_special_ranges(domain);
1257
1258         /* calculate AGAW */
1259         iommu = domain_get_iommu(domain);
1260         if (guest_width > cap_mgaw(iommu->cap))
1261                 guest_width = cap_mgaw(iommu->cap);
1262         domain->gaw = guest_width;
1263         adjust_width = guestwidth_to_adjustwidth(guest_width);
1264         agaw = width_to_agaw(adjust_width);
1265         sagaw = cap_sagaw(iommu->cap);
1266         if (!test_bit(agaw, &sagaw)) {
1267                 /* hardware doesn't support it, choose a bigger one */
1268                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1269                 agaw = find_next_bit(&sagaw, 5, agaw);
1270                 if (agaw >= 5)
1271                         return -ENODEV;
1272         }
1273         domain->agaw = agaw;
1274         INIT_LIST_HEAD(&domain->devices);
1275
1276         if (ecap_coherent(iommu->ecap))
1277                 domain->iommu_coherency = 1;
1278         else
1279                 domain->iommu_coherency = 0;
1280
1281         if (ecap_sc_support(iommu->ecap))
1282                 domain->iommu_snooping = 1;
1283         else
1284                 domain->iommu_snooping = 0;
1285
1286         domain->iommu_count = 1;
1287
1288         /* always allocate the top pgd */
1289         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1290         if (!domain->pgd)
1291                 return -ENOMEM;
1292         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1293         return 0;
1294 }
1295
1296 static void domain_exit(struct dmar_domain *domain)
1297 {
1298         u64 end;
1299
1300         /* Domain 0 is reserved, so dont process it */
1301         if (!domain)
1302                 return;
1303
1304         domain_remove_dev_info(domain);
1305         /* destroy iovas */
1306         put_iova_domain(&domain->iovad);
1307         end = DOMAIN_MAX_ADDR(domain->gaw);
1308         end = end & (~PAGE_MASK);
1309
1310         /* clear ptes */
1311         dma_pte_clear_range(domain, 0, end);
1312
1313         /* free page tables */
1314         dma_pte_free_pagetable(domain, 0, end);
1315
1316         iommu_free_domain(domain);
1317         free_domain_mem(domain);
1318 }
1319
1320 static int domain_context_mapping_one(struct dmar_domain *domain,
1321                 u8 bus, u8 devfn)
1322 {
1323         struct context_entry *context;
1324         unsigned long flags;
1325         struct intel_iommu *iommu;
1326         struct dma_pte *pgd;
1327         unsigned long num;
1328         unsigned long ndomains;
1329         int id;
1330         int agaw;
1331
1332         pr_debug("Set context mapping for %02x:%02x.%d\n",
1333                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1334         BUG_ON(!domain->pgd);
1335
1336         iommu = device_to_iommu(bus, devfn);
1337         if (!iommu)
1338                 return -ENODEV;
1339
1340         context = device_to_context_entry(iommu, bus, devfn);
1341         if (!context)
1342                 return -ENOMEM;
1343         spin_lock_irqsave(&iommu->lock, flags);
1344         if (context_present(context)) {
1345                 spin_unlock_irqrestore(&iommu->lock, flags);
1346                 return 0;
1347         }
1348
1349         id = domain->id;
1350         pgd = domain->pgd;
1351
1352         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1353                 int found = 0;
1354
1355                 /* find an available domain id for this device in iommu */
1356                 ndomains = cap_ndoms(iommu->cap);
1357                 num = find_first_bit(iommu->domain_ids, ndomains);
1358                 for (; num < ndomains; ) {
1359                         if (iommu->domains[num] == domain) {
1360                                 id = num;
1361                                 found = 1;
1362                                 break;
1363                         }
1364                         num = find_next_bit(iommu->domain_ids,
1365                                             cap_ndoms(iommu->cap), num+1);
1366                 }
1367
1368                 if (found == 0) {
1369                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1370                         if (num >= ndomains) {
1371                                 spin_unlock_irqrestore(&iommu->lock, flags);
1372                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1373                                 return -EFAULT;
1374                         }
1375
1376                         set_bit(num, iommu->domain_ids);
1377                         iommu->domains[num] = domain;
1378                         id = num;
1379                 }
1380
1381                 /* Skip top levels of page tables for
1382                  * iommu which has less agaw than default.
1383                  */
1384                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1385                         pgd = phys_to_virt(dma_pte_addr(pgd));
1386                         if (!dma_pte_present(pgd)) {
1387                                 spin_unlock_irqrestore(&iommu->lock, flags);
1388                                 return -ENOMEM;
1389                         }
1390                 }
1391         }
1392
1393         context_set_domain_id(context, id);
1394         context_set_address_width(context, iommu->agaw);
1395         context_set_address_root(context, virt_to_phys(pgd));
1396         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1397         context_set_fault_enable(context);
1398         context_set_present(context);
1399         domain_flush_cache(domain, context, sizeof(*context));
1400
1401         /* it's a non-present to present mapping */
1402         if (iommu->flush.flush_context(iommu, domain->id,
1403                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1404                 DMA_CCMD_DEVICE_INVL, 1))
1405                 iommu_flush_write_buffer(iommu);
1406         else
1407                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1408
1409         spin_unlock_irqrestore(&iommu->lock, flags);
1410
1411         spin_lock_irqsave(&domain->iommu_lock, flags);
1412         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1413                 domain->iommu_count++;
1414                 domain_update_iommu_cap(domain);
1415         }
1416         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1417         return 0;
1418 }
1419
1420 static int
1421 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1422 {
1423         int ret;
1424         struct pci_dev *tmp, *parent;
1425
1426         ret = domain_context_mapping_one(domain, pdev->bus->number,
1427                 pdev->devfn);
1428         if (ret)
1429                 return ret;
1430
1431         /* dependent device mapping */
1432         tmp = pci_find_upstream_pcie_bridge(pdev);
1433         if (!tmp)
1434                 return 0;
1435         /* Secondary interface's bus number and devfn 0 */
1436         parent = pdev->bus->self;
1437         while (parent != tmp) {
1438                 ret = domain_context_mapping_one(domain, parent->bus->number,
1439                         parent->devfn);
1440                 if (ret)
1441                         return ret;
1442                 parent = parent->bus->self;
1443         }
1444         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1445                 return domain_context_mapping_one(domain,
1446                         tmp->subordinate->number, 0);
1447         else /* this is a legacy PCI bridge */
1448                 return domain_context_mapping_one(domain,
1449                         tmp->bus->number, tmp->devfn);
1450 }
1451
1452 static int domain_context_mapped(struct pci_dev *pdev)
1453 {
1454         int ret;
1455         struct pci_dev *tmp, *parent;
1456         struct intel_iommu *iommu;
1457
1458         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1459         if (!iommu)
1460                 return -ENODEV;
1461
1462         ret = device_context_mapped(iommu,
1463                 pdev->bus->number, pdev->devfn);
1464         if (!ret)
1465                 return ret;
1466         /* dependent device mapping */
1467         tmp = pci_find_upstream_pcie_bridge(pdev);
1468         if (!tmp)
1469                 return ret;
1470         /* Secondary interface's bus number and devfn 0 */
1471         parent = pdev->bus->self;
1472         while (parent != tmp) {
1473                 ret = device_context_mapped(iommu, parent->bus->number,
1474                         parent->devfn);
1475                 if (!ret)
1476                         return ret;
1477                 parent = parent->bus->self;
1478         }
1479         if (tmp->is_pcie)
1480                 return device_context_mapped(iommu,
1481                         tmp->subordinate->number, 0);
1482         else
1483                 return device_context_mapped(iommu,
1484                         tmp->bus->number, tmp->devfn);
1485 }
1486
1487 static int
1488 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1489                         u64 hpa, size_t size, int prot)
1490 {
1491         u64 start_pfn, end_pfn;
1492         struct dma_pte *pte;
1493         int index;
1494         int addr_width = agaw_to_width(domain->agaw);
1495
1496         hpa &= (((u64)1) << addr_width) - 1;
1497
1498         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1499                 return -EINVAL;
1500         iova &= PAGE_MASK;
1501         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1502         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1503         index = 0;
1504         while (start_pfn < end_pfn) {
1505                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1506                 if (!pte)
1507                         return -ENOMEM;
1508                 /* We don't need lock here, nobody else
1509                  * touches the iova range
1510                  */
1511                 BUG_ON(dma_pte_addr(pte));
1512                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1513                 dma_set_pte_prot(pte, prot);
1514                 if (prot & DMA_PTE_SNP)
1515                         dma_set_pte_snp(pte);
1516                 domain_flush_cache(domain, pte, sizeof(*pte));
1517                 start_pfn++;
1518                 index++;
1519         }
1520         return 0;
1521 }
1522
1523 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1524 {
1525         if (!iommu)
1526                 return;
1527
1528         clear_context_table(iommu, bus, devfn);
1529         iommu->flush.flush_context(iommu, 0, 0, 0,
1530                                            DMA_CCMD_GLOBAL_INVL, 0);
1531         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1532                                          DMA_TLB_GLOBAL_FLUSH, 0);
1533 }
1534
1535 static void domain_remove_dev_info(struct dmar_domain *domain)
1536 {
1537         struct device_domain_info *info;
1538         unsigned long flags;
1539         struct intel_iommu *iommu;
1540
1541         spin_lock_irqsave(&device_domain_lock, flags);
1542         while (!list_empty(&domain->devices)) {
1543                 info = list_entry(domain->devices.next,
1544                         struct device_domain_info, link);
1545                 list_del(&info->link);
1546                 list_del(&info->global);
1547                 if (info->dev)
1548                         info->dev->dev.archdata.iommu = NULL;
1549                 spin_unlock_irqrestore(&device_domain_lock, flags);
1550
1551                 iommu = device_to_iommu(info->bus, info->devfn);
1552                 iommu_detach_dev(iommu, info->bus, info->devfn);
1553                 free_devinfo_mem(info);
1554
1555                 spin_lock_irqsave(&device_domain_lock, flags);
1556         }
1557         spin_unlock_irqrestore(&device_domain_lock, flags);
1558 }
1559
1560 /*
1561  * find_domain
1562  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1563  */
1564 static struct dmar_domain *
1565 find_domain(struct pci_dev *pdev)
1566 {
1567         struct device_domain_info *info;
1568
1569         /* No lock here, assumes no domain exit in normal case */
1570         info = pdev->dev.archdata.iommu;
1571         if (info)
1572                 return info->domain;
1573         return NULL;
1574 }
1575
1576 /* domain is initialized */
1577 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1578 {
1579         struct dmar_domain *domain, *found = NULL;
1580         struct intel_iommu *iommu;
1581         struct dmar_drhd_unit *drhd;
1582         struct device_domain_info *info, *tmp;
1583         struct pci_dev *dev_tmp;
1584         unsigned long flags;
1585         int bus = 0, devfn = 0;
1586
1587         domain = find_domain(pdev);
1588         if (domain)
1589                 return domain;
1590
1591         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1592         if (dev_tmp) {
1593                 if (dev_tmp->is_pcie) {
1594                         bus = dev_tmp->subordinate->number;
1595                         devfn = 0;
1596                 } else {
1597                         bus = dev_tmp->bus->number;
1598                         devfn = dev_tmp->devfn;
1599                 }
1600                 spin_lock_irqsave(&device_domain_lock, flags);
1601                 list_for_each_entry(info, &device_domain_list, global) {
1602                         if (info->bus == bus && info->devfn == devfn) {
1603                                 found = info->domain;
1604                                 break;
1605                         }
1606                 }
1607                 spin_unlock_irqrestore(&device_domain_lock, flags);
1608                 /* pcie-pci bridge already has a domain, uses it */
1609                 if (found) {
1610                         domain = found;
1611                         goto found_domain;
1612                 }
1613         }
1614
1615         /* Allocate new domain for the device */
1616         drhd = dmar_find_matched_drhd_unit(pdev);
1617         if (!drhd) {
1618                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1619                         pci_name(pdev));
1620                 return NULL;
1621         }
1622         iommu = drhd->iommu;
1623
1624         domain = iommu_alloc_domain(iommu);
1625         if (!domain)
1626                 goto error;
1627
1628         if (domain_init(domain, gaw)) {
1629                 domain_exit(domain);
1630                 goto error;
1631         }
1632
1633         /* register pcie-to-pci device */
1634         if (dev_tmp) {
1635                 info = alloc_devinfo_mem();
1636                 if (!info) {
1637                         domain_exit(domain);
1638                         goto error;
1639                 }
1640                 info->bus = bus;
1641                 info->devfn = devfn;
1642                 info->dev = NULL;
1643                 info->domain = domain;
1644                 /* This domain is shared by devices under p2p bridge */
1645                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1646
1647                 /* pcie-to-pci bridge already has a domain, uses it */
1648                 found = NULL;
1649                 spin_lock_irqsave(&device_domain_lock, flags);
1650                 list_for_each_entry(tmp, &device_domain_list, global) {
1651                         if (tmp->bus == bus && tmp->devfn == devfn) {
1652                                 found = tmp->domain;
1653                                 break;
1654                         }
1655                 }
1656                 if (found) {
1657                         free_devinfo_mem(info);
1658                         domain_exit(domain);
1659                         domain = found;
1660                 } else {
1661                         list_add(&info->link, &domain->devices);
1662                         list_add(&info->global, &device_domain_list);
1663                 }
1664                 spin_unlock_irqrestore(&device_domain_lock, flags);
1665         }
1666
1667 found_domain:
1668         info = alloc_devinfo_mem();
1669         if (!info)
1670                 goto error;
1671         info->bus = pdev->bus->number;
1672         info->devfn = pdev->devfn;
1673         info->dev = pdev;
1674         info->domain = domain;
1675         spin_lock_irqsave(&device_domain_lock, flags);
1676         /* somebody is fast */
1677         found = find_domain(pdev);
1678         if (found != NULL) {
1679                 spin_unlock_irqrestore(&device_domain_lock, flags);
1680                 if (found != domain) {
1681                         domain_exit(domain);
1682                         domain = found;
1683                 }
1684                 free_devinfo_mem(info);
1685                 return domain;
1686         }
1687         list_add(&info->link, &domain->devices);
1688         list_add(&info->global, &device_domain_list);
1689         pdev->dev.archdata.iommu = info;
1690         spin_unlock_irqrestore(&device_domain_lock, flags);
1691         return domain;
1692 error:
1693         /* recheck it here, maybe others set it */
1694         return find_domain(pdev);
1695 }
1696
1697 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1698                                       unsigned long long start,
1699                                       unsigned long long end)
1700 {
1701         struct dmar_domain *domain;
1702         unsigned long size;
1703         unsigned long long base;
1704         int ret;
1705
1706         printk(KERN_INFO
1707                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1708                 pci_name(pdev), start, end);
1709         /* page table init */
1710         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1711         if (!domain)
1712                 return -ENOMEM;
1713
1714         /* The address might not be aligned */
1715         base = start & PAGE_MASK;
1716         size = end - base;
1717         size = PAGE_ALIGN(size);
1718         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1719                         IOVA_PFN(base + size) - 1)) {
1720                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1721                 ret = -ENOMEM;
1722                 goto error;
1723         }
1724
1725         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1726                 size, base, pci_name(pdev));
1727         /*
1728          * RMRR range might have overlap with physical memory range,
1729          * clear it first
1730          */
1731         dma_pte_clear_range(domain, base, base + size);
1732
1733         ret = domain_page_mapping(domain, base, base, size,
1734                 DMA_PTE_READ|DMA_PTE_WRITE);
1735         if (ret)
1736                 goto error;
1737
1738         /* context entry init */
1739         ret = domain_context_mapping(domain, pdev);
1740         if (!ret)
1741                 return 0;
1742 error:
1743         domain_exit(domain);
1744         return ret;
1745
1746 }
1747
1748 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1749         struct pci_dev *pdev)
1750 {
1751         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1752                 return 0;
1753         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1754                 rmrr->end_address + 1);
1755 }
1756
1757 #ifdef CONFIG_DMAR_GFX_WA
1758 struct iommu_prepare_data {
1759         struct pci_dev *pdev;
1760         int ret;
1761 };
1762
1763 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1764                                          unsigned long end_pfn, void *datax)
1765 {
1766         struct iommu_prepare_data *data;
1767
1768         data = (struct iommu_prepare_data *)datax;
1769
1770         data->ret = iommu_prepare_identity_map(data->pdev,
1771                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1772         return data->ret;
1773
1774 }
1775
1776 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1777 {
1778         int nid;
1779         struct iommu_prepare_data data;
1780
1781         data.pdev = pdev;
1782         data.ret = 0;
1783
1784         for_each_online_node(nid) {
1785                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1786                 if (data.ret)
1787                         return data.ret;
1788         }
1789         return data.ret;
1790 }
1791
1792 static void __init iommu_prepare_gfx_mapping(void)
1793 {
1794         struct pci_dev *pdev = NULL;
1795         int ret;
1796
1797         for_each_pci_dev(pdev) {
1798                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1799                                 !IS_GFX_DEVICE(pdev))
1800                         continue;
1801                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1802                         pci_name(pdev));
1803                 ret = iommu_prepare_with_active_regions(pdev);
1804                 if (ret)
1805                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1806         }
1807 }
1808 #else /* !CONFIG_DMAR_GFX_WA */
1809 static inline void iommu_prepare_gfx_mapping(void)
1810 {
1811         return;
1812 }
1813 #endif
1814
1815 #ifdef CONFIG_DMAR_FLOPPY_WA
1816 static inline void iommu_prepare_isa(void)
1817 {
1818         struct pci_dev *pdev;
1819         int ret;
1820
1821         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1822         if (!pdev)
1823                 return;
1824
1825         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1826         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1827
1828         if (ret)
1829                 printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
1830                         "floppy might not work\n");
1831
1832 }
1833 #else
1834 static inline void iommu_prepare_isa(void)
1835 {
1836         return;
1837 }
1838 #endif /* !CONFIG_DMAR_FLPY_WA */
1839
1840 static int __init init_dmars(void)
1841 {
1842         struct dmar_drhd_unit *drhd;
1843         struct dmar_rmrr_unit *rmrr;
1844         struct pci_dev *pdev;
1845         struct intel_iommu *iommu;
1846         int i, ret;
1847
1848         /*
1849          * for each drhd
1850          *    allocate root
1851          *    initialize and program root entry to not present
1852          * endfor
1853          */
1854         for_each_drhd_unit(drhd) {
1855                 g_num_of_iommus++;
1856                 /*
1857                  * lock not needed as this is only incremented in the single
1858                  * threaded kernel __init code path all other access are read
1859                  * only
1860                  */
1861         }
1862
1863         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1864                         GFP_KERNEL);
1865         if (!g_iommus) {
1866                 printk(KERN_ERR "Allocating global iommu array failed\n");
1867                 ret = -ENOMEM;
1868                 goto error;
1869         }
1870
1871         deferred_flush = kzalloc(g_num_of_iommus *
1872                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1873         if (!deferred_flush) {
1874                 kfree(g_iommus);
1875                 ret = -ENOMEM;
1876                 goto error;
1877         }
1878
1879         for_each_drhd_unit(drhd) {
1880                 if (drhd->ignored)
1881                         continue;
1882
1883                 iommu = drhd->iommu;
1884                 g_iommus[iommu->seq_id] = iommu;
1885
1886                 ret = iommu_init_domains(iommu);
1887                 if (ret)
1888                         goto error;
1889
1890                 /*
1891                  * TBD:
1892                  * we could share the same root & context tables
1893                  * amoung all IOMMU's. Need to Split it later.
1894                  */
1895                 ret = iommu_alloc_root_entry(iommu);
1896                 if (ret) {
1897                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1898                         goto error;
1899                 }
1900         }
1901
1902         /*
1903          * Start from the sane iommu hardware state.
1904          */
1905         for_each_drhd_unit(drhd) {
1906                 if (drhd->ignored)
1907                         continue;
1908
1909                 iommu = drhd->iommu;
1910
1911                 /*
1912                  * If the queued invalidation is already initialized by us
1913                  * (for example, while enabling interrupt-remapping) then
1914                  * we got the things already rolling from a sane state.
1915                  */
1916                 if (iommu->qi)
1917                         continue;
1918
1919                 /*
1920                  * Clear any previous faults.
1921                  */
1922                 dmar_fault(-1, iommu);
1923                 /*
1924                  * Disable queued invalidation if supported and already enabled
1925                  * before OS handover.
1926                  */
1927                 dmar_disable_qi(iommu);
1928         }
1929
1930         for_each_drhd_unit(drhd) {
1931                 if (drhd->ignored)
1932                         continue;
1933
1934                 iommu = drhd->iommu;
1935
1936                 if (dmar_enable_qi(iommu)) {
1937                         /*
1938                          * Queued Invalidate not enabled, use Register Based
1939                          * Invalidate
1940                          */
1941                         iommu->flush.flush_context = __iommu_flush_context;
1942                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1943                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1944                                "invalidation\n",
1945                                (unsigned long long)drhd->reg_base_addr);
1946                 } else {
1947                         iommu->flush.flush_context = qi_flush_context;
1948                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1949                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1950                                "invalidation\n",
1951                                (unsigned long long)drhd->reg_base_addr);
1952                 }
1953         }
1954
1955 #ifdef CONFIG_INTR_REMAP
1956         if (!intr_remapping_enabled) {
1957                 ret = enable_intr_remapping(0);
1958                 if (ret)
1959                         printk(KERN_ERR
1960                                "IOMMU: enable interrupt remapping failed\n");
1961         }
1962 #endif
1963
1964         /*
1965          * For each rmrr
1966          *   for each dev attached to rmrr
1967          *   do
1968          *     locate drhd for dev, alloc domain for dev
1969          *     allocate free domain
1970          *     allocate page table entries for rmrr
1971          *     if context not allocated for bus
1972          *           allocate and init context
1973          *           set present in root table for this bus
1974          *     init context with domain, translation etc
1975          *    endfor
1976          * endfor
1977          */
1978         for_each_rmrr_units(rmrr) {
1979                 for (i = 0; i < rmrr->devices_cnt; i++) {
1980                         pdev = rmrr->devices[i];
1981                         /* some BIOS lists non-exist devices in DMAR table */
1982                         if (!pdev)
1983                                 continue;
1984                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1985                         if (ret)
1986                                 printk(KERN_ERR
1987                                  "IOMMU: mapping reserved region failed\n");
1988                 }
1989         }
1990
1991         iommu_prepare_gfx_mapping();
1992
1993         iommu_prepare_isa();
1994
1995         /*
1996          * for each drhd
1997          *   enable fault log
1998          *   global invalidate context cache
1999          *   global invalidate iotlb
2000          *   enable translation
2001          */
2002         for_each_drhd_unit(drhd) {
2003                 if (drhd->ignored)
2004                         continue;
2005                 iommu = drhd->iommu;
2006
2007                 iommu_flush_write_buffer(iommu);
2008
2009                 ret = dmar_set_interrupt(iommu);
2010                 if (ret)
2011                         goto error;
2012
2013                 iommu_set_root_entry(iommu);
2014
2015                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2016                                            0);
2017                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2018                                          0);
2019                 iommu_disable_protect_mem_regions(iommu);
2020
2021                 ret = iommu_enable_translation(iommu);
2022                 if (ret)
2023                         goto error;
2024         }
2025
2026         return 0;
2027 error:
2028         for_each_drhd_unit(drhd) {
2029                 if (drhd->ignored)
2030                         continue;
2031                 iommu = drhd->iommu;
2032                 free_iommu(iommu);
2033         }
2034         kfree(g_iommus);
2035         return ret;
2036 }
2037
2038 static inline u64 aligned_size(u64 host_addr, size_t size)
2039 {
2040         u64 addr;
2041         addr = (host_addr & (~PAGE_MASK)) + size;
2042         return PAGE_ALIGN(addr);
2043 }
2044
2045 struct iova *
2046 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2047 {
2048         struct iova *piova;
2049
2050         /* Make sure it's in range */
2051         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2052         if (!size || (IOVA_START_ADDR + size > end))
2053                 return NULL;
2054
2055         piova = alloc_iova(&domain->iovad,
2056                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2057         return piova;
2058 }
2059
2060 static struct iova *
2061 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2062                    size_t size, u64 dma_mask)
2063 {
2064         struct pci_dev *pdev = to_pci_dev(dev);
2065         struct iova *iova = NULL;
2066
2067         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2068                 iova = iommu_alloc_iova(domain, size, dma_mask);
2069         else {
2070                 /*
2071                  * First try to allocate an io virtual address in
2072                  * DMA_32BIT_MASK and if that fails then try allocating
2073                  * from higher range
2074                  */
2075                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2076                 if (!iova)
2077                         iova = iommu_alloc_iova(domain, size, dma_mask);
2078         }
2079
2080         if (!iova) {
2081                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2082                 return NULL;
2083         }
2084
2085         return iova;
2086 }
2087
2088 static struct dmar_domain *
2089 get_valid_domain_for_dev(struct pci_dev *pdev)
2090 {
2091         struct dmar_domain *domain;
2092         int ret;
2093
2094         domain = get_domain_for_dev(pdev,
2095                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2096         if (!domain) {
2097                 printk(KERN_ERR
2098                         "Allocating domain for %s failed", pci_name(pdev));
2099                 return NULL;
2100         }
2101
2102         /* make sure context mapping is ok */
2103         if (unlikely(!domain_context_mapped(pdev))) {
2104                 ret = domain_context_mapping(domain, pdev);
2105                 if (ret) {
2106                         printk(KERN_ERR
2107                                 "Domain context map for %s failed",
2108                                 pci_name(pdev));
2109                         return NULL;
2110                 }
2111         }
2112
2113         return domain;
2114 }
2115
2116 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2117                                      size_t size, int dir, u64 dma_mask)
2118 {
2119         struct pci_dev *pdev = to_pci_dev(hwdev);
2120         struct dmar_domain *domain;
2121         phys_addr_t start_paddr;
2122         struct iova *iova;
2123         int prot = 0;
2124         int ret;
2125         struct intel_iommu *iommu;
2126
2127         BUG_ON(dir == DMA_NONE);
2128         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2129                 return paddr;
2130
2131         domain = get_valid_domain_for_dev(pdev);
2132         if (!domain)
2133                 return 0;
2134
2135         iommu = domain_get_iommu(domain);
2136         size = aligned_size((u64)paddr, size);
2137
2138         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2139         if (!iova)
2140                 goto error;
2141
2142         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2143
2144         /*
2145          * Check if DMAR supports zero-length reads on write only
2146          * mappings..
2147          */
2148         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2149                         !cap_zlr(iommu->cap))
2150                 prot |= DMA_PTE_READ;
2151         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2152                 prot |= DMA_PTE_WRITE;
2153         /*
2154          * paddr - (paddr + size) might be partial page, we should map the whole
2155          * page.  Note: if two part of one page are separately mapped, we
2156          * might have two guest_addr mapping to the same host paddr, but this
2157          * is not a big problem
2158          */
2159         ret = domain_page_mapping(domain, start_paddr,
2160                 ((u64)paddr) & PAGE_MASK, size, prot);
2161         if (ret)
2162                 goto error;
2163
2164         /* it's a non-present to present mapping */
2165         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2166                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2167         if (ret)
2168                 iommu_flush_write_buffer(iommu);
2169
2170         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2171
2172 error:
2173         if (iova)
2174                 __free_iova(&domain->iovad, iova);
2175         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2176                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2177         return 0;
2178 }
2179
2180 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2181                                  unsigned long offset, size_t size,
2182                                  enum dma_data_direction dir,
2183                                  struct dma_attrs *attrs)
2184 {
2185         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2186                                   dir, to_pci_dev(dev)->dma_mask);
2187 }
2188
2189 static void flush_unmaps(void)
2190 {
2191         int i, j;
2192
2193         timer_on = 0;
2194
2195         /* just flush them all */
2196         for (i = 0; i < g_num_of_iommus; i++) {
2197                 struct intel_iommu *iommu = g_iommus[i];
2198                 if (!iommu)
2199                         continue;
2200
2201                 if (deferred_flush[i].next) {
2202                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2203                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2204                         for (j = 0; j < deferred_flush[i].next; j++) {
2205                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2206                                                 deferred_flush[i].iova[j]);
2207                         }
2208                         deferred_flush[i].next = 0;
2209                 }
2210         }
2211
2212         list_size = 0;
2213 }
2214
2215 static void flush_unmaps_timeout(unsigned long data)
2216 {
2217         unsigned long flags;
2218
2219         spin_lock_irqsave(&async_umap_flush_lock, flags);
2220         flush_unmaps();
2221         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2222 }
2223
2224 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2225 {
2226         unsigned long flags;
2227         int next, iommu_id;
2228         struct intel_iommu *iommu;
2229
2230         spin_lock_irqsave(&async_umap_flush_lock, flags);
2231         if (list_size == HIGH_WATER_MARK)
2232                 flush_unmaps();
2233
2234         iommu = domain_get_iommu(dom);
2235         iommu_id = iommu->seq_id;
2236
2237         next = deferred_flush[iommu_id].next;
2238         deferred_flush[iommu_id].domain[next] = dom;
2239         deferred_flush[iommu_id].iova[next] = iova;
2240         deferred_flush[iommu_id].next++;
2241
2242         if (!timer_on) {
2243                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2244                 timer_on = 1;
2245         }
2246         list_size++;
2247         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2248 }
2249
2250 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2251                              size_t size, enum dma_data_direction dir,
2252                              struct dma_attrs *attrs)
2253 {
2254         struct pci_dev *pdev = to_pci_dev(dev);
2255         struct dmar_domain *domain;
2256         unsigned long start_addr;
2257         struct iova *iova;
2258         struct intel_iommu *iommu;
2259
2260         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2261                 return;
2262         domain = find_domain(pdev);
2263         BUG_ON(!domain);
2264
2265         iommu = domain_get_iommu(domain);
2266
2267         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2268         if (!iova)
2269                 return;
2270
2271         start_addr = iova->pfn_lo << PAGE_SHIFT;
2272         size = aligned_size((u64)dev_addr, size);
2273
2274         pr_debug("Device %s unmapping: %zx@%llx\n",
2275                 pci_name(pdev), size, (unsigned long long)start_addr);
2276
2277         /*  clear the whole page */
2278         dma_pte_clear_range(domain, start_addr, start_addr + size);
2279         /* free page tables */
2280         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2281         if (intel_iommu_strict) {
2282                 if (iommu_flush_iotlb_psi(iommu,
2283                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2284                         iommu_flush_write_buffer(iommu);
2285                 /* free iova */
2286                 __free_iova(&domain->iovad, iova);
2287         } else {
2288                 add_unmap(domain, iova);
2289                 /*
2290                  * queue up the release of the unmap to save the 1/6th of the
2291                  * cpu used up by the iotlb flush operation...
2292                  */
2293         }
2294 }
2295
2296 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2297                                int dir)
2298 {
2299         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2300 }
2301
2302 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2303                                   dma_addr_t *dma_handle, gfp_t flags)
2304 {
2305         void *vaddr;
2306         int order;
2307
2308         size = PAGE_ALIGN(size);
2309         order = get_order(size);
2310         flags &= ~(GFP_DMA | GFP_DMA32);
2311
2312         vaddr = (void *)__get_free_pages(flags, order);
2313         if (!vaddr)
2314                 return NULL;
2315         memset(vaddr, 0, size);
2316
2317         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2318                                          DMA_BIDIRECTIONAL,
2319                                          hwdev->coherent_dma_mask);
2320         if (*dma_handle)
2321                 return vaddr;
2322         free_pages((unsigned long)vaddr, order);
2323         return NULL;
2324 }
2325
2326 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2327                                 dma_addr_t dma_handle)
2328 {
2329         int order;
2330
2331         size = PAGE_ALIGN(size);
2332         order = get_order(size);
2333
2334         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2335         free_pages((unsigned long)vaddr, order);
2336 }
2337
2338 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2339                            int nelems, enum dma_data_direction dir,
2340                            struct dma_attrs *attrs)
2341 {
2342         int i;
2343         struct pci_dev *pdev = to_pci_dev(hwdev);
2344         struct dmar_domain *domain;
2345         unsigned long start_addr;
2346         struct iova *iova;
2347         size_t size = 0;
2348         phys_addr_t addr;
2349         struct scatterlist *sg;
2350         struct intel_iommu *iommu;
2351
2352         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2353                 return;
2354
2355         domain = find_domain(pdev);
2356         BUG_ON(!domain);
2357
2358         iommu = domain_get_iommu(domain);
2359
2360         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2361         if (!iova)
2362                 return;
2363         for_each_sg(sglist, sg, nelems, i) {
2364                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2365                 size += aligned_size((u64)addr, sg->length);
2366         }
2367
2368         start_addr = iova->pfn_lo << PAGE_SHIFT;
2369
2370         /*  clear the whole page */
2371         dma_pte_clear_range(domain, start_addr, start_addr + size);
2372         /* free page tables */
2373         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2374
2375         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2376                         size >> VTD_PAGE_SHIFT, 0))
2377                 iommu_flush_write_buffer(iommu);
2378
2379         /* free iova */
2380         __free_iova(&domain->iovad, iova);
2381 }
2382
2383 static int intel_nontranslate_map_sg(struct device *hddev,
2384         struct scatterlist *sglist, int nelems, int dir)
2385 {
2386         int i;
2387         struct scatterlist *sg;
2388
2389         for_each_sg(sglist, sg, nelems, i) {
2390                 BUG_ON(!sg_page(sg));
2391                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2392                 sg->dma_length = sg->length;
2393         }
2394         return nelems;
2395 }
2396
2397 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2398                         enum dma_data_direction dir, struct dma_attrs *attrs)
2399 {
2400         phys_addr_t addr;
2401         int i;
2402         struct pci_dev *pdev = to_pci_dev(hwdev);
2403         struct dmar_domain *domain;
2404         size_t size = 0;
2405         int prot = 0;
2406         size_t offset = 0;
2407         struct iova *iova = NULL;
2408         int ret;
2409         struct scatterlist *sg;
2410         unsigned long start_addr;
2411         struct intel_iommu *iommu;
2412
2413         BUG_ON(dir == DMA_NONE);
2414         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2415                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2416
2417         domain = get_valid_domain_for_dev(pdev);
2418         if (!domain)
2419                 return 0;
2420
2421         iommu = domain_get_iommu(domain);
2422
2423         for_each_sg(sglist, sg, nelems, i) {
2424                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2425                 size += aligned_size((u64)addr, sg->length);
2426         }
2427
2428         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2429         if (!iova) {
2430                 sglist->dma_length = 0;
2431                 return 0;
2432         }
2433
2434         /*
2435          * Check if DMAR supports zero-length reads on write only
2436          * mappings..
2437          */
2438         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2439                         !cap_zlr(iommu->cap))
2440                 prot |= DMA_PTE_READ;
2441         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2442                 prot |= DMA_PTE_WRITE;
2443
2444         start_addr = iova->pfn_lo << PAGE_SHIFT;
2445         offset = 0;
2446         for_each_sg(sglist, sg, nelems, i) {
2447                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2448                 size = aligned_size((u64)addr, sg->length);
2449                 ret = domain_page_mapping(domain, start_addr + offset,
2450                         ((u64)addr) & PAGE_MASK,
2451                         size, prot);
2452                 if (ret) {
2453                         /*  clear the page */
2454                         dma_pte_clear_range(domain, start_addr,
2455                                   start_addr + offset);
2456                         /* free page tables */
2457                         dma_pte_free_pagetable(domain, start_addr,
2458                                   start_addr + offset);
2459                         /* free iova */
2460                         __free_iova(&domain->iovad, iova);
2461                         return 0;
2462                 }
2463                 sg->dma_address = start_addr + offset +
2464                                 ((u64)addr & (~PAGE_MASK));
2465                 sg->dma_length = sg->length;
2466                 offset += size;
2467         }
2468
2469         /* it's a non-present to present mapping */
2470         if (iommu_flush_iotlb_psi(iommu, domain->id,
2471                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2472                 iommu_flush_write_buffer(iommu);
2473         return nelems;
2474 }
2475
2476 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2477 {
2478         return !dma_addr;
2479 }
2480
2481 struct dma_map_ops intel_dma_ops = {
2482         .alloc_coherent = intel_alloc_coherent,
2483         .free_coherent = intel_free_coherent,
2484         .map_sg = intel_map_sg,
2485         .unmap_sg = intel_unmap_sg,
2486         .map_page = intel_map_page,
2487         .unmap_page = intel_unmap_page,
2488         .mapping_error = intel_mapping_error,
2489 };
2490
2491 static inline int iommu_domain_cache_init(void)
2492 {
2493         int ret = 0;
2494
2495         iommu_domain_cache = kmem_cache_create("iommu_domain",
2496                                          sizeof(struct dmar_domain),
2497                                          0,
2498                                          SLAB_HWCACHE_ALIGN,
2499
2500                                          NULL);
2501         if (!iommu_domain_cache) {
2502                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2503                 ret = -ENOMEM;
2504         }
2505
2506         return ret;
2507 }
2508
2509 static inline int iommu_devinfo_cache_init(void)
2510 {
2511         int ret = 0;
2512
2513         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2514                                          sizeof(struct device_domain_info),
2515                                          0,
2516                                          SLAB_HWCACHE_ALIGN,
2517                                          NULL);
2518         if (!iommu_devinfo_cache) {
2519                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2520                 ret = -ENOMEM;
2521         }
2522
2523         return ret;
2524 }
2525
2526 static inline int iommu_iova_cache_init(void)
2527 {
2528         int ret = 0;
2529
2530         iommu_iova_cache = kmem_cache_create("iommu_iova",
2531                                          sizeof(struct iova),
2532                                          0,
2533                                          SLAB_HWCACHE_ALIGN,
2534                                          NULL);
2535         if (!iommu_iova_cache) {
2536                 printk(KERN_ERR "Couldn't create iova cache\n");
2537                 ret = -ENOMEM;
2538         }
2539
2540         return ret;
2541 }
2542
2543 static int __init iommu_init_mempool(void)
2544 {
2545         int ret;
2546         ret = iommu_iova_cache_init();
2547         if (ret)
2548                 return ret;
2549
2550         ret = iommu_domain_cache_init();
2551         if (ret)
2552                 goto domain_error;
2553
2554         ret = iommu_devinfo_cache_init();
2555         if (!ret)
2556                 return ret;
2557
2558         kmem_cache_destroy(iommu_domain_cache);
2559 domain_error:
2560         kmem_cache_destroy(iommu_iova_cache);
2561
2562         return -ENOMEM;
2563 }
2564
2565 static void __init iommu_exit_mempool(void)
2566 {
2567         kmem_cache_destroy(iommu_devinfo_cache);
2568         kmem_cache_destroy(iommu_domain_cache);
2569         kmem_cache_destroy(iommu_iova_cache);
2570
2571 }
2572
2573 static void __init init_no_remapping_devices(void)
2574 {
2575         struct dmar_drhd_unit *drhd;
2576
2577         for_each_drhd_unit(drhd) {
2578                 if (!drhd->include_all) {
2579                         int i;
2580                         for (i = 0; i < drhd->devices_cnt; i++)
2581                                 if (drhd->devices[i] != NULL)
2582                                         break;
2583                         /* ignore DMAR unit if no pci devices exist */
2584                         if (i == drhd->devices_cnt)
2585                                 drhd->ignored = 1;
2586                 }
2587         }
2588
2589         if (dmar_map_gfx)
2590                 return;
2591
2592         for_each_drhd_unit(drhd) {
2593                 int i;
2594                 if (drhd->ignored || drhd->include_all)
2595                         continue;
2596
2597                 for (i = 0; i < drhd->devices_cnt; i++)
2598                         if (drhd->devices[i] &&
2599                                 !IS_GFX_DEVICE(drhd->devices[i]))
2600                                 break;
2601
2602                 if (i < drhd->devices_cnt)
2603                         continue;
2604
2605                 /* bypass IOMMU if it is just for gfx devices */
2606                 drhd->ignored = 1;
2607                 for (i = 0; i < drhd->devices_cnt; i++) {
2608                         if (!drhd->devices[i])
2609                                 continue;
2610                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2611                 }
2612         }
2613 }
2614
2615 #ifdef CONFIG_SUSPEND
2616 static int init_iommu_hw(void)
2617 {
2618         struct dmar_drhd_unit *drhd;
2619         struct intel_iommu *iommu = NULL;
2620
2621         for_each_active_iommu(iommu, drhd)
2622                 if (iommu->qi)
2623                         dmar_reenable_qi(iommu);
2624
2625         for_each_active_iommu(iommu, drhd) {
2626                 iommu_flush_write_buffer(iommu);
2627
2628                 iommu_set_root_entry(iommu);
2629
2630                 iommu->flush.flush_context(iommu, 0, 0, 0,
2631                                                 DMA_CCMD_GLOBAL_INVL, 0);
2632                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2633                                                 DMA_TLB_GLOBAL_FLUSH, 0);
2634                 iommu_disable_protect_mem_regions(iommu);
2635                 iommu_enable_translation(iommu);
2636         }
2637
2638         return 0;
2639 }
2640
2641 static void iommu_flush_all(void)
2642 {
2643         struct dmar_drhd_unit *drhd;
2644         struct intel_iommu *iommu;
2645
2646         for_each_active_iommu(iommu, drhd) {
2647                 iommu->flush.flush_context(iommu, 0, 0, 0,
2648                                                 DMA_CCMD_GLOBAL_INVL, 0);
2649                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2650                                                 DMA_TLB_GLOBAL_FLUSH, 0);
2651         }
2652 }
2653
2654 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
2655 {
2656         struct dmar_drhd_unit *drhd;
2657         struct intel_iommu *iommu = NULL;
2658         unsigned long flag;
2659
2660         for_each_active_iommu(iommu, drhd) {
2661                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
2662                                                  GFP_ATOMIC);
2663                 if (!iommu->iommu_state)
2664                         goto nomem;
2665         }
2666
2667         iommu_flush_all();
2668
2669         for_each_active_iommu(iommu, drhd) {
2670                 iommu_disable_translation(iommu);
2671
2672                 spin_lock_irqsave(&iommu->register_lock, flag);
2673
2674                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2675                         readl(iommu->reg + DMAR_FECTL_REG);
2676                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2677                         readl(iommu->reg + DMAR_FEDATA_REG);
2678                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2679                         readl(iommu->reg + DMAR_FEADDR_REG);
2680                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2681                         readl(iommu->reg + DMAR_FEUADDR_REG);
2682
2683                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2684         }
2685         return 0;
2686
2687 nomem:
2688         for_each_active_iommu(iommu, drhd)
2689                 kfree(iommu->iommu_state);
2690
2691         return -ENOMEM;
2692 }
2693
2694 static int iommu_resume(struct sys_device *dev)
2695 {
2696         struct dmar_drhd_unit *drhd;
2697         struct intel_iommu *iommu = NULL;
2698         unsigned long flag;
2699
2700         if (init_iommu_hw()) {
2701                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2702                 return -EIO;
2703         }
2704
2705         for_each_active_iommu(iommu, drhd) {
2706
2707                 spin_lock_irqsave(&iommu->register_lock, flag);
2708
2709                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2710                         iommu->reg + DMAR_FECTL_REG);
2711                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2712                         iommu->reg + DMAR_FEDATA_REG);
2713                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2714                         iommu->reg + DMAR_FEADDR_REG);
2715                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2716                         iommu->reg + DMAR_FEUADDR_REG);
2717
2718                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2719         }
2720
2721         for_each_active_iommu(iommu, drhd)
2722                 kfree(iommu->iommu_state);
2723
2724         return 0;
2725 }
2726
2727 static struct sysdev_class iommu_sysclass = {
2728         .name           = "iommu",
2729         .resume         = iommu_resume,
2730         .suspend        = iommu_suspend,
2731 };
2732
2733 static struct sys_device device_iommu = {
2734         .cls    = &iommu_sysclass,
2735 };
2736
2737 static int __init init_iommu_sysfs(void)
2738 {
2739         int error;
2740
2741         error = sysdev_class_register(&iommu_sysclass);
2742         if (error)
2743                 return error;
2744
2745         error = sysdev_register(&device_iommu);
2746         if (error)
2747                 sysdev_class_unregister(&iommu_sysclass);
2748
2749         return error;
2750 }
2751
2752 #else
2753 static int __init init_iommu_sysfs(void)
2754 {
2755         return 0;
2756 }
2757 #endif  /* CONFIG_PM */
2758
2759 int __init intel_iommu_init(void)
2760 {
2761         int ret = 0;
2762
2763         if (dmar_table_init())
2764                 return  -ENODEV;
2765
2766         if (dmar_dev_scope_init())
2767                 return  -ENODEV;
2768
2769         /*
2770          * Check the need for DMA-remapping initialization now.
2771          * Above initialization will also be used by Interrupt-remapping.
2772          */
2773         if (no_iommu || swiotlb || dmar_disabled)
2774                 return -ENODEV;
2775
2776         iommu_init_mempool();
2777         dmar_init_reserved_ranges();
2778
2779         init_no_remapping_devices();
2780
2781         ret = init_dmars();
2782         if (ret) {
2783                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2784                 put_iova_domain(&reserved_iova_list);
2785                 iommu_exit_mempool();
2786                 return ret;
2787         }
2788         printk(KERN_INFO
2789         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2790
2791         init_timer(&unmap_timer);
2792         force_iommu = 1;
2793         dma_ops = &intel_dma_ops;
2794         init_iommu_sysfs();
2795
2796         register_iommu(&intel_iommu_ops);
2797
2798         return 0;
2799 }
2800
2801 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2802                                   struct pci_dev *pdev)
2803 {
2804         struct device_domain_info *info;
2805         unsigned long flags;
2806
2807         info = alloc_devinfo_mem();
2808         if (!info)
2809                 return -ENOMEM;
2810
2811         info->bus = pdev->bus->number;
2812         info->devfn = pdev->devfn;
2813         info->dev = pdev;
2814         info->domain = domain;
2815
2816         spin_lock_irqsave(&device_domain_lock, flags);
2817         list_add(&info->link, &domain->devices);
2818         list_add(&info->global, &device_domain_list);
2819         pdev->dev.archdata.iommu = info;
2820         spin_unlock_irqrestore(&device_domain_lock, flags);
2821
2822         return 0;
2823 }
2824
2825 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2826                                            struct pci_dev *pdev)
2827 {
2828         struct pci_dev *tmp, *parent;
2829
2830         if (!iommu || !pdev)
2831                 return;
2832
2833         /* dependent device detach */
2834         tmp = pci_find_upstream_pcie_bridge(pdev);
2835         /* Secondary interface's bus number and devfn 0 */
2836         if (tmp) {
2837                 parent = pdev->bus->self;
2838                 while (parent != tmp) {
2839                         iommu_detach_dev(iommu, parent->bus->number,
2840                                 parent->devfn);
2841                         parent = parent->bus->self;
2842                 }
2843                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2844                         iommu_detach_dev(iommu,
2845                                 tmp->subordinate->number, 0);
2846                 else /* this is a legacy PCI bridge */
2847                         iommu_detach_dev(iommu,
2848                                 tmp->bus->number, tmp->devfn);
2849         }
2850 }
2851
2852 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2853                                           struct pci_dev *pdev)
2854 {
2855         struct device_domain_info *info;
2856         struct intel_iommu *iommu;
2857         unsigned long flags;
2858         int found = 0;
2859         struct list_head *entry, *tmp;
2860
2861         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2862         if (!iommu)
2863                 return;
2864
2865         spin_lock_irqsave(&device_domain_lock, flags);
2866         list_for_each_safe(entry, tmp, &domain->devices) {
2867                 info = list_entry(entry, struct device_domain_info, link);
2868                 if (info->bus == pdev->bus->number &&
2869                     info->devfn == pdev->devfn) {
2870                         list_del(&info->link);
2871                         list_del(&info->global);
2872                         if (info->dev)
2873                                 info->dev->dev.archdata.iommu = NULL;
2874                         spin_unlock_irqrestore(&device_domain_lock, flags);
2875
2876                         iommu_detach_dev(iommu, info->bus, info->devfn);
2877                         iommu_detach_dependent_devices(iommu, pdev);
2878                         free_devinfo_mem(info);
2879
2880                         spin_lock_irqsave(&device_domain_lock, flags);
2881
2882                         if (found)
2883                                 break;
2884                         else
2885                                 continue;
2886                 }
2887
2888                 /* if there is no other devices under the same iommu
2889                  * owned by this domain, clear this iommu in iommu_bmp
2890                  * update iommu count and coherency
2891                  */
2892                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2893                         found = 1;
2894         }
2895
2896         if (found == 0) {
2897                 unsigned long tmp_flags;
2898                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2899                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2900                 domain->iommu_count--;
2901                 domain_update_iommu_cap(domain);
2902                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2903         }
2904
2905         spin_unlock_irqrestore(&device_domain_lock, flags);
2906 }
2907
2908 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2909 {
2910         struct device_domain_info *info;
2911         struct intel_iommu *iommu;
2912         unsigned long flags1, flags2;
2913
2914         spin_lock_irqsave(&device_domain_lock, flags1);
2915         while (!list_empty(&domain->devices)) {
2916                 info = list_entry(domain->devices.next,
2917                         struct device_domain_info, link);
2918                 list_del(&info->link);
2919                 list_del(&info->global);
2920                 if (info->dev)
2921                         info->dev->dev.archdata.iommu = NULL;
2922
2923                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2924
2925                 iommu = device_to_iommu(info->bus, info->devfn);
2926                 iommu_detach_dev(iommu, info->bus, info->devfn);
2927                 iommu_detach_dependent_devices(iommu, info->dev);
2928
2929                 /* clear this iommu in iommu_bmp, update iommu count
2930                  * and capabilities
2931                  */
2932                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2933                 if (test_and_clear_bit(iommu->seq_id,
2934                                        &domain->iommu_bmp)) {
2935                         domain->iommu_count--;
2936                         domain_update_iommu_cap(domain);
2937                 }
2938                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2939
2940                 free_devinfo_mem(info);
2941                 spin_lock_irqsave(&device_domain_lock, flags1);
2942         }
2943         spin_unlock_irqrestore(&device_domain_lock, flags1);
2944 }
2945
2946 /* domain id for virtual machine, it won't be set in context */
2947 static unsigned long vm_domid;
2948
2949 static int vm_domain_min_agaw(struct dmar_domain *domain)
2950 {
2951         int i;
2952         int min_agaw = domain->agaw;
2953
2954         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2955         for (; i < g_num_of_iommus; ) {
2956                 if (min_agaw > g_iommus[i]->agaw)
2957                         min_agaw = g_iommus[i]->agaw;
2958
2959                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2960         }
2961
2962         return min_agaw;
2963 }
2964
2965 static struct dmar_domain *iommu_alloc_vm_domain(void)
2966 {
2967         struct dmar_domain *domain;
2968
2969         domain = alloc_domain_mem();
2970         if (!domain)
2971                 return NULL;
2972
2973         domain->id = vm_domid++;
2974         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2975         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2976
2977         return domain;
2978 }
2979
2980 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2981 {
2982         int adjust_width;
2983
2984         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2985         spin_lock_init(&domain->mapping_lock);
2986         spin_lock_init(&domain->iommu_lock);
2987
2988         domain_reserve_special_ranges(domain);
2989
2990         /* calculate AGAW */
2991         domain->gaw = guest_width;
2992         adjust_width = guestwidth_to_adjustwidth(guest_width);
2993         domain->agaw = width_to_agaw(adjust_width);
2994
2995         INIT_LIST_HEAD(&domain->devices);
2996
2997         domain->iommu_count = 0;
2998         domain->iommu_coherency = 0;
2999         domain->max_addr = 0;
3000
3001         /* always allocate the top pgd */
3002         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3003         if (!domain->pgd)
3004                 return -ENOMEM;
3005         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3006         return 0;
3007 }
3008
3009 static void iommu_free_vm_domain(struct dmar_domain *domain)
3010 {
3011         unsigned long flags;
3012         struct dmar_drhd_unit *drhd;
3013         struct intel_iommu *iommu;
3014         unsigned long i;
3015         unsigned long ndomains;
3016
3017         for_each_drhd_unit(drhd) {
3018                 if (drhd->ignored)
3019                         continue;
3020                 iommu = drhd->iommu;
3021
3022                 ndomains = cap_ndoms(iommu->cap);
3023                 i = find_first_bit(iommu->domain_ids, ndomains);
3024                 for (; i < ndomains; ) {
3025                         if (iommu->domains[i] == domain) {
3026                                 spin_lock_irqsave(&iommu->lock, flags);
3027                                 clear_bit(i, iommu->domain_ids);
3028                                 iommu->domains[i] = NULL;
3029                                 spin_unlock_irqrestore(&iommu->lock, flags);
3030                                 break;
3031                         }
3032                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3033                 }
3034         }
3035 }
3036
3037 static void vm_domain_exit(struct dmar_domain *domain)
3038 {
3039         u64 end;
3040
3041         /* Domain 0 is reserved, so dont process it */
3042         if (!domain)
3043                 return;
3044
3045         vm_domain_remove_all_dev_info(domain);
3046         /* destroy iovas */
3047         put_iova_domain(&domain->iovad);
3048         end = DOMAIN_MAX_ADDR(domain->gaw);
3049         end = end & (~VTD_PAGE_MASK);
3050
3051         /* clear ptes */
3052         dma_pte_clear_range(domain, 0, end);
3053
3054         /* free page tables */
3055         dma_pte_free_pagetable(domain, 0, end);
3056
3057         iommu_free_vm_domain(domain);
3058         free_domain_mem(domain);
3059 }
3060
3061 static int intel_iommu_domain_init(struct iommu_domain *domain)
3062 {
3063         struct dmar_domain *dmar_domain;
3064
3065         dmar_domain = iommu_alloc_vm_domain();
3066         if (!dmar_domain) {
3067                 printk(KERN_ERR
3068                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3069                 return -ENOMEM;
3070         }
3071         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3072                 printk(KERN_ERR
3073                         "intel_iommu_domain_init() failed\n");
3074                 vm_domain_exit(dmar_domain);
3075                 return -ENOMEM;
3076         }
3077         domain->priv = dmar_domain;
3078
3079         return 0;
3080 }
3081
3082 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3083 {
3084         struct dmar_domain *dmar_domain = domain->priv;
3085
3086         domain->priv = NULL;
3087         vm_domain_exit(dmar_domain);
3088 }
3089
3090 static int intel_iommu_attach_device(struct iommu_domain *domain,
3091                                      struct device *dev)
3092 {
3093         struct dmar_domain *dmar_domain = domain->priv;
3094         struct pci_dev *pdev = to_pci_dev(dev);
3095         struct intel_iommu *iommu;
3096         int addr_width;
3097         u64 end;
3098         int ret;
3099
3100         /* normally pdev is not mapped */
3101         if (unlikely(domain_context_mapped(pdev))) {
3102                 struct dmar_domain *old_domain;
3103
3104                 old_domain = find_domain(pdev);
3105                 if (old_domain) {
3106                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3107                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3108                         else
3109                                 domain_remove_dev_info(old_domain);
3110                 }
3111         }
3112
3113         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3114         if (!iommu)
3115                 return -ENODEV;
3116
3117         /* check if this iommu agaw is sufficient for max mapped address */
3118         addr_width = agaw_to_width(iommu->agaw);
3119         end = DOMAIN_MAX_ADDR(addr_width);
3120         end = end & VTD_PAGE_MASK;
3121         if (end < dmar_domain->max_addr) {
3122                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3123                        "sufficient for the mapped address (%llx)\n",
3124                        __func__, iommu->agaw, dmar_domain->max_addr);
3125                 return -EFAULT;
3126         }
3127
3128         ret = domain_context_mapping(dmar_domain, pdev);
3129         if (ret)
3130                 return ret;
3131
3132         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3133         return ret;
3134 }
3135
3136 static void intel_iommu_detach_device(struct iommu_domain *domain,
3137                                       struct device *dev)
3138 {
3139         struct dmar_domain *dmar_domain = domain->priv;
3140         struct pci_dev *pdev = to_pci_dev(dev);
3141
3142         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3143 }
3144
3145 static int intel_iommu_map_range(struct iommu_domain *domain,
3146                                  unsigned long iova, phys_addr_t hpa,
3147                                  size_t size, int iommu_prot)
3148 {
3149         struct dmar_domain *dmar_domain = domain->priv;
3150         u64 max_addr;
3151         int addr_width;
3152         int prot = 0;
3153         int ret;
3154
3155         if (iommu_prot & IOMMU_READ)
3156                 prot |= DMA_PTE_READ;
3157         if (iommu_prot & IOMMU_WRITE)
3158                 prot |= DMA_PTE_WRITE;
3159         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3160                 prot |= DMA_PTE_SNP;
3161
3162         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3163         if (dmar_domain->max_addr < max_addr) {
3164                 int min_agaw;
3165                 u64 end;
3166
3167                 /* check if minimum agaw is sufficient for mapped address */
3168                 min_agaw = vm_domain_min_agaw(dmar_domain);
3169                 addr_width = agaw_to_width(min_agaw);
3170                 end = DOMAIN_MAX_ADDR(addr_width);
3171                 end = end & VTD_PAGE_MASK;
3172                 if (end < max_addr) {
3173                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3174                                "sufficient for the mapped address (%llx)\n",
3175                                __func__, min_agaw, max_addr);
3176                         return -EFAULT;
3177                 }
3178                 dmar_domain->max_addr = max_addr;
3179         }
3180
3181         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3182         return ret;
3183 }
3184
3185 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3186                                     unsigned long iova, size_t size)
3187 {
3188         struct dmar_domain *dmar_domain = domain->priv;
3189         dma_addr_t base;
3190
3191         /* The address might not be aligned */
3192         base = iova & VTD_PAGE_MASK;
3193         size = VTD_PAGE_ALIGN(size);
3194         dma_pte_clear_range(dmar_domain, base, base + size);
3195
3196         if (dmar_domain->max_addr == base + size)
3197                 dmar_domain->max_addr = base;
3198 }
3199
3200 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3201                                             unsigned long iova)
3202 {
3203         struct dmar_domain *dmar_domain = domain->priv;
3204         struct dma_pte *pte;
3205         u64 phys = 0;
3206
3207         pte = addr_to_dma_pte(dmar_domain, iova);
3208         if (pte)
3209                 phys = dma_pte_addr(pte);
3210
3211         return phys;
3212 }
3213
3214 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3215                                       unsigned long cap)
3216 {
3217         struct dmar_domain *dmar_domain = domain->priv;
3218
3219         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3220                 return dmar_domain->iommu_snooping;
3221
3222         return 0;
3223 }
3224
3225 static struct iommu_ops intel_iommu_ops = {
3226         .domain_init    = intel_iommu_domain_init,
3227         .domain_destroy = intel_iommu_domain_destroy,
3228         .attach_dev     = intel_iommu_attach_device,
3229         .detach_dev     = intel_iommu_detach_device,
3230         .map            = intel_iommu_map_range,
3231         .unmap          = intel_iommu_unmap_range,
3232         .iova_to_phys   = intel_iommu_iova_to_phys,
3233         .domain_has_cap = intel_iommu_domain_has_cap,
3234 };
3235
3236 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3237 {
3238         /*
3239          * Mobile 4 Series Chipset neglects to set RWBF capability,
3240          * but needs it:
3241          */
3242         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3243         rwbf_quirk = 1;
3244 }
3245
3246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);