intel-iommu: Clean up handling of "caching mode" vs. IOTLB flushing.
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <asm/cacheflush.h>
41 #include <asm/iommu.h>
42 #include "pci.h"
43
44 #define ROOT_SIZE               VTD_PAGE_SIZE
45 #define CONTEXT_SIZE            VTD_PAGE_SIZE
46
47 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
48 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49
50 #define IOAPIC_RANGE_START      (0xfee00000)
51 #define IOAPIC_RANGE_END        (0xfeefffff)
52 #define IOVA_START_ADDR         (0x1000)
53
54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55
56 #define MAX_AGAW_WIDTH 64
57
58 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
59
60 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
61 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
62 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
63
64 /* global iommu list, set NULL for ignored DMAR units */
65 static struct intel_iommu **g_iommus;
66
67 static int rwbf_quirk;
68
69 /*
70  * 0: Present
71  * 1-11: Reserved
72  * 12-63: Context Ptr (12 - (haw-1))
73  * 64-127: Reserved
74  */
75 struct root_entry {
76         u64     val;
77         u64     rsvd1;
78 };
79 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
80 static inline bool root_present(struct root_entry *root)
81 {
82         return (root->val & 1);
83 }
84 static inline void set_root_present(struct root_entry *root)
85 {
86         root->val |= 1;
87 }
88 static inline void set_root_value(struct root_entry *root, unsigned long value)
89 {
90         root->val |= value & VTD_PAGE_MASK;
91 }
92
93 static inline struct context_entry *
94 get_context_addr_from_root(struct root_entry *root)
95 {
96         return (struct context_entry *)
97                 (root_present(root)?phys_to_virt(
98                 root->val & VTD_PAGE_MASK) :
99                 NULL);
100 }
101
102 /*
103  * low 64 bits:
104  * 0: present
105  * 1: fault processing disable
106  * 2-3: translation type
107  * 12-63: address space root
108  * high 64 bits:
109  * 0-2: address width
110  * 3-6: aval
111  * 8-23: domain id
112  */
113 struct context_entry {
114         u64 lo;
115         u64 hi;
116 };
117
118 static inline bool context_present(struct context_entry *context)
119 {
120         return (context->lo & 1);
121 }
122 static inline void context_set_present(struct context_entry *context)
123 {
124         context->lo |= 1;
125 }
126
127 static inline void context_set_fault_enable(struct context_entry *context)
128 {
129         context->lo &= (((u64)-1) << 2) | 1;
130 }
131
132 static inline void context_set_translation_type(struct context_entry *context,
133                                                 unsigned long value)
134 {
135         context->lo &= (((u64)-1) << 4) | 3;
136         context->lo |= (value & 3) << 2;
137 }
138
139 static inline void context_set_address_root(struct context_entry *context,
140                                             unsigned long value)
141 {
142         context->lo |= value & VTD_PAGE_MASK;
143 }
144
145 static inline void context_set_address_width(struct context_entry *context,
146                                              unsigned long value)
147 {
148         context->hi |= value & 7;
149 }
150
151 static inline void context_set_domain_id(struct context_entry *context,
152                                          unsigned long value)
153 {
154         context->hi |= (value & ((1 << 16) - 1)) << 8;
155 }
156
157 static inline void context_clear_entry(struct context_entry *context)
158 {
159         context->lo = 0;
160         context->hi = 0;
161 }
162
163 /*
164  * 0: readable
165  * 1: writable
166  * 2-6: reserved
167  * 7: super page
168  * 8-10: available
169  * 11: snoop behavior
170  * 12-63: Host physcial address
171  */
172 struct dma_pte {
173         u64 val;
174 };
175
176 static inline void dma_clear_pte(struct dma_pte *pte)
177 {
178         pte->val = 0;
179 }
180
181 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_READ;
184 }
185
186 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 {
188         pte->val |= DMA_PTE_WRITE;
189 }
190
191 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 {
193         pte->val |= DMA_PTE_SNP;
194 }
195
196 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 {
198         pte->val = (pte->val & ~3) | (prot & 3);
199 }
200
201 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 {
203         return (pte->val & VTD_PAGE_MASK);
204 }
205
206 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 {
208         pte->val |= (addr & VTD_PAGE_MASK);
209 }
210
211 static inline bool dma_pte_present(struct dma_pte *pte)
212 {
213         return (pte->val & 3) != 0;
214 }
215
216 /* devices under the same p2p bridge are owned in one domain */
217 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218
219 /* domain represents a virtual machine, more than one devices
220  * across iommus may be owned in one domain, e.g. kvm guest.
221  */
222 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
223
224 struct dmar_domain {
225         int     id;                     /* domain id */
226         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
227
228         struct list_head devices;       /* all devices' list */
229         struct iova_domain iovad;       /* iova's that belong to this domain */
230
231         struct dma_pte  *pgd;           /* virtual address */
232         spinlock_t      mapping_lock;   /* page table lock */
233         int             gaw;            /* max guest address width */
234
235         /* adjusted guest address width, 0 is level 2 30-bit */
236         int             agaw;
237
238         int             flags;          /* flags to find out type of domain */
239
240         int             iommu_coherency;/* indicate coherency of iommu access */
241         int             iommu_snooping; /* indicate snooping control feature*/
242         int             iommu_count;    /* reference count of iommu */
243         spinlock_t      iommu_lock;     /* protect iommu set in domain */
244         u64             max_addr;       /* maximum mapped address */
245 };
246
247 /* PCI domain-device relationship */
248 struct device_domain_info {
249         struct list_head link;  /* link to domain siblings */
250         struct list_head global; /* link to global list */
251         int segment;            /* PCI domain */
252         u8 bus;                 /* PCI bus number */
253         u8 devfn;               /* PCI devfn number */
254         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
255         struct dmar_domain *domain; /* pointer to domain */
256 };
257
258 static void flush_unmaps_timeout(unsigned long data);
259
260 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
261
262 #define HIGH_WATER_MARK 250
263 struct deferred_flush_tables {
264         int next;
265         struct iova *iova[HIGH_WATER_MARK];
266         struct dmar_domain *domain[HIGH_WATER_MARK];
267 };
268
269 static struct deferred_flush_tables *deferred_flush;
270
271 /* bitmap for indexing intel_iommus */
272 static int g_num_of_iommus;
273
274 static DEFINE_SPINLOCK(async_umap_flush_lock);
275 static LIST_HEAD(unmaps_to_do);
276
277 static int timer_on;
278 static long list_size;
279
280 static void domain_remove_dev_info(struct dmar_domain *domain);
281
282 #ifdef CONFIG_DMAR_DEFAULT_ON
283 int dmar_disabled = 0;
284 #else
285 int dmar_disabled = 1;
286 #endif /*CONFIG_DMAR_DEFAULT_ON*/
287
288 static int __initdata dmar_map_gfx = 1;
289 static int dmar_forcedac;
290 static int intel_iommu_strict;
291
292 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
293 static DEFINE_SPINLOCK(device_domain_lock);
294 static LIST_HEAD(device_domain_list);
295
296 static struct iommu_ops intel_iommu_ops;
297
298 static int __init intel_iommu_setup(char *str)
299 {
300         if (!str)
301                 return -EINVAL;
302         while (*str) {
303                 if (!strncmp(str, "on", 2)) {
304                         dmar_disabled = 0;
305                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
306                 } else if (!strncmp(str, "off", 3)) {
307                         dmar_disabled = 1;
308                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
309                 } else if (!strncmp(str, "igfx_off", 8)) {
310                         dmar_map_gfx = 0;
311                         printk(KERN_INFO
312                                 "Intel-IOMMU: disable GFX device mapping\n");
313                 } else if (!strncmp(str, "forcedac", 8)) {
314                         printk(KERN_INFO
315                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
316                         dmar_forcedac = 1;
317                 } else if (!strncmp(str, "strict", 6)) {
318                         printk(KERN_INFO
319                                 "Intel-IOMMU: disable batched IOTLB flush\n");
320                         intel_iommu_strict = 1;
321                 }
322
323                 str += strcspn(str, ",");
324                 while (*str == ',')
325                         str++;
326         }
327         return 0;
328 }
329 __setup("intel_iommu=", intel_iommu_setup);
330
331 static struct kmem_cache *iommu_domain_cache;
332 static struct kmem_cache *iommu_devinfo_cache;
333 static struct kmem_cache *iommu_iova_cache;
334
335 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
336 {
337         unsigned int flags;
338         void *vaddr;
339
340         /* trying to avoid low memory issues */
341         flags = current->flags & PF_MEMALLOC;
342         current->flags |= PF_MEMALLOC;
343         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
344         current->flags &= (~PF_MEMALLOC | flags);
345         return vaddr;
346 }
347
348
349 static inline void *alloc_pgtable_page(void)
350 {
351         unsigned int flags;
352         void *vaddr;
353
354         /* trying to avoid low memory issues */
355         flags = current->flags & PF_MEMALLOC;
356         current->flags |= PF_MEMALLOC;
357         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
358         current->flags &= (~PF_MEMALLOC | flags);
359         return vaddr;
360 }
361
362 static inline void free_pgtable_page(void *vaddr)
363 {
364         free_page((unsigned long)vaddr);
365 }
366
367 static inline void *alloc_domain_mem(void)
368 {
369         return iommu_kmem_cache_alloc(iommu_domain_cache);
370 }
371
372 static void free_domain_mem(void *vaddr)
373 {
374         kmem_cache_free(iommu_domain_cache, vaddr);
375 }
376
377 static inline void * alloc_devinfo_mem(void)
378 {
379         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 }
381
382 static inline void free_devinfo_mem(void *vaddr)
383 {
384         kmem_cache_free(iommu_devinfo_cache, vaddr);
385 }
386
387 struct iova *alloc_iova_mem(void)
388 {
389         return iommu_kmem_cache_alloc(iommu_iova_cache);
390 }
391
392 void free_iova_mem(struct iova *iova)
393 {
394         kmem_cache_free(iommu_iova_cache, iova);
395 }
396
397
398 static inline int width_to_agaw(int width);
399
400 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
401 {
402         unsigned long sagaw;
403         int agaw = -1;
404
405         sagaw = cap_sagaw(iommu->cap);
406         for (agaw = width_to_agaw(max_gaw);
407              agaw >= 0; agaw--) {
408                 if (test_bit(agaw, &sagaw))
409                         break;
410         }
411
412         return agaw;
413 }
414
415 /*
416  * Calculate max SAGAW for each iommu.
417  */
418 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
419 {
420         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
421 }
422
423 /*
424  * calculate agaw for each iommu.
425  * "SAGAW" may be different across iommus, use a default agaw, and
426  * get a supported less agaw for iommus that don't support the default agaw.
427  */
428 int iommu_calculate_agaw(struct intel_iommu *iommu)
429 {
430         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
431 }
432
433 /* in native case, each domain is related to only one iommu */
434 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
435 {
436         int iommu_id;
437
438         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
439
440         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
441         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
442                 return NULL;
443
444         return g_iommus[iommu_id];
445 }
446
447 static void domain_update_iommu_coherency(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_coherency = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_coherent(g_iommus[i]->ecap)) {
456                         domain->iommu_coherency = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 static void domain_update_iommu_snooping(struct dmar_domain *domain)
464 {
465         int i;
466
467         domain->iommu_snooping = 1;
468
469         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
470         for (; i < g_num_of_iommus; ) {
471                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
472                         domain->iommu_snooping = 0;
473                         break;
474                 }
475                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
476         }
477 }
478
479 /* Some capabilities may be different across iommus */
480 static void domain_update_iommu_cap(struct dmar_domain *domain)
481 {
482         domain_update_iommu_coherency(domain);
483         domain_update_iommu_snooping(domain);
484 }
485
486 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
487 {
488         struct dmar_drhd_unit *drhd = NULL;
489         int i;
490
491         for_each_drhd_unit(drhd) {
492                 if (drhd->ignored)
493                         continue;
494                 if (segment != drhd->segment)
495                         continue;
496
497                 for (i = 0; i < drhd->devices_cnt; i++) {
498                         if (drhd->devices[i] &&
499                             drhd->devices[i]->bus->number == bus &&
500                             drhd->devices[i]->devfn == devfn)
501                                 return drhd->iommu;
502                         if (drhd->devices[i] &&
503                             drhd->devices[i]->subordinate &&
504                             drhd->devices[i]->subordinate->number <= bus &&
505                             drhd->devices[i]->subordinate->subordinate >= bus)
506                                 return drhd->iommu;
507                 }
508
509                 if (drhd->include_all)
510                         return drhd->iommu;
511         }
512
513         return NULL;
514 }
515
516 static void domain_flush_cache(struct dmar_domain *domain,
517                                void *addr, int size)
518 {
519         if (!domain->iommu_coherency)
520                 clflush_cache_range(addr, size);
521 }
522
523 /* Gets context entry for a given bus and devfn */
524 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
525                 u8 bus, u8 devfn)
526 {
527         struct root_entry *root;
528         struct context_entry *context;
529         unsigned long phy_addr;
530         unsigned long flags;
531
532         spin_lock_irqsave(&iommu->lock, flags);
533         root = &iommu->root_entry[bus];
534         context = get_context_addr_from_root(root);
535         if (!context) {
536                 context = (struct context_entry *)alloc_pgtable_page();
537                 if (!context) {
538                         spin_unlock_irqrestore(&iommu->lock, flags);
539                         return NULL;
540                 }
541                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
542                 phy_addr = virt_to_phys((void *)context);
543                 set_root_value(root, phy_addr);
544                 set_root_present(root);
545                 __iommu_flush_cache(iommu, root, sizeof(*root));
546         }
547         spin_unlock_irqrestore(&iommu->lock, flags);
548         return &context[devfn];
549 }
550
551 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
552 {
553         struct root_entry *root;
554         struct context_entry *context;
555         int ret;
556         unsigned long flags;
557
558         spin_lock_irqsave(&iommu->lock, flags);
559         root = &iommu->root_entry[bus];
560         context = get_context_addr_from_root(root);
561         if (!context) {
562                 ret = 0;
563                 goto out;
564         }
565         ret = context_present(&context[devfn]);
566 out:
567         spin_unlock_irqrestore(&iommu->lock, flags);
568         return ret;
569 }
570
571 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
572 {
573         struct root_entry *root;
574         struct context_entry *context;
575         unsigned long flags;
576
577         spin_lock_irqsave(&iommu->lock, flags);
578         root = &iommu->root_entry[bus];
579         context = get_context_addr_from_root(root);
580         if (context) {
581                 context_clear_entry(&context[devfn]);
582                 __iommu_flush_cache(iommu, &context[devfn], \
583                         sizeof(*context));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586 }
587
588 static void free_context_table(struct intel_iommu *iommu)
589 {
590         struct root_entry *root;
591         int i;
592         unsigned long flags;
593         struct context_entry *context;
594
595         spin_lock_irqsave(&iommu->lock, flags);
596         if (!iommu->root_entry) {
597                 goto out;
598         }
599         for (i = 0; i < ROOT_ENTRY_NR; i++) {
600                 root = &iommu->root_entry[i];
601                 context = get_context_addr_from_root(root);
602                 if (context)
603                         free_pgtable_page(context);
604         }
605         free_pgtable_page(iommu->root_entry);
606         iommu->root_entry = NULL;
607 out:
608         spin_unlock_irqrestore(&iommu->lock, flags);
609 }
610
611 /* page table handling */
612 #define LEVEL_STRIDE            (9)
613 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
614
615 static inline int agaw_to_level(int agaw)
616 {
617         return agaw + 2;
618 }
619
620 static inline int agaw_to_width(int agaw)
621 {
622         return 30 + agaw * LEVEL_STRIDE;
623
624 }
625
626 static inline int width_to_agaw(int width)
627 {
628         return (width - 30) / LEVEL_STRIDE;
629 }
630
631 static inline unsigned int level_to_offset_bits(int level)
632 {
633         return (12 + (level - 1) * LEVEL_STRIDE);
634 }
635
636 static inline int address_level_offset(u64 addr, int level)
637 {
638         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
639 }
640
641 static inline u64 level_mask(int level)
642 {
643         return ((u64)-1 << level_to_offset_bits(level));
644 }
645
646 static inline u64 level_size(int level)
647 {
648         return ((u64)1 << level_to_offset_bits(level));
649 }
650
651 static inline u64 align_to_level(u64 addr, int level)
652 {
653         return ((addr + level_size(level) - 1) & level_mask(level));
654 }
655
656 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
657 {
658         int addr_width = agaw_to_width(domain->agaw);
659         struct dma_pte *parent, *pte = NULL;
660         int level = agaw_to_level(domain->agaw);
661         int offset;
662         unsigned long flags;
663
664         BUG_ON(!domain->pgd);
665
666         addr &= (((u64)1) << addr_width) - 1;
667         parent = domain->pgd;
668
669         spin_lock_irqsave(&domain->mapping_lock, flags);
670         while (level > 0) {
671                 void *tmp_page;
672
673                 offset = address_level_offset(addr, level);
674                 pte = &parent[offset];
675                 if (level == 1)
676                         break;
677
678                 if (!dma_pte_present(pte)) {
679                         tmp_page = alloc_pgtable_page();
680
681                         if (!tmp_page) {
682                                 spin_unlock_irqrestore(&domain->mapping_lock,
683                                         flags);
684                                 return NULL;
685                         }
686                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
687                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
688                         /*
689                          * high level table always sets r/w, last level page
690                          * table control read/write
691                          */
692                         dma_set_pte_readable(pte);
693                         dma_set_pte_writable(pte);
694                         domain_flush_cache(domain, pte, sizeof(*pte));
695                 }
696                 parent = phys_to_virt(dma_pte_addr(pte));
697                 level--;
698         }
699
700         spin_unlock_irqrestore(&domain->mapping_lock, flags);
701         return pte;
702 }
703
704 /* return address's pte at specific level */
705 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
706                 int level)
707 {
708         struct dma_pte *parent, *pte = NULL;
709         int total = agaw_to_level(domain->agaw);
710         int offset;
711
712         parent = domain->pgd;
713         while (level <= total) {
714                 offset = address_level_offset(addr, total);
715                 pte = &parent[offset];
716                 if (level == total)
717                         return pte;
718
719                 if (!dma_pte_present(pte))
720                         break;
721                 parent = phys_to_virt(dma_pte_addr(pte));
722                 total--;
723         }
724         return NULL;
725 }
726
727 /* clear one page's page table */
728 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
729 {
730         struct dma_pte *pte = NULL;
731
732         /* get last level pte */
733         pte = dma_addr_level_pte(domain, addr, 1);
734
735         if (pte) {
736                 dma_clear_pte(pte);
737                 domain_flush_cache(domain, pte, sizeof(*pte));
738         }
739 }
740
741 /* clear last level pte, a tlb flush should be followed */
742 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
743 {
744         int addr_width = agaw_to_width(domain->agaw);
745         int npages;
746
747         start &= (((u64)1) << addr_width) - 1;
748         end &= (((u64)1) << addr_width) - 1;
749         /* in case it's partial page */
750         start &= PAGE_MASK;
751         end = PAGE_ALIGN(end);
752         npages = (end - start) / VTD_PAGE_SIZE;
753
754         /* we don't need lock here, nobody else touches the iova range */
755         while (npages--) {
756                 dma_pte_clear_one(domain, start);
757                 start += VTD_PAGE_SIZE;
758         }
759 }
760
761 /* free page table pages. last level pte should already be cleared */
762 static void dma_pte_free_pagetable(struct dmar_domain *domain,
763         u64 start, u64 end)
764 {
765         int addr_width = agaw_to_width(domain->agaw);
766         struct dma_pte *pte;
767         int total = agaw_to_level(domain->agaw);
768         int level;
769         u64 tmp;
770
771         start &= (((u64)1) << addr_width) - 1;
772         end &= (((u64)1) << addr_width) - 1;
773
774         /* we don't need lock here, nobody else touches the iova range */
775         level = 2;
776         while (level <= total) {
777                 tmp = align_to_level(start, level);
778                 if (tmp >= end || (tmp + level_size(level) > end))
779                         return;
780
781                 while (tmp < end) {
782                         pte = dma_addr_level_pte(domain, tmp, level);
783                         if (pte) {
784                                 free_pgtable_page(
785                                         phys_to_virt(dma_pte_addr(pte)));
786                                 dma_clear_pte(pte);
787                                 domain_flush_cache(domain, pte, sizeof(*pte));
788                         }
789                         tmp += level_size(level);
790                 }
791                 level++;
792         }
793         /* free pgd */
794         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
795                 free_pgtable_page(domain->pgd);
796                 domain->pgd = NULL;
797         }
798 }
799
800 /* iommu handling */
801 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
802 {
803         struct root_entry *root;
804         unsigned long flags;
805
806         root = (struct root_entry *)alloc_pgtable_page();
807         if (!root)
808                 return -ENOMEM;
809
810         __iommu_flush_cache(iommu, root, ROOT_SIZE);
811
812         spin_lock_irqsave(&iommu->lock, flags);
813         iommu->root_entry = root;
814         spin_unlock_irqrestore(&iommu->lock, flags);
815
816         return 0;
817 }
818
819 static void iommu_set_root_entry(struct intel_iommu *iommu)
820 {
821         void *addr;
822         u32 cmd, sts;
823         unsigned long flag;
824
825         addr = iommu->root_entry;
826
827         spin_lock_irqsave(&iommu->register_lock, flag);
828         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
829
830         cmd = iommu->gcmd | DMA_GCMD_SRTP;
831         writel(cmd, iommu->reg + DMAR_GCMD_REG);
832
833         /* Make sure hardware complete it */
834         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
835                 readl, (sts & DMA_GSTS_RTPS), sts);
836
837         spin_unlock_irqrestore(&iommu->register_lock, flag);
838 }
839
840 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
841 {
842         u32 val;
843         unsigned long flag;
844
845         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
846                 return;
847         val = iommu->gcmd | DMA_GCMD_WBF;
848
849         spin_lock_irqsave(&iommu->register_lock, flag);
850         writel(val, iommu->reg + DMAR_GCMD_REG);
851
852         /* Make sure hardware complete it */
853         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
854                         readl, (!(val & DMA_GSTS_WBFS)), val);
855
856         spin_unlock_irqrestore(&iommu->register_lock, flag);
857 }
858
859 /* return value determine if we need a write buffer flush */
860 static void __iommu_flush_context(struct intel_iommu *iommu,
861                                   u16 did, u16 source_id, u8 function_mask,
862                                   u64 type)
863 {
864         u64 val = 0;
865         unsigned long flag;
866
867         switch (type) {
868         case DMA_CCMD_GLOBAL_INVL:
869                 val = DMA_CCMD_GLOBAL_INVL;
870                 break;
871         case DMA_CCMD_DOMAIN_INVL:
872                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
873                 break;
874         case DMA_CCMD_DEVICE_INVL:
875                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
876                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
877                 break;
878         default:
879                 BUG();
880         }
881         val |= DMA_CCMD_ICC;
882
883         spin_lock_irqsave(&iommu->register_lock, flag);
884         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
885
886         /* Make sure hardware complete it */
887         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
888                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
889
890         spin_unlock_irqrestore(&iommu->register_lock, flag);
891 }
892
893 /* return value determine if we need a write buffer flush */
894 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
895                                 u64 addr, unsigned int size_order, u64 type)
896 {
897         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
898         u64 val = 0, val_iva = 0;
899         unsigned long flag;
900
901         switch (type) {
902         case DMA_TLB_GLOBAL_FLUSH:
903                 /* global flush doesn't need set IVA_REG */
904                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
905                 break;
906         case DMA_TLB_DSI_FLUSH:
907                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
908                 break;
909         case DMA_TLB_PSI_FLUSH:
910                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
911                 /* Note: always flush non-leaf currently */
912                 val_iva = size_order | addr;
913                 break;
914         default:
915                 BUG();
916         }
917         /* Note: set drain read/write */
918 #if 0
919         /*
920          * This is probably to be super secure.. Looks like we can
921          * ignore it without any impact.
922          */
923         if (cap_read_drain(iommu->cap))
924                 val |= DMA_TLB_READ_DRAIN;
925 #endif
926         if (cap_write_drain(iommu->cap))
927                 val |= DMA_TLB_WRITE_DRAIN;
928
929         spin_lock_irqsave(&iommu->register_lock, flag);
930         /* Note: Only uses first TLB reg currently */
931         if (val_iva)
932                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
933         dmar_writeq(iommu->reg + tlb_offset + 8, val);
934
935         /* Make sure hardware complete it */
936         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
937                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
938
939         spin_unlock_irqrestore(&iommu->register_lock, flag);
940
941         /* check IOTLB invalidation granularity */
942         if (DMA_TLB_IAIG(val) == 0)
943                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
944         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
945                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
946                         (unsigned long long)DMA_TLB_IIRG(type),
947                         (unsigned long long)DMA_TLB_IAIG(val));
948 }
949
950 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
951                                   u64 addr, unsigned int pages)
952 {
953         unsigned int mask;
954
955         BUG_ON(addr & (~VTD_PAGE_MASK));
956         BUG_ON(pages == 0);
957
958         /* Fallback to domain selective flush if no PSI support */
959         if (!cap_pgsel_inv(iommu->cap))
960                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
961                                                 DMA_TLB_DSI_FLUSH);
962
963         /*
964          * PSI requires page size to be 2 ^ x, and the base address is naturally
965          * aligned to the size
966          */
967         mask = ilog2(__roundup_pow_of_two(pages));
968         /* Fallback to domain selective flush if size is too big */
969         if (mask > cap_max_amask_val(iommu->cap))
970                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
971                                                 DMA_TLB_DSI_FLUSH);
972
973         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
974                                         DMA_TLB_PSI_FLUSH);
975 }
976
977 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
978 {
979         u32 pmen;
980         unsigned long flags;
981
982         spin_lock_irqsave(&iommu->register_lock, flags);
983         pmen = readl(iommu->reg + DMAR_PMEN_REG);
984         pmen &= ~DMA_PMEN_EPM;
985         writel(pmen, iommu->reg + DMAR_PMEN_REG);
986
987         /* wait for the protected region status bit to clear */
988         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
989                 readl, !(pmen & DMA_PMEN_PRS), pmen);
990
991         spin_unlock_irqrestore(&iommu->register_lock, flags);
992 }
993
994 static int iommu_enable_translation(struct intel_iommu *iommu)
995 {
996         u32 sts;
997         unsigned long flags;
998
999         spin_lock_irqsave(&iommu->register_lock, flags);
1000         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1001
1002         /* Make sure hardware complete it */
1003         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1004                 readl, (sts & DMA_GSTS_TES), sts);
1005
1006         iommu->gcmd |= DMA_GCMD_TE;
1007         spin_unlock_irqrestore(&iommu->register_lock, flags);
1008         return 0;
1009 }
1010
1011 static int iommu_disable_translation(struct intel_iommu *iommu)
1012 {
1013         u32 sts;
1014         unsigned long flag;
1015
1016         spin_lock_irqsave(&iommu->register_lock, flag);
1017         iommu->gcmd &= ~DMA_GCMD_TE;
1018         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1019
1020         /* Make sure hardware complete it */
1021         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1022                 readl, (!(sts & DMA_GSTS_TES)), sts);
1023
1024         spin_unlock_irqrestore(&iommu->register_lock, flag);
1025         return 0;
1026 }
1027
1028
1029 static int iommu_init_domains(struct intel_iommu *iommu)
1030 {
1031         unsigned long ndomains;
1032         unsigned long nlongs;
1033
1034         ndomains = cap_ndoms(iommu->cap);
1035         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1036         nlongs = BITS_TO_LONGS(ndomains);
1037
1038         /* TBD: there might be 64K domains,
1039          * consider other allocation for future chip
1040          */
1041         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1042         if (!iommu->domain_ids) {
1043                 printk(KERN_ERR "Allocating domain id array failed\n");
1044                 return -ENOMEM;
1045         }
1046         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1047                         GFP_KERNEL);
1048         if (!iommu->domains) {
1049                 printk(KERN_ERR "Allocating domain array failed\n");
1050                 kfree(iommu->domain_ids);
1051                 return -ENOMEM;
1052         }
1053
1054         spin_lock_init(&iommu->lock);
1055
1056         /*
1057          * if Caching mode is set, then invalid translations are tagged
1058          * with domainid 0. Hence we need to pre-allocate it.
1059          */
1060         if (cap_caching_mode(iommu->cap))
1061                 set_bit(0, iommu->domain_ids);
1062         return 0;
1063 }
1064
1065
1066 static void domain_exit(struct dmar_domain *domain);
1067 static void vm_domain_exit(struct dmar_domain *domain);
1068
1069 void free_dmar_iommu(struct intel_iommu *iommu)
1070 {
1071         struct dmar_domain *domain;
1072         int i;
1073         unsigned long flags;
1074
1075         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1076         for (; i < cap_ndoms(iommu->cap); ) {
1077                 domain = iommu->domains[i];
1078                 clear_bit(i, iommu->domain_ids);
1079
1080                 spin_lock_irqsave(&domain->iommu_lock, flags);
1081                 if (--domain->iommu_count == 0) {
1082                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1083                                 vm_domain_exit(domain);
1084                         else
1085                                 domain_exit(domain);
1086                 }
1087                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1088
1089                 i = find_next_bit(iommu->domain_ids,
1090                         cap_ndoms(iommu->cap), i+1);
1091         }
1092
1093         if (iommu->gcmd & DMA_GCMD_TE)
1094                 iommu_disable_translation(iommu);
1095
1096         if (iommu->irq) {
1097                 set_irq_data(iommu->irq, NULL);
1098                 /* This will mask the irq */
1099                 free_irq(iommu->irq, iommu);
1100                 destroy_irq(iommu->irq);
1101         }
1102
1103         kfree(iommu->domains);
1104         kfree(iommu->domain_ids);
1105
1106         g_iommus[iommu->seq_id] = NULL;
1107
1108         /* if all iommus are freed, free g_iommus */
1109         for (i = 0; i < g_num_of_iommus; i++) {
1110                 if (g_iommus[i])
1111                         break;
1112         }
1113
1114         if (i == g_num_of_iommus)
1115                 kfree(g_iommus);
1116
1117         /* free context mapping */
1118         free_context_table(iommu);
1119 }
1120
1121 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1122 {
1123         unsigned long num;
1124         unsigned long ndomains;
1125         struct dmar_domain *domain;
1126         unsigned long flags;
1127
1128         domain = alloc_domain_mem();
1129         if (!domain)
1130                 return NULL;
1131
1132         ndomains = cap_ndoms(iommu->cap);
1133
1134         spin_lock_irqsave(&iommu->lock, flags);
1135         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1136         if (num >= ndomains) {
1137                 spin_unlock_irqrestore(&iommu->lock, flags);
1138                 free_domain_mem(domain);
1139                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1140                 return NULL;
1141         }
1142
1143         set_bit(num, iommu->domain_ids);
1144         domain->id = num;
1145         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1146         set_bit(iommu->seq_id, &domain->iommu_bmp);
1147         domain->flags = 0;
1148         iommu->domains[num] = domain;
1149         spin_unlock_irqrestore(&iommu->lock, flags);
1150
1151         return domain;
1152 }
1153
1154 static void iommu_free_domain(struct dmar_domain *domain)
1155 {
1156         unsigned long flags;
1157         struct intel_iommu *iommu;
1158
1159         iommu = domain_get_iommu(domain);
1160
1161         spin_lock_irqsave(&iommu->lock, flags);
1162         clear_bit(domain->id, iommu->domain_ids);
1163         spin_unlock_irqrestore(&iommu->lock, flags);
1164 }
1165
1166 static struct iova_domain reserved_iova_list;
1167 static struct lock_class_key reserved_alloc_key;
1168 static struct lock_class_key reserved_rbtree_key;
1169
1170 static void dmar_init_reserved_ranges(void)
1171 {
1172         struct pci_dev *pdev = NULL;
1173         struct iova *iova;
1174         int i;
1175         u64 addr, size;
1176
1177         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1178
1179         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1180                 &reserved_alloc_key);
1181         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1182                 &reserved_rbtree_key);
1183
1184         /* IOAPIC ranges shouldn't be accessed by DMA */
1185         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1186                 IOVA_PFN(IOAPIC_RANGE_END));
1187         if (!iova)
1188                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1189
1190         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1191         for_each_pci_dev(pdev) {
1192                 struct resource *r;
1193
1194                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1195                         r = &pdev->resource[i];
1196                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1197                                 continue;
1198                         addr = r->start;
1199                         addr &= PAGE_MASK;
1200                         size = r->end - addr;
1201                         size = PAGE_ALIGN(size);
1202                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1203                                 IOVA_PFN(size + addr) - 1);
1204                         if (!iova)
1205                                 printk(KERN_ERR "Reserve iova failed\n");
1206                 }
1207         }
1208
1209 }
1210
1211 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1212 {
1213         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1214 }
1215
1216 static inline int guestwidth_to_adjustwidth(int gaw)
1217 {
1218         int agaw;
1219         int r = (gaw - 12) % 9;
1220
1221         if (r == 0)
1222                 agaw = gaw;
1223         else
1224                 agaw = gaw + 9 - r;
1225         if (agaw > 64)
1226                 agaw = 64;
1227         return agaw;
1228 }
1229
1230 static int domain_init(struct dmar_domain *domain, int guest_width)
1231 {
1232         struct intel_iommu *iommu;
1233         int adjust_width, agaw;
1234         unsigned long sagaw;
1235
1236         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1237         spin_lock_init(&domain->mapping_lock);
1238         spin_lock_init(&domain->iommu_lock);
1239
1240         domain_reserve_special_ranges(domain);
1241
1242         /* calculate AGAW */
1243         iommu = domain_get_iommu(domain);
1244         if (guest_width > cap_mgaw(iommu->cap))
1245                 guest_width = cap_mgaw(iommu->cap);
1246         domain->gaw = guest_width;
1247         adjust_width = guestwidth_to_adjustwidth(guest_width);
1248         agaw = width_to_agaw(adjust_width);
1249         sagaw = cap_sagaw(iommu->cap);
1250         if (!test_bit(agaw, &sagaw)) {
1251                 /* hardware doesn't support it, choose a bigger one */
1252                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1253                 agaw = find_next_bit(&sagaw, 5, agaw);
1254                 if (agaw >= 5)
1255                         return -ENODEV;
1256         }
1257         domain->agaw = agaw;
1258         INIT_LIST_HEAD(&domain->devices);
1259
1260         if (ecap_coherent(iommu->ecap))
1261                 domain->iommu_coherency = 1;
1262         else
1263                 domain->iommu_coherency = 0;
1264
1265         if (ecap_sc_support(iommu->ecap))
1266                 domain->iommu_snooping = 1;
1267         else
1268                 domain->iommu_snooping = 0;
1269
1270         domain->iommu_count = 1;
1271
1272         /* always allocate the top pgd */
1273         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1274         if (!domain->pgd)
1275                 return -ENOMEM;
1276         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1277         return 0;
1278 }
1279
1280 static void domain_exit(struct dmar_domain *domain)
1281 {
1282         u64 end;
1283
1284         /* Domain 0 is reserved, so dont process it */
1285         if (!domain)
1286                 return;
1287
1288         domain_remove_dev_info(domain);
1289         /* destroy iovas */
1290         put_iova_domain(&domain->iovad);
1291         end = DOMAIN_MAX_ADDR(domain->gaw);
1292         end = end & (~PAGE_MASK);
1293
1294         /* clear ptes */
1295         dma_pte_clear_range(domain, 0, end);
1296
1297         /* free page tables */
1298         dma_pte_free_pagetable(domain, 0, end);
1299
1300         iommu_free_domain(domain);
1301         free_domain_mem(domain);
1302 }
1303
1304 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1305                                  u8 bus, u8 devfn, int translation)
1306 {
1307         struct context_entry *context;
1308         unsigned long flags;
1309         struct intel_iommu *iommu;
1310         struct dma_pte *pgd;
1311         unsigned long num;
1312         unsigned long ndomains;
1313         int id;
1314         int agaw;
1315
1316         pr_debug("Set context mapping for %02x:%02x.%d\n",
1317                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1318
1319         BUG_ON(!domain->pgd);
1320         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1321                translation != CONTEXT_TT_MULTI_LEVEL);
1322
1323         iommu = device_to_iommu(segment, bus, devfn);
1324         if (!iommu)
1325                 return -ENODEV;
1326
1327         context = device_to_context_entry(iommu, bus, devfn);
1328         if (!context)
1329                 return -ENOMEM;
1330         spin_lock_irqsave(&iommu->lock, flags);
1331         if (context_present(context)) {
1332                 spin_unlock_irqrestore(&iommu->lock, flags);
1333                 return 0;
1334         }
1335
1336         id = domain->id;
1337         pgd = domain->pgd;
1338
1339         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1340                 int found = 0;
1341
1342                 /* find an available domain id for this device in iommu */
1343                 ndomains = cap_ndoms(iommu->cap);
1344                 num = find_first_bit(iommu->domain_ids, ndomains);
1345                 for (; num < ndomains; ) {
1346                         if (iommu->domains[num] == domain) {
1347                                 id = num;
1348                                 found = 1;
1349                                 break;
1350                         }
1351                         num = find_next_bit(iommu->domain_ids,
1352                                             cap_ndoms(iommu->cap), num+1);
1353                 }
1354
1355                 if (found == 0) {
1356                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1357                         if (num >= ndomains) {
1358                                 spin_unlock_irqrestore(&iommu->lock, flags);
1359                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1360                                 return -EFAULT;
1361                         }
1362
1363                         set_bit(num, iommu->domain_ids);
1364                         iommu->domains[num] = domain;
1365                         id = num;
1366                 }
1367
1368                 /* Skip top levels of page tables for
1369                  * iommu which has less agaw than default.
1370                  */
1371                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1372                         pgd = phys_to_virt(dma_pte_addr(pgd));
1373                         if (!dma_pte_present(pgd)) {
1374                                 spin_unlock_irqrestore(&iommu->lock, flags);
1375                                 return -ENOMEM;
1376                         }
1377                 }
1378         }
1379
1380         context_set_domain_id(context, id);
1381
1382         /*
1383          * In pass through mode, AW must be programmed to indicate the largest
1384          * AGAW value supported by hardware. And ASR is ignored by hardware.
1385          */
1386         if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
1387                 context_set_address_width(context, iommu->agaw);
1388                 context_set_address_root(context, virt_to_phys(pgd));
1389         } else
1390                 context_set_address_width(context, iommu->msagaw);
1391
1392         context_set_translation_type(context, translation);
1393         context_set_fault_enable(context);
1394         context_set_present(context);
1395         domain_flush_cache(domain, context, sizeof(*context));
1396
1397         /*
1398          * It's a non-present to present mapping. If hardware doesn't cache
1399          * non-present entry we only need to flush the write-buffer. If the
1400          * _does_ cache non-present entries, then it does so in the special
1401          * domain #0, which we have to flush:
1402          */
1403         if (cap_caching_mode(iommu->cap)) {
1404                 iommu->flush.flush_context(iommu, 0,
1405                                            (((u16)bus) << 8) | devfn,
1406                                            DMA_CCMD_MASK_NOBIT,
1407                                            DMA_CCMD_DEVICE_INVL);
1408                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1409         } else {
1410                 iommu_flush_write_buffer(iommu);
1411         }
1412         spin_unlock_irqrestore(&iommu->lock, flags);
1413
1414         spin_lock_irqsave(&domain->iommu_lock, flags);
1415         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1416                 domain->iommu_count++;
1417                 domain_update_iommu_cap(domain);
1418         }
1419         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1420         return 0;
1421 }
1422
1423 static int
1424 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1425                         int translation)
1426 {
1427         int ret;
1428         struct pci_dev *tmp, *parent;
1429
1430         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1431                                          pdev->bus->number, pdev->devfn,
1432                                          translation);
1433         if (ret)
1434                 return ret;
1435
1436         /* dependent device mapping */
1437         tmp = pci_find_upstream_pcie_bridge(pdev);
1438         if (!tmp)
1439                 return 0;
1440         /* Secondary interface's bus number and devfn 0 */
1441         parent = pdev->bus->self;
1442         while (parent != tmp) {
1443                 ret = domain_context_mapping_one(domain,
1444                                                  pci_domain_nr(parent->bus),
1445                                                  parent->bus->number,
1446                                                  parent->devfn, translation);
1447                 if (ret)
1448                         return ret;
1449                 parent = parent->bus->self;
1450         }
1451         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1452                 return domain_context_mapping_one(domain,
1453                                         pci_domain_nr(tmp->subordinate),
1454                                         tmp->subordinate->number, 0,
1455                                         translation);
1456         else /* this is a legacy PCI bridge */
1457                 return domain_context_mapping_one(domain,
1458                                                   pci_domain_nr(tmp->bus),
1459                                                   tmp->bus->number,
1460                                                   tmp->devfn,
1461                                                   translation);
1462 }
1463
1464 static int domain_context_mapped(struct pci_dev *pdev)
1465 {
1466         int ret;
1467         struct pci_dev *tmp, *parent;
1468         struct intel_iommu *iommu;
1469
1470         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1471                                 pdev->devfn);
1472         if (!iommu)
1473                 return -ENODEV;
1474
1475         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1476         if (!ret)
1477                 return ret;
1478         /* dependent device mapping */
1479         tmp = pci_find_upstream_pcie_bridge(pdev);
1480         if (!tmp)
1481                 return ret;
1482         /* Secondary interface's bus number and devfn 0 */
1483         parent = pdev->bus->self;
1484         while (parent != tmp) {
1485                 ret = device_context_mapped(iommu, parent->bus->number,
1486                                             parent->devfn);
1487                 if (!ret)
1488                         return ret;
1489                 parent = parent->bus->self;
1490         }
1491         if (tmp->is_pcie)
1492                 return device_context_mapped(iommu, tmp->subordinate->number,
1493                                              0);
1494         else
1495                 return device_context_mapped(iommu, tmp->bus->number,
1496                                              tmp->devfn);
1497 }
1498
1499 static int
1500 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1501                         u64 hpa, size_t size, int prot)
1502 {
1503         u64 start_pfn, end_pfn;
1504         struct dma_pte *pte;
1505         int index;
1506         int addr_width = agaw_to_width(domain->agaw);
1507
1508         hpa &= (((u64)1) << addr_width) - 1;
1509
1510         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1511                 return -EINVAL;
1512         iova &= PAGE_MASK;
1513         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1514         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1515         index = 0;
1516         while (start_pfn < end_pfn) {
1517                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1518                 if (!pte)
1519                         return -ENOMEM;
1520                 /* We don't need lock here, nobody else
1521                  * touches the iova range
1522                  */
1523                 BUG_ON(dma_pte_addr(pte));
1524                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1525                 dma_set_pte_prot(pte, prot);
1526                 if (prot & DMA_PTE_SNP)
1527                         dma_set_pte_snp(pte);
1528                 domain_flush_cache(domain, pte, sizeof(*pte));
1529                 start_pfn++;
1530                 index++;
1531         }
1532         return 0;
1533 }
1534
1535 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1536 {
1537         if (!iommu)
1538                 return;
1539
1540         clear_context_table(iommu, bus, devfn);
1541         iommu->flush.flush_context(iommu, 0, 0, 0,
1542                                            DMA_CCMD_GLOBAL_INVL);
1543         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1544 }
1545
1546 static void domain_remove_dev_info(struct dmar_domain *domain)
1547 {
1548         struct device_domain_info *info;
1549         unsigned long flags;
1550         struct intel_iommu *iommu;
1551
1552         spin_lock_irqsave(&device_domain_lock, flags);
1553         while (!list_empty(&domain->devices)) {
1554                 info = list_entry(domain->devices.next,
1555                         struct device_domain_info, link);
1556                 list_del(&info->link);
1557                 list_del(&info->global);
1558                 if (info->dev)
1559                         info->dev->dev.archdata.iommu = NULL;
1560                 spin_unlock_irqrestore(&device_domain_lock, flags);
1561
1562                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1563                 iommu_detach_dev(iommu, info->bus, info->devfn);
1564                 free_devinfo_mem(info);
1565
1566                 spin_lock_irqsave(&device_domain_lock, flags);
1567         }
1568         spin_unlock_irqrestore(&device_domain_lock, flags);
1569 }
1570
1571 /*
1572  * find_domain
1573  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1574  */
1575 static struct dmar_domain *
1576 find_domain(struct pci_dev *pdev)
1577 {
1578         struct device_domain_info *info;
1579
1580         /* No lock here, assumes no domain exit in normal case */
1581         info = pdev->dev.archdata.iommu;
1582         if (info)
1583                 return info->domain;
1584         return NULL;
1585 }
1586
1587 /* domain is initialized */
1588 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1589 {
1590         struct dmar_domain *domain, *found = NULL;
1591         struct intel_iommu *iommu;
1592         struct dmar_drhd_unit *drhd;
1593         struct device_domain_info *info, *tmp;
1594         struct pci_dev *dev_tmp;
1595         unsigned long flags;
1596         int bus = 0, devfn = 0;
1597         int segment;
1598
1599         domain = find_domain(pdev);
1600         if (domain)
1601                 return domain;
1602
1603         segment = pci_domain_nr(pdev->bus);
1604
1605         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1606         if (dev_tmp) {
1607                 if (dev_tmp->is_pcie) {
1608                         bus = dev_tmp->subordinate->number;
1609                         devfn = 0;
1610                 } else {
1611                         bus = dev_tmp->bus->number;
1612                         devfn = dev_tmp->devfn;
1613                 }
1614                 spin_lock_irqsave(&device_domain_lock, flags);
1615                 list_for_each_entry(info, &device_domain_list, global) {
1616                         if (info->segment == segment &&
1617                             info->bus == bus && info->devfn == devfn) {
1618                                 found = info->domain;
1619                                 break;
1620                         }
1621                 }
1622                 spin_unlock_irqrestore(&device_domain_lock, flags);
1623                 /* pcie-pci bridge already has a domain, uses it */
1624                 if (found) {
1625                         domain = found;
1626                         goto found_domain;
1627                 }
1628         }
1629
1630         /* Allocate new domain for the device */
1631         drhd = dmar_find_matched_drhd_unit(pdev);
1632         if (!drhd) {
1633                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1634                         pci_name(pdev));
1635                 return NULL;
1636         }
1637         iommu = drhd->iommu;
1638
1639         domain = iommu_alloc_domain(iommu);
1640         if (!domain)
1641                 goto error;
1642
1643         if (domain_init(domain, gaw)) {
1644                 domain_exit(domain);
1645                 goto error;
1646         }
1647
1648         /* register pcie-to-pci device */
1649         if (dev_tmp) {
1650                 info = alloc_devinfo_mem();
1651                 if (!info) {
1652                         domain_exit(domain);
1653                         goto error;
1654                 }
1655                 info->segment = segment;
1656                 info->bus = bus;
1657                 info->devfn = devfn;
1658                 info->dev = NULL;
1659                 info->domain = domain;
1660                 /* This domain is shared by devices under p2p bridge */
1661                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1662
1663                 /* pcie-to-pci bridge already has a domain, uses it */
1664                 found = NULL;
1665                 spin_lock_irqsave(&device_domain_lock, flags);
1666                 list_for_each_entry(tmp, &device_domain_list, global) {
1667                         if (tmp->segment == segment &&
1668                             tmp->bus == bus && tmp->devfn == devfn) {
1669                                 found = tmp->domain;
1670                                 break;
1671                         }
1672                 }
1673                 if (found) {
1674                         free_devinfo_mem(info);
1675                         domain_exit(domain);
1676                         domain = found;
1677                 } else {
1678                         list_add(&info->link, &domain->devices);
1679                         list_add(&info->global, &device_domain_list);
1680                 }
1681                 spin_unlock_irqrestore(&device_domain_lock, flags);
1682         }
1683
1684 found_domain:
1685         info = alloc_devinfo_mem();
1686         if (!info)
1687                 goto error;
1688         info->segment = segment;
1689         info->bus = pdev->bus->number;
1690         info->devfn = pdev->devfn;
1691         info->dev = pdev;
1692         info->domain = domain;
1693         spin_lock_irqsave(&device_domain_lock, flags);
1694         /* somebody is fast */
1695         found = find_domain(pdev);
1696         if (found != NULL) {
1697                 spin_unlock_irqrestore(&device_domain_lock, flags);
1698                 if (found != domain) {
1699                         domain_exit(domain);
1700                         domain = found;
1701                 }
1702                 free_devinfo_mem(info);
1703                 return domain;
1704         }
1705         list_add(&info->link, &domain->devices);
1706         list_add(&info->global, &device_domain_list);
1707         pdev->dev.archdata.iommu = info;
1708         spin_unlock_irqrestore(&device_domain_lock, flags);
1709         return domain;
1710 error:
1711         /* recheck it here, maybe others set it */
1712         return find_domain(pdev);
1713 }
1714
1715 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1716                                       unsigned long long start,
1717                                       unsigned long long end)
1718 {
1719         struct dmar_domain *domain;
1720         unsigned long size;
1721         unsigned long long base;
1722         int ret;
1723
1724         printk(KERN_INFO
1725                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1726                 pci_name(pdev), start, end);
1727         /* page table init */
1728         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1729         if (!domain)
1730                 return -ENOMEM;
1731
1732         /* The address might not be aligned */
1733         base = start & PAGE_MASK;
1734         size = end - base;
1735         size = PAGE_ALIGN(size);
1736         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1737                         IOVA_PFN(base + size) - 1)) {
1738                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1739                 ret = -ENOMEM;
1740                 goto error;
1741         }
1742
1743         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1744                 size, base, pci_name(pdev));
1745         /*
1746          * RMRR range might have overlap with physical memory range,
1747          * clear it first
1748          */
1749         dma_pte_clear_range(domain, base, base + size);
1750
1751         ret = domain_page_mapping(domain, base, base, size,
1752                 DMA_PTE_READ|DMA_PTE_WRITE);
1753         if (ret)
1754                 goto error;
1755
1756         /* context entry init */
1757         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1758         if (!ret)
1759                 return 0;
1760 error:
1761         domain_exit(domain);
1762         return ret;
1763
1764 }
1765
1766 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1767         struct pci_dev *pdev)
1768 {
1769         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1770                 return 0;
1771         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1772                 rmrr->end_address + 1);
1773 }
1774
1775 #ifdef CONFIG_DMAR_GFX_WA
1776 struct iommu_prepare_data {
1777         struct pci_dev *pdev;
1778         int ret;
1779 };
1780
1781 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1782                                          unsigned long end_pfn, void *datax)
1783 {
1784         struct iommu_prepare_data *data;
1785
1786         data = (struct iommu_prepare_data *)datax;
1787
1788         data->ret = iommu_prepare_identity_map(data->pdev,
1789                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1790         return data->ret;
1791
1792 }
1793
1794 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1795 {
1796         int nid;
1797         struct iommu_prepare_data data;
1798
1799         data.pdev = pdev;
1800         data.ret = 0;
1801
1802         for_each_online_node(nid) {
1803                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1804                 if (data.ret)
1805                         return data.ret;
1806         }
1807         return data.ret;
1808 }
1809
1810 static void __init iommu_prepare_gfx_mapping(void)
1811 {
1812         struct pci_dev *pdev = NULL;
1813         int ret;
1814
1815         for_each_pci_dev(pdev) {
1816                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1817                                 !IS_GFX_DEVICE(pdev))
1818                         continue;
1819                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1820                         pci_name(pdev));
1821                 ret = iommu_prepare_with_active_regions(pdev);
1822                 if (ret)
1823                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1824         }
1825 }
1826 #else /* !CONFIG_DMAR_GFX_WA */
1827 static inline void iommu_prepare_gfx_mapping(void)
1828 {
1829         return;
1830 }
1831 #endif
1832
1833 #ifdef CONFIG_DMAR_FLOPPY_WA
1834 static inline void iommu_prepare_isa(void)
1835 {
1836         struct pci_dev *pdev;
1837         int ret;
1838
1839         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1840         if (!pdev)
1841                 return;
1842
1843         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1844         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1845
1846         if (ret)
1847                 printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
1848                         "floppy might not work\n");
1849
1850 }
1851 #else
1852 static inline void iommu_prepare_isa(void)
1853 {
1854         return;
1855 }
1856 #endif /* !CONFIG_DMAR_FLPY_WA */
1857
1858 /* Initialize each context entry as pass through.*/
1859 static int __init init_context_pass_through(void)
1860 {
1861         struct pci_dev *pdev = NULL;
1862         struct dmar_domain *domain;
1863         int ret;
1864
1865         for_each_pci_dev(pdev) {
1866                 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1867                 ret = domain_context_mapping(domain, pdev,
1868                                              CONTEXT_TT_PASS_THROUGH);
1869                 if (ret)
1870                         return ret;
1871         }
1872         return 0;
1873 }
1874
1875 static int __init init_dmars(void)
1876 {
1877         struct dmar_drhd_unit *drhd;
1878         struct dmar_rmrr_unit *rmrr;
1879         struct pci_dev *pdev;
1880         struct intel_iommu *iommu;
1881         int i, ret;
1882         int pass_through = 1;
1883
1884         /*
1885          * for each drhd
1886          *    allocate root
1887          *    initialize and program root entry to not present
1888          * endfor
1889          */
1890         for_each_drhd_unit(drhd) {
1891                 g_num_of_iommus++;
1892                 /*
1893                  * lock not needed as this is only incremented in the single
1894                  * threaded kernel __init code path all other access are read
1895                  * only
1896                  */
1897         }
1898
1899         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1900                         GFP_KERNEL);
1901         if (!g_iommus) {
1902                 printk(KERN_ERR "Allocating global iommu array failed\n");
1903                 ret = -ENOMEM;
1904                 goto error;
1905         }
1906
1907         deferred_flush = kzalloc(g_num_of_iommus *
1908                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1909         if (!deferred_flush) {
1910                 kfree(g_iommus);
1911                 ret = -ENOMEM;
1912                 goto error;
1913         }
1914
1915         for_each_drhd_unit(drhd) {
1916                 if (drhd->ignored)
1917                         continue;
1918
1919                 iommu = drhd->iommu;
1920                 g_iommus[iommu->seq_id] = iommu;
1921
1922                 ret = iommu_init_domains(iommu);
1923                 if (ret)
1924                         goto error;
1925
1926                 /*
1927                  * TBD:
1928                  * we could share the same root & context tables
1929                  * amoung all IOMMU's. Need to Split it later.
1930                  */
1931                 ret = iommu_alloc_root_entry(iommu);
1932                 if (ret) {
1933                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1934                         goto error;
1935                 }
1936                 if (!ecap_pass_through(iommu->ecap))
1937                         pass_through = 0;
1938         }
1939         if (iommu_pass_through)
1940                 if (!pass_through) {
1941                         printk(KERN_INFO
1942                                "Pass Through is not supported by hardware.\n");
1943                         iommu_pass_through = 0;
1944                 }
1945
1946         /*
1947          * Start from the sane iommu hardware state.
1948          */
1949         for_each_drhd_unit(drhd) {
1950                 if (drhd->ignored)
1951                         continue;
1952
1953                 iommu = drhd->iommu;
1954
1955                 /*
1956                  * If the queued invalidation is already initialized by us
1957                  * (for example, while enabling interrupt-remapping) then
1958                  * we got the things already rolling from a sane state.
1959                  */
1960                 if (iommu->qi)
1961                         continue;
1962
1963                 /*
1964                  * Clear any previous faults.
1965                  */
1966                 dmar_fault(-1, iommu);
1967                 /*
1968                  * Disable queued invalidation if supported and already enabled
1969                  * before OS handover.
1970                  */
1971                 dmar_disable_qi(iommu);
1972         }
1973
1974         for_each_drhd_unit(drhd) {
1975                 if (drhd->ignored)
1976                         continue;
1977
1978                 iommu = drhd->iommu;
1979
1980                 if (dmar_enable_qi(iommu)) {
1981                         /*
1982                          * Queued Invalidate not enabled, use Register Based
1983                          * Invalidate
1984                          */
1985                         iommu->flush.flush_context = __iommu_flush_context;
1986                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1987                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1988                                "invalidation\n",
1989                                (unsigned long long)drhd->reg_base_addr);
1990                 } else {
1991                         iommu->flush.flush_context = qi_flush_context;
1992                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1993                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1994                                "invalidation\n",
1995                                (unsigned long long)drhd->reg_base_addr);
1996                 }
1997         }
1998
1999 #ifdef CONFIG_INTR_REMAP
2000         if (!intr_remapping_enabled) {
2001                 ret = enable_intr_remapping(0);
2002                 if (ret)
2003                         printk(KERN_ERR
2004                                "IOMMU: enable interrupt remapping failed\n");
2005         }
2006 #endif
2007         /*
2008          * If pass through is set and enabled, context entries of all pci
2009          * devices are intialized by pass through translation type.
2010          */
2011         if (iommu_pass_through) {
2012                 ret = init_context_pass_through();
2013                 if (ret) {
2014                         printk(KERN_ERR "IOMMU: Pass through init failed.\n");
2015                         iommu_pass_through = 0;
2016                 }
2017         }
2018
2019         /*
2020          * If pass through is not set or not enabled, setup context entries for
2021          * identity mappings for rmrr, gfx, and isa.
2022          */
2023         if (!iommu_pass_through) {
2024                 /*
2025                  * For each rmrr
2026                  *   for each dev attached to rmrr
2027                  *   do
2028                  *     locate drhd for dev, alloc domain for dev
2029                  *     allocate free domain
2030                  *     allocate page table entries for rmrr
2031                  *     if context not allocated for bus
2032                  *           allocate and init context
2033                  *           set present in root table for this bus
2034                  *     init context with domain, translation etc
2035                  *    endfor
2036                  * endfor
2037                  */
2038                 for_each_rmrr_units(rmrr) {
2039                         for (i = 0; i < rmrr->devices_cnt; i++) {
2040                                 pdev = rmrr->devices[i];
2041                                 /*
2042                                  * some BIOS lists non-exist devices in DMAR
2043                                  * table.
2044                                  */
2045                                 if (!pdev)
2046                                         continue;
2047                                 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2048                                 if (ret)
2049                                         printk(KERN_ERR
2050                                  "IOMMU: mapping reserved region failed\n");
2051                         }
2052                 }
2053
2054                 iommu_prepare_gfx_mapping();
2055
2056                 iommu_prepare_isa();
2057         }
2058
2059         /*
2060          * for each drhd
2061          *   enable fault log
2062          *   global invalidate context cache
2063          *   global invalidate iotlb
2064          *   enable translation
2065          */
2066         for_each_drhd_unit(drhd) {
2067                 if (drhd->ignored)
2068                         continue;
2069                 iommu = drhd->iommu;
2070
2071                 iommu_flush_write_buffer(iommu);
2072
2073                 ret = dmar_set_interrupt(iommu);
2074                 if (ret)
2075                         goto error;
2076
2077                 iommu_set_root_entry(iommu);
2078
2079                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2080                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2081                 iommu_disable_protect_mem_regions(iommu);
2082
2083                 ret = iommu_enable_translation(iommu);
2084                 if (ret)
2085                         goto error;
2086         }
2087
2088         return 0;
2089 error:
2090         for_each_drhd_unit(drhd) {
2091                 if (drhd->ignored)
2092                         continue;
2093                 iommu = drhd->iommu;
2094                 free_iommu(iommu);
2095         }
2096         kfree(g_iommus);
2097         return ret;
2098 }
2099
2100 static inline u64 aligned_size(u64 host_addr, size_t size)
2101 {
2102         u64 addr;
2103         addr = (host_addr & (~PAGE_MASK)) + size;
2104         return PAGE_ALIGN(addr);
2105 }
2106
2107 struct iova *
2108 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2109 {
2110         struct iova *piova;
2111
2112         /* Make sure it's in range */
2113         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2114         if (!size || (IOVA_START_ADDR + size > end))
2115                 return NULL;
2116
2117         piova = alloc_iova(&domain->iovad,
2118                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2119         return piova;
2120 }
2121
2122 static struct iova *
2123 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2124                    size_t size, u64 dma_mask)
2125 {
2126         struct pci_dev *pdev = to_pci_dev(dev);
2127         struct iova *iova = NULL;
2128
2129         if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac)
2130                 iova = iommu_alloc_iova(domain, size, dma_mask);
2131         else {
2132                 /*
2133                  * First try to allocate an io virtual address in
2134                  * DMA_BIT_MASK(32) and if that fails then try allocating
2135                  * from higher range
2136                  */
2137                 iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32));
2138                 if (!iova)
2139                         iova = iommu_alloc_iova(domain, size, dma_mask);
2140         }
2141
2142         if (!iova) {
2143                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2144                 return NULL;
2145         }
2146
2147         return iova;
2148 }
2149
2150 static struct dmar_domain *
2151 get_valid_domain_for_dev(struct pci_dev *pdev)
2152 {
2153         struct dmar_domain *domain;
2154         int ret;
2155
2156         domain = get_domain_for_dev(pdev,
2157                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2158         if (!domain) {
2159                 printk(KERN_ERR
2160                         "Allocating domain for %s failed", pci_name(pdev));
2161                 return NULL;
2162         }
2163
2164         /* make sure context mapping is ok */
2165         if (unlikely(!domain_context_mapped(pdev))) {
2166                 ret = domain_context_mapping(domain, pdev,
2167                                              CONTEXT_TT_MULTI_LEVEL);
2168                 if (ret) {
2169                         printk(KERN_ERR
2170                                 "Domain context map for %s failed",
2171                                 pci_name(pdev));
2172                         return NULL;
2173                 }
2174         }
2175
2176         return domain;
2177 }
2178
2179 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2180                                      size_t size, int dir, u64 dma_mask)
2181 {
2182         struct pci_dev *pdev = to_pci_dev(hwdev);
2183         struct dmar_domain *domain;
2184         phys_addr_t start_paddr;
2185         struct iova *iova;
2186         int prot = 0;
2187         int ret;
2188         struct intel_iommu *iommu;
2189
2190         BUG_ON(dir == DMA_NONE);
2191         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2192                 return paddr;
2193
2194         domain = get_valid_domain_for_dev(pdev);
2195         if (!domain)
2196                 return 0;
2197
2198         iommu = domain_get_iommu(domain);
2199         size = aligned_size((u64)paddr, size);
2200
2201         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2202         if (!iova)
2203                 goto error;
2204
2205         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2206
2207         /*
2208          * Check if DMAR supports zero-length reads on write only
2209          * mappings..
2210          */
2211         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2212                         !cap_zlr(iommu->cap))
2213                 prot |= DMA_PTE_READ;
2214         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2215                 prot |= DMA_PTE_WRITE;
2216         /*
2217          * paddr - (paddr + size) might be partial page, we should map the whole
2218          * page.  Note: if two part of one page are separately mapped, we
2219          * might have two guest_addr mapping to the same host paddr, but this
2220          * is not a big problem
2221          */
2222         ret = domain_page_mapping(domain, start_paddr,
2223                 ((u64)paddr) & PAGE_MASK, size, prot);
2224         if (ret)
2225                 goto error;
2226
2227         /* it's a non-present to present mapping. Only flush if caching mode */
2228         if (cap_caching_mode(iommu->cap))
2229                 iommu_flush_iotlb_psi(iommu, 0, start_paddr,
2230                                       size >> VTD_PAGE_SHIFT);
2231         else
2232                 iommu_flush_write_buffer(iommu);
2233
2234         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2235
2236 error:
2237         if (iova)
2238                 __free_iova(&domain->iovad, iova);
2239         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2240                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2241         return 0;
2242 }
2243
2244 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2245                                  unsigned long offset, size_t size,
2246                                  enum dma_data_direction dir,
2247                                  struct dma_attrs *attrs)
2248 {
2249         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2250                                   dir, to_pci_dev(dev)->dma_mask);
2251 }
2252
2253 static void flush_unmaps(void)
2254 {
2255         int i, j;
2256
2257         timer_on = 0;
2258
2259         /* just flush them all */
2260         for (i = 0; i < g_num_of_iommus; i++) {
2261                 struct intel_iommu *iommu = g_iommus[i];
2262                 if (!iommu)
2263                         continue;
2264
2265                 if (deferred_flush[i].next) {
2266                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2267                                                  DMA_TLB_GLOBAL_FLUSH);
2268                         for (j = 0; j < deferred_flush[i].next; j++) {
2269                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2270                                                 deferred_flush[i].iova[j]);
2271                         }
2272                         deferred_flush[i].next = 0;
2273                 }
2274         }
2275
2276         list_size = 0;
2277 }
2278
2279 static void flush_unmaps_timeout(unsigned long data)
2280 {
2281         unsigned long flags;
2282
2283         spin_lock_irqsave(&async_umap_flush_lock, flags);
2284         flush_unmaps();
2285         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2286 }
2287
2288 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2289 {
2290         unsigned long flags;
2291         int next, iommu_id;
2292         struct intel_iommu *iommu;
2293
2294         spin_lock_irqsave(&async_umap_flush_lock, flags);
2295         if (list_size == HIGH_WATER_MARK)
2296                 flush_unmaps();
2297
2298         iommu = domain_get_iommu(dom);
2299         iommu_id = iommu->seq_id;
2300
2301         next = deferred_flush[iommu_id].next;
2302         deferred_flush[iommu_id].domain[next] = dom;
2303         deferred_flush[iommu_id].iova[next] = iova;
2304         deferred_flush[iommu_id].next++;
2305
2306         if (!timer_on) {
2307                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2308                 timer_on = 1;
2309         }
2310         list_size++;
2311         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2312 }
2313
2314 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2315                              size_t size, enum dma_data_direction dir,
2316                              struct dma_attrs *attrs)
2317 {
2318         struct pci_dev *pdev = to_pci_dev(dev);
2319         struct dmar_domain *domain;
2320         unsigned long start_addr;
2321         struct iova *iova;
2322         struct intel_iommu *iommu;
2323
2324         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2325                 return;
2326         domain = find_domain(pdev);
2327         BUG_ON(!domain);
2328
2329         iommu = domain_get_iommu(domain);
2330
2331         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2332         if (!iova)
2333                 return;
2334
2335         start_addr = iova->pfn_lo << PAGE_SHIFT;
2336         size = aligned_size((u64)dev_addr, size);
2337
2338         pr_debug("Device %s unmapping: %zx@%llx\n",
2339                 pci_name(pdev), size, (unsigned long long)start_addr);
2340
2341         /*  clear the whole page */
2342         dma_pte_clear_range(domain, start_addr, start_addr + size);
2343         /* free page tables */
2344         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2345         if (intel_iommu_strict) {
2346                 iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2347                                       size >> VTD_PAGE_SHIFT);
2348                 /* free iova */
2349                 __free_iova(&domain->iovad, iova);
2350         } else {
2351                 add_unmap(domain, iova);
2352                 /*
2353                  * queue up the release of the unmap to save the 1/6th of the
2354                  * cpu used up by the iotlb flush operation...
2355                  */
2356         }
2357 }
2358
2359 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2360                                int dir)
2361 {
2362         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2363 }
2364
2365 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2366                                   dma_addr_t *dma_handle, gfp_t flags)
2367 {
2368         void *vaddr;
2369         int order;
2370
2371         size = PAGE_ALIGN(size);
2372         order = get_order(size);
2373         flags &= ~(GFP_DMA | GFP_DMA32);
2374
2375         vaddr = (void *)__get_free_pages(flags, order);
2376         if (!vaddr)
2377                 return NULL;
2378         memset(vaddr, 0, size);
2379
2380         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2381                                          DMA_BIDIRECTIONAL,
2382                                          hwdev->coherent_dma_mask);
2383         if (*dma_handle)
2384                 return vaddr;
2385         free_pages((unsigned long)vaddr, order);
2386         return NULL;
2387 }
2388
2389 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2390                                 dma_addr_t dma_handle)
2391 {
2392         int order;
2393
2394         size = PAGE_ALIGN(size);
2395         order = get_order(size);
2396
2397         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2398         free_pages((unsigned long)vaddr, order);
2399 }
2400
2401 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2402                            int nelems, enum dma_data_direction dir,
2403                            struct dma_attrs *attrs)
2404 {
2405         int i;
2406         struct pci_dev *pdev = to_pci_dev(hwdev);
2407         struct dmar_domain *domain;
2408         unsigned long start_addr;
2409         struct iova *iova;
2410         size_t size = 0;
2411         phys_addr_t addr;
2412         struct scatterlist *sg;
2413         struct intel_iommu *iommu;
2414
2415         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2416                 return;
2417
2418         domain = find_domain(pdev);
2419         BUG_ON(!domain);
2420
2421         iommu = domain_get_iommu(domain);
2422
2423         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2424         if (!iova)
2425                 return;
2426         for_each_sg(sglist, sg, nelems, i) {
2427                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2428                 size += aligned_size((u64)addr, sg->length);
2429         }
2430
2431         start_addr = iova->pfn_lo << PAGE_SHIFT;
2432
2433         /*  clear the whole page */
2434         dma_pte_clear_range(domain, start_addr, start_addr + size);
2435         /* free page tables */
2436         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2437
2438         iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2439                               size >> VTD_PAGE_SHIFT);
2440
2441         /* free iova */
2442         __free_iova(&domain->iovad, iova);
2443 }
2444
2445 static int intel_nontranslate_map_sg(struct device *hddev,
2446         struct scatterlist *sglist, int nelems, int dir)
2447 {
2448         int i;
2449         struct scatterlist *sg;
2450
2451         for_each_sg(sglist, sg, nelems, i) {
2452                 BUG_ON(!sg_page(sg));
2453                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2454                 sg->dma_length = sg->length;
2455         }
2456         return nelems;
2457 }
2458
2459 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2460                         enum dma_data_direction dir, struct dma_attrs *attrs)
2461 {
2462         phys_addr_t addr;
2463         int i;
2464         struct pci_dev *pdev = to_pci_dev(hwdev);
2465         struct dmar_domain *domain;
2466         size_t size = 0;
2467         int prot = 0;
2468         size_t offset = 0;
2469         struct iova *iova = NULL;
2470         int ret;
2471         struct scatterlist *sg;
2472         unsigned long start_addr;
2473         struct intel_iommu *iommu;
2474
2475         BUG_ON(dir == DMA_NONE);
2476         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2477                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2478
2479         domain = get_valid_domain_for_dev(pdev);
2480         if (!domain)
2481                 return 0;
2482
2483         iommu = domain_get_iommu(domain);
2484
2485         for_each_sg(sglist, sg, nelems, i) {
2486                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2487                 size += aligned_size((u64)addr, sg->length);
2488         }
2489
2490         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2491         if (!iova) {
2492                 sglist->dma_length = 0;
2493                 return 0;
2494         }
2495
2496         /*
2497          * Check if DMAR supports zero-length reads on write only
2498          * mappings..
2499          */
2500         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2501                         !cap_zlr(iommu->cap))
2502                 prot |= DMA_PTE_READ;
2503         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2504                 prot |= DMA_PTE_WRITE;
2505
2506         start_addr = iova->pfn_lo << PAGE_SHIFT;
2507         offset = 0;
2508         for_each_sg(sglist, sg, nelems, i) {
2509                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2510                 size = aligned_size((u64)addr, sg->length);
2511                 ret = domain_page_mapping(domain, start_addr + offset,
2512                         ((u64)addr) & PAGE_MASK,
2513                         size, prot);
2514                 if (ret) {
2515                         /*  clear the page */
2516                         dma_pte_clear_range(domain, start_addr,
2517                                   start_addr + offset);
2518                         /* free page tables */
2519                         dma_pte_free_pagetable(domain, start_addr,
2520                                   start_addr + offset);
2521                         /* free iova */
2522                         __free_iova(&domain->iovad, iova);
2523                         return 0;
2524                 }
2525                 sg->dma_address = start_addr + offset +
2526                                 ((u64)addr & (~PAGE_MASK));
2527                 sg->dma_length = sg->length;
2528                 offset += size;
2529         }
2530
2531         /* it's a non-present to present mapping. Only flush if caching mode */
2532         if (cap_caching_mode(iommu->cap))
2533                 iommu_flush_iotlb_psi(iommu, 0, start_addr,
2534                                       offset >> VTD_PAGE_SHIFT);
2535         else
2536                 iommu_flush_write_buffer(iommu);
2537
2538         return nelems;
2539 }
2540
2541 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2542 {
2543         return !dma_addr;
2544 }
2545
2546 struct dma_map_ops intel_dma_ops = {
2547         .alloc_coherent = intel_alloc_coherent,
2548         .free_coherent = intel_free_coherent,
2549         .map_sg = intel_map_sg,
2550         .unmap_sg = intel_unmap_sg,
2551         .map_page = intel_map_page,
2552         .unmap_page = intel_unmap_page,
2553         .mapping_error = intel_mapping_error,
2554 };
2555
2556 static inline int iommu_domain_cache_init(void)
2557 {
2558         int ret = 0;
2559
2560         iommu_domain_cache = kmem_cache_create("iommu_domain",
2561                                          sizeof(struct dmar_domain),
2562                                          0,
2563                                          SLAB_HWCACHE_ALIGN,
2564
2565                                          NULL);
2566         if (!iommu_domain_cache) {
2567                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2568                 ret = -ENOMEM;
2569         }
2570
2571         return ret;
2572 }
2573
2574 static inline int iommu_devinfo_cache_init(void)
2575 {
2576         int ret = 0;
2577
2578         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2579                                          sizeof(struct device_domain_info),
2580                                          0,
2581                                          SLAB_HWCACHE_ALIGN,
2582                                          NULL);
2583         if (!iommu_devinfo_cache) {
2584                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2585                 ret = -ENOMEM;
2586         }
2587
2588         return ret;
2589 }
2590
2591 static inline int iommu_iova_cache_init(void)
2592 {
2593         int ret = 0;
2594
2595         iommu_iova_cache = kmem_cache_create("iommu_iova",
2596                                          sizeof(struct iova),
2597                                          0,
2598                                          SLAB_HWCACHE_ALIGN,
2599                                          NULL);
2600         if (!iommu_iova_cache) {
2601                 printk(KERN_ERR "Couldn't create iova cache\n");
2602                 ret = -ENOMEM;
2603         }
2604
2605         return ret;
2606 }
2607
2608 static int __init iommu_init_mempool(void)
2609 {
2610         int ret;
2611         ret = iommu_iova_cache_init();
2612         if (ret)
2613                 return ret;
2614
2615         ret = iommu_domain_cache_init();
2616         if (ret)
2617                 goto domain_error;
2618
2619         ret = iommu_devinfo_cache_init();
2620         if (!ret)
2621                 return ret;
2622
2623         kmem_cache_destroy(iommu_domain_cache);
2624 domain_error:
2625         kmem_cache_destroy(iommu_iova_cache);
2626
2627         return -ENOMEM;
2628 }
2629
2630 static void __init iommu_exit_mempool(void)
2631 {
2632         kmem_cache_destroy(iommu_devinfo_cache);
2633         kmem_cache_destroy(iommu_domain_cache);
2634         kmem_cache_destroy(iommu_iova_cache);
2635
2636 }
2637
2638 static void __init init_no_remapping_devices(void)
2639 {
2640         struct dmar_drhd_unit *drhd;
2641
2642         for_each_drhd_unit(drhd) {
2643                 if (!drhd->include_all) {
2644                         int i;
2645                         for (i = 0; i < drhd->devices_cnt; i++)
2646                                 if (drhd->devices[i] != NULL)
2647                                         break;
2648                         /* ignore DMAR unit if no pci devices exist */
2649                         if (i == drhd->devices_cnt)
2650                                 drhd->ignored = 1;
2651                 }
2652         }
2653
2654         if (dmar_map_gfx)
2655                 return;
2656
2657         for_each_drhd_unit(drhd) {
2658                 int i;
2659                 if (drhd->ignored || drhd->include_all)
2660                         continue;
2661
2662                 for (i = 0; i < drhd->devices_cnt; i++)
2663                         if (drhd->devices[i] &&
2664                                 !IS_GFX_DEVICE(drhd->devices[i]))
2665                                 break;
2666
2667                 if (i < drhd->devices_cnt)
2668                         continue;
2669
2670                 /* bypass IOMMU if it is just for gfx devices */
2671                 drhd->ignored = 1;
2672                 for (i = 0; i < drhd->devices_cnt; i++) {
2673                         if (!drhd->devices[i])
2674                                 continue;
2675                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2676                 }
2677         }
2678 }
2679
2680 #ifdef CONFIG_SUSPEND
2681 static int init_iommu_hw(void)
2682 {
2683         struct dmar_drhd_unit *drhd;
2684         struct intel_iommu *iommu = NULL;
2685
2686         for_each_active_iommu(iommu, drhd)
2687                 if (iommu->qi)
2688                         dmar_reenable_qi(iommu);
2689
2690         for_each_active_iommu(iommu, drhd) {
2691                 iommu_flush_write_buffer(iommu);
2692
2693                 iommu_set_root_entry(iommu);
2694
2695                 iommu->flush.flush_context(iommu, 0, 0, 0,
2696                                            DMA_CCMD_GLOBAL_INVL);
2697                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2698                                          DMA_TLB_GLOBAL_FLUSH);
2699                 iommu_disable_protect_mem_regions(iommu);
2700                 iommu_enable_translation(iommu);
2701         }
2702
2703         return 0;
2704 }
2705
2706 static void iommu_flush_all(void)
2707 {
2708         struct dmar_drhd_unit *drhd;
2709         struct intel_iommu *iommu;
2710
2711         for_each_active_iommu(iommu, drhd) {
2712                 iommu->flush.flush_context(iommu, 0, 0, 0,
2713                                            DMA_CCMD_GLOBAL_INVL);
2714                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2715                                          DMA_TLB_GLOBAL_FLUSH);
2716         }
2717 }
2718
2719 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
2720 {
2721         struct dmar_drhd_unit *drhd;
2722         struct intel_iommu *iommu = NULL;
2723         unsigned long flag;
2724
2725         for_each_active_iommu(iommu, drhd) {
2726                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
2727                                                  GFP_ATOMIC);
2728                 if (!iommu->iommu_state)
2729                         goto nomem;
2730         }
2731
2732         iommu_flush_all();
2733
2734         for_each_active_iommu(iommu, drhd) {
2735                 iommu_disable_translation(iommu);
2736
2737                 spin_lock_irqsave(&iommu->register_lock, flag);
2738
2739                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2740                         readl(iommu->reg + DMAR_FECTL_REG);
2741                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2742                         readl(iommu->reg + DMAR_FEDATA_REG);
2743                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2744                         readl(iommu->reg + DMAR_FEADDR_REG);
2745                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2746                         readl(iommu->reg + DMAR_FEUADDR_REG);
2747
2748                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2749         }
2750         return 0;
2751
2752 nomem:
2753         for_each_active_iommu(iommu, drhd)
2754                 kfree(iommu->iommu_state);
2755
2756         return -ENOMEM;
2757 }
2758
2759 static int iommu_resume(struct sys_device *dev)
2760 {
2761         struct dmar_drhd_unit *drhd;
2762         struct intel_iommu *iommu = NULL;
2763         unsigned long flag;
2764
2765         if (init_iommu_hw()) {
2766                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2767                 return -EIO;
2768         }
2769
2770         for_each_active_iommu(iommu, drhd) {
2771
2772                 spin_lock_irqsave(&iommu->register_lock, flag);
2773
2774                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2775                         iommu->reg + DMAR_FECTL_REG);
2776                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2777                         iommu->reg + DMAR_FEDATA_REG);
2778                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2779                         iommu->reg + DMAR_FEADDR_REG);
2780                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2781                         iommu->reg + DMAR_FEUADDR_REG);
2782
2783                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2784         }
2785
2786         for_each_active_iommu(iommu, drhd)
2787                 kfree(iommu->iommu_state);
2788
2789         return 0;
2790 }
2791
2792 static struct sysdev_class iommu_sysclass = {
2793         .name           = "iommu",
2794         .resume         = iommu_resume,
2795         .suspend        = iommu_suspend,
2796 };
2797
2798 static struct sys_device device_iommu = {
2799         .cls    = &iommu_sysclass,
2800 };
2801
2802 static int __init init_iommu_sysfs(void)
2803 {
2804         int error;
2805
2806         error = sysdev_class_register(&iommu_sysclass);
2807         if (error)
2808                 return error;
2809
2810         error = sysdev_register(&device_iommu);
2811         if (error)
2812                 sysdev_class_unregister(&iommu_sysclass);
2813
2814         return error;
2815 }
2816
2817 #else
2818 static int __init init_iommu_sysfs(void)
2819 {
2820         return 0;
2821 }
2822 #endif  /* CONFIG_PM */
2823
2824 int __init intel_iommu_init(void)
2825 {
2826         int ret = 0;
2827
2828         if (dmar_table_init())
2829                 return  -ENODEV;
2830
2831         if (dmar_dev_scope_init())
2832                 return  -ENODEV;
2833
2834         /*
2835          * Check the need for DMA-remapping initialization now.
2836          * Above initialization will also be used by Interrupt-remapping.
2837          */
2838         if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
2839                 return -ENODEV;
2840
2841         iommu_init_mempool();
2842         dmar_init_reserved_ranges();
2843
2844         init_no_remapping_devices();
2845
2846         ret = init_dmars();
2847         if (ret) {
2848                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2849                 put_iova_domain(&reserved_iova_list);
2850                 iommu_exit_mempool();
2851                 return ret;
2852         }
2853         printk(KERN_INFO
2854         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2855
2856         init_timer(&unmap_timer);
2857         force_iommu = 1;
2858
2859         if (!iommu_pass_through) {
2860                 printk(KERN_INFO
2861                        "Multi-level page-table translation for DMAR.\n");
2862                 dma_ops = &intel_dma_ops;
2863         } else
2864                 printk(KERN_INFO
2865                        "DMAR: Pass through translation for DMAR.\n");
2866
2867         init_iommu_sysfs();
2868
2869         register_iommu(&intel_iommu_ops);
2870
2871         return 0;
2872 }
2873
2874 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2875                                   struct pci_dev *pdev)
2876 {
2877         struct device_domain_info *info;
2878         unsigned long flags;
2879
2880         info = alloc_devinfo_mem();
2881         if (!info)
2882                 return -ENOMEM;
2883
2884         info->segment = pci_domain_nr(pdev->bus);
2885         info->bus = pdev->bus->number;
2886         info->devfn = pdev->devfn;
2887         info->dev = pdev;
2888         info->domain = domain;
2889
2890         spin_lock_irqsave(&device_domain_lock, flags);
2891         list_add(&info->link, &domain->devices);
2892         list_add(&info->global, &device_domain_list);
2893         pdev->dev.archdata.iommu = info;
2894         spin_unlock_irqrestore(&device_domain_lock, flags);
2895
2896         return 0;
2897 }
2898
2899 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2900                                            struct pci_dev *pdev)
2901 {
2902         struct pci_dev *tmp, *parent;
2903
2904         if (!iommu || !pdev)
2905                 return;
2906
2907         /* dependent device detach */
2908         tmp = pci_find_upstream_pcie_bridge(pdev);
2909         /* Secondary interface's bus number and devfn 0 */
2910         if (tmp) {
2911                 parent = pdev->bus->self;
2912                 while (parent != tmp) {
2913                         iommu_detach_dev(iommu, parent->bus->number,
2914                                          parent->devfn);
2915                         parent = parent->bus->self;
2916                 }
2917                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2918                         iommu_detach_dev(iommu,
2919                                 tmp->subordinate->number, 0);
2920                 else /* this is a legacy PCI bridge */
2921                         iommu_detach_dev(iommu, tmp->bus->number,
2922                                          tmp->devfn);
2923         }
2924 }
2925
2926 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2927                                           struct pci_dev *pdev)
2928 {
2929         struct device_domain_info *info;
2930         struct intel_iommu *iommu;
2931         unsigned long flags;
2932         int found = 0;
2933         struct list_head *entry, *tmp;
2934
2935         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
2936                                 pdev->devfn);
2937         if (!iommu)
2938                 return;
2939
2940         spin_lock_irqsave(&device_domain_lock, flags);
2941         list_for_each_safe(entry, tmp, &domain->devices) {
2942                 info = list_entry(entry, struct device_domain_info, link);
2943                 /* No need to compare PCI domain; it has to be the same */
2944                 if (info->bus == pdev->bus->number &&
2945                     info->devfn == pdev->devfn) {
2946                         list_del(&info->link);
2947                         list_del(&info->global);
2948                         if (info->dev)
2949                                 info->dev->dev.archdata.iommu = NULL;
2950                         spin_unlock_irqrestore(&device_domain_lock, flags);
2951
2952                         iommu_detach_dev(iommu, info->bus, info->devfn);
2953                         iommu_detach_dependent_devices(iommu, pdev);
2954                         free_devinfo_mem(info);
2955
2956                         spin_lock_irqsave(&device_domain_lock, flags);
2957
2958                         if (found)
2959                                 break;
2960                         else
2961                                 continue;
2962                 }
2963
2964                 /* if there is no other devices under the same iommu
2965                  * owned by this domain, clear this iommu in iommu_bmp
2966                  * update iommu count and coherency
2967                  */
2968                 if (iommu == device_to_iommu(info->segment, info->bus,
2969                                             info->devfn))
2970                         found = 1;
2971         }
2972
2973         if (found == 0) {
2974                 unsigned long tmp_flags;
2975                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2976                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2977                 domain->iommu_count--;
2978                 domain_update_iommu_cap(domain);
2979                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2980         }
2981
2982         spin_unlock_irqrestore(&device_domain_lock, flags);
2983 }
2984
2985 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2986 {
2987         struct device_domain_info *info;
2988         struct intel_iommu *iommu;
2989         unsigned long flags1, flags2;
2990
2991         spin_lock_irqsave(&device_domain_lock, flags1);
2992         while (!list_empty(&domain->devices)) {
2993                 info = list_entry(domain->devices.next,
2994                         struct device_domain_info, link);
2995                 list_del(&info->link);
2996                 list_del(&info->global);
2997                 if (info->dev)
2998                         info->dev->dev.archdata.iommu = NULL;
2999
3000                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3001
3002                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3003                 iommu_detach_dev(iommu, info->bus, info->devfn);
3004                 iommu_detach_dependent_devices(iommu, info->dev);
3005
3006                 /* clear this iommu in iommu_bmp, update iommu count
3007                  * and capabilities
3008                  */
3009                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3010                 if (test_and_clear_bit(iommu->seq_id,
3011                                        &domain->iommu_bmp)) {
3012                         domain->iommu_count--;
3013                         domain_update_iommu_cap(domain);
3014                 }
3015                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3016
3017                 free_devinfo_mem(info);
3018                 spin_lock_irqsave(&device_domain_lock, flags1);
3019         }
3020         spin_unlock_irqrestore(&device_domain_lock, flags1);
3021 }
3022
3023 /* domain id for virtual machine, it won't be set in context */
3024 static unsigned long vm_domid;
3025
3026 static int vm_domain_min_agaw(struct dmar_domain *domain)
3027 {
3028         int i;
3029         int min_agaw = domain->agaw;
3030
3031         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3032         for (; i < g_num_of_iommus; ) {
3033                 if (min_agaw > g_iommus[i]->agaw)
3034                         min_agaw = g_iommus[i]->agaw;
3035
3036                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3037         }
3038
3039         return min_agaw;
3040 }
3041
3042 static struct dmar_domain *iommu_alloc_vm_domain(void)
3043 {
3044         struct dmar_domain *domain;
3045
3046         domain = alloc_domain_mem();
3047         if (!domain)
3048                 return NULL;
3049
3050         domain->id = vm_domid++;
3051         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3052         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3053
3054         return domain;
3055 }
3056
3057 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
3058 {
3059         int adjust_width;
3060
3061         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3062         spin_lock_init(&domain->mapping_lock);
3063         spin_lock_init(&domain->iommu_lock);
3064
3065         domain_reserve_special_ranges(domain);
3066
3067         /* calculate AGAW */
3068         domain->gaw = guest_width;
3069         adjust_width = guestwidth_to_adjustwidth(guest_width);
3070         domain->agaw = width_to_agaw(adjust_width);
3071
3072         INIT_LIST_HEAD(&domain->devices);
3073
3074         domain->iommu_count = 0;
3075         domain->iommu_coherency = 0;
3076         domain->max_addr = 0;
3077
3078         /* always allocate the top pgd */
3079         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3080         if (!domain->pgd)
3081                 return -ENOMEM;
3082         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3083         return 0;
3084 }
3085
3086 static void iommu_free_vm_domain(struct dmar_domain *domain)
3087 {
3088         unsigned long flags;
3089         struct dmar_drhd_unit *drhd;
3090         struct intel_iommu *iommu;
3091         unsigned long i;
3092         unsigned long ndomains;
3093
3094         for_each_drhd_unit(drhd) {
3095                 if (drhd->ignored)
3096                         continue;
3097                 iommu = drhd->iommu;
3098
3099                 ndomains = cap_ndoms(iommu->cap);
3100                 i = find_first_bit(iommu->domain_ids, ndomains);
3101                 for (; i < ndomains; ) {
3102                         if (iommu->domains[i] == domain) {
3103                                 spin_lock_irqsave(&iommu->lock, flags);
3104                                 clear_bit(i, iommu->domain_ids);
3105                                 iommu->domains[i] = NULL;
3106                                 spin_unlock_irqrestore(&iommu->lock, flags);
3107                                 break;
3108                         }
3109                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3110                 }
3111         }
3112 }
3113
3114 static void vm_domain_exit(struct dmar_domain *domain)
3115 {
3116         u64 end;
3117
3118         /* Domain 0 is reserved, so dont process it */
3119         if (!domain)
3120                 return;
3121
3122         vm_domain_remove_all_dev_info(domain);
3123         /* destroy iovas */
3124         put_iova_domain(&domain->iovad);
3125         end = DOMAIN_MAX_ADDR(domain->gaw);
3126         end = end & (~VTD_PAGE_MASK);
3127
3128         /* clear ptes */
3129         dma_pte_clear_range(domain, 0, end);
3130
3131         /* free page tables */
3132         dma_pte_free_pagetable(domain, 0, end);
3133
3134         iommu_free_vm_domain(domain);
3135         free_domain_mem(domain);
3136 }
3137
3138 static int intel_iommu_domain_init(struct iommu_domain *domain)
3139 {
3140         struct dmar_domain *dmar_domain;
3141
3142         dmar_domain = iommu_alloc_vm_domain();
3143         if (!dmar_domain) {
3144                 printk(KERN_ERR
3145                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3146                 return -ENOMEM;
3147         }
3148         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3149                 printk(KERN_ERR
3150                         "intel_iommu_domain_init() failed\n");
3151                 vm_domain_exit(dmar_domain);
3152                 return -ENOMEM;
3153         }
3154         domain->priv = dmar_domain;
3155
3156         return 0;
3157 }
3158
3159 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3160 {
3161         struct dmar_domain *dmar_domain = domain->priv;
3162
3163         domain->priv = NULL;
3164         vm_domain_exit(dmar_domain);
3165 }
3166
3167 static int intel_iommu_attach_device(struct iommu_domain *domain,
3168                                      struct device *dev)
3169 {
3170         struct dmar_domain *dmar_domain = domain->priv;
3171         struct pci_dev *pdev = to_pci_dev(dev);
3172         struct intel_iommu *iommu;
3173         int addr_width;
3174         u64 end;
3175         int ret;
3176
3177         /* normally pdev is not mapped */
3178         if (unlikely(domain_context_mapped(pdev))) {
3179                 struct dmar_domain *old_domain;
3180
3181                 old_domain = find_domain(pdev);
3182                 if (old_domain) {
3183                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3184                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3185                         else
3186                                 domain_remove_dev_info(old_domain);
3187                 }
3188         }
3189
3190         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3191                                 pdev->devfn);
3192         if (!iommu)
3193                 return -ENODEV;
3194
3195         /* check if this iommu agaw is sufficient for max mapped address */
3196         addr_width = agaw_to_width(iommu->agaw);
3197         end = DOMAIN_MAX_ADDR(addr_width);
3198         end = end & VTD_PAGE_MASK;
3199         if (end < dmar_domain->max_addr) {
3200                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3201                        "sufficient for the mapped address (%llx)\n",
3202                        __func__, iommu->agaw, dmar_domain->max_addr);
3203                 return -EFAULT;
3204         }
3205
3206         ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3207         if (ret)
3208                 return ret;
3209
3210         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3211         return ret;
3212 }
3213
3214 static void intel_iommu_detach_device(struct iommu_domain *domain,
3215                                       struct device *dev)
3216 {
3217         struct dmar_domain *dmar_domain = domain->priv;
3218         struct pci_dev *pdev = to_pci_dev(dev);
3219
3220         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3221 }
3222
3223 static int intel_iommu_map_range(struct iommu_domain *domain,
3224                                  unsigned long iova, phys_addr_t hpa,
3225                                  size_t size, int iommu_prot)
3226 {
3227         struct dmar_domain *dmar_domain = domain->priv;
3228         u64 max_addr;
3229         int addr_width;
3230         int prot = 0;
3231         int ret;
3232
3233         if (iommu_prot & IOMMU_READ)
3234                 prot |= DMA_PTE_READ;
3235         if (iommu_prot & IOMMU_WRITE)
3236                 prot |= DMA_PTE_WRITE;
3237         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3238                 prot |= DMA_PTE_SNP;
3239
3240         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3241         if (dmar_domain->max_addr < max_addr) {
3242                 int min_agaw;
3243                 u64 end;
3244
3245                 /* check if minimum agaw is sufficient for mapped address */
3246                 min_agaw = vm_domain_min_agaw(dmar_domain);
3247                 addr_width = agaw_to_width(min_agaw);
3248                 end = DOMAIN_MAX_ADDR(addr_width);
3249                 end = end & VTD_PAGE_MASK;
3250                 if (end < max_addr) {
3251                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3252                                "sufficient for the mapped address (%llx)\n",
3253                                __func__, min_agaw, max_addr);
3254                         return -EFAULT;
3255                 }
3256                 dmar_domain->max_addr = max_addr;
3257         }
3258
3259         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3260         return ret;
3261 }
3262
3263 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3264                                     unsigned long iova, size_t size)
3265 {
3266         struct dmar_domain *dmar_domain = domain->priv;
3267         dma_addr_t base;
3268
3269         /* The address might not be aligned */
3270         base = iova & VTD_PAGE_MASK;
3271         size = VTD_PAGE_ALIGN(size);
3272         dma_pte_clear_range(dmar_domain, base, base + size);
3273
3274         if (dmar_domain->max_addr == base + size)
3275                 dmar_domain->max_addr = base;
3276 }
3277
3278 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3279                                             unsigned long iova)
3280 {
3281         struct dmar_domain *dmar_domain = domain->priv;
3282         struct dma_pte *pte;
3283         u64 phys = 0;
3284
3285         pte = addr_to_dma_pte(dmar_domain, iova);
3286         if (pte)
3287                 phys = dma_pte_addr(pte);
3288
3289         return phys;
3290 }
3291
3292 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3293                                       unsigned long cap)
3294 {
3295         struct dmar_domain *dmar_domain = domain->priv;
3296
3297         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3298                 return dmar_domain->iommu_snooping;
3299
3300         return 0;
3301 }
3302
3303 static struct iommu_ops intel_iommu_ops = {
3304         .domain_init    = intel_iommu_domain_init,
3305         .domain_destroy = intel_iommu_domain_destroy,
3306         .attach_dev     = intel_iommu_attach_device,
3307         .detach_dev     = intel_iommu_detach_device,
3308         .map            = intel_iommu_map_range,
3309         .unmap          = intel_iommu_unmap_range,
3310         .iova_to_phys   = intel_iommu_iova_to_phys,
3311         .domain_has_cap = intel_iommu_domain_has_cap,
3312 };
3313
3314 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3315 {
3316         /*
3317          * Mobile 4 Series Chipset neglects to set RWBF capability,
3318          * but needs it:
3319          */
3320         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3321         rwbf_quirk = 1;
3322 }
3323
3324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);