VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <asm/cacheflush.h>
41 #include <asm/iommu.h>
42 #include "pci.h"
43
44 #define ROOT_SIZE               VTD_PAGE_SIZE
45 #define CONTEXT_SIZE            VTD_PAGE_SIZE
46
47 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
48 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49
50 #define IOAPIC_RANGE_START      (0xfee00000)
51 #define IOAPIC_RANGE_END        (0xfeefffff)
52 #define IOVA_START_ADDR         (0x1000)
53
54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55
56 #define MAX_AGAW_WIDTH 64
57
58 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
59
60 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
61 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
62 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
63
64 /* global iommu list, set NULL for ignored DMAR units */
65 static struct intel_iommu **g_iommus;
66
67 static int rwbf_quirk;
68
69 /*
70  * 0: Present
71  * 1-11: Reserved
72  * 12-63: Context Ptr (12 - (haw-1))
73  * 64-127: Reserved
74  */
75 struct root_entry {
76         u64     val;
77         u64     rsvd1;
78 };
79 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
80 static inline bool root_present(struct root_entry *root)
81 {
82         return (root->val & 1);
83 }
84 static inline void set_root_present(struct root_entry *root)
85 {
86         root->val |= 1;
87 }
88 static inline void set_root_value(struct root_entry *root, unsigned long value)
89 {
90         root->val |= value & VTD_PAGE_MASK;
91 }
92
93 static inline struct context_entry *
94 get_context_addr_from_root(struct root_entry *root)
95 {
96         return (struct context_entry *)
97                 (root_present(root)?phys_to_virt(
98                 root->val & VTD_PAGE_MASK) :
99                 NULL);
100 }
101
102 /*
103  * low 64 bits:
104  * 0: present
105  * 1: fault processing disable
106  * 2-3: translation type
107  * 12-63: address space root
108  * high 64 bits:
109  * 0-2: address width
110  * 3-6: aval
111  * 8-23: domain id
112  */
113 struct context_entry {
114         u64 lo;
115         u64 hi;
116 };
117
118 static inline bool context_present(struct context_entry *context)
119 {
120         return (context->lo & 1);
121 }
122 static inline void context_set_present(struct context_entry *context)
123 {
124         context->lo |= 1;
125 }
126
127 static inline void context_set_fault_enable(struct context_entry *context)
128 {
129         context->lo &= (((u64)-1) << 2) | 1;
130 }
131
132 static inline void context_set_translation_type(struct context_entry *context,
133                                                 unsigned long value)
134 {
135         context->lo &= (((u64)-1) << 4) | 3;
136         context->lo |= (value & 3) << 2;
137 }
138
139 static inline void context_set_address_root(struct context_entry *context,
140                                             unsigned long value)
141 {
142         context->lo |= value & VTD_PAGE_MASK;
143 }
144
145 static inline void context_set_address_width(struct context_entry *context,
146                                              unsigned long value)
147 {
148         context->hi |= value & 7;
149 }
150
151 static inline void context_set_domain_id(struct context_entry *context,
152                                          unsigned long value)
153 {
154         context->hi |= (value & ((1 << 16) - 1)) << 8;
155 }
156
157 static inline void context_clear_entry(struct context_entry *context)
158 {
159         context->lo = 0;
160         context->hi = 0;
161 }
162
163 /*
164  * 0: readable
165  * 1: writable
166  * 2-6: reserved
167  * 7: super page
168  * 8-10: available
169  * 11: snoop behavior
170  * 12-63: Host physcial address
171  */
172 struct dma_pte {
173         u64 val;
174 };
175
176 static inline void dma_clear_pte(struct dma_pte *pte)
177 {
178         pte->val = 0;
179 }
180
181 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_READ;
184 }
185
186 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 {
188         pte->val |= DMA_PTE_WRITE;
189 }
190
191 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 {
193         pte->val |= DMA_PTE_SNP;
194 }
195
196 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 {
198         pte->val = (pte->val & ~3) | (prot & 3);
199 }
200
201 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 {
203         return (pte->val & VTD_PAGE_MASK);
204 }
205
206 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 {
208         pte->val |= (addr & VTD_PAGE_MASK);
209 }
210
211 static inline bool dma_pte_present(struct dma_pte *pte)
212 {
213         return (pte->val & 3) != 0;
214 }
215
216 /* devices under the same p2p bridge are owned in one domain */
217 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218
219 /* domain represents a virtual machine, more than one devices
220  * across iommus may be owned in one domain, e.g. kvm guest.
221  */
222 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
223
224 struct dmar_domain {
225         int     id;                     /* domain id */
226         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
227
228         struct list_head devices;       /* all devices' list */
229         struct iova_domain iovad;       /* iova's that belong to this domain */
230
231         struct dma_pte  *pgd;           /* virtual address */
232         spinlock_t      mapping_lock;   /* page table lock */
233         int             gaw;            /* max guest address width */
234
235         /* adjusted guest address width, 0 is level 2 30-bit */
236         int             agaw;
237
238         int             flags;          /* flags to find out type of domain */
239
240         int             iommu_coherency;/* indicate coherency of iommu access */
241         int             iommu_snooping; /* indicate snooping control feature*/
242         int             iommu_count;    /* reference count of iommu */
243         spinlock_t      iommu_lock;     /* protect iommu set in domain */
244         u64             max_addr;       /* maximum mapped address */
245 };
246
247 /* PCI domain-device relationship */
248 struct device_domain_info {
249         struct list_head link;  /* link to domain siblings */
250         struct list_head global; /* link to global list */
251         int segment;            /* PCI domain */
252         u8 bus;                 /* PCI bus number */
253         u8 devfn;               /* PCI devfn number */
254         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
255         struct dmar_domain *domain; /* pointer to domain */
256 };
257
258 static void flush_unmaps_timeout(unsigned long data);
259
260 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
261
262 #define HIGH_WATER_MARK 250
263 struct deferred_flush_tables {
264         int next;
265         struct iova *iova[HIGH_WATER_MARK];
266         struct dmar_domain *domain[HIGH_WATER_MARK];
267 };
268
269 static struct deferred_flush_tables *deferred_flush;
270
271 /* bitmap for indexing intel_iommus */
272 static int g_num_of_iommus;
273
274 static DEFINE_SPINLOCK(async_umap_flush_lock);
275 static LIST_HEAD(unmaps_to_do);
276
277 static int timer_on;
278 static long list_size;
279
280 static void domain_remove_dev_info(struct dmar_domain *domain);
281
282 #ifdef CONFIG_DMAR_DEFAULT_ON
283 int dmar_disabled = 0;
284 #else
285 int dmar_disabled = 1;
286 #endif /*CONFIG_DMAR_DEFAULT_ON*/
287
288 static int __initdata dmar_map_gfx = 1;
289 static int dmar_forcedac;
290 static int intel_iommu_strict;
291
292 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
293 static DEFINE_SPINLOCK(device_domain_lock);
294 static LIST_HEAD(device_domain_list);
295
296 static struct iommu_ops intel_iommu_ops;
297
298 static int __init intel_iommu_setup(char *str)
299 {
300         if (!str)
301                 return -EINVAL;
302         while (*str) {
303                 if (!strncmp(str, "on", 2)) {
304                         dmar_disabled = 0;
305                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
306                 } else if (!strncmp(str, "off", 3)) {
307                         dmar_disabled = 1;
308                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
309                 } else if (!strncmp(str, "igfx_off", 8)) {
310                         dmar_map_gfx = 0;
311                         printk(KERN_INFO
312                                 "Intel-IOMMU: disable GFX device mapping\n");
313                 } else if (!strncmp(str, "forcedac", 8)) {
314                         printk(KERN_INFO
315                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
316                         dmar_forcedac = 1;
317                 } else if (!strncmp(str, "strict", 6)) {
318                         printk(KERN_INFO
319                                 "Intel-IOMMU: disable batched IOTLB flush\n");
320                         intel_iommu_strict = 1;
321                 }
322
323                 str += strcspn(str, ",");
324                 while (*str == ',')
325                         str++;
326         }
327         return 0;
328 }
329 __setup("intel_iommu=", intel_iommu_setup);
330
331 static struct kmem_cache *iommu_domain_cache;
332 static struct kmem_cache *iommu_devinfo_cache;
333 static struct kmem_cache *iommu_iova_cache;
334
335 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
336 {
337         unsigned int flags;
338         void *vaddr;
339
340         /* trying to avoid low memory issues */
341         flags = current->flags & PF_MEMALLOC;
342         current->flags |= PF_MEMALLOC;
343         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
344         current->flags &= (~PF_MEMALLOC | flags);
345         return vaddr;
346 }
347
348
349 static inline void *alloc_pgtable_page(void)
350 {
351         unsigned int flags;
352         void *vaddr;
353
354         /* trying to avoid low memory issues */
355         flags = current->flags & PF_MEMALLOC;
356         current->flags |= PF_MEMALLOC;
357         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
358         current->flags &= (~PF_MEMALLOC | flags);
359         return vaddr;
360 }
361
362 static inline void free_pgtable_page(void *vaddr)
363 {
364         free_page((unsigned long)vaddr);
365 }
366
367 static inline void *alloc_domain_mem(void)
368 {
369         return iommu_kmem_cache_alloc(iommu_domain_cache);
370 }
371
372 static void free_domain_mem(void *vaddr)
373 {
374         kmem_cache_free(iommu_domain_cache, vaddr);
375 }
376
377 static inline void * alloc_devinfo_mem(void)
378 {
379         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 }
381
382 static inline void free_devinfo_mem(void *vaddr)
383 {
384         kmem_cache_free(iommu_devinfo_cache, vaddr);
385 }
386
387 struct iova *alloc_iova_mem(void)
388 {
389         return iommu_kmem_cache_alloc(iommu_iova_cache);
390 }
391
392 void free_iova_mem(struct iova *iova)
393 {
394         kmem_cache_free(iommu_iova_cache, iova);
395 }
396
397
398 static inline int width_to_agaw(int width);
399
400 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
401 {
402         unsigned long sagaw;
403         int agaw = -1;
404
405         sagaw = cap_sagaw(iommu->cap);
406         for (agaw = width_to_agaw(max_gaw);
407              agaw >= 0; agaw--) {
408                 if (test_bit(agaw, &sagaw))
409                         break;
410         }
411
412         return agaw;
413 }
414
415 /*
416  * Calculate max SAGAW for each iommu.
417  */
418 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
419 {
420         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
421 }
422
423 /*
424  * calculate agaw for each iommu.
425  * "SAGAW" may be different across iommus, use a default agaw, and
426  * get a supported less agaw for iommus that don't support the default agaw.
427  */
428 int iommu_calculate_agaw(struct intel_iommu *iommu)
429 {
430         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
431 }
432
433 /* in native case, each domain is related to only one iommu */
434 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
435 {
436         int iommu_id;
437
438         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
439
440         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
441         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
442                 return NULL;
443
444         return g_iommus[iommu_id];
445 }
446
447 static void domain_update_iommu_coherency(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_coherency = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_coherent(g_iommus[i]->ecap)) {
456                         domain->iommu_coherency = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 static void domain_update_iommu_snooping(struct dmar_domain *domain)
464 {
465         int i;
466
467         domain->iommu_snooping = 1;
468
469         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
470         for (; i < g_num_of_iommus; ) {
471                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
472                         domain->iommu_snooping = 0;
473                         break;
474                 }
475                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
476         }
477 }
478
479 /* Some capabilities may be different across iommus */
480 static void domain_update_iommu_cap(struct dmar_domain *domain)
481 {
482         domain_update_iommu_coherency(domain);
483         domain_update_iommu_snooping(domain);
484 }
485
486 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
487 {
488         struct dmar_drhd_unit *drhd = NULL;
489         int i;
490
491         for_each_drhd_unit(drhd) {
492                 if (drhd->ignored)
493                         continue;
494                 if (segment != drhd->segment)
495                         continue;
496
497                 for (i = 0; i < drhd->devices_cnt; i++) {
498                         if (drhd->devices[i] &&
499                             drhd->devices[i]->bus->number == bus &&
500                             drhd->devices[i]->devfn == devfn)
501                                 return drhd->iommu;
502                         if (drhd->devices[i] &&
503                             drhd->devices[i]->subordinate &&
504                             drhd->devices[i]->subordinate->number <= bus &&
505                             drhd->devices[i]->subordinate->subordinate >= bus)
506                                 return drhd->iommu;
507                 }
508
509                 if (drhd->include_all)
510                         return drhd->iommu;
511         }
512
513         return NULL;
514 }
515
516 static void domain_flush_cache(struct dmar_domain *domain,
517                                void *addr, int size)
518 {
519         if (!domain->iommu_coherency)
520                 clflush_cache_range(addr, size);
521 }
522
523 /* Gets context entry for a given bus and devfn */
524 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
525                 u8 bus, u8 devfn)
526 {
527         struct root_entry *root;
528         struct context_entry *context;
529         unsigned long phy_addr;
530         unsigned long flags;
531
532         spin_lock_irqsave(&iommu->lock, flags);
533         root = &iommu->root_entry[bus];
534         context = get_context_addr_from_root(root);
535         if (!context) {
536                 context = (struct context_entry *)alloc_pgtable_page();
537                 if (!context) {
538                         spin_unlock_irqrestore(&iommu->lock, flags);
539                         return NULL;
540                 }
541                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
542                 phy_addr = virt_to_phys((void *)context);
543                 set_root_value(root, phy_addr);
544                 set_root_present(root);
545                 __iommu_flush_cache(iommu, root, sizeof(*root));
546         }
547         spin_unlock_irqrestore(&iommu->lock, flags);
548         return &context[devfn];
549 }
550
551 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
552 {
553         struct root_entry *root;
554         struct context_entry *context;
555         int ret;
556         unsigned long flags;
557
558         spin_lock_irqsave(&iommu->lock, flags);
559         root = &iommu->root_entry[bus];
560         context = get_context_addr_from_root(root);
561         if (!context) {
562                 ret = 0;
563                 goto out;
564         }
565         ret = context_present(&context[devfn]);
566 out:
567         spin_unlock_irqrestore(&iommu->lock, flags);
568         return ret;
569 }
570
571 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
572 {
573         struct root_entry *root;
574         struct context_entry *context;
575         unsigned long flags;
576
577         spin_lock_irqsave(&iommu->lock, flags);
578         root = &iommu->root_entry[bus];
579         context = get_context_addr_from_root(root);
580         if (context) {
581                 context_clear_entry(&context[devfn]);
582                 __iommu_flush_cache(iommu, &context[devfn], \
583                         sizeof(*context));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586 }
587
588 static void free_context_table(struct intel_iommu *iommu)
589 {
590         struct root_entry *root;
591         int i;
592         unsigned long flags;
593         struct context_entry *context;
594
595         spin_lock_irqsave(&iommu->lock, flags);
596         if (!iommu->root_entry) {
597                 goto out;
598         }
599         for (i = 0; i < ROOT_ENTRY_NR; i++) {
600                 root = &iommu->root_entry[i];
601                 context = get_context_addr_from_root(root);
602                 if (context)
603                         free_pgtable_page(context);
604         }
605         free_pgtable_page(iommu->root_entry);
606         iommu->root_entry = NULL;
607 out:
608         spin_unlock_irqrestore(&iommu->lock, flags);
609 }
610
611 /* page table handling */
612 #define LEVEL_STRIDE            (9)
613 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
614
615 static inline int agaw_to_level(int agaw)
616 {
617         return agaw + 2;
618 }
619
620 static inline int agaw_to_width(int agaw)
621 {
622         return 30 + agaw * LEVEL_STRIDE;
623
624 }
625
626 static inline int width_to_agaw(int width)
627 {
628         return (width - 30) / LEVEL_STRIDE;
629 }
630
631 static inline unsigned int level_to_offset_bits(int level)
632 {
633         return (12 + (level - 1) * LEVEL_STRIDE);
634 }
635
636 static inline int address_level_offset(u64 addr, int level)
637 {
638         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
639 }
640
641 static inline u64 level_mask(int level)
642 {
643         return ((u64)-1 << level_to_offset_bits(level));
644 }
645
646 static inline u64 level_size(int level)
647 {
648         return ((u64)1 << level_to_offset_bits(level));
649 }
650
651 static inline u64 align_to_level(u64 addr, int level)
652 {
653         return ((addr + level_size(level) - 1) & level_mask(level));
654 }
655
656 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
657 {
658         int addr_width = agaw_to_width(domain->agaw);
659         struct dma_pte *parent, *pte = NULL;
660         int level = agaw_to_level(domain->agaw);
661         int offset;
662         unsigned long flags;
663
664         BUG_ON(!domain->pgd);
665
666         addr &= (((u64)1) << addr_width) - 1;
667         parent = domain->pgd;
668
669         spin_lock_irqsave(&domain->mapping_lock, flags);
670         while (level > 0) {
671                 void *tmp_page;
672
673                 offset = address_level_offset(addr, level);
674                 pte = &parent[offset];
675                 if (level == 1)
676                         break;
677
678                 if (!dma_pte_present(pte)) {
679                         tmp_page = alloc_pgtable_page();
680
681                         if (!tmp_page) {
682                                 spin_unlock_irqrestore(&domain->mapping_lock,
683                                         flags);
684                                 return NULL;
685                         }
686                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
687                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
688                         /*
689                          * high level table always sets r/w, last level page
690                          * table control read/write
691                          */
692                         dma_set_pte_readable(pte);
693                         dma_set_pte_writable(pte);
694                         domain_flush_cache(domain, pte, sizeof(*pte));
695                 }
696                 parent = phys_to_virt(dma_pte_addr(pte));
697                 level--;
698         }
699
700         spin_unlock_irqrestore(&domain->mapping_lock, flags);
701         return pte;
702 }
703
704 /* return address's pte at specific level */
705 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
706                 int level)
707 {
708         struct dma_pte *parent, *pte = NULL;
709         int total = agaw_to_level(domain->agaw);
710         int offset;
711
712         parent = domain->pgd;
713         while (level <= total) {
714                 offset = address_level_offset(addr, total);
715                 pte = &parent[offset];
716                 if (level == total)
717                         return pte;
718
719                 if (!dma_pte_present(pte))
720                         break;
721                 parent = phys_to_virt(dma_pte_addr(pte));
722                 total--;
723         }
724         return NULL;
725 }
726
727 /* clear one page's page table */
728 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
729 {
730         struct dma_pte *pte = NULL;
731
732         /* get last level pte */
733         pte = dma_addr_level_pte(domain, addr, 1);
734
735         if (pte) {
736                 dma_clear_pte(pte);
737                 domain_flush_cache(domain, pte, sizeof(*pte));
738         }
739 }
740
741 /* clear last level pte, a tlb flush should be followed */
742 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
743 {
744         int addr_width = agaw_to_width(domain->agaw);
745         int npages;
746
747         start &= (((u64)1) << addr_width) - 1;
748         end &= (((u64)1) << addr_width) - 1;
749         /* in case it's partial page */
750         start &= PAGE_MASK;
751         end = PAGE_ALIGN(end);
752         npages = (end - start) / VTD_PAGE_SIZE;
753
754         /* we don't need lock here, nobody else touches the iova range */
755         while (npages--) {
756                 dma_pte_clear_one(domain, start);
757                 start += VTD_PAGE_SIZE;
758         }
759 }
760
761 /* free page table pages. last level pte should already be cleared */
762 static void dma_pte_free_pagetable(struct dmar_domain *domain,
763         u64 start, u64 end)
764 {
765         int addr_width = agaw_to_width(domain->agaw);
766         struct dma_pte *pte;
767         int total = agaw_to_level(domain->agaw);
768         int level;
769         u64 tmp;
770
771         start &= (((u64)1) << addr_width) - 1;
772         end &= (((u64)1) << addr_width) - 1;
773
774         /* we don't need lock here, nobody else touches the iova range */
775         level = 2;
776         while (level <= total) {
777                 tmp = align_to_level(start, level);
778                 if (tmp >= end || (tmp + level_size(level) > end))
779                         return;
780
781                 while (tmp < end) {
782                         pte = dma_addr_level_pte(domain, tmp, level);
783                         if (pte) {
784                                 free_pgtable_page(
785                                         phys_to_virt(dma_pte_addr(pte)));
786                                 dma_clear_pte(pte);
787                                 domain_flush_cache(domain, pte, sizeof(*pte));
788                         }
789                         tmp += level_size(level);
790                 }
791                 level++;
792         }
793         /* free pgd */
794         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
795                 free_pgtable_page(domain->pgd);
796                 domain->pgd = NULL;
797         }
798 }
799
800 /* iommu handling */
801 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
802 {
803         struct root_entry *root;
804         unsigned long flags;
805
806         root = (struct root_entry *)alloc_pgtable_page();
807         if (!root)
808                 return -ENOMEM;
809
810         __iommu_flush_cache(iommu, root, ROOT_SIZE);
811
812         spin_lock_irqsave(&iommu->lock, flags);
813         iommu->root_entry = root;
814         spin_unlock_irqrestore(&iommu->lock, flags);
815
816         return 0;
817 }
818
819 static void iommu_set_root_entry(struct intel_iommu *iommu)
820 {
821         void *addr;
822         u32 sts;
823         unsigned long flag;
824
825         addr = iommu->root_entry;
826
827         spin_lock_irqsave(&iommu->register_lock, flag);
828         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
829
830         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
831
832         /* Make sure hardware complete it */
833         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
834                       readl, (sts & DMA_GSTS_RTPS), sts);
835
836         spin_unlock_irqrestore(&iommu->register_lock, flag);
837 }
838
839 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
840 {
841         u32 val;
842         unsigned long flag;
843
844         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
845                 return;
846
847         spin_lock_irqsave(&iommu->register_lock, flag);
848         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
849
850         /* Make sure hardware complete it */
851         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
852                       readl, (!(val & DMA_GSTS_WBFS)), val);
853
854         spin_unlock_irqrestore(&iommu->register_lock, flag);
855 }
856
857 /* return value determine if we need a write buffer flush */
858 static void __iommu_flush_context(struct intel_iommu *iommu,
859                                   u16 did, u16 source_id, u8 function_mask,
860                                   u64 type)
861 {
862         u64 val = 0;
863         unsigned long flag;
864
865         switch (type) {
866         case DMA_CCMD_GLOBAL_INVL:
867                 val = DMA_CCMD_GLOBAL_INVL;
868                 break;
869         case DMA_CCMD_DOMAIN_INVL:
870                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
871                 break;
872         case DMA_CCMD_DEVICE_INVL:
873                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
874                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
875                 break;
876         default:
877                 BUG();
878         }
879         val |= DMA_CCMD_ICC;
880
881         spin_lock_irqsave(&iommu->register_lock, flag);
882         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
883
884         /* Make sure hardware complete it */
885         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
886                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
887
888         spin_unlock_irqrestore(&iommu->register_lock, flag);
889 }
890
891 /* return value determine if we need a write buffer flush */
892 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
893                                 u64 addr, unsigned int size_order, u64 type)
894 {
895         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
896         u64 val = 0, val_iva = 0;
897         unsigned long flag;
898
899         switch (type) {
900         case DMA_TLB_GLOBAL_FLUSH:
901                 /* global flush doesn't need set IVA_REG */
902                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
903                 break;
904         case DMA_TLB_DSI_FLUSH:
905                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
906                 break;
907         case DMA_TLB_PSI_FLUSH:
908                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
909                 /* Note: always flush non-leaf currently */
910                 val_iva = size_order | addr;
911                 break;
912         default:
913                 BUG();
914         }
915         /* Note: set drain read/write */
916 #if 0
917         /*
918          * This is probably to be super secure.. Looks like we can
919          * ignore it without any impact.
920          */
921         if (cap_read_drain(iommu->cap))
922                 val |= DMA_TLB_READ_DRAIN;
923 #endif
924         if (cap_write_drain(iommu->cap))
925                 val |= DMA_TLB_WRITE_DRAIN;
926
927         spin_lock_irqsave(&iommu->register_lock, flag);
928         /* Note: Only uses first TLB reg currently */
929         if (val_iva)
930                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
931         dmar_writeq(iommu->reg + tlb_offset + 8, val);
932
933         /* Make sure hardware complete it */
934         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
935                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
936
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938
939         /* check IOTLB invalidation granularity */
940         if (DMA_TLB_IAIG(val) == 0)
941                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
942         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
943                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
944                         (unsigned long long)DMA_TLB_IIRG(type),
945                         (unsigned long long)DMA_TLB_IAIG(val));
946 }
947
948 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
949                                   u64 addr, unsigned int pages)
950 {
951         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
952
953         BUG_ON(addr & (~VTD_PAGE_MASK));
954         BUG_ON(pages == 0);
955
956         /*
957          * Fallback to domain selective flush if no PSI support or the size is
958          * too big.
959          * PSI requires page size to be 2 ^ x, and the base address is naturally
960          * aligned to the size
961          */
962         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
963                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
964                                                 DMA_TLB_DSI_FLUSH);
965         else
966                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
967                                                 DMA_TLB_PSI_FLUSH);
968 }
969
970 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
971 {
972         u32 pmen;
973         unsigned long flags;
974
975         spin_lock_irqsave(&iommu->register_lock, flags);
976         pmen = readl(iommu->reg + DMAR_PMEN_REG);
977         pmen &= ~DMA_PMEN_EPM;
978         writel(pmen, iommu->reg + DMAR_PMEN_REG);
979
980         /* wait for the protected region status bit to clear */
981         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
982                 readl, !(pmen & DMA_PMEN_PRS), pmen);
983
984         spin_unlock_irqrestore(&iommu->register_lock, flags);
985 }
986
987 static int iommu_enable_translation(struct intel_iommu *iommu)
988 {
989         u32 sts;
990         unsigned long flags;
991
992         spin_lock_irqsave(&iommu->register_lock, flags);
993         iommu->gcmd |= DMA_GCMD_TE;
994         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
995
996         /* Make sure hardware complete it */
997         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
998                       readl, (sts & DMA_GSTS_TES), sts);
999
1000         spin_unlock_irqrestore(&iommu->register_lock, flags);
1001         return 0;
1002 }
1003
1004 static int iommu_disable_translation(struct intel_iommu *iommu)
1005 {
1006         u32 sts;
1007         unsigned long flag;
1008
1009         spin_lock_irqsave(&iommu->register_lock, flag);
1010         iommu->gcmd &= ~DMA_GCMD_TE;
1011         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1012
1013         /* Make sure hardware complete it */
1014         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1015                       readl, (!(sts & DMA_GSTS_TES)), sts);
1016
1017         spin_unlock_irqrestore(&iommu->register_lock, flag);
1018         return 0;
1019 }
1020
1021
1022 static int iommu_init_domains(struct intel_iommu *iommu)
1023 {
1024         unsigned long ndomains;
1025         unsigned long nlongs;
1026
1027         ndomains = cap_ndoms(iommu->cap);
1028         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1029         nlongs = BITS_TO_LONGS(ndomains);
1030
1031         /* TBD: there might be 64K domains,
1032          * consider other allocation for future chip
1033          */
1034         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1035         if (!iommu->domain_ids) {
1036                 printk(KERN_ERR "Allocating domain id array failed\n");
1037                 return -ENOMEM;
1038         }
1039         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1040                         GFP_KERNEL);
1041         if (!iommu->domains) {
1042                 printk(KERN_ERR "Allocating domain array failed\n");
1043                 kfree(iommu->domain_ids);
1044                 return -ENOMEM;
1045         }
1046
1047         spin_lock_init(&iommu->lock);
1048
1049         /*
1050          * if Caching mode is set, then invalid translations are tagged
1051          * with domainid 0. Hence we need to pre-allocate it.
1052          */
1053         if (cap_caching_mode(iommu->cap))
1054                 set_bit(0, iommu->domain_ids);
1055         return 0;
1056 }
1057
1058
1059 static void domain_exit(struct dmar_domain *domain);
1060 static void vm_domain_exit(struct dmar_domain *domain);
1061
1062 void free_dmar_iommu(struct intel_iommu *iommu)
1063 {
1064         struct dmar_domain *domain;
1065         int i;
1066         unsigned long flags;
1067
1068         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1069         for (; i < cap_ndoms(iommu->cap); ) {
1070                 domain = iommu->domains[i];
1071                 clear_bit(i, iommu->domain_ids);
1072
1073                 spin_lock_irqsave(&domain->iommu_lock, flags);
1074                 if (--domain->iommu_count == 0) {
1075                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1076                                 vm_domain_exit(domain);
1077                         else
1078                                 domain_exit(domain);
1079                 }
1080                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1081
1082                 i = find_next_bit(iommu->domain_ids,
1083                         cap_ndoms(iommu->cap), i+1);
1084         }
1085
1086         if (iommu->gcmd & DMA_GCMD_TE)
1087                 iommu_disable_translation(iommu);
1088
1089         if (iommu->irq) {
1090                 set_irq_data(iommu->irq, NULL);
1091                 /* This will mask the irq */
1092                 free_irq(iommu->irq, iommu);
1093                 destroy_irq(iommu->irq);
1094         }
1095
1096         kfree(iommu->domains);
1097         kfree(iommu->domain_ids);
1098
1099         g_iommus[iommu->seq_id] = NULL;
1100
1101         /* if all iommus are freed, free g_iommus */
1102         for (i = 0; i < g_num_of_iommus; i++) {
1103                 if (g_iommus[i])
1104                         break;
1105         }
1106
1107         if (i == g_num_of_iommus)
1108                 kfree(g_iommus);
1109
1110         /* free context mapping */
1111         free_context_table(iommu);
1112 }
1113
1114 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1115 {
1116         unsigned long num;
1117         unsigned long ndomains;
1118         struct dmar_domain *domain;
1119         unsigned long flags;
1120
1121         domain = alloc_domain_mem();
1122         if (!domain)
1123                 return NULL;
1124
1125         ndomains = cap_ndoms(iommu->cap);
1126
1127         spin_lock_irqsave(&iommu->lock, flags);
1128         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1129         if (num >= ndomains) {
1130                 spin_unlock_irqrestore(&iommu->lock, flags);
1131                 free_domain_mem(domain);
1132                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1133                 return NULL;
1134         }
1135
1136         set_bit(num, iommu->domain_ids);
1137         domain->id = num;
1138         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1139         set_bit(iommu->seq_id, &domain->iommu_bmp);
1140         domain->flags = 0;
1141         iommu->domains[num] = domain;
1142         spin_unlock_irqrestore(&iommu->lock, flags);
1143
1144         return domain;
1145 }
1146
1147 static void iommu_free_domain(struct dmar_domain *domain)
1148 {
1149         unsigned long flags;
1150         struct intel_iommu *iommu;
1151
1152         iommu = domain_get_iommu(domain);
1153
1154         spin_lock_irqsave(&iommu->lock, flags);
1155         clear_bit(domain->id, iommu->domain_ids);
1156         spin_unlock_irqrestore(&iommu->lock, flags);
1157 }
1158
1159 static struct iova_domain reserved_iova_list;
1160 static struct lock_class_key reserved_alloc_key;
1161 static struct lock_class_key reserved_rbtree_key;
1162
1163 static void dmar_init_reserved_ranges(void)
1164 {
1165         struct pci_dev *pdev = NULL;
1166         struct iova *iova;
1167         int i;
1168         u64 addr, size;
1169
1170         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1171
1172         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1173                 &reserved_alloc_key);
1174         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1175                 &reserved_rbtree_key);
1176
1177         /* IOAPIC ranges shouldn't be accessed by DMA */
1178         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1179                 IOVA_PFN(IOAPIC_RANGE_END));
1180         if (!iova)
1181                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1182
1183         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1184         for_each_pci_dev(pdev) {
1185                 struct resource *r;
1186
1187                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1188                         r = &pdev->resource[i];
1189                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1190                                 continue;
1191                         addr = r->start;
1192                         addr &= PAGE_MASK;
1193                         size = r->end - addr;
1194                         size = PAGE_ALIGN(size);
1195                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1196                                 IOVA_PFN(size + addr) - 1);
1197                         if (!iova)
1198                                 printk(KERN_ERR "Reserve iova failed\n");
1199                 }
1200         }
1201
1202 }
1203
1204 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1205 {
1206         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1207 }
1208
1209 static inline int guestwidth_to_adjustwidth(int gaw)
1210 {
1211         int agaw;
1212         int r = (gaw - 12) % 9;
1213
1214         if (r == 0)
1215                 agaw = gaw;
1216         else
1217                 agaw = gaw + 9 - r;
1218         if (agaw > 64)
1219                 agaw = 64;
1220         return agaw;
1221 }
1222
1223 static int domain_init(struct dmar_domain *domain, int guest_width)
1224 {
1225         struct intel_iommu *iommu;
1226         int adjust_width, agaw;
1227         unsigned long sagaw;
1228
1229         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1230         spin_lock_init(&domain->mapping_lock);
1231         spin_lock_init(&domain->iommu_lock);
1232
1233         domain_reserve_special_ranges(domain);
1234
1235         /* calculate AGAW */
1236         iommu = domain_get_iommu(domain);
1237         if (guest_width > cap_mgaw(iommu->cap))
1238                 guest_width = cap_mgaw(iommu->cap);
1239         domain->gaw = guest_width;
1240         adjust_width = guestwidth_to_adjustwidth(guest_width);
1241         agaw = width_to_agaw(adjust_width);
1242         sagaw = cap_sagaw(iommu->cap);
1243         if (!test_bit(agaw, &sagaw)) {
1244                 /* hardware doesn't support it, choose a bigger one */
1245                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1246                 agaw = find_next_bit(&sagaw, 5, agaw);
1247                 if (agaw >= 5)
1248                         return -ENODEV;
1249         }
1250         domain->agaw = agaw;
1251         INIT_LIST_HEAD(&domain->devices);
1252
1253         if (ecap_coherent(iommu->ecap))
1254                 domain->iommu_coherency = 1;
1255         else
1256                 domain->iommu_coherency = 0;
1257
1258         if (ecap_sc_support(iommu->ecap))
1259                 domain->iommu_snooping = 1;
1260         else
1261                 domain->iommu_snooping = 0;
1262
1263         domain->iommu_count = 1;
1264
1265         /* always allocate the top pgd */
1266         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1267         if (!domain->pgd)
1268                 return -ENOMEM;
1269         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1270         return 0;
1271 }
1272
1273 static void domain_exit(struct dmar_domain *domain)
1274 {
1275         u64 end;
1276
1277         /* Domain 0 is reserved, so dont process it */
1278         if (!domain)
1279                 return;
1280
1281         domain_remove_dev_info(domain);
1282         /* destroy iovas */
1283         put_iova_domain(&domain->iovad);
1284         end = DOMAIN_MAX_ADDR(domain->gaw);
1285         end = end & (~PAGE_MASK);
1286
1287         /* clear ptes */
1288         dma_pte_clear_range(domain, 0, end);
1289
1290         /* free page tables */
1291         dma_pte_free_pagetable(domain, 0, end);
1292
1293         iommu_free_domain(domain);
1294         free_domain_mem(domain);
1295 }
1296
1297 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1298                                  u8 bus, u8 devfn, int translation)
1299 {
1300         struct context_entry *context;
1301         unsigned long flags;
1302         struct intel_iommu *iommu;
1303         struct dma_pte *pgd;
1304         unsigned long num;
1305         unsigned long ndomains;
1306         int id;
1307         int agaw;
1308
1309         pr_debug("Set context mapping for %02x:%02x.%d\n",
1310                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1311
1312         BUG_ON(!domain->pgd);
1313         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1314                translation != CONTEXT_TT_MULTI_LEVEL);
1315
1316         iommu = device_to_iommu(segment, bus, devfn);
1317         if (!iommu)
1318                 return -ENODEV;
1319
1320         context = device_to_context_entry(iommu, bus, devfn);
1321         if (!context)
1322                 return -ENOMEM;
1323         spin_lock_irqsave(&iommu->lock, flags);
1324         if (context_present(context)) {
1325                 spin_unlock_irqrestore(&iommu->lock, flags);
1326                 return 0;
1327         }
1328
1329         id = domain->id;
1330         pgd = domain->pgd;
1331
1332         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1333                 int found = 0;
1334
1335                 /* find an available domain id for this device in iommu */
1336                 ndomains = cap_ndoms(iommu->cap);
1337                 num = find_first_bit(iommu->domain_ids, ndomains);
1338                 for (; num < ndomains; ) {
1339                         if (iommu->domains[num] == domain) {
1340                                 id = num;
1341                                 found = 1;
1342                                 break;
1343                         }
1344                         num = find_next_bit(iommu->domain_ids,
1345                                             cap_ndoms(iommu->cap), num+1);
1346                 }
1347
1348                 if (found == 0) {
1349                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1350                         if (num >= ndomains) {
1351                                 spin_unlock_irqrestore(&iommu->lock, flags);
1352                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1353                                 return -EFAULT;
1354                         }
1355
1356                         set_bit(num, iommu->domain_ids);
1357                         iommu->domains[num] = domain;
1358                         id = num;
1359                 }
1360
1361                 /* Skip top levels of page tables for
1362                  * iommu which has less agaw than default.
1363                  */
1364                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1365                         pgd = phys_to_virt(dma_pte_addr(pgd));
1366                         if (!dma_pte_present(pgd)) {
1367                                 spin_unlock_irqrestore(&iommu->lock, flags);
1368                                 return -ENOMEM;
1369                         }
1370                 }
1371         }
1372
1373         context_set_domain_id(context, id);
1374
1375         /*
1376          * In pass through mode, AW must be programmed to indicate the largest
1377          * AGAW value supported by hardware. And ASR is ignored by hardware.
1378          */
1379         if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
1380                 context_set_address_width(context, iommu->agaw);
1381                 context_set_address_root(context, virt_to_phys(pgd));
1382         } else
1383                 context_set_address_width(context, iommu->msagaw);
1384
1385         context_set_translation_type(context, translation);
1386         context_set_fault_enable(context);
1387         context_set_present(context);
1388         domain_flush_cache(domain, context, sizeof(*context));
1389
1390         /*
1391          * It's a non-present to present mapping. If hardware doesn't cache
1392          * non-present entry we only need to flush the write-buffer. If the
1393          * _does_ cache non-present entries, then it does so in the special
1394          * domain #0, which we have to flush:
1395          */
1396         if (cap_caching_mode(iommu->cap)) {
1397                 iommu->flush.flush_context(iommu, 0,
1398                                            (((u16)bus) << 8) | devfn,
1399                                            DMA_CCMD_MASK_NOBIT,
1400                                            DMA_CCMD_DEVICE_INVL);
1401                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1402         } else {
1403                 iommu_flush_write_buffer(iommu);
1404         }
1405         spin_unlock_irqrestore(&iommu->lock, flags);
1406
1407         spin_lock_irqsave(&domain->iommu_lock, flags);
1408         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1409                 domain->iommu_count++;
1410                 domain_update_iommu_cap(domain);
1411         }
1412         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1413         return 0;
1414 }
1415
1416 static int
1417 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1418                         int translation)
1419 {
1420         int ret;
1421         struct pci_dev *tmp, *parent;
1422
1423         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1424                                          pdev->bus->number, pdev->devfn,
1425                                          translation);
1426         if (ret)
1427                 return ret;
1428
1429         /* dependent device mapping */
1430         tmp = pci_find_upstream_pcie_bridge(pdev);
1431         if (!tmp)
1432                 return 0;
1433         /* Secondary interface's bus number and devfn 0 */
1434         parent = pdev->bus->self;
1435         while (parent != tmp) {
1436                 ret = domain_context_mapping_one(domain,
1437                                                  pci_domain_nr(parent->bus),
1438                                                  parent->bus->number,
1439                                                  parent->devfn, translation);
1440                 if (ret)
1441                         return ret;
1442                 parent = parent->bus->self;
1443         }
1444         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1445                 return domain_context_mapping_one(domain,
1446                                         pci_domain_nr(tmp->subordinate),
1447                                         tmp->subordinate->number, 0,
1448                                         translation);
1449         else /* this is a legacy PCI bridge */
1450                 return domain_context_mapping_one(domain,
1451                                                   pci_domain_nr(tmp->bus),
1452                                                   tmp->bus->number,
1453                                                   tmp->devfn,
1454                                                   translation);
1455 }
1456
1457 static int domain_context_mapped(struct pci_dev *pdev)
1458 {
1459         int ret;
1460         struct pci_dev *tmp, *parent;
1461         struct intel_iommu *iommu;
1462
1463         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1464                                 pdev->devfn);
1465         if (!iommu)
1466                 return -ENODEV;
1467
1468         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1469         if (!ret)
1470                 return ret;
1471         /* dependent device mapping */
1472         tmp = pci_find_upstream_pcie_bridge(pdev);
1473         if (!tmp)
1474                 return ret;
1475         /* Secondary interface's bus number and devfn 0 */
1476         parent = pdev->bus->self;
1477         while (parent != tmp) {
1478                 ret = device_context_mapped(iommu, parent->bus->number,
1479                                             parent->devfn);
1480                 if (!ret)
1481                         return ret;
1482                 parent = parent->bus->self;
1483         }
1484         if (tmp->is_pcie)
1485                 return device_context_mapped(iommu, tmp->subordinate->number,
1486                                              0);
1487         else
1488                 return device_context_mapped(iommu, tmp->bus->number,
1489                                              tmp->devfn);
1490 }
1491
1492 static int
1493 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1494                         u64 hpa, size_t size, int prot)
1495 {
1496         u64 start_pfn, end_pfn;
1497         struct dma_pte *pte;
1498         int index;
1499         int addr_width = agaw_to_width(domain->agaw);
1500
1501         hpa &= (((u64)1) << addr_width) - 1;
1502
1503         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1504                 return -EINVAL;
1505         iova &= PAGE_MASK;
1506         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1507         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1508         index = 0;
1509         while (start_pfn < end_pfn) {
1510                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1511                 if (!pte)
1512                         return -ENOMEM;
1513                 /* We don't need lock here, nobody else
1514                  * touches the iova range
1515                  */
1516                 BUG_ON(dma_pte_addr(pte));
1517                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1518                 dma_set_pte_prot(pte, prot);
1519                 if (prot & DMA_PTE_SNP)
1520                         dma_set_pte_snp(pte);
1521                 domain_flush_cache(domain, pte, sizeof(*pte));
1522                 start_pfn++;
1523                 index++;
1524         }
1525         return 0;
1526 }
1527
1528 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1529 {
1530         if (!iommu)
1531                 return;
1532
1533         clear_context_table(iommu, bus, devfn);
1534         iommu->flush.flush_context(iommu, 0, 0, 0,
1535                                            DMA_CCMD_GLOBAL_INVL);
1536         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1537 }
1538
1539 static void domain_remove_dev_info(struct dmar_domain *domain)
1540 {
1541         struct device_domain_info *info;
1542         unsigned long flags;
1543         struct intel_iommu *iommu;
1544
1545         spin_lock_irqsave(&device_domain_lock, flags);
1546         while (!list_empty(&domain->devices)) {
1547                 info = list_entry(domain->devices.next,
1548                         struct device_domain_info, link);
1549                 list_del(&info->link);
1550                 list_del(&info->global);
1551                 if (info->dev)
1552                         info->dev->dev.archdata.iommu = NULL;
1553                 spin_unlock_irqrestore(&device_domain_lock, flags);
1554
1555                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1556                 iommu_detach_dev(iommu, info->bus, info->devfn);
1557                 free_devinfo_mem(info);
1558
1559                 spin_lock_irqsave(&device_domain_lock, flags);
1560         }
1561         spin_unlock_irqrestore(&device_domain_lock, flags);
1562 }
1563
1564 /*
1565  * find_domain
1566  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1567  */
1568 static struct dmar_domain *
1569 find_domain(struct pci_dev *pdev)
1570 {
1571         struct device_domain_info *info;
1572
1573         /* No lock here, assumes no domain exit in normal case */
1574         info = pdev->dev.archdata.iommu;
1575         if (info)
1576                 return info->domain;
1577         return NULL;
1578 }
1579
1580 /* domain is initialized */
1581 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1582 {
1583         struct dmar_domain *domain, *found = NULL;
1584         struct intel_iommu *iommu;
1585         struct dmar_drhd_unit *drhd;
1586         struct device_domain_info *info, *tmp;
1587         struct pci_dev *dev_tmp;
1588         unsigned long flags;
1589         int bus = 0, devfn = 0;
1590         int segment;
1591
1592         domain = find_domain(pdev);
1593         if (domain)
1594                 return domain;
1595
1596         segment = pci_domain_nr(pdev->bus);
1597
1598         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1599         if (dev_tmp) {
1600                 if (dev_tmp->is_pcie) {
1601                         bus = dev_tmp->subordinate->number;
1602                         devfn = 0;
1603                 } else {
1604                         bus = dev_tmp->bus->number;
1605                         devfn = dev_tmp->devfn;
1606                 }
1607                 spin_lock_irqsave(&device_domain_lock, flags);
1608                 list_for_each_entry(info, &device_domain_list, global) {
1609                         if (info->segment == segment &&
1610                             info->bus == bus && info->devfn == devfn) {
1611                                 found = info->domain;
1612                                 break;
1613                         }
1614                 }
1615                 spin_unlock_irqrestore(&device_domain_lock, flags);
1616                 /* pcie-pci bridge already has a domain, uses it */
1617                 if (found) {
1618                         domain = found;
1619                         goto found_domain;
1620                 }
1621         }
1622
1623         /* Allocate new domain for the device */
1624         drhd = dmar_find_matched_drhd_unit(pdev);
1625         if (!drhd) {
1626                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1627                         pci_name(pdev));
1628                 return NULL;
1629         }
1630         iommu = drhd->iommu;
1631
1632         domain = iommu_alloc_domain(iommu);
1633         if (!domain)
1634                 goto error;
1635
1636         if (domain_init(domain, gaw)) {
1637                 domain_exit(domain);
1638                 goto error;
1639         }
1640
1641         /* register pcie-to-pci device */
1642         if (dev_tmp) {
1643                 info = alloc_devinfo_mem();
1644                 if (!info) {
1645                         domain_exit(domain);
1646                         goto error;
1647                 }
1648                 info->segment = segment;
1649                 info->bus = bus;
1650                 info->devfn = devfn;
1651                 info->dev = NULL;
1652                 info->domain = domain;
1653                 /* This domain is shared by devices under p2p bridge */
1654                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1655
1656                 /* pcie-to-pci bridge already has a domain, uses it */
1657                 found = NULL;
1658                 spin_lock_irqsave(&device_domain_lock, flags);
1659                 list_for_each_entry(tmp, &device_domain_list, global) {
1660                         if (tmp->segment == segment &&
1661                             tmp->bus == bus && tmp->devfn == devfn) {
1662                                 found = tmp->domain;
1663                                 break;
1664                         }
1665                 }
1666                 if (found) {
1667                         free_devinfo_mem(info);
1668                         domain_exit(domain);
1669                         domain = found;
1670                 } else {
1671                         list_add(&info->link, &domain->devices);
1672                         list_add(&info->global, &device_domain_list);
1673                 }
1674                 spin_unlock_irqrestore(&device_domain_lock, flags);
1675         }
1676
1677 found_domain:
1678         info = alloc_devinfo_mem();
1679         if (!info)
1680                 goto error;
1681         info->segment = segment;
1682         info->bus = pdev->bus->number;
1683         info->devfn = pdev->devfn;
1684         info->dev = pdev;
1685         info->domain = domain;
1686         spin_lock_irqsave(&device_domain_lock, flags);
1687         /* somebody is fast */
1688         found = find_domain(pdev);
1689         if (found != NULL) {
1690                 spin_unlock_irqrestore(&device_domain_lock, flags);
1691                 if (found != domain) {
1692                         domain_exit(domain);
1693                         domain = found;
1694                 }
1695                 free_devinfo_mem(info);
1696                 return domain;
1697         }
1698         list_add(&info->link, &domain->devices);
1699         list_add(&info->global, &device_domain_list);
1700         pdev->dev.archdata.iommu = info;
1701         spin_unlock_irqrestore(&device_domain_lock, flags);
1702         return domain;
1703 error:
1704         /* recheck it here, maybe others set it */
1705         return find_domain(pdev);
1706 }
1707
1708 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1709                                       unsigned long long start,
1710                                       unsigned long long end)
1711 {
1712         struct dmar_domain *domain;
1713         unsigned long size;
1714         unsigned long long base;
1715         int ret;
1716
1717         printk(KERN_INFO
1718                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1719                 pci_name(pdev), start, end);
1720         /* page table init */
1721         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1722         if (!domain)
1723                 return -ENOMEM;
1724
1725         /* The address might not be aligned */
1726         base = start & PAGE_MASK;
1727         size = end - base;
1728         size = PAGE_ALIGN(size);
1729         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1730                         IOVA_PFN(base + size) - 1)) {
1731                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1732                 ret = -ENOMEM;
1733                 goto error;
1734         }
1735
1736         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1737                 size, base, pci_name(pdev));
1738         /*
1739          * RMRR range might have overlap with physical memory range,
1740          * clear it first
1741          */
1742         dma_pte_clear_range(domain, base, base + size);
1743
1744         ret = domain_page_mapping(domain, base, base, size,
1745                 DMA_PTE_READ|DMA_PTE_WRITE);
1746         if (ret)
1747                 goto error;
1748
1749         /* context entry init */
1750         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1751         if (!ret)
1752                 return 0;
1753 error:
1754         domain_exit(domain);
1755         return ret;
1756
1757 }
1758
1759 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1760         struct pci_dev *pdev)
1761 {
1762         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1763                 return 0;
1764         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1765                 rmrr->end_address + 1);
1766 }
1767
1768 #ifdef CONFIG_DMAR_GFX_WA
1769 struct iommu_prepare_data {
1770         struct pci_dev *pdev;
1771         int ret;
1772 };
1773
1774 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1775                                          unsigned long end_pfn, void *datax)
1776 {
1777         struct iommu_prepare_data *data;
1778
1779         data = (struct iommu_prepare_data *)datax;
1780
1781         data->ret = iommu_prepare_identity_map(data->pdev,
1782                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1783         return data->ret;
1784
1785 }
1786
1787 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1788 {
1789         int nid;
1790         struct iommu_prepare_data data;
1791
1792         data.pdev = pdev;
1793         data.ret = 0;
1794
1795         for_each_online_node(nid) {
1796                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1797                 if (data.ret)
1798                         return data.ret;
1799         }
1800         return data.ret;
1801 }
1802
1803 static void __init iommu_prepare_gfx_mapping(void)
1804 {
1805         struct pci_dev *pdev = NULL;
1806         int ret;
1807
1808         for_each_pci_dev(pdev) {
1809                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1810                                 !IS_GFX_DEVICE(pdev))
1811                         continue;
1812                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1813                         pci_name(pdev));
1814                 ret = iommu_prepare_with_active_regions(pdev);
1815                 if (ret)
1816                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1817         }
1818 }
1819 #else /* !CONFIG_DMAR_GFX_WA */
1820 static inline void iommu_prepare_gfx_mapping(void)
1821 {
1822         return;
1823 }
1824 #endif
1825
1826 #ifdef CONFIG_DMAR_FLOPPY_WA
1827 static inline void iommu_prepare_isa(void)
1828 {
1829         struct pci_dev *pdev;
1830         int ret;
1831
1832         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1833         if (!pdev)
1834                 return;
1835
1836         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1837         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1838
1839         if (ret)
1840                 printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
1841                         "floppy might not work\n");
1842
1843 }
1844 #else
1845 static inline void iommu_prepare_isa(void)
1846 {
1847         return;
1848 }
1849 #endif /* !CONFIG_DMAR_FLPY_WA */
1850
1851 /* Initialize each context entry as pass through.*/
1852 static int __init init_context_pass_through(void)
1853 {
1854         struct pci_dev *pdev = NULL;
1855         struct dmar_domain *domain;
1856         int ret;
1857
1858         for_each_pci_dev(pdev) {
1859                 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1860                 ret = domain_context_mapping(domain, pdev,
1861                                              CONTEXT_TT_PASS_THROUGH);
1862                 if (ret)
1863                         return ret;
1864         }
1865         return 0;
1866 }
1867
1868 static int __init init_dmars(void)
1869 {
1870         struct dmar_drhd_unit *drhd;
1871         struct dmar_rmrr_unit *rmrr;
1872         struct pci_dev *pdev;
1873         struct intel_iommu *iommu;
1874         int i, ret;
1875         int pass_through = 1;
1876
1877         /*
1878          * for each drhd
1879          *    allocate root
1880          *    initialize and program root entry to not present
1881          * endfor
1882          */
1883         for_each_drhd_unit(drhd) {
1884                 g_num_of_iommus++;
1885                 /*
1886                  * lock not needed as this is only incremented in the single
1887                  * threaded kernel __init code path all other access are read
1888                  * only
1889                  */
1890         }
1891
1892         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1893                         GFP_KERNEL);
1894         if (!g_iommus) {
1895                 printk(KERN_ERR "Allocating global iommu array failed\n");
1896                 ret = -ENOMEM;
1897                 goto error;
1898         }
1899
1900         deferred_flush = kzalloc(g_num_of_iommus *
1901                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1902         if (!deferred_flush) {
1903                 kfree(g_iommus);
1904                 ret = -ENOMEM;
1905                 goto error;
1906         }
1907
1908         for_each_drhd_unit(drhd) {
1909                 if (drhd->ignored)
1910                         continue;
1911
1912                 iommu = drhd->iommu;
1913                 g_iommus[iommu->seq_id] = iommu;
1914
1915                 ret = iommu_init_domains(iommu);
1916                 if (ret)
1917                         goto error;
1918
1919                 /*
1920                  * TBD:
1921                  * we could share the same root & context tables
1922                  * amoung all IOMMU's. Need to Split it later.
1923                  */
1924                 ret = iommu_alloc_root_entry(iommu);
1925                 if (ret) {
1926                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1927                         goto error;
1928                 }
1929                 if (!ecap_pass_through(iommu->ecap))
1930                         pass_through = 0;
1931         }
1932         if (iommu_pass_through)
1933                 if (!pass_through) {
1934                         printk(KERN_INFO
1935                                "Pass Through is not supported by hardware.\n");
1936                         iommu_pass_through = 0;
1937                 }
1938
1939         /*
1940          * Start from the sane iommu hardware state.
1941          */
1942         for_each_drhd_unit(drhd) {
1943                 if (drhd->ignored)
1944                         continue;
1945
1946                 iommu = drhd->iommu;
1947
1948                 /*
1949                  * If the queued invalidation is already initialized by us
1950                  * (for example, while enabling interrupt-remapping) then
1951                  * we got the things already rolling from a sane state.
1952                  */
1953                 if (iommu->qi)
1954                         continue;
1955
1956                 /*
1957                  * Clear any previous faults.
1958                  */
1959                 dmar_fault(-1, iommu);
1960                 /*
1961                  * Disable queued invalidation if supported and already enabled
1962                  * before OS handover.
1963                  */
1964                 dmar_disable_qi(iommu);
1965         }
1966
1967         for_each_drhd_unit(drhd) {
1968                 if (drhd->ignored)
1969                         continue;
1970
1971                 iommu = drhd->iommu;
1972
1973                 if (dmar_enable_qi(iommu)) {
1974                         /*
1975                          * Queued Invalidate not enabled, use Register Based
1976                          * Invalidate
1977                          */
1978                         iommu->flush.flush_context = __iommu_flush_context;
1979                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1980                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1981                                "invalidation\n",
1982                                (unsigned long long)drhd->reg_base_addr);
1983                 } else {
1984                         iommu->flush.flush_context = qi_flush_context;
1985                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1986                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1987                                "invalidation\n",
1988                                (unsigned long long)drhd->reg_base_addr);
1989                 }
1990         }
1991
1992 #ifdef CONFIG_INTR_REMAP
1993         if (!intr_remapping_enabled) {
1994                 ret = enable_intr_remapping(0);
1995                 if (ret)
1996                         printk(KERN_ERR
1997                                "IOMMU: enable interrupt remapping failed\n");
1998         }
1999 #endif
2000         /*
2001          * If pass through is set and enabled, context entries of all pci
2002          * devices are intialized by pass through translation type.
2003          */
2004         if (iommu_pass_through) {
2005                 ret = init_context_pass_through();
2006                 if (ret) {
2007                         printk(KERN_ERR "IOMMU: Pass through init failed.\n");
2008                         iommu_pass_through = 0;
2009                 }
2010         }
2011
2012         /*
2013          * If pass through is not set or not enabled, setup context entries for
2014          * identity mappings for rmrr, gfx, and isa.
2015          */
2016         if (!iommu_pass_through) {
2017                 /*
2018                  * For each rmrr
2019                  *   for each dev attached to rmrr
2020                  *   do
2021                  *     locate drhd for dev, alloc domain for dev
2022                  *     allocate free domain
2023                  *     allocate page table entries for rmrr
2024                  *     if context not allocated for bus
2025                  *           allocate and init context
2026                  *           set present in root table for this bus
2027                  *     init context with domain, translation etc
2028                  *    endfor
2029                  * endfor
2030                  */
2031                 for_each_rmrr_units(rmrr) {
2032                         for (i = 0; i < rmrr->devices_cnt; i++) {
2033                                 pdev = rmrr->devices[i];
2034                                 /*
2035                                  * some BIOS lists non-exist devices in DMAR
2036                                  * table.
2037                                  */
2038                                 if (!pdev)
2039                                         continue;
2040                                 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2041                                 if (ret)
2042                                         printk(KERN_ERR
2043                                  "IOMMU: mapping reserved region failed\n");
2044                         }
2045                 }
2046
2047                 iommu_prepare_gfx_mapping();
2048
2049                 iommu_prepare_isa();
2050         }
2051
2052         /*
2053          * for each drhd
2054          *   enable fault log
2055          *   global invalidate context cache
2056          *   global invalidate iotlb
2057          *   enable translation
2058          */
2059         for_each_drhd_unit(drhd) {
2060                 if (drhd->ignored)
2061                         continue;
2062                 iommu = drhd->iommu;
2063
2064                 iommu_flush_write_buffer(iommu);
2065
2066                 ret = dmar_set_interrupt(iommu);
2067                 if (ret)
2068                         goto error;
2069
2070                 iommu_set_root_entry(iommu);
2071
2072                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2073                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2074                 iommu_disable_protect_mem_regions(iommu);
2075
2076                 ret = iommu_enable_translation(iommu);
2077                 if (ret)
2078                         goto error;
2079         }
2080
2081         return 0;
2082 error:
2083         for_each_drhd_unit(drhd) {
2084                 if (drhd->ignored)
2085                         continue;
2086                 iommu = drhd->iommu;
2087                 free_iommu(iommu);
2088         }
2089         kfree(g_iommus);
2090         return ret;
2091 }
2092
2093 static inline u64 aligned_size(u64 host_addr, size_t size)
2094 {
2095         u64 addr;
2096         addr = (host_addr & (~PAGE_MASK)) + size;
2097         return PAGE_ALIGN(addr);
2098 }
2099
2100 struct iova *
2101 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2102 {
2103         struct iova *piova;
2104
2105         /* Make sure it's in range */
2106         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2107         if (!size || (IOVA_START_ADDR + size > end))
2108                 return NULL;
2109
2110         piova = alloc_iova(&domain->iovad,
2111                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2112         return piova;
2113 }
2114
2115 static struct iova *
2116 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2117                    size_t size, u64 dma_mask)
2118 {
2119         struct pci_dev *pdev = to_pci_dev(dev);
2120         struct iova *iova = NULL;
2121
2122         if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac)
2123                 iova = iommu_alloc_iova(domain, size, dma_mask);
2124         else {
2125                 /*
2126                  * First try to allocate an io virtual address in
2127                  * DMA_BIT_MASK(32) and if that fails then try allocating
2128                  * from higher range
2129                  */
2130                 iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32));
2131                 if (!iova)
2132                         iova = iommu_alloc_iova(domain, size, dma_mask);
2133         }
2134
2135         if (!iova) {
2136                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2137                 return NULL;
2138         }
2139
2140         return iova;
2141 }
2142
2143 static struct dmar_domain *
2144 get_valid_domain_for_dev(struct pci_dev *pdev)
2145 {
2146         struct dmar_domain *domain;
2147         int ret;
2148
2149         domain = get_domain_for_dev(pdev,
2150                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2151         if (!domain) {
2152                 printk(KERN_ERR
2153                         "Allocating domain for %s failed", pci_name(pdev));
2154                 return NULL;
2155         }
2156
2157         /* make sure context mapping is ok */
2158         if (unlikely(!domain_context_mapped(pdev))) {
2159                 ret = domain_context_mapping(domain, pdev,
2160                                              CONTEXT_TT_MULTI_LEVEL);
2161                 if (ret) {
2162                         printk(KERN_ERR
2163                                 "Domain context map for %s failed",
2164                                 pci_name(pdev));
2165                         return NULL;
2166                 }
2167         }
2168
2169         return domain;
2170 }
2171
2172 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2173                                      size_t size, int dir, u64 dma_mask)
2174 {
2175         struct pci_dev *pdev = to_pci_dev(hwdev);
2176         struct dmar_domain *domain;
2177         phys_addr_t start_paddr;
2178         struct iova *iova;
2179         int prot = 0;
2180         int ret;
2181         struct intel_iommu *iommu;
2182
2183         BUG_ON(dir == DMA_NONE);
2184         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2185                 return paddr;
2186
2187         domain = get_valid_domain_for_dev(pdev);
2188         if (!domain)
2189                 return 0;
2190
2191         iommu = domain_get_iommu(domain);
2192         size = aligned_size((u64)paddr, size);
2193
2194         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2195         if (!iova)
2196                 goto error;
2197
2198         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2199
2200         /*
2201          * Check if DMAR supports zero-length reads on write only
2202          * mappings..
2203          */
2204         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2205                         !cap_zlr(iommu->cap))
2206                 prot |= DMA_PTE_READ;
2207         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2208                 prot |= DMA_PTE_WRITE;
2209         /*
2210          * paddr - (paddr + size) might be partial page, we should map the whole
2211          * page.  Note: if two part of one page are separately mapped, we
2212          * might have two guest_addr mapping to the same host paddr, but this
2213          * is not a big problem
2214          */
2215         ret = domain_page_mapping(domain, start_paddr,
2216                 ((u64)paddr) & PAGE_MASK, size, prot);
2217         if (ret)
2218                 goto error;
2219
2220         /* it's a non-present to present mapping. Only flush if caching mode */
2221         if (cap_caching_mode(iommu->cap))
2222                 iommu_flush_iotlb_psi(iommu, 0, start_paddr,
2223                                       size >> VTD_PAGE_SHIFT);
2224         else
2225                 iommu_flush_write_buffer(iommu);
2226
2227         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2228
2229 error:
2230         if (iova)
2231                 __free_iova(&domain->iovad, iova);
2232         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2233                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2234         return 0;
2235 }
2236
2237 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2238                                  unsigned long offset, size_t size,
2239                                  enum dma_data_direction dir,
2240                                  struct dma_attrs *attrs)
2241 {
2242         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2243                                   dir, to_pci_dev(dev)->dma_mask);
2244 }
2245
2246 static void flush_unmaps(void)
2247 {
2248         int i, j;
2249
2250         timer_on = 0;
2251
2252         /* just flush them all */
2253         for (i = 0; i < g_num_of_iommus; i++) {
2254                 struct intel_iommu *iommu = g_iommus[i];
2255                 if (!iommu)
2256                         continue;
2257
2258                 if (!deferred_flush[i].next)
2259                         continue;
2260
2261                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2262                                          DMA_TLB_GLOBAL_FLUSH, 0);
2263                 for (j = 0; j < deferred_flush[i].next; j++) {
2264                         __free_iova(&deferred_flush[i].domain[j]->iovad,
2265                                         deferred_flush[i].iova[j]);
2266                 }
2267                 deferred_flush[i].next = 0;
2268         }
2269
2270         list_size = 0;
2271 }
2272
2273 static void flush_unmaps_timeout(unsigned long data)
2274 {
2275         unsigned long flags;
2276
2277         spin_lock_irqsave(&async_umap_flush_lock, flags);
2278         flush_unmaps();
2279         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2280 }
2281
2282 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2283 {
2284         unsigned long flags;
2285         int next, iommu_id;
2286         struct intel_iommu *iommu;
2287
2288         spin_lock_irqsave(&async_umap_flush_lock, flags);
2289         if (list_size == HIGH_WATER_MARK)
2290                 flush_unmaps();
2291
2292         iommu = domain_get_iommu(dom);
2293         iommu_id = iommu->seq_id;
2294
2295         next = deferred_flush[iommu_id].next;
2296         deferred_flush[iommu_id].domain[next] = dom;
2297         deferred_flush[iommu_id].iova[next] = iova;
2298         deferred_flush[iommu_id].next++;
2299
2300         if (!timer_on) {
2301                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2302                 timer_on = 1;
2303         }
2304         list_size++;
2305         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2306 }
2307
2308 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2309                              size_t size, enum dma_data_direction dir,
2310                              struct dma_attrs *attrs)
2311 {
2312         struct pci_dev *pdev = to_pci_dev(dev);
2313         struct dmar_domain *domain;
2314         unsigned long start_addr;
2315         struct iova *iova;
2316         struct intel_iommu *iommu;
2317
2318         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2319                 return;
2320         domain = find_domain(pdev);
2321         BUG_ON(!domain);
2322
2323         iommu = domain_get_iommu(domain);
2324
2325         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2326         if (!iova)
2327                 return;
2328
2329         start_addr = iova->pfn_lo << PAGE_SHIFT;
2330         size = aligned_size((u64)dev_addr, size);
2331
2332         pr_debug("Device %s unmapping: %zx@%llx\n",
2333                 pci_name(pdev), size, (unsigned long long)start_addr);
2334
2335         /*  clear the whole page */
2336         dma_pte_clear_range(domain, start_addr, start_addr + size);
2337         /* free page tables */
2338         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2339         if (intel_iommu_strict) {
2340                 iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2341                                       size >> VTD_PAGE_SHIFT);
2342                 /* free iova */
2343                 __free_iova(&domain->iovad, iova);
2344         } else {
2345                 add_unmap(domain, iova);
2346                 /*
2347                  * queue up the release of the unmap to save the 1/6th of the
2348                  * cpu used up by the iotlb flush operation...
2349                  */
2350         }
2351 }
2352
2353 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2354                                int dir)
2355 {
2356         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2357 }
2358
2359 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2360                                   dma_addr_t *dma_handle, gfp_t flags)
2361 {
2362         void *vaddr;
2363         int order;
2364
2365         size = PAGE_ALIGN(size);
2366         order = get_order(size);
2367         flags &= ~(GFP_DMA | GFP_DMA32);
2368
2369         vaddr = (void *)__get_free_pages(flags, order);
2370         if (!vaddr)
2371                 return NULL;
2372         memset(vaddr, 0, size);
2373
2374         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2375                                          DMA_BIDIRECTIONAL,
2376                                          hwdev->coherent_dma_mask);
2377         if (*dma_handle)
2378                 return vaddr;
2379         free_pages((unsigned long)vaddr, order);
2380         return NULL;
2381 }
2382
2383 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2384                                 dma_addr_t dma_handle)
2385 {
2386         int order;
2387
2388         size = PAGE_ALIGN(size);
2389         order = get_order(size);
2390
2391         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2392         free_pages((unsigned long)vaddr, order);
2393 }
2394
2395 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2396                            int nelems, enum dma_data_direction dir,
2397                            struct dma_attrs *attrs)
2398 {
2399         int i;
2400         struct pci_dev *pdev = to_pci_dev(hwdev);
2401         struct dmar_domain *domain;
2402         unsigned long start_addr;
2403         struct iova *iova;
2404         size_t size = 0;
2405         phys_addr_t addr;
2406         struct scatterlist *sg;
2407         struct intel_iommu *iommu;
2408
2409         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2410                 return;
2411
2412         domain = find_domain(pdev);
2413         BUG_ON(!domain);
2414
2415         iommu = domain_get_iommu(domain);
2416
2417         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2418         if (!iova)
2419                 return;
2420         for_each_sg(sglist, sg, nelems, i) {
2421                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2422                 size += aligned_size((u64)addr, sg->length);
2423         }
2424
2425         start_addr = iova->pfn_lo << PAGE_SHIFT;
2426
2427         /*  clear the whole page */
2428         dma_pte_clear_range(domain, start_addr, start_addr + size);
2429         /* free page tables */
2430         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2431
2432         iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2433                               size >> VTD_PAGE_SHIFT);
2434
2435         /* free iova */
2436         __free_iova(&domain->iovad, iova);
2437 }
2438
2439 static int intel_nontranslate_map_sg(struct device *hddev,
2440         struct scatterlist *sglist, int nelems, int dir)
2441 {
2442         int i;
2443         struct scatterlist *sg;
2444
2445         for_each_sg(sglist, sg, nelems, i) {
2446                 BUG_ON(!sg_page(sg));
2447                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2448                 sg->dma_length = sg->length;
2449         }
2450         return nelems;
2451 }
2452
2453 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2454                         enum dma_data_direction dir, struct dma_attrs *attrs)
2455 {
2456         phys_addr_t addr;
2457         int i;
2458         struct pci_dev *pdev = to_pci_dev(hwdev);
2459         struct dmar_domain *domain;
2460         size_t size = 0;
2461         int prot = 0;
2462         size_t offset = 0;
2463         struct iova *iova = NULL;
2464         int ret;
2465         struct scatterlist *sg;
2466         unsigned long start_addr;
2467         struct intel_iommu *iommu;
2468
2469         BUG_ON(dir == DMA_NONE);
2470         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2471                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2472
2473         domain = get_valid_domain_for_dev(pdev);
2474         if (!domain)
2475                 return 0;
2476
2477         iommu = domain_get_iommu(domain);
2478
2479         for_each_sg(sglist, sg, nelems, i) {
2480                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2481                 size += aligned_size((u64)addr, sg->length);
2482         }
2483
2484         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2485         if (!iova) {
2486                 sglist->dma_length = 0;
2487                 return 0;
2488         }
2489
2490         /*
2491          * Check if DMAR supports zero-length reads on write only
2492          * mappings..
2493          */
2494         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2495                         !cap_zlr(iommu->cap))
2496                 prot |= DMA_PTE_READ;
2497         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2498                 prot |= DMA_PTE_WRITE;
2499
2500         start_addr = iova->pfn_lo << PAGE_SHIFT;
2501         offset = 0;
2502         for_each_sg(sglist, sg, nelems, i) {
2503                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2504                 size = aligned_size((u64)addr, sg->length);
2505                 ret = domain_page_mapping(domain, start_addr + offset,
2506                         ((u64)addr) & PAGE_MASK,
2507                         size, prot);
2508                 if (ret) {
2509                         /*  clear the page */
2510                         dma_pte_clear_range(domain, start_addr,
2511                                   start_addr + offset);
2512                         /* free page tables */
2513                         dma_pte_free_pagetable(domain, start_addr,
2514                                   start_addr + offset);
2515                         /* free iova */
2516                         __free_iova(&domain->iovad, iova);
2517                         return 0;
2518                 }
2519                 sg->dma_address = start_addr + offset +
2520                                 ((u64)addr & (~PAGE_MASK));
2521                 sg->dma_length = sg->length;
2522                 offset += size;
2523         }
2524
2525         /* it's a non-present to present mapping. Only flush if caching mode */
2526         if (cap_caching_mode(iommu->cap))
2527                 iommu_flush_iotlb_psi(iommu, 0, start_addr,
2528                                       offset >> VTD_PAGE_SHIFT);
2529         else
2530                 iommu_flush_write_buffer(iommu);
2531
2532         return nelems;
2533 }
2534
2535 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2536 {
2537         return !dma_addr;
2538 }
2539
2540 struct dma_map_ops intel_dma_ops = {
2541         .alloc_coherent = intel_alloc_coherent,
2542         .free_coherent = intel_free_coherent,
2543         .map_sg = intel_map_sg,
2544         .unmap_sg = intel_unmap_sg,
2545         .map_page = intel_map_page,
2546         .unmap_page = intel_unmap_page,
2547         .mapping_error = intel_mapping_error,
2548 };
2549
2550 static inline int iommu_domain_cache_init(void)
2551 {
2552         int ret = 0;
2553
2554         iommu_domain_cache = kmem_cache_create("iommu_domain",
2555                                          sizeof(struct dmar_domain),
2556                                          0,
2557                                          SLAB_HWCACHE_ALIGN,
2558
2559                                          NULL);
2560         if (!iommu_domain_cache) {
2561                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2562                 ret = -ENOMEM;
2563         }
2564
2565         return ret;
2566 }
2567
2568 static inline int iommu_devinfo_cache_init(void)
2569 {
2570         int ret = 0;
2571
2572         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2573                                          sizeof(struct device_domain_info),
2574                                          0,
2575                                          SLAB_HWCACHE_ALIGN,
2576                                          NULL);
2577         if (!iommu_devinfo_cache) {
2578                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2579                 ret = -ENOMEM;
2580         }
2581
2582         return ret;
2583 }
2584
2585 static inline int iommu_iova_cache_init(void)
2586 {
2587         int ret = 0;
2588
2589         iommu_iova_cache = kmem_cache_create("iommu_iova",
2590                                          sizeof(struct iova),
2591                                          0,
2592                                          SLAB_HWCACHE_ALIGN,
2593                                          NULL);
2594         if (!iommu_iova_cache) {
2595                 printk(KERN_ERR "Couldn't create iova cache\n");
2596                 ret = -ENOMEM;
2597         }
2598
2599         return ret;
2600 }
2601
2602 static int __init iommu_init_mempool(void)
2603 {
2604         int ret;
2605         ret = iommu_iova_cache_init();
2606         if (ret)
2607                 return ret;
2608
2609         ret = iommu_domain_cache_init();
2610         if (ret)
2611                 goto domain_error;
2612
2613         ret = iommu_devinfo_cache_init();
2614         if (!ret)
2615                 return ret;
2616
2617         kmem_cache_destroy(iommu_domain_cache);
2618 domain_error:
2619         kmem_cache_destroy(iommu_iova_cache);
2620
2621         return -ENOMEM;
2622 }
2623
2624 static void __init iommu_exit_mempool(void)
2625 {
2626         kmem_cache_destroy(iommu_devinfo_cache);
2627         kmem_cache_destroy(iommu_domain_cache);
2628         kmem_cache_destroy(iommu_iova_cache);
2629
2630 }
2631
2632 static void __init init_no_remapping_devices(void)
2633 {
2634         struct dmar_drhd_unit *drhd;
2635
2636         for_each_drhd_unit(drhd) {
2637                 if (!drhd->include_all) {
2638                         int i;
2639                         for (i = 0; i < drhd->devices_cnt; i++)
2640                                 if (drhd->devices[i] != NULL)
2641                                         break;
2642                         /* ignore DMAR unit if no pci devices exist */
2643                         if (i == drhd->devices_cnt)
2644                                 drhd->ignored = 1;
2645                 }
2646         }
2647
2648         if (dmar_map_gfx)
2649                 return;
2650
2651         for_each_drhd_unit(drhd) {
2652                 int i;
2653                 if (drhd->ignored || drhd->include_all)
2654                         continue;
2655
2656                 for (i = 0; i < drhd->devices_cnt; i++)
2657                         if (drhd->devices[i] &&
2658                                 !IS_GFX_DEVICE(drhd->devices[i]))
2659                                 break;
2660
2661                 if (i < drhd->devices_cnt)
2662                         continue;
2663
2664                 /* bypass IOMMU if it is just for gfx devices */
2665                 drhd->ignored = 1;
2666                 for (i = 0; i < drhd->devices_cnt; i++) {
2667                         if (!drhd->devices[i])
2668                                 continue;
2669                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2670                 }
2671         }
2672 }
2673
2674 #ifdef CONFIG_SUSPEND
2675 static int init_iommu_hw(void)
2676 {
2677         struct dmar_drhd_unit *drhd;
2678         struct intel_iommu *iommu = NULL;
2679
2680         for_each_active_iommu(iommu, drhd)
2681                 if (iommu->qi)
2682                         dmar_reenable_qi(iommu);
2683
2684         for_each_active_iommu(iommu, drhd) {
2685                 iommu_flush_write_buffer(iommu);
2686
2687                 iommu_set_root_entry(iommu);
2688
2689                 iommu->flush.flush_context(iommu, 0, 0, 0,
2690                                            DMA_CCMD_GLOBAL_INVL);
2691                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2692                                          DMA_TLB_GLOBAL_FLUSH);
2693                 iommu_disable_protect_mem_regions(iommu);
2694                 iommu_enable_translation(iommu);
2695         }
2696
2697         return 0;
2698 }
2699
2700 static void iommu_flush_all(void)
2701 {
2702         struct dmar_drhd_unit *drhd;
2703         struct intel_iommu *iommu;
2704
2705         for_each_active_iommu(iommu, drhd) {
2706                 iommu->flush.flush_context(iommu, 0, 0, 0,
2707                                            DMA_CCMD_GLOBAL_INVL);
2708                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2709                                          DMA_TLB_GLOBAL_FLUSH);
2710         }
2711 }
2712
2713 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
2714 {
2715         struct dmar_drhd_unit *drhd;
2716         struct intel_iommu *iommu = NULL;
2717         unsigned long flag;
2718
2719         for_each_active_iommu(iommu, drhd) {
2720                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
2721                                                  GFP_ATOMIC);
2722                 if (!iommu->iommu_state)
2723                         goto nomem;
2724         }
2725
2726         iommu_flush_all();
2727
2728         for_each_active_iommu(iommu, drhd) {
2729                 iommu_disable_translation(iommu);
2730
2731                 spin_lock_irqsave(&iommu->register_lock, flag);
2732
2733                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2734                         readl(iommu->reg + DMAR_FECTL_REG);
2735                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2736                         readl(iommu->reg + DMAR_FEDATA_REG);
2737                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2738                         readl(iommu->reg + DMAR_FEADDR_REG);
2739                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2740                         readl(iommu->reg + DMAR_FEUADDR_REG);
2741
2742                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2743         }
2744         return 0;
2745
2746 nomem:
2747         for_each_active_iommu(iommu, drhd)
2748                 kfree(iommu->iommu_state);
2749
2750         return -ENOMEM;
2751 }
2752
2753 static int iommu_resume(struct sys_device *dev)
2754 {
2755         struct dmar_drhd_unit *drhd;
2756         struct intel_iommu *iommu = NULL;
2757         unsigned long flag;
2758
2759         if (init_iommu_hw()) {
2760                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2761                 return -EIO;
2762         }
2763
2764         for_each_active_iommu(iommu, drhd) {
2765
2766                 spin_lock_irqsave(&iommu->register_lock, flag);
2767
2768                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2769                         iommu->reg + DMAR_FECTL_REG);
2770                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2771                         iommu->reg + DMAR_FEDATA_REG);
2772                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2773                         iommu->reg + DMAR_FEADDR_REG);
2774                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2775                         iommu->reg + DMAR_FEUADDR_REG);
2776
2777                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2778         }
2779
2780         for_each_active_iommu(iommu, drhd)
2781                 kfree(iommu->iommu_state);
2782
2783         return 0;
2784 }
2785
2786 static struct sysdev_class iommu_sysclass = {
2787         .name           = "iommu",
2788         .resume         = iommu_resume,
2789         .suspend        = iommu_suspend,
2790 };
2791
2792 static struct sys_device device_iommu = {
2793         .cls    = &iommu_sysclass,
2794 };
2795
2796 static int __init init_iommu_sysfs(void)
2797 {
2798         int error;
2799
2800         error = sysdev_class_register(&iommu_sysclass);
2801         if (error)
2802                 return error;
2803
2804         error = sysdev_register(&device_iommu);
2805         if (error)
2806                 sysdev_class_unregister(&iommu_sysclass);
2807
2808         return error;
2809 }
2810
2811 #else
2812 static int __init init_iommu_sysfs(void)
2813 {
2814         return 0;
2815 }
2816 #endif  /* CONFIG_PM */
2817
2818 int __init intel_iommu_init(void)
2819 {
2820         int ret = 0;
2821
2822         if (dmar_table_init())
2823                 return  -ENODEV;
2824
2825         if (dmar_dev_scope_init())
2826                 return  -ENODEV;
2827
2828         /*
2829          * Check the need for DMA-remapping initialization now.
2830          * Above initialization will also be used by Interrupt-remapping.
2831          */
2832         if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
2833                 return -ENODEV;
2834
2835         iommu_init_mempool();
2836         dmar_init_reserved_ranges();
2837
2838         init_no_remapping_devices();
2839
2840         ret = init_dmars();
2841         if (ret) {
2842                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2843                 put_iova_domain(&reserved_iova_list);
2844                 iommu_exit_mempool();
2845                 return ret;
2846         }
2847         printk(KERN_INFO
2848         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2849
2850         init_timer(&unmap_timer);
2851         force_iommu = 1;
2852
2853         if (!iommu_pass_through) {
2854                 printk(KERN_INFO
2855                        "Multi-level page-table translation for DMAR.\n");
2856                 dma_ops = &intel_dma_ops;
2857         } else
2858                 printk(KERN_INFO
2859                        "DMAR: Pass through translation for DMAR.\n");
2860
2861         init_iommu_sysfs();
2862
2863         register_iommu(&intel_iommu_ops);
2864
2865         return 0;
2866 }
2867
2868 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2869                                   struct pci_dev *pdev)
2870 {
2871         struct device_domain_info *info;
2872         unsigned long flags;
2873
2874         info = alloc_devinfo_mem();
2875         if (!info)
2876                 return -ENOMEM;
2877
2878         info->segment = pci_domain_nr(pdev->bus);
2879         info->bus = pdev->bus->number;
2880         info->devfn = pdev->devfn;
2881         info->dev = pdev;
2882         info->domain = domain;
2883
2884         spin_lock_irqsave(&device_domain_lock, flags);
2885         list_add(&info->link, &domain->devices);
2886         list_add(&info->global, &device_domain_list);
2887         pdev->dev.archdata.iommu = info;
2888         spin_unlock_irqrestore(&device_domain_lock, flags);
2889
2890         return 0;
2891 }
2892
2893 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2894                                            struct pci_dev *pdev)
2895 {
2896         struct pci_dev *tmp, *parent;
2897
2898         if (!iommu || !pdev)
2899                 return;
2900
2901         /* dependent device detach */
2902         tmp = pci_find_upstream_pcie_bridge(pdev);
2903         /* Secondary interface's bus number and devfn 0 */
2904         if (tmp) {
2905                 parent = pdev->bus->self;
2906                 while (parent != tmp) {
2907                         iommu_detach_dev(iommu, parent->bus->number,
2908                                          parent->devfn);
2909                         parent = parent->bus->self;
2910                 }
2911                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2912                         iommu_detach_dev(iommu,
2913                                 tmp->subordinate->number, 0);
2914                 else /* this is a legacy PCI bridge */
2915                         iommu_detach_dev(iommu, tmp->bus->number,
2916                                          tmp->devfn);
2917         }
2918 }
2919
2920 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2921                                           struct pci_dev *pdev)
2922 {
2923         struct device_domain_info *info;
2924         struct intel_iommu *iommu;
2925         unsigned long flags;
2926         int found = 0;
2927         struct list_head *entry, *tmp;
2928
2929         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
2930                                 pdev->devfn);
2931         if (!iommu)
2932                 return;
2933
2934         spin_lock_irqsave(&device_domain_lock, flags);
2935         list_for_each_safe(entry, tmp, &domain->devices) {
2936                 info = list_entry(entry, struct device_domain_info, link);
2937                 /* No need to compare PCI domain; it has to be the same */
2938                 if (info->bus == pdev->bus->number &&
2939                     info->devfn == pdev->devfn) {
2940                         list_del(&info->link);
2941                         list_del(&info->global);
2942                         if (info->dev)
2943                                 info->dev->dev.archdata.iommu = NULL;
2944                         spin_unlock_irqrestore(&device_domain_lock, flags);
2945
2946                         iommu_detach_dev(iommu, info->bus, info->devfn);
2947                         iommu_detach_dependent_devices(iommu, pdev);
2948                         free_devinfo_mem(info);
2949
2950                         spin_lock_irqsave(&device_domain_lock, flags);
2951
2952                         if (found)
2953                                 break;
2954                         else
2955                                 continue;
2956                 }
2957
2958                 /* if there is no other devices under the same iommu
2959                  * owned by this domain, clear this iommu in iommu_bmp
2960                  * update iommu count and coherency
2961                  */
2962                 if (iommu == device_to_iommu(info->segment, info->bus,
2963                                             info->devfn))
2964                         found = 1;
2965         }
2966
2967         if (found == 0) {
2968                 unsigned long tmp_flags;
2969                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2970                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2971                 domain->iommu_count--;
2972                 domain_update_iommu_cap(domain);
2973                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2974         }
2975
2976         spin_unlock_irqrestore(&device_domain_lock, flags);
2977 }
2978
2979 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2980 {
2981         struct device_domain_info *info;
2982         struct intel_iommu *iommu;
2983         unsigned long flags1, flags2;
2984
2985         spin_lock_irqsave(&device_domain_lock, flags1);
2986         while (!list_empty(&domain->devices)) {
2987                 info = list_entry(domain->devices.next,
2988                         struct device_domain_info, link);
2989                 list_del(&info->link);
2990                 list_del(&info->global);
2991                 if (info->dev)
2992                         info->dev->dev.archdata.iommu = NULL;
2993
2994                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2995
2996                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2997                 iommu_detach_dev(iommu, info->bus, info->devfn);
2998                 iommu_detach_dependent_devices(iommu, info->dev);
2999
3000                 /* clear this iommu in iommu_bmp, update iommu count
3001                  * and capabilities
3002                  */
3003                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3004                 if (test_and_clear_bit(iommu->seq_id,
3005                                        &domain->iommu_bmp)) {
3006                         domain->iommu_count--;
3007                         domain_update_iommu_cap(domain);
3008                 }
3009                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3010
3011                 free_devinfo_mem(info);
3012                 spin_lock_irqsave(&device_domain_lock, flags1);
3013         }
3014         spin_unlock_irqrestore(&device_domain_lock, flags1);
3015 }
3016
3017 /* domain id for virtual machine, it won't be set in context */
3018 static unsigned long vm_domid;
3019
3020 static int vm_domain_min_agaw(struct dmar_domain *domain)
3021 {
3022         int i;
3023         int min_agaw = domain->agaw;
3024
3025         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3026         for (; i < g_num_of_iommus; ) {
3027                 if (min_agaw > g_iommus[i]->agaw)
3028                         min_agaw = g_iommus[i]->agaw;
3029
3030                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3031         }
3032
3033         return min_agaw;
3034 }
3035
3036 static struct dmar_domain *iommu_alloc_vm_domain(void)
3037 {
3038         struct dmar_domain *domain;
3039
3040         domain = alloc_domain_mem();
3041         if (!domain)
3042                 return NULL;
3043
3044         domain->id = vm_domid++;
3045         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3046         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3047
3048         return domain;
3049 }
3050
3051 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
3052 {
3053         int adjust_width;
3054
3055         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3056         spin_lock_init(&domain->mapping_lock);
3057         spin_lock_init(&domain->iommu_lock);
3058
3059         domain_reserve_special_ranges(domain);
3060
3061         /* calculate AGAW */
3062         domain->gaw = guest_width;
3063         adjust_width = guestwidth_to_adjustwidth(guest_width);
3064         domain->agaw = width_to_agaw(adjust_width);
3065
3066         INIT_LIST_HEAD(&domain->devices);
3067
3068         domain->iommu_count = 0;
3069         domain->iommu_coherency = 0;
3070         domain->max_addr = 0;
3071
3072         /* always allocate the top pgd */
3073         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3074         if (!domain->pgd)
3075                 return -ENOMEM;
3076         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3077         return 0;
3078 }
3079
3080 static void iommu_free_vm_domain(struct dmar_domain *domain)
3081 {
3082         unsigned long flags;
3083         struct dmar_drhd_unit *drhd;
3084         struct intel_iommu *iommu;
3085         unsigned long i;
3086         unsigned long ndomains;
3087
3088         for_each_drhd_unit(drhd) {
3089                 if (drhd->ignored)
3090                         continue;
3091                 iommu = drhd->iommu;
3092
3093                 ndomains = cap_ndoms(iommu->cap);
3094                 i = find_first_bit(iommu->domain_ids, ndomains);
3095                 for (; i < ndomains; ) {
3096                         if (iommu->domains[i] == domain) {
3097                                 spin_lock_irqsave(&iommu->lock, flags);
3098                                 clear_bit(i, iommu->domain_ids);
3099                                 iommu->domains[i] = NULL;
3100                                 spin_unlock_irqrestore(&iommu->lock, flags);
3101                                 break;
3102                         }
3103                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3104                 }
3105         }
3106 }
3107
3108 static void vm_domain_exit(struct dmar_domain *domain)
3109 {
3110         u64 end;
3111
3112         /* Domain 0 is reserved, so dont process it */
3113         if (!domain)
3114                 return;
3115
3116         vm_domain_remove_all_dev_info(domain);
3117         /* destroy iovas */
3118         put_iova_domain(&domain->iovad);
3119         end = DOMAIN_MAX_ADDR(domain->gaw);
3120         end = end & (~VTD_PAGE_MASK);
3121
3122         /* clear ptes */
3123         dma_pte_clear_range(domain, 0, end);
3124
3125         /* free page tables */
3126         dma_pte_free_pagetable(domain, 0, end);
3127
3128         iommu_free_vm_domain(domain);
3129         free_domain_mem(domain);
3130 }
3131
3132 static int intel_iommu_domain_init(struct iommu_domain *domain)
3133 {
3134         struct dmar_domain *dmar_domain;
3135
3136         dmar_domain = iommu_alloc_vm_domain();
3137         if (!dmar_domain) {
3138                 printk(KERN_ERR
3139                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3140                 return -ENOMEM;
3141         }
3142         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3143                 printk(KERN_ERR
3144                         "intel_iommu_domain_init() failed\n");
3145                 vm_domain_exit(dmar_domain);
3146                 return -ENOMEM;
3147         }
3148         domain->priv = dmar_domain;
3149
3150         return 0;
3151 }
3152
3153 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3154 {
3155         struct dmar_domain *dmar_domain = domain->priv;
3156
3157         domain->priv = NULL;
3158         vm_domain_exit(dmar_domain);
3159 }
3160
3161 static int intel_iommu_attach_device(struct iommu_domain *domain,
3162                                      struct device *dev)
3163 {
3164         struct dmar_domain *dmar_domain = domain->priv;
3165         struct pci_dev *pdev = to_pci_dev(dev);
3166         struct intel_iommu *iommu;
3167         int addr_width;
3168         u64 end;
3169         int ret;
3170
3171         /* normally pdev is not mapped */
3172         if (unlikely(domain_context_mapped(pdev))) {
3173                 struct dmar_domain *old_domain;
3174
3175                 old_domain = find_domain(pdev);
3176                 if (old_domain) {
3177                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3178                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3179                         else
3180                                 domain_remove_dev_info(old_domain);
3181                 }
3182         }
3183
3184         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3185                                 pdev->devfn);
3186         if (!iommu)
3187                 return -ENODEV;
3188
3189         /* check if this iommu agaw is sufficient for max mapped address */
3190         addr_width = agaw_to_width(iommu->agaw);
3191         end = DOMAIN_MAX_ADDR(addr_width);
3192         end = end & VTD_PAGE_MASK;
3193         if (end < dmar_domain->max_addr) {
3194                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3195                        "sufficient for the mapped address (%llx)\n",
3196                        __func__, iommu->agaw, dmar_domain->max_addr);
3197                 return -EFAULT;
3198         }
3199
3200         ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3201         if (ret)
3202                 return ret;
3203
3204         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3205         return ret;
3206 }
3207
3208 static void intel_iommu_detach_device(struct iommu_domain *domain,
3209                                       struct device *dev)
3210 {
3211         struct dmar_domain *dmar_domain = domain->priv;
3212         struct pci_dev *pdev = to_pci_dev(dev);
3213
3214         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3215 }
3216
3217 static int intel_iommu_map_range(struct iommu_domain *domain,
3218                                  unsigned long iova, phys_addr_t hpa,
3219                                  size_t size, int iommu_prot)
3220 {
3221         struct dmar_domain *dmar_domain = domain->priv;
3222         u64 max_addr;
3223         int addr_width;
3224         int prot = 0;
3225         int ret;
3226
3227         if (iommu_prot & IOMMU_READ)
3228                 prot |= DMA_PTE_READ;
3229         if (iommu_prot & IOMMU_WRITE)
3230                 prot |= DMA_PTE_WRITE;
3231         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3232                 prot |= DMA_PTE_SNP;
3233
3234         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3235         if (dmar_domain->max_addr < max_addr) {
3236                 int min_agaw;
3237                 u64 end;
3238
3239                 /* check if minimum agaw is sufficient for mapped address */
3240                 min_agaw = vm_domain_min_agaw(dmar_domain);
3241                 addr_width = agaw_to_width(min_agaw);
3242                 end = DOMAIN_MAX_ADDR(addr_width);
3243                 end = end & VTD_PAGE_MASK;
3244                 if (end < max_addr) {
3245                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3246                                "sufficient for the mapped address (%llx)\n",
3247                                __func__, min_agaw, max_addr);
3248                         return -EFAULT;
3249                 }
3250                 dmar_domain->max_addr = max_addr;
3251         }
3252
3253         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3254         return ret;
3255 }
3256
3257 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3258                                     unsigned long iova, size_t size)
3259 {
3260         struct dmar_domain *dmar_domain = domain->priv;
3261         dma_addr_t base;
3262
3263         /* The address might not be aligned */
3264         base = iova & VTD_PAGE_MASK;
3265         size = VTD_PAGE_ALIGN(size);
3266         dma_pte_clear_range(dmar_domain, base, base + size);
3267
3268         if (dmar_domain->max_addr == base + size)
3269                 dmar_domain->max_addr = base;
3270 }
3271
3272 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3273                                             unsigned long iova)
3274 {
3275         struct dmar_domain *dmar_domain = domain->priv;
3276         struct dma_pte *pte;
3277         u64 phys = 0;
3278
3279         pte = addr_to_dma_pte(dmar_domain, iova);
3280         if (pte)
3281                 phys = dma_pte_addr(pte);
3282
3283         return phys;
3284 }
3285
3286 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3287                                       unsigned long cap)
3288 {
3289         struct dmar_domain *dmar_domain = domain->priv;
3290
3291         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3292                 return dmar_domain->iommu_snooping;
3293
3294         return 0;
3295 }
3296
3297 static struct iommu_ops intel_iommu_ops = {
3298         .domain_init    = intel_iommu_domain_init,
3299         .domain_destroy = intel_iommu_domain_destroy,
3300         .attach_dev     = intel_iommu_attach_device,
3301         .detach_dev     = intel_iommu_detach_device,
3302         .map            = intel_iommu_map_range,
3303         .unmap          = intel_iommu_unmap_range,
3304         .iova_to_phys   = intel_iommu_iova_to_phys,
3305         .domain_has_cap = intel_iommu_domain_has_cap,
3306 };
3307
3308 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3309 {
3310         /*
3311          * Mobile 4 Series Chipset neglects to set RWBF capability,
3312          * but needs it:
3313          */
3314         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3315         rwbf_quirk = 1;
3316 }
3317
3318 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);