bc99b1e47fbc4b869eeafcb3b175737d764b841a
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <asm/cacheflush.h>
41 #include <asm/iommu.h>
42 #include "pci.h"
43
44 #define ROOT_SIZE               VTD_PAGE_SIZE
45 #define CONTEXT_SIZE            VTD_PAGE_SIZE
46
47 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
48 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49
50 #define IOAPIC_RANGE_START      (0xfee00000)
51 #define IOAPIC_RANGE_END        (0xfeefffff)
52 #define IOVA_START_ADDR         (0x1000)
53
54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55
56 #define MAX_AGAW_WIDTH 64
57
58 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
59
60 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
61 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
62 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
63
64 /* global iommu list, set NULL for ignored DMAR units */
65 static struct intel_iommu **g_iommus;
66
67 static int rwbf_quirk;
68
69 /*
70  * 0: Present
71  * 1-11: Reserved
72  * 12-63: Context Ptr (12 - (haw-1))
73  * 64-127: Reserved
74  */
75 struct root_entry {
76         u64     val;
77         u64     rsvd1;
78 };
79 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
80 static inline bool root_present(struct root_entry *root)
81 {
82         return (root->val & 1);
83 }
84 static inline void set_root_present(struct root_entry *root)
85 {
86         root->val |= 1;
87 }
88 static inline void set_root_value(struct root_entry *root, unsigned long value)
89 {
90         root->val |= value & VTD_PAGE_MASK;
91 }
92
93 static inline struct context_entry *
94 get_context_addr_from_root(struct root_entry *root)
95 {
96         return (struct context_entry *)
97                 (root_present(root)?phys_to_virt(
98                 root->val & VTD_PAGE_MASK) :
99                 NULL);
100 }
101
102 /*
103  * low 64 bits:
104  * 0: present
105  * 1: fault processing disable
106  * 2-3: translation type
107  * 12-63: address space root
108  * high 64 bits:
109  * 0-2: address width
110  * 3-6: aval
111  * 8-23: domain id
112  */
113 struct context_entry {
114         u64 lo;
115         u64 hi;
116 };
117
118 static inline bool context_present(struct context_entry *context)
119 {
120         return (context->lo & 1);
121 }
122 static inline void context_set_present(struct context_entry *context)
123 {
124         context->lo |= 1;
125 }
126
127 static inline void context_set_fault_enable(struct context_entry *context)
128 {
129         context->lo &= (((u64)-1) << 2) | 1;
130 }
131
132 static inline void context_set_translation_type(struct context_entry *context,
133                                                 unsigned long value)
134 {
135         context->lo &= (((u64)-1) << 4) | 3;
136         context->lo |= (value & 3) << 2;
137 }
138
139 static inline void context_set_address_root(struct context_entry *context,
140                                             unsigned long value)
141 {
142         context->lo |= value & VTD_PAGE_MASK;
143 }
144
145 static inline void context_set_address_width(struct context_entry *context,
146                                              unsigned long value)
147 {
148         context->hi |= value & 7;
149 }
150
151 static inline void context_set_domain_id(struct context_entry *context,
152                                          unsigned long value)
153 {
154         context->hi |= (value & ((1 << 16) - 1)) << 8;
155 }
156
157 static inline void context_clear_entry(struct context_entry *context)
158 {
159         context->lo = 0;
160         context->hi = 0;
161 }
162
163 /*
164  * 0: readable
165  * 1: writable
166  * 2-6: reserved
167  * 7: super page
168  * 8-10: available
169  * 11: snoop behavior
170  * 12-63: Host physcial address
171  */
172 struct dma_pte {
173         u64 val;
174 };
175
176 static inline void dma_clear_pte(struct dma_pte *pte)
177 {
178         pte->val = 0;
179 }
180
181 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_READ;
184 }
185
186 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 {
188         pte->val |= DMA_PTE_WRITE;
189 }
190
191 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 {
193         pte->val |= DMA_PTE_SNP;
194 }
195
196 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 {
198         pte->val = (pte->val & ~3) | (prot & 3);
199 }
200
201 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 {
203         return (pte->val & VTD_PAGE_MASK);
204 }
205
206 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 {
208         pte->val |= (addr & VTD_PAGE_MASK);
209 }
210
211 static inline bool dma_pte_present(struct dma_pte *pte)
212 {
213         return (pte->val & 3) != 0;
214 }
215
216 /* devices under the same p2p bridge are owned in one domain */
217 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218
219 /* domain represents a virtual machine, more than one devices
220  * across iommus may be owned in one domain, e.g. kvm guest.
221  */
222 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
223
224 struct dmar_domain {
225         int     id;                     /* domain id */
226         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
227
228         struct list_head devices;       /* all devices' list */
229         struct iova_domain iovad;       /* iova's that belong to this domain */
230
231         struct dma_pte  *pgd;           /* virtual address */
232         spinlock_t      mapping_lock;   /* page table lock */
233         int             gaw;            /* max guest address width */
234
235         /* adjusted guest address width, 0 is level 2 30-bit */
236         int             agaw;
237
238         int             flags;          /* flags to find out type of domain */
239
240         int             iommu_coherency;/* indicate coherency of iommu access */
241         int             iommu_snooping; /* indicate snooping control feature*/
242         int             iommu_count;    /* reference count of iommu */
243         spinlock_t      iommu_lock;     /* protect iommu set in domain */
244         u64             max_addr;       /* maximum mapped address */
245 };
246
247 /* PCI domain-device relationship */
248 struct device_domain_info {
249         struct list_head link;  /* link to domain siblings */
250         struct list_head global; /* link to global list */
251         int segment;            /* PCI domain */
252         u8 bus;                 /* PCI bus number */
253         u8 devfn;               /* PCI devfn number */
254         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
255         struct dmar_domain *domain; /* pointer to domain */
256 };
257
258 static void flush_unmaps_timeout(unsigned long data);
259
260 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
261
262 #define HIGH_WATER_MARK 250
263 struct deferred_flush_tables {
264         int next;
265         struct iova *iova[HIGH_WATER_MARK];
266         struct dmar_domain *domain[HIGH_WATER_MARK];
267 };
268
269 static struct deferred_flush_tables *deferred_flush;
270
271 /* bitmap for indexing intel_iommus */
272 static int g_num_of_iommus;
273
274 static DEFINE_SPINLOCK(async_umap_flush_lock);
275 static LIST_HEAD(unmaps_to_do);
276
277 static int timer_on;
278 static long list_size;
279
280 static void domain_remove_dev_info(struct dmar_domain *domain);
281
282 #ifdef CONFIG_DMAR_DEFAULT_ON
283 int dmar_disabled = 0;
284 #else
285 int dmar_disabled = 1;
286 #endif /*CONFIG_DMAR_DEFAULT_ON*/
287
288 static int __initdata dmar_map_gfx = 1;
289 static int dmar_forcedac;
290 static int intel_iommu_strict;
291
292 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
293 static DEFINE_SPINLOCK(device_domain_lock);
294 static LIST_HEAD(device_domain_list);
295
296 static struct iommu_ops intel_iommu_ops;
297
298 static int __init intel_iommu_setup(char *str)
299 {
300         if (!str)
301                 return -EINVAL;
302         while (*str) {
303                 if (!strncmp(str, "on", 2)) {
304                         dmar_disabled = 0;
305                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
306                 } else if (!strncmp(str, "off", 3)) {
307                         dmar_disabled = 1;
308                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
309                 } else if (!strncmp(str, "igfx_off", 8)) {
310                         dmar_map_gfx = 0;
311                         printk(KERN_INFO
312                                 "Intel-IOMMU: disable GFX device mapping\n");
313                 } else if (!strncmp(str, "forcedac", 8)) {
314                         printk(KERN_INFO
315                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
316                         dmar_forcedac = 1;
317                 } else if (!strncmp(str, "strict", 6)) {
318                         printk(KERN_INFO
319                                 "Intel-IOMMU: disable batched IOTLB flush\n");
320                         intel_iommu_strict = 1;
321                 }
322
323                 str += strcspn(str, ",");
324                 while (*str == ',')
325                         str++;
326         }
327         return 0;
328 }
329 __setup("intel_iommu=", intel_iommu_setup);
330
331 static struct kmem_cache *iommu_domain_cache;
332 static struct kmem_cache *iommu_devinfo_cache;
333 static struct kmem_cache *iommu_iova_cache;
334
335 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
336 {
337         unsigned int flags;
338         void *vaddr;
339
340         /* trying to avoid low memory issues */
341         flags = current->flags & PF_MEMALLOC;
342         current->flags |= PF_MEMALLOC;
343         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
344         current->flags &= (~PF_MEMALLOC | flags);
345         return vaddr;
346 }
347
348
349 static inline void *alloc_pgtable_page(void)
350 {
351         unsigned int flags;
352         void *vaddr;
353
354         /* trying to avoid low memory issues */
355         flags = current->flags & PF_MEMALLOC;
356         current->flags |= PF_MEMALLOC;
357         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
358         current->flags &= (~PF_MEMALLOC | flags);
359         return vaddr;
360 }
361
362 static inline void free_pgtable_page(void *vaddr)
363 {
364         free_page((unsigned long)vaddr);
365 }
366
367 static inline void *alloc_domain_mem(void)
368 {
369         return iommu_kmem_cache_alloc(iommu_domain_cache);
370 }
371
372 static void free_domain_mem(void *vaddr)
373 {
374         kmem_cache_free(iommu_domain_cache, vaddr);
375 }
376
377 static inline void * alloc_devinfo_mem(void)
378 {
379         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 }
381
382 static inline void free_devinfo_mem(void *vaddr)
383 {
384         kmem_cache_free(iommu_devinfo_cache, vaddr);
385 }
386
387 struct iova *alloc_iova_mem(void)
388 {
389         return iommu_kmem_cache_alloc(iommu_iova_cache);
390 }
391
392 void free_iova_mem(struct iova *iova)
393 {
394         kmem_cache_free(iommu_iova_cache, iova);
395 }
396
397
398 static inline int width_to_agaw(int width);
399
400 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
401 {
402         unsigned long sagaw;
403         int agaw = -1;
404
405         sagaw = cap_sagaw(iommu->cap);
406         for (agaw = width_to_agaw(max_gaw);
407              agaw >= 0; agaw--) {
408                 if (test_bit(agaw, &sagaw))
409                         break;
410         }
411
412         return agaw;
413 }
414
415 /*
416  * Calculate max SAGAW for each iommu.
417  */
418 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
419 {
420         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
421 }
422
423 /*
424  * calculate agaw for each iommu.
425  * "SAGAW" may be different across iommus, use a default agaw, and
426  * get a supported less agaw for iommus that don't support the default agaw.
427  */
428 int iommu_calculate_agaw(struct intel_iommu *iommu)
429 {
430         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
431 }
432
433 /* in native case, each domain is related to only one iommu */
434 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
435 {
436         int iommu_id;
437
438         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
439
440         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
441         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
442                 return NULL;
443
444         return g_iommus[iommu_id];
445 }
446
447 static void domain_update_iommu_coherency(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_coherency = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_coherent(g_iommus[i]->ecap)) {
456                         domain->iommu_coherency = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 static void domain_update_iommu_snooping(struct dmar_domain *domain)
464 {
465         int i;
466
467         domain->iommu_snooping = 1;
468
469         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
470         for (; i < g_num_of_iommus; ) {
471                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
472                         domain->iommu_snooping = 0;
473                         break;
474                 }
475                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
476         }
477 }
478
479 /* Some capabilities may be different across iommus */
480 static void domain_update_iommu_cap(struct dmar_domain *domain)
481 {
482         domain_update_iommu_coherency(domain);
483         domain_update_iommu_snooping(domain);
484 }
485
486 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
487 {
488         struct dmar_drhd_unit *drhd = NULL;
489         int i;
490
491         for_each_drhd_unit(drhd) {
492                 if (drhd->ignored)
493                         continue;
494                 if (segment != drhd->segment)
495                         continue;
496
497                 for (i = 0; i < drhd->devices_cnt; i++) {
498                         if (drhd->devices[i] &&
499                             drhd->devices[i]->bus->number == bus &&
500                             drhd->devices[i]->devfn == devfn)
501                                 return drhd->iommu;
502                         if (drhd->devices[i] &&
503                             drhd->devices[i]->subordinate &&
504                             drhd->devices[i]->subordinate->number <= bus &&
505                             drhd->devices[i]->subordinate->subordinate >= bus)
506                                 return drhd->iommu;
507                 }
508
509                 if (drhd->include_all)
510                         return drhd->iommu;
511         }
512
513         return NULL;
514 }
515
516 static void domain_flush_cache(struct dmar_domain *domain,
517                                void *addr, int size)
518 {
519         if (!domain->iommu_coherency)
520                 clflush_cache_range(addr, size);
521 }
522
523 /* Gets context entry for a given bus and devfn */
524 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
525                 u8 bus, u8 devfn)
526 {
527         struct root_entry *root;
528         struct context_entry *context;
529         unsigned long phy_addr;
530         unsigned long flags;
531
532         spin_lock_irqsave(&iommu->lock, flags);
533         root = &iommu->root_entry[bus];
534         context = get_context_addr_from_root(root);
535         if (!context) {
536                 context = (struct context_entry *)alloc_pgtable_page();
537                 if (!context) {
538                         spin_unlock_irqrestore(&iommu->lock, flags);
539                         return NULL;
540                 }
541                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
542                 phy_addr = virt_to_phys((void *)context);
543                 set_root_value(root, phy_addr);
544                 set_root_present(root);
545                 __iommu_flush_cache(iommu, root, sizeof(*root));
546         }
547         spin_unlock_irqrestore(&iommu->lock, flags);
548         return &context[devfn];
549 }
550
551 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
552 {
553         struct root_entry *root;
554         struct context_entry *context;
555         int ret;
556         unsigned long flags;
557
558         spin_lock_irqsave(&iommu->lock, flags);
559         root = &iommu->root_entry[bus];
560         context = get_context_addr_from_root(root);
561         if (!context) {
562                 ret = 0;
563                 goto out;
564         }
565         ret = context_present(&context[devfn]);
566 out:
567         spin_unlock_irqrestore(&iommu->lock, flags);
568         return ret;
569 }
570
571 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
572 {
573         struct root_entry *root;
574         struct context_entry *context;
575         unsigned long flags;
576
577         spin_lock_irqsave(&iommu->lock, flags);
578         root = &iommu->root_entry[bus];
579         context = get_context_addr_from_root(root);
580         if (context) {
581                 context_clear_entry(&context[devfn]);
582                 __iommu_flush_cache(iommu, &context[devfn], \
583                         sizeof(*context));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586 }
587
588 static void free_context_table(struct intel_iommu *iommu)
589 {
590         struct root_entry *root;
591         int i;
592         unsigned long flags;
593         struct context_entry *context;
594
595         spin_lock_irqsave(&iommu->lock, flags);
596         if (!iommu->root_entry) {
597                 goto out;
598         }
599         for (i = 0; i < ROOT_ENTRY_NR; i++) {
600                 root = &iommu->root_entry[i];
601                 context = get_context_addr_from_root(root);
602                 if (context)
603                         free_pgtable_page(context);
604         }
605         free_pgtable_page(iommu->root_entry);
606         iommu->root_entry = NULL;
607 out:
608         spin_unlock_irqrestore(&iommu->lock, flags);
609 }
610
611 /* page table handling */
612 #define LEVEL_STRIDE            (9)
613 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
614
615 static inline int agaw_to_level(int agaw)
616 {
617         return agaw + 2;
618 }
619
620 static inline int agaw_to_width(int agaw)
621 {
622         return 30 + agaw * LEVEL_STRIDE;
623
624 }
625
626 static inline int width_to_agaw(int width)
627 {
628         return (width - 30) / LEVEL_STRIDE;
629 }
630
631 static inline unsigned int level_to_offset_bits(int level)
632 {
633         return (12 + (level - 1) * LEVEL_STRIDE);
634 }
635
636 static inline int address_level_offset(u64 addr, int level)
637 {
638         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
639 }
640
641 static inline u64 level_mask(int level)
642 {
643         return ((u64)-1 << level_to_offset_bits(level));
644 }
645
646 static inline u64 level_size(int level)
647 {
648         return ((u64)1 << level_to_offset_bits(level));
649 }
650
651 static inline u64 align_to_level(u64 addr, int level)
652 {
653         return ((addr + level_size(level) - 1) & level_mask(level));
654 }
655
656 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
657 {
658         int addr_width = agaw_to_width(domain->agaw);
659         struct dma_pte *parent, *pte = NULL;
660         int level = agaw_to_level(domain->agaw);
661         int offset;
662         unsigned long flags;
663
664         BUG_ON(!domain->pgd);
665
666         addr &= (((u64)1) << addr_width) - 1;
667         parent = domain->pgd;
668
669         spin_lock_irqsave(&domain->mapping_lock, flags);
670         while (level > 0) {
671                 void *tmp_page;
672
673                 offset = address_level_offset(addr, level);
674                 pte = &parent[offset];
675                 if (level == 1)
676                         break;
677
678                 if (!dma_pte_present(pte)) {
679                         tmp_page = alloc_pgtable_page();
680
681                         if (!tmp_page) {
682                                 spin_unlock_irqrestore(&domain->mapping_lock,
683                                         flags);
684                                 return NULL;
685                         }
686                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
687                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
688                         /*
689                          * high level table always sets r/w, last level page
690                          * table control read/write
691                          */
692                         dma_set_pte_readable(pte);
693                         dma_set_pte_writable(pte);
694                         domain_flush_cache(domain, pte, sizeof(*pte));
695                 }
696                 parent = phys_to_virt(dma_pte_addr(pte));
697                 level--;
698         }
699
700         spin_unlock_irqrestore(&domain->mapping_lock, flags);
701         return pte;
702 }
703
704 /* return address's pte at specific level */
705 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
706                 int level)
707 {
708         struct dma_pte *parent, *pte = NULL;
709         int total = agaw_to_level(domain->agaw);
710         int offset;
711
712         parent = domain->pgd;
713         while (level <= total) {
714                 offset = address_level_offset(addr, total);
715                 pte = &parent[offset];
716                 if (level == total)
717                         return pte;
718
719                 if (!dma_pte_present(pte))
720                         break;
721                 parent = phys_to_virt(dma_pte_addr(pte));
722                 total--;
723         }
724         return NULL;
725 }
726
727 /* clear one page's page table */
728 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
729 {
730         struct dma_pte *pte = NULL;
731
732         /* get last level pte */
733         pte = dma_addr_level_pte(domain, addr, 1);
734
735         if (pte) {
736                 dma_clear_pte(pte);
737                 domain_flush_cache(domain, pte, sizeof(*pte));
738         }
739 }
740
741 /* clear last level pte, a tlb flush should be followed */
742 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
743 {
744         int addr_width = agaw_to_width(domain->agaw);
745         int npages;
746
747         start &= (((u64)1) << addr_width) - 1;
748         end &= (((u64)1) << addr_width) - 1;
749         /* in case it's partial page */
750         start &= PAGE_MASK;
751         end = PAGE_ALIGN(end);
752         npages = (end - start) / VTD_PAGE_SIZE;
753
754         /* we don't need lock here, nobody else touches the iova range */
755         while (npages--) {
756                 dma_pte_clear_one(domain, start);
757                 start += VTD_PAGE_SIZE;
758         }
759 }
760
761 /* free page table pages. last level pte should already be cleared */
762 static void dma_pte_free_pagetable(struct dmar_domain *domain,
763         u64 start, u64 end)
764 {
765         int addr_width = agaw_to_width(domain->agaw);
766         struct dma_pte *pte;
767         int total = agaw_to_level(domain->agaw);
768         int level;
769         u64 tmp;
770
771         start &= (((u64)1) << addr_width) - 1;
772         end &= (((u64)1) << addr_width) - 1;
773
774         /* we don't need lock here, nobody else touches the iova range */
775         level = 2;
776         while (level <= total) {
777                 tmp = align_to_level(start, level);
778                 if (tmp >= end || (tmp + level_size(level) > end))
779                         return;
780
781                 while (tmp < end) {
782                         pte = dma_addr_level_pte(domain, tmp, level);
783                         if (pte) {
784                                 free_pgtable_page(
785                                         phys_to_virt(dma_pte_addr(pte)));
786                                 dma_clear_pte(pte);
787                                 domain_flush_cache(domain, pte, sizeof(*pte));
788                         }
789                         tmp += level_size(level);
790                 }
791                 level++;
792         }
793         /* free pgd */
794         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
795                 free_pgtable_page(domain->pgd);
796                 domain->pgd = NULL;
797         }
798 }
799
800 /* iommu handling */
801 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
802 {
803         struct root_entry *root;
804         unsigned long flags;
805
806         root = (struct root_entry *)alloc_pgtable_page();
807         if (!root)
808                 return -ENOMEM;
809
810         __iommu_flush_cache(iommu, root, ROOT_SIZE);
811
812         spin_lock_irqsave(&iommu->lock, flags);
813         iommu->root_entry = root;
814         spin_unlock_irqrestore(&iommu->lock, flags);
815
816         return 0;
817 }
818
819 static void iommu_set_root_entry(struct intel_iommu *iommu)
820 {
821         void *addr;
822         u32 sts;
823         unsigned long flag;
824
825         addr = iommu->root_entry;
826
827         spin_lock_irqsave(&iommu->register_lock, flag);
828         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
829
830         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
831
832         /* Make sure hardware complete it */
833         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
834                       readl, (sts & DMA_GSTS_RTPS), sts);
835
836         spin_unlock_irqrestore(&iommu->register_lock, flag);
837 }
838
839 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
840 {
841         u32 val;
842         unsigned long flag;
843
844         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
845                 return;
846
847         spin_lock_irqsave(&iommu->register_lock, flag);
848         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
849
850         /* Make sure hardware complete it */
851         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
852                       readl, (!(val & DMA_GSTS_WBFS)), val);
853
854         spin_unlock_irqrestore(&iommu->register_lock, flag);
855 }
856
857 /* return value determine if we need a write buffer flush */
858 static void __iommu_flush_context(struct intel_iommu *iommu,
859                                   u16 did, u16 source_id, u8 function_mask,
860                                   u64 type)
861 {
862         u64 val = 0;
863         unsigned long flag;
864
865         switch (type) {
866         case DMA_CCMD_GLOBAL_INVL:
867                 val = DMA_CCMD_GLOBAL_INVL;
868                 break;
869         case DMA_CCMD_DOMAIN_INVL:
870                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
871                 break;
872         case DMA_CCMD_DEVICE_INVL:
873                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
874                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
875                 break;
876         default:
877                 BUG();
878         }
879         val |= DMA_CCMD_ICC;
880
881         spin_lock_irqsave(&iommu->register_lock, flag);
882         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
883
884         /* Make sure hardware complete it */
885         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
886                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
887
888         spin_unlock_irqrestore(&iommu->register_lock, flag);
889 }
890
891 /* return value determine if we need a write buffer flush */
892 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
893                                 u64 addr, unsigned int size_order, u64 type)
894 {
895         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
896         u64 val = 0, val_iva = 0;
897         unsigned long flag;
898
899         switch (type) {
900         case DMA_TLB_GLOBAL_FLUSH:
901                 /* global flush doesn't need set IVA_REG */
902                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
903                 break;
904         case DMA_TLB_DSI_FLUSH:
905                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
906                 break;
907         case DMA_TLB_PSI_FLUSH:
908                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
909                 /* Note: always flush non-leaf currently */
910                 val_iva = size_order | addr;
911                 break;
912         default:
913                 BUG();
914         }
915         /* Note: set drain read/write */
916 #if 0
917         /*
918          * This is probably to be super secure.. Looks like we can
919          * ignore it without any impact.
920          */
921         if (cap_read_drain(iommu->cap))
922                 val |= DMA_TLB_READ_DRAIN;
923 #endif
924         if (cap_write_drain(iommu->cap))
925                 val |= DMA_TLB_WRITE_DRAIN;
926
927         spin_lock_irqsave(&iommu->register_lock, flag);
928         /* Note: Only uses first TLB reg currently */
929         if (val_iva)
930                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
931         dmar_writeq(iommu->reg + tlb_offset + 8, val);
932
933         /* Make sure hardware complete it */
934         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
935                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
936
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938
939         /* check IOTLB invalidation granularity */
940         if (DMA_TLB_IAIG(val) == 0)
941                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
942         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
943                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
944                         (unsigned long long)DMA_TLB_IIRG(type),
945                         (unsigned long long)DMA_TLB_IAIG(val));
946 }
947
948 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
949                                   u64 addr, unsigned int pages)
950 {
951         unsigned int mask;
952
953         BUG_ON(addr & (~VTD_PAGE_MASK));
954         BUG_ON(pages == 0);
955
956         /* Fallback to domain selective flush if no PSI support */
957         if (!cap_pgsel_inv(iommu->cap))
958                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
959                                                 DMA_TLB_DSI_FLUSH);
960
961         /*
962          * PSI requires page size to be 2 ^ x, and the base address is naturally
963          * aligned to the size
964          */
965         mask = ilog2(__roundup_pow_of_two(pages));
966         /* Fallback to domain selective flush if size is too big */
967         if (mask > cap_max_amask_val(iommu->cap))
968                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
969                                                 DMA_TLB_DSI_FLUSH);
970
971         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
972                                         DMA_TLB_PSI_FLUSH);
973 }
974
975 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
976 {
977         u32 pmen;
978         unsigned long flags;
979
980         spin_lock_irqsave(&iommu->register_lock, flags);
981         pmen = readl(iommu->reg + DMAR_PMEN_REG);
982         pmen &= ~DMA_PMEN_EPM;
983         writel(pmen, iommu->reg + DMAR_PMEN_REG);
984
985         /* wait for the protected region status bit to clear */
986         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
987                 readl, !(pmen & DMA_PMEN_PRS), pmen);
988
989         spin_unlock_irqrestore(&iommu->register_lock, flags);
990 }
991
992 static int iommu_enable_translation(struct intel_iommu *iommu)
993 {
994         u32 sts;
995         unsigned long flags;
996
997         spin_lock_irqsave(&iommu->register_lock, flags);
998         iommu->gcmd |= DMA_GCMD_TE;
999         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1000
1001         /* Make sure hardware complete it */
1002         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1003                       readl, (sts & DMA_GSTS_TES), sts);
1004
1005         spin_unlock_irqrestore(&iommu->register_lock, flags);
1006         return 0;
1007 }
1008
1009 static int iommu_disable_translation(struct intel_iommu *iommu)
1010 {
1011         u32 sts;
1012         unsigned long flag;
1013
1014         spin_lock_irqsave(&iommu->register_lock, flag);
1015         iommu->gcmd &= ~DMA_GCMD_TE;
1016         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1017
1018         /* Make sure hardware complete it */
1019         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1020                       readl, (!(sts & DMA_GSTS_TES)), sts);
1021
1022         spin_unlock_irqrestore(&iommu->register_lock, flag);
1023         return 0;
1024 }
1025
1026
1027 static int iommu_init_domains(struct intel_iommu *iommu)
1028 {
1029         unsigned long ndomains;
1030         unsigned long nlongs;
1031
1032         ndomains = cap_ndoms(iommu->cap);
1033         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1034         nlongs = BITS_TO_LONGS(ndomains);
1035
1036         /* TBD: there might be 64K domains,
1037          * consider other allocation for future chip
1038          */
1039         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1040         if (!iommu->domain_ids) {
1041                 printk(KERN_ERR "Allocating domain id array failed\n");
1042                 return -ENOMEM;
1043         }
1044         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1045                         GFP_KERNEL);
1046         if (!iommu->domains) {
1047                 printk(KERN_ERR "Allocating domain array failed\n");
1048                 kfree(iommu->domain_ids);
1049                 return -ENOMEM;
1050         }
1051
1052         spin_lock_init(&iommu->lock);
1053
1054         /*
1055          * if Caching mode is set, then invalid translations are tagged
1056          * with domainid 0. Hence we need to pre-allocate it.
1057          */
1058         if (cap_caching_mode(iommu->cap))
1059                 set_bit(0, iommu->domain_ids);
1060         return 0;
1061 }
1062
1063
1064 static void domain_exit(struct dmar_domain *domain);
1065 static void vm_domain_exit(struct dmar_domain *domain);
1066
1067 void free_dmar_iommu(struct intel_iommu *iommu)
1068 {
1069         struct dmar_domain *domain;
1070         int i;
1071         unsigned long flags;
1072
1073         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1074         for (; i < cap_ndoms(iommu->cap); ) {
1075                 domain = iommu->domains[i];
1076                 clear_bit(i, iommu->domain_ids);
1077
1078                 spin_lock_irqsave(&domain->iommu_lock, flags);
1079                 if (--domain->iommu_count == 0) {
1080                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1081                                 vm_domain_exit(domain);
1082                         else
1083                                 domain_exit(domain);
1084                 }
1085                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1086
1087                 i = find_next_bit(iommu->domain_ids,
1088                         cap_ndoms(iommu->cap), i+1);
1089         }
1090
1091         if (iommu->gcmd & DMA_GCMD_TE)
1092                 iommu_disable_translation(iommu);
1093
1094         if (iommu->irq) {
1095                 set_irq_data(iommu->irq, NULL);
1096                 /* This will mask the irq */
1097                 free_irq(iommu->irq, iommu);
1098                 destroy_irq(iommu->irq);
1099         }
1100
1101         kfree(iommu->domains);
1102         kfree(iommu->domain_ids);
1103
1104         g_iommus[iommu->seq_id] = NULL;
1105
1106         /* if all iommus are freed, free g_iommus */
1107         for (i = 0; i < g_num_of_iommus; i++) {
1108                 if (g_iommus[i])
1109                         break;
1110         }
1111
1112         if (i == g_num_of_iommus)
1113                 kfree(g_iommus);
1114
1115         /* free context mapping */
1116         free_context_table(iommu);
1117 }
1118
1119 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1120 {
1121         unsigned long num;
1122         unsigned long ndomains;
1123         struct dmar_domain *domain;
1124         unsigned long flags;
1125
1126         domain = alloc_domain_mem();
1127         if (!domain)
1128                 return NULL;
1129
1130         ndomains = cap_ndoms(iommu->cap);
1131
1132         spin_lock_irqsave(&iommu->lock, flags);
1133         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1134         if (num >= ndomains) {
1135                 spin_unlock_irqrestore(&iommu->lock, flags);
1136                 free_domain_mem(domain);
1137                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1138                 return NULL;
1139         }
1140
1141         set_bit(num, iommu->domain_ids);
1142         domain->id = num;
1143         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1144         set_bit(iommu->seq_id, &domain->iommu_bmp);
1145         domain->flags = 0;
1146         iommu->domains[num] = domain;
1147         spin_unlock_irqrestore(&iommu->lock, flags);
1148
1149         return domain;
1150 }
1151
1152 static void iommu_free_domain(struct dmar_domain *domain)
1153 {
1154         unsigned long flags;
1155         struct intel_iommu *iommu;
1156
1157         iommu = domain_get_iommu(domain);
1158
1159         spin_lock_irqsave(&iommu->lock, flags);
1160         clear_bit(domain->id, iommu->domain_ids);
1161         spin_unlock_irqrestore(&iommu->lock, flags);
1162 }
1163
1164 static struct iova_domain reserved_iova_list;
1165 static struct lock_class_key reserved_alloc_key;
1166 static struct lock_class_key reserved_rbtree_key;
1167
1168 static void dmar_init_reserved_ranges(void)
1169 {
1170         struct pci_dev *pdev = NULL;
1171         struct iova *iova;
1172         int i;
1173         u64 addr, size;
1174
1175         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1176
1177         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1178                 &reserved_alloc_key);
1179         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1180                 &reserved_rbtree_key);
1181
1182         /* IOAPIC ranges shouldn't be accessed by DMA */
1183         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1184                 IOVA_PFN(IOAPIC_RANGE_END));
1185         if (!iova)
1186                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1187
1188         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1189         for_each_pci_dev(pdev) {
1190                 struct resource *r;
1191
1192                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1193                         r = &pdev->resource[i];
1194                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1195                                 continue;
1196                         addr = r->start;
1197                         addr &= PAGE_MASK;
1198                         size = r->end - addr;
1199                         size = PAGE_ALIGN(size);
1200                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1201                                 IOVA_PFN(size + addr) - 1);
1202                         if (!iova)
1203                                 printk(KERN_ERR "Reserve iova failed\n");
1204                 }
1205         }
1206
1207 }
1208
1209 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1210 {
1211         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1212 }
1213
1214 static inline int guestwidth_to_adjustwidth(int gaw)
1215 {
1216         int agaw;
1217         int r = (gaw - 12) % 9;
1218
1219         if (r == 0)
1220                 agaw = gaw;
1221         else
1222                 agaw = gaw + 9 - r;
1223         if (agaw > 64)
1224                 agaw = 64;
1225         return agaw;
1226 }
1227
1228 static int domain_init(struct dmar_domain *domain, int guest_width)
1229 {
1230         struct intel_iommu *iommu;
1231         int adjust_width, agaw;
1232         unsigned long sagaw;
1233
1234         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1235         spin_lock_init(&domain->mapping_lock);
1236         spin_lock_init(&domain->iommu_lock);
1237
1238         domain_reserve_special_ranges(domain);
1239
1240         /* calculate AGAW */
1241         iommu = domain_get_iommu(domain);
1242         if (guest_width > cap_mgaw(iommu->cap))
1243                 guest_width = cap_mgaw(iommu->cap);
1244         domain->gaw = guest_width;
1245         adjust_width = guestwidth_to_adjustwidth(guest_width);
1246         agaw = width_to_agaw(adjust_width);
1247         sagaw = cap_sagaw(iommu->cap);
1248         if (!test_bit(agaw, &sagaw)) {
1249                 /* hardware doesn't support it, choose a bigger one */
1250                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1251                 agaw = find_next_bit(&sagaw, 5, agaw);
1252                 if (agaw >= 5)
1253                         return -ENODEV;
1254         }
1255         domain->agaw = agaw;
1256         INIT_LIST_HEAD(&domain->devices);
1257
1258         if (ecap_coherent(iommu->ecap))
1259                 domain->iommu_coherency = 1;
1260         else
1261                 domain->iommu_coherency = 0;
1262
1263         if (ecap_sc_support(iommu->ecap))
1264                 domain->iommu_snooping = 1;
1265         else
1266                 domain->iommu_snooping = 0;
1267
1268         domain->iommu_count = 1;
1269
1270         /* always allocate the top pgd */
1271         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1272         if (!domain->pgd)
1273                 return -ENOMEM;
1274         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1275         return 0;
1276 }
1277
1278 static void domain_exit(struct dmar_domain *domain)
1279 {
1280         u64 end;
1281
1282         /* Domain 0 is reserved, so dont process it */
1283         if (!domain)
1284                 return;
1285
1286         domain_remove_dev_info(domain);
1287         /* destroy iovas */
1288         put_iova_domain(&domain->iovad);
1289         end = DOMAIN_MAX_ADDR(domain->gaw);
1290         end = end & (~PAGE_MASK);
1291
1292         /* clear ptes */
1293         dma_pte_clear_range(domain, 0, end);
1294
1295         /* free page tables */
1296         dma_pte_free_pagetable(domain, 0, end);
1297
1298         iommu_free_domain(domain);
1299         free_domain_mem(domain);
1300 }
1301
1302 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1303                                  u8 bus, u8 devfn, int translation)
1304 {
1305         struct context_entry *context;
1306         unsigned long flags;
1307         struct intel_iommu *iommu;
1308         struct dma_pte *pgd;
1309         unsigned long num;
1310         unsigned long ndomains;
1311         int id;
1312         int agaw;
1313
1314         pr_debug("Set context mapping for %02x:%02x.%d\n",
1315                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1316
1317         BUG_ON(!domain->pgd);
1318         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1319                translation != CONTEXT_TT_MULTI_LEVEL);
1320
1321         iommu = device_to_iommu(segment, bus, devfn);
1322         if (!iommu)
1323                 return -ENODEV;
1324
1325         context = device_to_context_entry(iommu, bus, devfn);
1326         if (!context)
1327                 return -ENOMEM;
1328         spin_lock_irqsave(&iommu->lock, flags);
1329         if (context_present(context)) {
1330                 spin_unlock_irqrestore(&iommu->lock, flags);
1331                 return 0;
1332         }
1333
1334         id = domain->id;
1335         pgd = domain->pgd;
1336
1337         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1338                 int found = 0;
1339
1340                 /* find an available domain id for this device in iommu */
1341                 ndomains = cap_ndoms(iommu->cap);
1342                 num = find_first_bit(iommu->domain_ids, ndomains);
1343                 for (; num < ndomains; ) {
1344                         if (iommu->domains[num] == domain) {
1345                                 id = num;
1346                                 found = 1;
1347                                 break;
1348                         }
1349                         num = find_next_bit(iommu->domain_ids,
1350                                             cap_ndoms(iommu->cap), num+1);
1351                 }
1352
1353                 if (found == 0) {
1354                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1355                         if (num >= ndomains) {
1356                                 spin_unlock_irqrestore(&iommu->lock, flags);
1357                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1358                                 return -EFAULT;
1359                         }
1360
1361                         set_bit(num, iommu->domain_ids);
1362                         iommu->domains[num] = domain;
1363                         id = num;
1364                 }
1365
1366                 /* Skip top levels of page tables for
1367                  * iommu which has less agaw than default.
1368                  */
1369                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1370                         pgd = phys_to_virt(dma_pte_addr(pgd));
1371                         if (!dma_pte_present(pgd)) {
1372                                 spin_unlock_irqrestore(&iommu->lock, flags);
1373                                 return -ENOMEM;
1374                         }
1375                 }
1376         }
1377
1378         context_set_domain_id(context, id);
1379
1380         /*
1381          * In pass through mode, AW must be programmed to indicate the largest
1382          * AGAW value supported by hardware. And ASR is ignored by hardware.
1383          */
1384         if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
1385                 context_set_address_width(context, iommu->agaw);
1386                 context_set_address_root(context, virt_to_phys(pgd));
1387         } else
1388                 context_set_address_width(context, iommu->msagaw);
1389
1390         context_set_translation_type(context, translation);
1391         context_set_fault_enable(context);
1392         context_set_present(context);
1393         domain_flush_cache(domain, context, sizeof(*context));
1394
1395         /*
1396          * It's a non-present to present mapping. If hardware doesn't cache
1397          * non-present entry we only need to flush the write-buffer. If the
1398          * _does_ cache non-present entries, then it does so in the special
1399          * domain #0, which we have to flush:
1400          */
1401         if (cap_caching_mode(iommu->cap)) {
1402                 iommu->flush.flush_context(iommu, 0,
1403                                            (((u16)bus) << 8) | devfn,
1404                                            DMA_CCMD_MASK_NOBIT,
1405                                            DMA_CCMD_DEVICE_INVL);
1406                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1407         } else {
1408                 iommu_flush_write_buffer(iommu);
1409         }
1410         spin_unlock_irqrestore(&iommu->lock, flags);
1411
1412         spin_lock_irqsave(&domain->iommu_lock, flags);
1413         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1414                 domain->iommu_count++;
1415                 domain_update_iommu_cap(domain);
1416         }
1417         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1418         return 0;
1419 }
1420
1421 static int
1422 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1423                         int translation)
1424 {
1425         int ret;
1426         struct pci_dev *tmp, *parent;
1427
1428         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1429                                          pdev->bus->number, pdev->devfn,
1430                                          translation);
1431         if (ret)
1432                 return ret;
1433
1434         /* dependent device mapping */
1435         tmp = pci_find_upstream_pcie_bridge(pdev);
1436         if (!tmp)
1437                 return 0;
1438         /* Secondary interface's bus number and devfn 0 */
1439         parent = pdev->bus->self;
1440         while (parent != tmp) {
1441                 ret = domain_context_mapping_one(domain,
1442                                                  pci_domain_nr(parent->bus),
1443                                                  parent->bus->number,
1444                                                  parent->devfn, translation);
1445                 if (ret)
1446                         return ret;
1447                 parent = parent->bus->self;
1448         }
1449         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1450                 return domain_context_mapping_one(domain,
1451                                         pci_domain_nr(tmp->subordinate),
1452                                         tmp->subordinate->number, 0,
1453                                         translation);
1454         else /* this is a legacy PCI bridge */
1455                 return domain_context_mapping_one(domain,
1456                                                   pci_domain_nr(tmp->bus),
1457                                                   tmp->bus->number,
1458                                                   tmp->devfn,
1459                                                   translation);
1460 }
1461
1462 static int domain_context_mapped(struct pci_dev *pdev)
1463 {
1464         int ret;
1465         struct pci_dev *tmp, *parent;
1466         struct intel_iommu *iommu;
1467
1468         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1469                                 pdev->devfn);
1470         if (!iommu)
1471                 return -ENODEV;
1472
1473         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1474         if (!ret)
1475                 return ret;
1476         /* dependent device mapping */
1477         tmp = pci_find_upstream_pcie_bridge(pdev);
1478         if (!tmp)
1479                 return ret;
1480         /* Secondary interface's bus number and devfn 0 */
1481         parent = pdev->bus->self;
1482         while (parent != tmp) {
1483                 ret = device_context_mapped(iommu, parent->bus->number,
1484                                             parent->devfn);
1485                 if (!ret)
1486                         return ret;
1487                 parent = parent->bus->self;
1488         }
1489         if (tmp->is_pcie)
1490                 return device_context_mapped(iommu, tmp->subordinate->number,
1491                                              0);
1492         else
1493                 return device_context_mapped(iommu, tmp->bus->number,
1494                                              tmp->devfn);
1495 }
1496
1497 static int
1498 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1499                         u64 hpa, size_t size, int prot)
1500 {
1501         u64 start_pfn, end_pfn;
1502         struct dma_pte *pte;
1503         int index;
1504         int addr_width = agaw_to_width(domain->agaw);
1505
1506         hpa &= (((u64)1) << addr_width) - 1;
1507
1508         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1509                 return -EINVAL;
1510         iova &= PAGE_MASK;
1511         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1512         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1513         index = 0;
1514         while (start_pfn < end_pfn) {
1515                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1516                 if (!pte)
1517                         return -ENOMEM;
1518                 /* We don't need lock here, nobody else
1519                  * touches the iova range
1520                  */
1521                 BUG_ON(dma_pte_addr(pte));
1522                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1523                 dma_set_pte_prot(pte, prot);
1524                 if (prot & DMA_PTE_SNP)
1525                         dma_set_pte_snp(pte);
1526                 domain_flush_cache(domain, pte, sizeof(*pte));
1527                 start_pfn++;
1528                 index++;
1529         }
1530         return 0;
1531 }
1532
1533 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1534 {
1535         if (!iommu)
1536                 return;
1537
1538         clear_context_table(iommu, bus, devfn);
1539         iommu->flush.flush_context(iommu, 0, 0, 0,
1540                                            DMA_CCMD_GLOBAL_INVL);
1541         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1542 }
1543
1544 static void domain_remove_dev_info(struct dmar_domain *domain)
1545 {
1546         struct device_domain_info *info;
1547         unsigned long flags;
1548         struct intel_iommu *iommu;
1549
1550         spin_lock_irqsave(&device_domain_lock, flags);
1551         while (!list_empty(&domain->devices)) {
1552                 info = list_entry(domain->devices.next,
1553                         struct device_domain_info, link);
1554                 list_del(&info->link);
1555                 list_del(&info->global);
1556                 if (info->dev)
1557                         info->dev->dev.archdata.iommu = NULL;
1558                 spin_unlock_irqrestore(&device_domain_lock, flags);
1559
1560                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1561                 iommu_detach_dev(iommu, info->bus, info->devfn);
1562                 free_devinfo_mem(info);
1563
1564                 spin_lock_irqsave(&device_domain_lock, flags);
1565         }
1566         spin_unlock_irqrestore(&device_domain_lock, flags);
1567 }
1568
1569 /*
1570  * find_domain
1571  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1572  */
1573 static struct dmar_domain *
1574 find_domain(struct pci_dev *pdev)
1575 {
1576         struct device_domain_info *info;
1577
1578         /* No lock here, assumes no domain exit in normal case */
1579         info = pdev->dev.archdata.iommu;
1580         if (info)
1581                 return info->domain;
1582         return NULL;
1583 }
1584
1585 /* domain is initialized */
1586 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1587 {
1588         struct dmar_domain *domain, *found = NULL;
1589         struct intel_iommu *iommu;
1590         struct dmar_drhd_unit *drhd;
1591         struct device_domain_info *info, *tmp;
1592         struct pci_dev *dev_tmp;
1593         unsigned long flags;
1594         int bus = 0, devfn = 0;
1595         int segment;
1596
1597         domain = find_domain(pdev);
1598         if (domain)
1599                 return domain;
1600
1601         segment = pci_domain_nr(pdev->bus);
1602
1603         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1604         if (dev_tmp) {
1605                 if (dev_tmp->is_pcie) {
1606                         bus = dev_tmp->subordinate->number;
1607                         devfn = 0;
1608                 } else {
1609                         bus = dev_tmp->bus->number;
1610                         devfn = dev_tmp->devfn;
1611                 }
1612                 spin_lock_irqsave(&device_domain_lock, flags);
1613                 list_for_each_entry(info, &device_domain_list, global) {
1614                         if (info->segment == segment &&
1615                             info->bus == bus && info->devfn == devfn) {
1616                                 found = info->domain;
1617                                 break;
1618                         }
1619                 }
1620                 spin_unlock_irqrestore(&device_domain_lock, flags);
1621                 /* pcie-pci bridge already has a domain, uses it */
1622                 if (found) {
1623                         domain = found;
1624                         goto found_domain;
1625                 }
1626         }
1627
1628         /* Allocate new domain for the device */
1629         drhd = dmar_find_matched_drhd_unit(pdev);
1630         if (!drhd) {
1631                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1632                         pci_name(pdev));
1633                 return NULL;
1634         }
1635         iommu = drhd->iommu;
1636
1637         domain = iommu_alloc_domain(iommu);
1638         if (!domain)
1639                 goto error;
1640
1641         if (domain_init(domain, gaw)) {
1642                 domain_exit(domain);
1643                 goto error;
1644         }
1645
1646         /* register pcie-to-pci device */
1647         if (dev_tmp) {
1648                 info = alloc_devinfo_mem();
1649                 if (!info) {
1650                         domain_exit(domain);
1651                         goto error;
1652                 }
1653                 info->segment = segment;
1654                 info->bus = bus;
1655                 info->devfn = devfn;
1656                 info->dev = NULL;
1657                 info->domain = domain;
1658                 /* This domain is shared by devices under p2p bridge */
1659                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1660
1661                 /* pcie-to-pci bridge already has a domain, uses it */
1662                 found = NULL;
1663                 spin_lock_irqsave(&device_domain_lock, flags);
1664                 list_for_each_entry(tmp, &device_domain_list, global) {
1665                         if (tmp->segment == segment &&
1666                             tmp->bus == bus && tmp->devfn == devfn) {
1667                                 found = tmp->domain;
1668                                 break;
1669                         }
1670                 }
1671                 if (found) {
1672                         free_devinfo_mem(info);
1673                         domain_exit(domain);
1674                         domain = found;
1675                 } else {
1676                         list_add(&info->link, &domain->devices);
1677                         list_add(&info->global, &device_domain_list);
1678                 }
1679                 spin_unlock_irqrestore(&device_domain_lock, flags);
1680         }
1681
1682 found_domain:
1683         info = alloc_devinfo_mem();
1684         if (!info)
1685                 goto error;
1686         info->segment = segment;
1687         info->bus = pdev->bus->number;
1688         info->devfn = pdev->devfn;
1689         info->dev = pdev;
1690         info->domain = domain;
1691         spin_lock_irqsave(&device_domain_lock, flags);
1692         /* somebody is fast */
1693         found = find_domain(pdev);
1694         if (found != NULL) {
1695                 spin_unlock_irqrestore(&device_domain_lock, flags);
1696                 if (found != domain) {
1697                         domain_exit(domain);
1698                         domain = found;
1699                 }
1700                 free_devinfo_mem(info);
1701                 return domain;
1702         }
1703         list_add(&info->link, &domain->devices);
1704         list_add(&info->global, &device_domain_list);
1705         pdev->dev.archdata.iommu = info;
1706         spin_unlock_irqrestore(&device_domain_lock, flags);
1707         return domain;
1708 error:
1709         /* recheck it here, maybe others set it */
1710         return find_domain(pdev);
1711 }
1712
1713 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1714                                       unsigned long long start,
1715                                       unsigned long long end)
1716 {
1717         struct dmar_domain *domain;
1718         unsigned long size;
1719         unsigned long long base;
1720         int ret;
1721
1722         printk(KERN_INFO
1723                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1724                 pci_name(pdev), start, end);
1725         /* page table init */
1726         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1727         if (!domain)
1728                 return -ENOMEM;
1729
1730         /* The address might not be aligned */
1731         base = start & PAGE_MASK;
1732         size = end - base;
1733         size = PAGE_ALIGN(size);
1734         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1735                         IOVA_PFN(base + size) - 1)) {
1736                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1737                 ret = -ENOMEM;
1738                 goto error;
1739         }
1740
1741         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1742                 size, base, pci_name(pdev));
1743         /*
1744          * RMRR range might have overlap with physical memory range,
1745          * clear it first
1746          */
1747         dma_pte_clear_range(domain, base, base + size);
1748
1749         ret = domain_page_mapping(domain, base, base, size,
1750                 DMA_PTE_READ|DMA_PTE_WRITE);
1751         if (ret)
1752                 goto error;
1753
1754         /* context entry init */
1755         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1756         if (!ret)
1757                 return 0;
1758 error:
1759         domain_exit(domain);
1760         return ret;
1761
1762 }
1763
1764 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1765         struct pci_dev *pdev)
1766 {
1767         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1768                 return 0;
1769         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1770                 rmrr->end_address + 1);
1771 }
1772
1773 #ifdef CONFIG_DMAR_GFX_WA
1774 struct iommu_prepare_data {
1775         struct pci_dev *pdev;
1776         int ret;
1777 };
1778
1779 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1780                                          unsigned long end_pfn, void *datax)
1781 {
1782         struct iommu_prepare_data *data;
1783
1784         data = (struct iommu_prepare_data *)datax;
1785
1786         data->ret = iommu_prepare_identity_map(data->pdev,
1787                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1788         return data->ret;
1789
1790 }
1791
1792 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1793 {
1794         int nid;
1795         struct iommu_prepare_data data;
1796
1797         data.pdev = pdev;
1798         data.ret = 0;
1799
1800         for_each_online_node(nid) {
1801                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1802                 if (data.ret)
1803                         return data.ret;
1804         }
1805         return data.ret;
1806 }
1807
1808 static void __init iommu_prepare_gfx_mapping(void)
1809 {
1810         struct pci_dev *pdev = NULL;
1811         int ret;
1812
1813         for_each_pci_dev(pdev) {
1814                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1815                                 !IS_GFX_DEVICE(pdev))
1816                         continue;
1817                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1818                         pci_name(pdev));
1819                 ret = iommu_prepare_with_active_regions(pdev);
1820                 if (ret)
1821                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1822         }
1823 }
1824 #else /* !CONFIG_DMAR_GFX_WA */
1825 static inline void iommu_prepare_gfx_mapping(void)
1826 {
1827         return;
1828 }
1829 #endif
1830
1831 #ifdef CONFIG_DMAR_FLOPPY_WA
1832 static inline void iommu_prepare_isa(void)
1833 {
1834         struct pci_dev *pdev;
1835         int ret;
1836
1837         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1838         if (!pdev)
1839                 return;
1840
1841         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1842         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1843
1844         if (ret)
1845                 printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
1846                         "floppy might not work\n");
1847
1848 }
1849 #else
1850 static inline void iommu_prepare_isa(void)
1851 {
1852         return;
1853 }
1854 #endif /* !CONFIG_DMAR_FLPY_WA */
1855
1856 /* Initialize each context entry as pass through.*/
1857 static int __init init_context_pass_through(void)
1858 {
1859         struct pci_dev *pdev = NULL;
1860         struct dmar_domain *domain;
1861         int ret;
1862
1863         for_each_pci_dev(pdev) {
1864                 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1865                 ret = domain_context_mapping(domain, pdev,
1866                                              CONTEXT_TT_PASS_THROUGH);
1867                 if (ret)
1868                         return ret;
1869         }
1870         return 0;
1871 }
1872
1873 static int __init init_dmars(void)
1874 {
1875         struct dmar_drhd_unit *drhd;
1876         struct dmar_rmrr_unit *rmrr;
1877         struct pci_dev *pdev;
1878         struct intel_iommu *iommu;
1879         int i, ret;
1880         int pass_through = 1;
1881
1882         /*
1883          * for each drhd
1884          *    allocate root
1885          *    initialize and program root entry to not present
1886          * endfor
1887          */
1888         for_each_drhd_unit(drhd) {
1889                 g_num_of_iommus++;
1890                 /*
1891                  * lock not needed as this is only incremented in the single
1892                  * threaded kernel __init code path all other access are read
1893                  * only
1894                  */
1895         }
1896
1897         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1898                         GFP_KERNEL);
1899         if (!g_iommus) {
1900                 printk(KERN_ERR "Allocating global iommu array failed\n");
1901                 ret = -ENOMEM;
1902                 goto error;
1903         }
1904
1905         deferred_flush = kzalloc(g_num_of_iommus *
1906                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1907         if (!deferred_flush) {
1908                 kfree(g_iommus);
1909                 ret = -ENOMEM;
1910                 goto error;
1911         }
1912
1913         for_each_drhd_unit(drhd) {
1914                 if (drhd->ignored)
1915                         continue;
1916
1917                 iommu = drhd->iommu;
1918                 g_iommus[iommu->seq_id] = iommu;
1919
1920                 ret = iommu_init_domains(iommu);
1921                 if (ret)
1922                         goto error;
1923
1924                 /*
1925                  * TBD:
1926                  * we could share the same root & context tables
1927                  * amoung all IOMMU's. Need to Split it later.
1928                  */
1929                 ret = iommu_alloc_root_entry(iommu);
1930                 if (ret) {
1931                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1932                         goto error;
1933                 }
1934                 if (!ecap_pass_through(iommu->ecap))
1935                         pass_through = 0;
1936         }
1937         if (iommu_pass_through)
1938                 if (!pass_through) {
1939                         printk(KERN_INFO
1940                                "Pass Through is not supported by hardware.\n");
1941                         iommu_pass_through = 0;
1942                 }
1943
1944         /*
1945          * Start from the sane iommu hardware state.
1946          */
1947         for_each_drhd_unit(drhd) {
1948                 if (drhd->ignored)
1949                         continue;
1950
1951                 iommu = drhd->iommu;
1952
1953                 /*
1954                  * If the queued invalidation is already initialized by us
1955                  * (for example, while enabling interrupt-remapping) then
1956                  * we got the things already rolling from a sane state.
1957                  */
1958                 if (iommu->qi)
1959                         continue;
1960
1961                 /*
1962                  * Clear any previous faults.
1963                  */
1964                 dmar_fault(-1, iommu);
1965                 /*
1966                  * Disable queued invalidation if supported and already enabled
1967                  * before OS handover.
1968                  */
1969                 dmar_disable_qi(iommu);
1970         }
1971
1972         for_each_drhd_unit(drhd) {
1973                 if (drhd->ignored)
1974                         continue;
1975
1976                 iommu = drhd->iommu;
1977
1978                 if (dmar_enable_qi(iommu)) {
1979                         /*
1980                          * Queued Invalidate not enabled, use Register Based
1981                          * Invalidate
1982                          */
1983                         iommu->flush.flush_context = __iommu_flush_context;
1984                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1985                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1986                                "invalidation\n",
1987                                (unsigned long long)drhd->reg_base_addr);
1988                 } else {
1989                         iommu->flush.flush_context = qi_flush_context;
1990                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1991                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1992                                "invalidation\n",
1993                                (unsigned long long)drhd->reg_base_addr);
1994                 }
1995         }
1996
1997 #ifdef CONFIG_INTR_REMAP
1998         if (!intr_remapping_enabled) {
1999                 ret = enable_intr_remapping(0);
2000                 if (ret)
2001                         printk(KERN_ERR
2002                                "IOMMU: enable interrupt remapping failed\n");
2003         }
2004 #endif
2005         /*
2006          * If pass through is set and enabled, context entries of all pci
2007          * devices are intialized by pass through translation type.
2008          */
2009         if (iommu_pass_through) {
2010                 ret = init_context_pass_through();
2011                 if (ret) {
2012                         printk(KERN_ERR "IOMMU: Pass through init failed.\n");
2013                         iommu_pass_through = 0;
2014                 }
2015         }
2016
2017         /*
2018          * If pass through is not set or not enabled, setup context entries for
2019          * identity mappings for rmrr, gfx, and isa.
2020          */
2021         if (!iommu_pass_through) {
2022                 /*
2023                  * For each rmrr
2024                  *   for each dev attached to rmrr
2025                  *   do
2026                  *     locate drhd for dev, alloc domain for dev
2027                  *     allocate free domain
2028                  *     allocate page table entries for rmrr
2029                  *     if context not allocated for bus
2030                  *           allocate and init context
2031                  *           set present in root table for this bus
2032                  *     init context with domain, translation etc
2033                  *    endfor
2034                  * endfor
2035                  */
2036                 for_each_rmrr_units(rmrr) {
2037                         for (i = 0; i < rmrr->devices_cnt; i++) {
2038                                 pdev = rmrr->devices[i];
2039                                 /*
2040                                  * some BIOS lists non-exist devices in DMAR
2041                                  * table.
2042                                  */
2043                                 if (!pdev)
2044                                         continue;
2045                                 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2046                                 if (ret)
2047                                         printk(KERN_ERR
2048                                  "IOMMU: mapping reserved region failed\n");
2049                         }
2050                 }
2051
2052                 iommu_prepare_gfx_mapping();
2053
2054                 iommu_prepare_isa();
2055         }
2056
2057         /*
2058          * for each drhd
2059          *   enable fault log
2060          *   global invalidate context cache
2061          *   global invalidate iotlb
2062          *   enable translation
2063          */
2064         for_each_drhd_unit(drhd) {
2065                 if (drhd->ignored)
2066                         continue;
2067                 iommu = drhd->iommu;
2068
2069                 iommu_flush_write_buffer(iommu);
2070
2071                 ret = dmar_set_interrupt(iommu);
2072                 if (ret)
2073                         goto error;
2074
2075                 iommu_set_root_entry(iommu);
2076
2077                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2078                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2079                 iommu_disable_protect_mem_regions(iommu);
2080
2081                 ret = iommu_enable_translation(iommu);
2082                 if (ret)
2083                         goto error;
2084         }
2085
2086         return 0;
2087 error:
2088         for_each_drhd_unit(drhd) {
2089                 if (drhd->ignored)
2090                         continue;
2091                 iommu = drhd->iommu;
2092                 free_iommu(iommu);
2093         }
2094         kfree(g_iommus);
2095         return ret;
2096 }
2097
2098 static inline u64 aligned_size(u64 host_addr, size_t size)
2099 {
2100         u64 addr;
2101         addr = (host_addr & (~PAGE_MASK)) + size;
2102         return PAGE_ALIGN(addr);
2103 }
2104
2105 struct iova *
2106 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2107 {
2108         struct iova *piova;
2109
2110         /* Make sure it's in range */
2111         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2112         if (!size || (IOVA_START_ADDR + size > end))
2113                 return NULL;
2114
2115         piova = alloc_iova(&domain->iovad,
2116                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2117         return piova;
2118 }
2119
2120 static struct iova *
2121 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2122                    size_t size, u64 dma_mask)
2123 {
2124         struct pci_dev *pdev = to_pci_dev(dev);
2125         struct iova *iova = NULL;
2126
2127         if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac)
2128                 iova = iommu_alloc_iova(domain, size, dma_mask);
2129         else {
2130                 /*
2131                  * First try to allocate an io virtual address in
2132                  * DMA_BIT_MASK(32) and if that fails then try allocating
2133                  * from higher range
2134                  */
2135                 iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32));
2136                 if (!iova)
2137                         iova = iommu_alloc_iova(domain, size, dma_mask);
2138         }
2139
2140         if (!iova) {
2141                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2142                 return NULL;
2143         }
2144
2145         return iova;
2146 }
2147
2148 static struct dmar_domain *
2149 get_valid_domain_for_dev(struct pci_dev *pdev)
2150 {
2151         struct dmar_domain *domain;
2152         int ret;
2153
2154         domain = get_domain_for_dev(pdev,
2155                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2156         if (!domain) {
2157                 printk(KERN_ERR
2158                         "Allocating domain for %s failed", pci_name(pdev));
2159                 return NULL;
2160         }
2161
2162         /* make sure context mapping is ok */
2163         if (unlikely(!domain_context_mapped(pdev))) {
2164                 ret = domain_context_mapping(domain, pdev,
2165                                              CONTEXT_TT_MULTI_LEVEL);
2166                 if (ret) {
2167                         printk(KERN_ERR
2168                                 "Domain context map for %s failed",
2169                                 pci_name(pdev));
2170                         return NULL;
2171                 }
2172         }
2173
2174         return domain;
2175 }
2176
2177 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2178                                      size_t size, int dir, u64 dma_mask)
2179 {
2180         struct pci_dev *pdev = to_pci_dev(hwdev);
2181         struct dmar_domain *domain;
2182         phys_addr_t start_paddr;
2183         struct iova *iova;
2184         int prot = 0;
2185         int ret;
2186         struct intel_iommu *iommu;
2187
2188         BUG_ON(dir == DMA_NONE);
2189         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2190                 return paddr;
2191
2192         domain = get_valid_domain_for_dev(pdev);
2193         if (!domain)
2194                 return 0;
2195
2196         iommu = domain_get_iommu(domain);
2197         size = aligned_size((u64)paddr, size);
2198
2199         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2200         if (!iova)
2201                 goto error;
2202
2203         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2204
2205         /*
2206          * Check if DMAR supports zero-length reads on write only
2207          * mappings..
2208          */
2209         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2210                         !cap_zlr(iommu->cap))
2211                 prot |= DMA_PTE_READ;
2212         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2213                 prot |= DMA_PTE_WRITE;
2214         /*
2215          * paddr - (paddr + size) might be partial page, we should map the whole
2216          * page.  Note: if two part of one page are separately mapped, we
2217          * might have two guest_addr mapping to the same host paddr, but this
2218          * is not a big problem
2219          */
2220         ret = domain_page_mapping(domain, start_paddr,
2221                 ((u64)paddr) & PAGE_MASK, size, prot);
2222         if (ret)
2223                 goto error;
2224
2225         /* it's a non-present to present mapping. Only flush if caching mode */
2226         if (cap_caching_mode(iommu->cap))
2227                 iommu_flush_iotlb_psi(iommu, 0, start_paddr,
2228                                       size >> VTD_PAGE_SHIFT);
2229         else
2230                 iommu_flush_write_buffer(iommu);
2231
2232         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2233
2234 error:
2235         if (iova)
2236                 __free_iova(&domain->iovad, iova);
2237         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2238                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2239         return 0;
2240 }
2241
2242 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2243                                  unsigned long offset, size_t size,
2244                                  enum dma_data_direction dir,
2245                                  struct dma_attrs *attrs)
2246 {
2247         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2248                                   dir, to_pci_dev(dev)->dma_mask);
2249 }
2250
2251 static void flush_unmaps(void)
2252 {
2253         int i, j;
2254
2255         timer_on = 0;
2256
2257         /* just flush them all */
2258         for (i = 0; i < g_num_of_iommus; i++) {
2259                 struct intel_iommu *iommu = g_iommus[i];
2260                 if (!iommu)
2261                         continue;
2262
2263                 if (deferred_flush[i].next) {
2264                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2265                                                  DMA_TLB_GLOBAL_FLUSH);
2266                         for (j = 0; j < deferred_flush[i].next; j++) {
2267                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2268                                                 deferred_flush[i].iova[j]);
2269                         }
2270                         deferred_flush[i].next = 0;
2271                 }
2272         }
2273
2274         list_size = 0;
2275 }
2276
2277 static void flush_unmaps_timeout(unsigned long data)
2278 {
2279         unsigned long flags;
2280
2281         spin_lock_irqsave(&async_umap_flush_lock, flags);
2282         flush_unmaps();
2283         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2284 }
2285
2286 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2287 {
2288         unsigned long flags;
2289         int next, iommu_id;
2290         struct intel_iommu *iommu;
2291
2292         spin_lock_irqsave(&async_umap_flush_lock, flags);
2293         if (list_size == HIGH_WATER_MARK)
2294                 flush_unmaps();
2295
2296         iommu = domain_get_iommu(dom);
2297         iommu_id = iommu->seq_id;
2298
2299         next = deferred_flush[iommu_id].next;
2300         deferred_flush[iommu_id].domain[next] = dom;
2301         deferred_flush[iommu_id].iova[next] = iova;
2302         deferred_flush[iommu_id].next++;
2303
2304         if (!timer_on) {
2305                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2306                 timer_on = 1;
2307         }
2308         list_size++;
2309         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2310 }
2311
2312 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2313                              size_t size, enum dma_data_direction dir,
2314                              struct dma_attrs *attrs)
2315 {
2316         struct pci_dev *pdev = to_pci_dev(dev);
2317         struct dmar_domain *domain;
2318         unsigned long start_addr;
2319         struct iova *iova;
2320         struct intel_iommu *iommu;
2321
2322         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2323                 return;
2324         domain = find_domain(pdev);
2325         BUG_ON(!domain);
2326
2327         iommu = domain_get_iommu(domain);
2328
2329         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2330         if (!iova)
2331                 return;
2332
2333         start_addr = iova->pfn_lo << PAGE_SHIFT;
2334         size = aligned_size((u64)dev_addr, size);
2335
2336         pr_debug("Device %s unmapping: %zx@%llx\n",
2337                 pci_name(pdev), size, (unsigned long long)start_addr);
2338
2339         /*  clear the whole page */
2340         dma_pte_clear_range(domain, start_addr, start_addr + size);
2341         /* free page tables */
2342         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2343         if (intel_iommu_strict) {
2344                 iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2345                                       size >> VTD_PAGE_SHIFT);
2346                 /* free iova */
2347                 __free_iova(&domain->iovad, iova);
2348         } else {
2349                 add_unmap(domain, iova);
2350                 /*
2351                  * queue up the release of the unmap to save the 1/6th of the
2352                  * cpu used up by the iotlb flush operation...
2353                  */
2354         }
2355 }
2356
2357 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2358                                int dir)
2359 {
2360         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2361 }
2362
2363 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2364                                   dma_addr_t *dma_handle, gfp_t flags)
2365 {
2366         void *vaddr;
2367         int order;
2368
2369         size = PAGE_ALIGN(size);
2370         order = get_order(size);
2371         flags &= ~(GFP_DMA | GFP_DMA32);
2372
2373         vaddr = (void *)__get_free_pages(flags, order);
2374         if (!vaddr)
2375                 return NULL;
2376         memset(vaddr, 0, size);
2377
2378         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2379                                          DMA_BIDIRECTIONAL,
2380                                          hwdev->coherent_dma_mask);
2381         if (*dma_handle)
2382                 return vaddr;
2383         free_pages((unsigned long)vaddr, order);
2384         return NULL;
2385 }
2386
2387 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2388                                 dma_addr_t dma_handle)
2389 {
2390         int order;
2391
2392         size = PAGE_ALIGN(size);
2393         order = get_order(size);
2394
2395         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2396         free_pages((unsigned long)vaddr, order);
2397 }
2398
2399 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2400                            int nelems, enum dma_data_direction dir,
2401                            struct dma_attrs *attrs)
2402 {
2403         int i;
2404         struct pci_dev *pdev = to_pci_dev(hwdev);
2405         struct dmar_domain *domain;
2406         unsigned long start_addr;
2407         struct iova *iova;
2408         size_t size = 0;
2409         phys_addr_t addr;
2410         struct scatterlist *sg;
2411         struct intel_iommu *iommu;
2412
2413         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2414                 return;
2415
2416         domain = find_domain(pdev);
2417         BUG_ON(!domain);
2418
2419         iommu = domain_get_iommu(domain);
2420
2421         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2422         if (!iova)
2423                 return;
2424         for_each_sg(sglist, sg, nelems, i) {
2425                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2426                 size += aligned_size((u64)addr, sg->length);
2427         }
2428
2429         start_addr = iova->pfn_lo << PAGE_SHIFT;
2430
2431         /*  clear the whole page */
2432         dma_pte_clear_range(domain, start_addr, start_addr + size);
2433         /* free page tables */
2434         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2435
2436         iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2437                               size >> VTD_PAGE_SHIFT);
2438
2439         /* free iova */
2440         __free_iova(&domain->iovad, iova);
2441 }
2442
2443 static int intel_nontranslate_map_sg(struct device *hddev,
2444         struct scatterlist *sglist, int nelems, int dir)
2445 {
2446         int i;
2447         struct scatterlist *sg;
2448
2449         for_each_sg(sglist, sg, nelems, i) {
2450                 BUG_ON(!sg_page(sg));
2451                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2452                 sg->dma_length = sg->length;
2453         }
2454         return nelems;
2455 }
2456
2457 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2458                         enum dma_data_direction dir, struct dma_attrs *attrs)
2459 {
2460         phys_addr_t addr;
2461         int i;
2462         struct pci_dev *pdev = to_pci_dev(hwdev);
2463         struct dmar_domain *domain;
2464         size_t size = 0;
2465         int prot = 0;
2466         size_t offset = 0;
2467         struct iova *iova = NULL;
2468         int ret;
2469         struct scatterlist *sg;
2470         unsigned long start_addr;
2471         struct intel_iommu *iommu;
2472
2473         BUG_ON(dir == DMA_NONE);
2474         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2475                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2476
2477         domain = get_valid_domain_for_dev(pdev);
2478         if (!domain)
2479                 return 0;
2480
2481         iommu = domain_get_iommu(domain);
2482
2483         for_each_sg(sglist, sg, nelems, i) {
2484                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2485                 size += aligned_size((u64)addr, sg->length);
2486         }
2487
2488         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2489         if (!iova) {
2490                 sglist->dma_length = 0;
2491                 return 0;
2492         }
2493
2494         /*
2495          * Check if DMAR supports zero-length reads on write only
2496          * mappings..
2497          */
2498         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2499                         !cap_zlr(iommu->cap))
2500                 prot |= DMA_PTE_READ;
2501         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2502                 prot |= DMA_PTE_WRITE;
2503
2504         start_addr = iova->pfn_lo << PAGE_SHIFT;
2505         offset = 0;
2506         for_each_sg(sglist, sg, nelems, i) {
2507                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2508                 size = aligned_size((u64)addr, sg->length);
2509                 ret = domain_page_mapping(domain, start_addr + offset,
2510                         ((u64)addr) & PAGE_MASK,
2511                         size, prot);
2512                 if (ret) {
2513                         /*  clear the page */
2514                         dma_pte_clear_range(domain, start_addr,
2515                                   start_addr + offset);
2516                         /* free page tables */
2517                         dma_pte_free_pagetable(domain, start_addr,
2518                                   start_addr + offset);
2519                         /* free iova */
2520                         __free_iova(&domain->iovad, iova);
2521                         return 0;
2522                 }
2523                 sg->dma_address = start_addr + offset +
2524                                 ((u64)addr & (~PAGE_MASK));
2525                 sg->dma_length = sg->length;
2526                 offset += size;
2527         }
2528
2529         /* it's a non-present to present mapping. Only flush if caching mode */
2530         if (cap_caching_mode(iommu->cap))
2531                 iommu_flush_iotlb_psi(iommu, 0, start_addr,
2532                                       offset >> VTD_PAGE_SHIFT);
2533         else
2534                 iommu_flush_write_buffer(iommu);
2535
2536         return nelems;
2537 }
2538
2539 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2540 {
2541         return !dma_addr;
2542 }
2543
2544 struct dma_map_ops intel_dma_ops = {
2545         .alloc_coherent = intel_alloc_coherent,
2546         .free_coherent = intel_free_coherent,
2547         .map_sg = intel_map_sg,
2548         .unmap_sg = intel_unmap_sg,
2549         .map_page = intel_map_page,
2550         .unmap_page = intel_unmap_page,
2551         .mapping_error = intel_mapping_error,
2552 };
2553
2554 static inline int iommu_domain_cache_init(void)
2555 {
2556         int ret = 0;
2557
2558         iommu_domain_cache = kmem_cache_create("iommu_domain",
2559                                          sizeof(struct dmar_domain),
2560                                          0,
2561                                          SLAB_HWCACHE_ALIGN,
2562
2563                                          NULL);
2564         if (!iommu_domain_cache) {
2565                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2566                 ret = -ENOMEM;
2567         }
2568
2569         return ret;
2570 }
2571
2572 static inline int iommu_devinfo_cache_init(void)
2573 {
2574         int ret = 0;
2575
2576         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2577                                          sizeof(struct device_domain_info),
2578                                          0,
2579                                          SLAB_HWCACHE_ALIGN,
2580                                          NULL);
2581         if (!iommu_devinfo_cache) {
2582                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2583                 ret = -ENOMEM;
2584         }
2585
2586         return ret;
2587 }
2588
2589 static inline int iommu_iova_cache_init(void)
2590 {
2591         int ret = 0;
2592
2593         iommu_iova_cache = kmem_cache_create("iommu_iova",
2594                                          sizeof(struct iova),
2595                                          0,
2596                                          SLAB_HWCACHE_ALIGN,
2597                                          NULL);
2598         if (!iommu_iova_cache) {
2599                 printk(KERN_ERR "Couldn't create iova cache\n");
2600                 ret = -ENOMEM;
2601         }
2602
2603         return ret;
2604 }
2605
2606 static int __init iommu_init_mempool(void)
2607 {
2608         int ret;
2609         ret = iommu_iova_cache_init();
2610         if (ret)
2611                 return ret;
2612
2613         ret = iommu_domain_cache_init();
2614         if (ret)
2615                 goto domain_error;
2616
2617         ret = iommu_devinfo_cache_init();
2618         if (!ret)
2619                 return ret;
2620
2621         kmem_cache_destroy(iommu_domain_cache);
2622 domain_error:
2623         kmem_cache_destroy(iommu_iova_cache);
2624
2625         return -ENOMEM;
2626 }
2627
2628 static void __init iommu_exit_mempool(void)
2629 {
2630         kmem_cache_destroy(iommu_devinfo_cache);
2631         kmem_cache_destroy(iommu_domain_cache);
2632         kmem_cache_destroy(iommu_iova_cache);
2633
2634 }
2635
2636 static void __init init_no_remapping_devices(void)
2637 {
2638         struct dmar_drhd_unit *drhd;
2639
2640         for_each_drhd_unit(drhd) {
2641                 if (!drhd->include_all) {
2642                         int i;
2643                         for (i = 0; i < drhd->devices_cnt; i++)
2644                                 if (drhd->devices[i] != NULL)
2645                                         break;
2646                         /* ignore DMAR unit if no pci devices exist */
2647                         if (i == drhd->devices_cnt)
2648                                 drhd->ignored = 1;
2649                 }
2650         }
2651
2652         if (dmar_map_gfx)
2653                 return;
2654
2655         for_each_drhd_unit(drhd) {
2656                 int i;
2657                 if (drhd->ignored || drhd->include_all)
2658                         continue;
2659
2660                 for (i = 0; i < drhd->devices_cnt; i++)
2661                         if (drhd->devices[i] &&
2662                                 !IS_GFX_DEVICE(drhd->devices[i]))
2663                                 break;
2664
2665                 if (i < drhd->devices_cnt)
2666                         continue;
2667
2668                 /* bypass IOMMU if it is just for gfx devices */
2669                 drhd->ignored = 1;
2670                 for (i = 0; i < drhd->devices_cnt; i++) {
2671                         if (!drhd->devices[i])
2672                                 continue;
2673                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2674                 }
2675         }
2676 }
2677
2678 #ifdef CONFIG_SUSPEND
2679 static int init_iommu_hw(void)
2680 {
2681         struct dmar_drhd_unit *drhd;
2682         struct intel_iommu *iommu = NULL;
2683
2684         for_each_active_iommu(iommu, drhd)
2685                 if (iommu->qi)
2686                         dmar_reenable_qi(iommu);
2687
2688         for_each_active_iommu(iommu, drhd) {
2689                 iommu_flush_write_buffer(iommu);
2690
2691                 iommu_set_root_entry(iommu);
2692
2693                 iommu->flush.flush_context(iommu, 0, 0, 0,
2694                                            DMA_CCMD_GLOBAL_INVL);
2695                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2696                                          DMA_TLB_GLOBAL_FLUSH);
2697                 iommu_disable_protect_mem_regions(iommu);
2698                 iommu_enable_translation(iommu);
2699         }
2700
2701         return 0;
2702 }
2703
2704 static void iommu_flush_all(void)
2705 {
2706         struct dmar_drhd_unit *drhd;
2707         struct intel_iommu *iommu;
2708
2709         for_each_active_iommu(iommu, drhd) {
2710                 iommu->flush.flush_context(iommu, 0, 0, 0,
2711                                            DMA_CCMD_GLOBAL_INVL);
2712                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2713                                          DMA_TLB_GLOBAL_FLUSH);
2714         }
2715 }
2716
2717 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
2718 {
2719         struct dmar_drhd_unit *drhd;
2720         struct intel_iommu *iommu = NULL;
2721         unsigned long flag;
2722
2723         for_each_active_iommu(iommu, drhd) {
2724                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
2725                                                  GFP_ATOMIC);
2726                 if (!iommu->iommu_state)
2727                         goto nomem;
2728         }
2729
2730         iommu_flush_all();
2731
2732         for_each_active_iommu(iommu, drhd) {
2733                 iommu_disable_translation(iommu);
2734
2735                 spin_lock_irqsave(&iommu->register_lock, flag);
2736
2737                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2738                         readl(iommu->reg + DMAR_FECTL_REG);
2739                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2740                         readl(iommu->reg + DMAR_FEDATA_REG);
2741                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2742                         readl(iommu->reg + DMAR_FEADDR_REG);
2743                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2744                         readl(iommu->reg + DMAR_FEUADDR_REG);
2745
2746                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2747         }
2748         return 0;
2749
2750 nomem:
2751         for_each_active_iommu(iommu, drhd)
2752                 kfree(iommu->iommu_state);
2753
2754         return -ENOMEM;
2755 }
2756
2757 static int iommu_resume(struct sys_device *dev)
2758 {
2759         struct dmar_drhd_unit *drhd;
2760         struct intel_iommu *iommu = NULL;
2761         unsigned long flag;
2762
2763         if (init_iommu_hw()) {
2764                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2765                 return -EIO;
2766         }
2767
2768         for_each_active_iommu(iommu, drhd) {
2769
2770                 spin_lock_irqsave(&iommu->register_lock, flag);
2771
2772                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2773                         iommu->reg + DMAR_FECTL_REG);
2774                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2775                         iommu->reg + DMAR_FEDATA_REG);
2776                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2777                         iommu->reg + DMAR_FEADDR_REG);
2778                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2779                         iommu->reg + DMAR_FEUADDR_REG);
2780
2781                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2782         }
2783
2784         for_each_active_iommu(iommu, drhd)
2785                 kfree(iommu->iommu_state);
2786
2787         return 0;
2788 }
2789
2790 static struct sysdev_class iommu_sysclass = {
2791         .name           = "iommu",
2792         .resume         = iommu_resume,
2793         .suspend        = iommu_suspend,
2794 };
2795
2796 static struct sys_device device_iommu = {
2797         .cls    = &iommu_sysclass,
2798 };
2799
2800 static int __init init_iommu_sysfs(void)
2801 {
2802         int error;
2803
2804         error = sysdev_class_register(&iommu_sysclass);
2805         if (error)
2806                 return error;
2807
2808         error = sysdev_register(&device_iommu);
2809         if (error)
2810                 sysdev_class_unregister(&iommu_sysclass);
2811
2812         return error;
2813 }
2814
2815 #else
2816 static int __init init_iommu_sysfs(void)
2817 {
2818         return 0;
2819 }
2820 #endif  /* CONFIG_PM */
2821
2822 int __init intel_iommu_init(void)
2823 {
2824         int ret = 0;
2825
2826         if (dmar_table_init())
2827                 return  -ENODEV;
2828
2829         if (dmar_dev_scope_init())
2830                 return  -ENODEV;
2831
2832         /*
2833          * Check the need for DMA-remapping initialization now.
2834          * Above initialization will also be used by Interrupt-remapping.
2835          */
2836         if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
2837                 return -ENODEV;
2838
2839         iommu_init_mempool();
2840         dmar_init_reserved_ranges();
2841
2842         init_no_remapping_devices();
2843
2844         ret = init_dmars();
2845         if (ret) {
2846                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2847                 put_iova_domain(&reserved_iova_list);
2848                 iommu_exit_mempool();
2849                 return ret;
2850         }
2851         printk(KERN_INFO
2852         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2853
2854         init_timer(&unmap_timer);
2855         force_iommu = 1;
2856
2857         if (!iommu_pass_through) {
2858                 printk(KERN_INFO
2859                        "Multi-level page-table translation for DMAR.\n");
2860                 dma_ops = &intel_dma_ops;
2861         } else
2862                 printk(KERN_INFO
2863                        "DMAR: Pass through translation for DMAR.\n");
2864
2865         init_iommu_sysfs();
2866
2867         register_iommu(&intel_iommu_ops);
2868
2869         return 0;
2870 }
2871
2872 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2873                                   struct pci_dev *pdev)
2874 {
2875         struct device_domain_info *info;
2876         unsigned long flags;
2877
2878         info = alloc_devinfo_mem();
2879         if (!info)
2880                 return -ENOMEM;
2881
2882         info->segment = pci_domain_nr(pdev->bus);
2883         info->bus = pdev->bus->number;
2884         info->devfn = pdev->devfn;
2885         info->dev = pdev;
2886         info->domain = domain;
2887
2888         spin_lock_irqsave(&device_domain_lock, flags);
2889         list_add(&info->link, &domain->devices);
2890         list_add(&info->global, &device_domain_list);
2891         pdev->dev.archdata.iommu = info;
2892         spin_unlock_irqrestore(&device_domain_lock, flags);
2893
2894         return 0;
2895 }
2896
2897 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2898                                            struct pci_dev *pdev)
2899 {
2900         struct pci_dev *tmp, *parent;
2901
2902         if (!iommu || !pdev)
2903                 return;
2904
2905         /* dependent device detach */
2906         tmp = pci_find_upstream_pcie_bridge(pdev);
2907         /* Secondary interface's bus number and devfn 0 */
2908         if (tmp) {
2909                 parent = pdev->bus->self;
2910                 while (parent != tmp) {
2911                         iommu_detach_dev(iommu, parent->bus->number,
2912                                          parent->devfn);
2913                         parent = parent->bus->self;
2914                 }
2915                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2916                         iommu_detach_dev(iommu,
2917                                 tmp->subordinate->number, 0);
2918                 else /* this is a legacy PCI bridge */
2919                         iommu_detach_dev(iommu, tmp->bus->number,
2920                                          tmp->devfn);
2921         }
2922 }
2923
2924 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2925                                           struct pci_dev *pdev)
2926 {
2927         struct device_domain_info *info;
2928         struct intel_iommu *iommu;
2929         unsigned long flags;
2930         int found = 0;
2931         struct list_head *entry, *tmp;
2932
2933         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
2934                                 pdev->devfn);
2935         if (!iommu)
2936                 return;
2937
2938         spin_lock_irqsave(&device_domain_lock, flags);
2939         list_for_each_safe(entry, tmp, &domain->devices) {
2940                 info = list_entry(entry, struct device_domain_info, link);
2941                 /* No need to compare PCI domain; it has to be the same */
2942                 if (info->bus == pdev->bus->number &&
2943                     info->devfn == pdev->devfn) {
2944                         list_del(&info->link);
2945                         list_del(&info->global);
2946                         if (info->dev)
2947                                 info->dev->dev.archdata.iommu = NULL;
2948                         spin_unlock_irqrestore(&device_domain_lock, flags);
2949
2950                         iommu_detach_dev(iommu, info->bus, info->devfn);
2951                         iommu_detach_dependent_devices(iommu, pdev);
2952                         free_devinfo_mem(info);
2953
2954                         spin_lock_irqsave(&device_domain_lock, flags);
2955
2956                         if (found)
2957                                 break;
2958                         else
2959                                 continue;
2960                 }
2961
2962                 /* if there is no other devices under the same iommu
2963                  * owned by this domain, clear this iommu in iommu_bmp
2964                  * update iommu count and coherency
2965                  */
2966                 if (iommu == device_to_iommu(info->segment, info->bus,
2967                                             info->devfn))
2968                         found = 1;
2969         }
2970
2971         if (found == 0) {
2972                 unsigned long tmp_flags;
2973                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2974                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2975                 domain->iommu_count--;
2976                 domain_update_iommu_cap(domain);
2977                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2978         }
2979
2980         spin_unlock_irqrestore(&device_domain_lock, flags);
2981 }
2982
2983 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2984 {
2985         struct device_domain_info *info;
2986         struct intel_iommu *iommu;
2987         unsigned long flags1, flags2;
2988
2989         spin_lock_irqsave(&device_domain_lock, flags1);
2990         while (!list_empty(&domain->devices)) {
2991                 info = list_entry(domain->devices.next,
2992                         struct device_domain_info, link);
2993                 list_del(&info->link);
2994                 list_del(&info->global);
2995                 if (info->dev)
2996                         info->dev->dev.archdata.iommu = NULL;
2997
2998                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2999
3000                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3001                 iommu_detach_dev(iommu, info->bus, info->devfn);
3002                 iommu_detach_dependent_devices(iommu, info->dev);
3003
3004                 /* clear this iommu in iommu_bmp, update iommu count
3005                  * and capabilities
3006                  */
3007                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3008                 if (test_and_clear_bit(iommu->seq_id,
3009                                        &domain->iommu_bmp)) {
3010                         domain->iommu_count--;
3011                         domain_update_iommu_cap(domain);
3012                 }
3013                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3014
3015                 free_devinfo_mem(info);
3016                 spin_lock_irqsave(&device_domain_lock, flags1);
3017         }
3018         spin_unlock_irqrestore(&device_domain_lock, flags1);
3019 }
3020
3021 /* domain id for virtual machine, it won't be set in context */
3022 static unsigned long vm_domid;
3023
3024 static int vm_domain_min_agaw(struct dmar_domain *domain)
3025 {
3026         int i;
3027         int min_agaw = domain->agaw;
3028
3029         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3030         for (; i < g_num_of_iommus; ) {
3031                 if (min_agaw > g_iommus[i]->agaw)
3032                         min_agaw = g_iommus[i]->agaw;
3033
3034                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3035         }
3036
3037         return min_agaw;
3038 }
3039
3040 static struct dmar_domain *iommu_alloc_vm_domain(void)
3041 {
3042         struct dmar_domain *domain;
3043
3044         domain = alloc_domain_mem();
3045         if (!domain)
3046                 return NULL;
3047
3048         domain->id = vm_domid++;
3049         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3050         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3051
3052         return domain;
3053 }
3054
3055 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
3056 {
3057         int adjust_width;
3058
3059         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3060         spin_lock_init(&domain->mapping_lock);
3061         spin_lock_init(&domain->iommu_lock);
3062
3063         domain_reserve_special_ranges(domain);
3064
3065         /* calculate AGAW */
3066         domain->gaw = guest_width;
3067         adjust_width = guestwidth_to_adjustwidth(guest_width);
3068         domain->agaw = width_to_agaw(adjust_width);
3069
3070         INIT_LIST_HEAD(&domain->devices);
3071
3072         domain->iommu_count = 0;
3073         domain->iommu_coherency = 0;
3074         domain->max_addr = 0;
3075
3076         /* always allocate the top pgd */
3077         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3078         if (!domain->pgd)
3079                 return -ENOMEM;
3080         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3081         return 0;
3082 }
3083
3084 static void iommu_free_vm_domain(struct dmar_domain *domain)
3085 {
3086         unsigned long flags;
3087         struct dmar_drhd_unit *drhd;
3088         struct intel_iommu *iommu;
3089         unsigned long i;
3090         unsigned long ndomains;
3091
3092         for_each_drhd_unit(drhd) {
3093                 if (drhd->ignored)
3094                         continue;
3095                 iommu = drhd->iommu;
3096
3097                 ndomains = cap_ndoms(iommu->cap);
3098                 i = find_first_bit(iommu->domain_ids, ndomains);
3099                 for (; i < ndomains; ) {
3100                         if (iommu->domains[i] == domain) {
3101                                 spin_lock_irqsave(&iommu->lock, flags);
3102                                 clear_bit(i, iommu->domain_ids);
3103                                 iommu->domains[i] = NULL;
3104                                 spin_unlock_irqrestore(&iommu->lock, flags);
3105                                 break;
3106                         }
3107                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3108                 }
3109         }
3110 }
3111
3112 static void vm_domain_exit(struct dmar_domain *domain)
3113 {
3114         u64 end;
3115
3116         /* Domain 0 is reserved, so dont process it */
3117         if (!domain)
3118                 return;
3119
3120         vm_domain_remove_all_dev_info(domain);
3121         /* destroy iovas */
3122         put_iova_domain(&domain->iovad);
3123         end = DOMAIN_MAX_ADDR(domain->gaw);
3124         end = end & (~VTD_PAGE_MASK);
3125
3126         /* clear ptes */
3127         dma_pte_clear_range(domain, 0, end);
3128
3129         /* free page tables */
3130         dma_pte_free_pagetable(domain, 0, end);
3131
3132         iommu_free_vm_domain(domain);
3133         free_domain_mem(domain);
3134 }
3135
3136 static int intel_iommu_domain_init(struct iommu_domain *domain)
3137 {
3138         struct dmar_domain *dmar_domain;
3139
3140         dmar_domain = iommu_alloc_vm_domain();
3141         if (!dmar_domain) {
3142                 printk(KERN_ERR
3143                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3144                 return -ENOMEM;
3145         }
3146         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3147                 printk(KERN_ERR
3148                         "intel_iommu_domain_init() failed\n");
3149                 vm_domain_exit(dmar_domain);
3150                 return -ENOMEM;
3151         }
3152         domain->priv = dmar_domain;
3153
3154         return 0;
3155 }
3156
3157 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3158 {
3159         struct dmar_domain *dmar_domain = domain->priv;
3160
3161         domain->priv = NULL;
3162         vm_domain_exit(dmar_domain);
3163 }
3164
3165 static int intel_iommu_attach_device(struct iommu_domain *domain,
3166                                      struct device *dev)
3167 {
3168         struct dmar_domain *dmar_domain = domain->priv;
3169         struct pci_dev *pdev = to_pci_dev(dev);
3170         struct intel_iommu *iommu;
3171         int addr_width;
3172         u64 end;
3173         int ret;
3174
3175         /* normally pdev is not mapped */
3176         if (unlikely(domain_context_mapped(pdev))) {
3177                 struct dmar_domain *old_domain;
3178
3179                 old_domain = find_domain(pdev);
3180                 if (old_domain) {
3181                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3182                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3183                         else
3184                                 domain_remove_dev_info(old_domain);
3185                 }
3186         }
3187
3188         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3189                                 pdev->devfn);
3190         if (!iommu)
3191                 return -ENODEV;
3192
3193         /* check if this iommu agaw is sufficient for max mapped address */
3194         addr_width = agaw_to_width(iommu->agaw);
3195         end = DOMAIN_MAX_ADDR(addr_width);
3196         end = end & VTD_PAGE_MASK;
3197         if (end < dmar_domain->max_addr) {
3198                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3199                        "sufficient for the mapped address (%llx)\n",
3200                        __func__, iommu->agaw, dmar_domain->max_addr);
3201                 return -EFAULT;
3202         }
3203
3204         ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3205         if (ret)
3206                 return ret;
3207
3208         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3209         return ret;
3210 }
3211
3212 static void intel_iommu_detach_device(struct iommu_domain *domain,
3213                                       struct device *dev)
3214 {
3215         struct dmar_domain *dmar_domain = domain->priv;
3216         struct pci_dev *pdev = to_pci_dev(dev);
3217
3218         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3219 }
3220
3221 static int intel_iommu_map_range(struct iommu_domain *domain,
3222                                  unsigned long iova, phys_addr_t hpa,
3223                                  size_t size, int iommu_prot)
3224 {
3225         struct dmar_domain *dmar_domain = domain->priv;
3226         u64 max_addr;
3227         int addr_width;
3228         int prot = 0;
3229         int ret;
3230
3231         if (iommu_prot & IOMMU_READ)
3232                 prot |= DMA_PTE_READ;
3233         if (iommu_prot & IOMMU_WRITE)
3234                 prot |= DMA_PTE_WRITE;
3235         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3236                 prot |= DMA_PTE_SNP;
3237
3238         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3239         if (dmar_domain->max_addr < max_addr) {
3240                 int min_agaw;
3241                 u64 end;
3242
3243                 /* check if minimum agaw is sufficient for mapped address */
3244                 min_agaw = vm_domain_min_agaw(dmar_domain);
3245                 addr_width = agaw_to_width(min_agaw);
3246                 end = DOMAIN_MAX_ADDR(addr_width);
3247                 end = end & VTD_PAGE_MASK;
3248                 if (end < max_addr) {
3249                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3250                                "sufficient for the mapped address (%llx)\n",
3251                                __func__, min_agaw, max_addr);
3252                         return -EFAULT;
3253                 }
3254                 dmar_domain->max_addr = max_addr;
3255         }
3256
3257         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3258         return ret;
3259 }
3260
3261 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3262                                     unsigned long iova, size_t size)
3263 {
3264         struct dmar_domain *dmar_domain = domain->priv;
3265         dma_addr_t base;
3266
3267         /* The address might not be aligned */
3268         base = iova & VTD_PAGE_MASK;
3269         size = VTD_PAGE_ALIGN(size);
3270         dma_pte_clear_range(dmar_domain, base, base + size);
3271
3272         if (dmar_domain->max_addr == base + size)
3273                 dmar_domain->max_addr = base;
3274 }
3275
3276 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3277                                             unsigned long iova)
3278 {
3279         struct dmar_domain *dmar_domain = domain->priv;
3280         struct dma_pte *pte;
3281         u64 phys = 0;
3282
3283         pte = addr_to_dma_pte(dmar_domain, iova);
3284         if (pte)
3285                 phys = dma_pte_addr(pte);
3286
3287         return phys;
3288 }
3289
3290 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3291                                       unsigned long cap)
3292 {
3293         struct dmar_domain *dmar_domain = domain->priv;
3294
3295         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3296                 return dmar_domain->iommu_snooping;
3297
3298         return 0;
3299 }
3300
3301 static struct iommu_ops intel_iommu_ops = {
3302         .domain_init    = intel_iommu_domain_init,
3303         .domain_destroy = intel_iommu_domain_destroy,
3304         .attach_dev     = intel_iommu_attach_device,
3305         .detach_dev     = intel_iommu_detach_device,
3306         .map            = intel_iommu_map_range,
3307         .unmap          = intel_iommu_unmap_range,
3308         .iova_to_phys   = intel_iommu_iova_to_phys,
3309         .domain_has_cap = intel_iommu_domain_has_cap,
3310 };
3311
3312 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3313 {
3314         /*
3315          * Mobile 4 Series Chipset neglects to set RWBF capability,
3316          * but needs it:
3317          */
3318         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3319         rwbf_quirk = 1;
3320 }
3321
3322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);