intel-iommu: Handle PCI domains appropriately.
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <asm/cacheflush.h>
41 #include <asm/iommu.h>
42 #include "pci.h"
43
44 #define ROOT_SIZE               VTD_PAGE_SIZE
45 #define CONTEXT_SIZE            VTD_PAGE_SIZE
46
47 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
48 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49
50 #define IOAPIC_RANGE_START      (0xfee00000)
51 #define IOAPIC_RANGE_END        (0xfeefffff)
52 #define IOVA_START_ADDR         (0x1000)
53
54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55
56 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57
58 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
59 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
60 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
61
62 /* global iommu list, set NULL for ignored DMAR units */
63 static struct intel_iommu **g_iommus;
64
65 static int rwbf_quirk;
66
67 /*
68  * 0: Present
69  * 1-11: Reserved
70  * 12-63: Context Ptr (12 - (haw-1))
71  * 64-127: Reserved
72  */
73 struct root_entry {
74         u64     val;
75         u64     rsvd1;
76 };
77 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
78 static inline bool root_present(struct root_entry *root)
79 {
80         return (root->val & 1);
81 }
82 static inline void set_root_present(struct root_entry *root)
83 {
84         root->val |= 1;
85 }
86 static inline void set_root_value(struct root_entry *root, unsigned long value)
87 {
88         root->val |= value & VTD_PAGE_MASK;
89 }
90
91 static inline struct context_entry *
92 get_context_addr_from_root(struct root_entry *root)
93 {
94         return (struct context_entry *)
95                 (root_present(root)?phys_to_virt(
96                 root->val & VTD_PAGE_MASK) :
97                 NULL);
98 }
99
100 /*
101  * low 64 bits:
102  * 0: present
103  * 1: fault processing disable
104  * 2-3: translation type
105  * 12-63: address space root
106  * high 64 bits:
107  * 0-2: address width
108  * 3-6: aval
109  * 8-23: domain id
110  */
111 struct context_entry {
112         u64 lo;
113         u64 hi;
114 };
115
116 static inline bool context_present(struct context_entry *context)
117 {
118         return (context->lo & 1);
119 }
120 static inline void context_set_present(struct context_entry *context)
121 {
122         context->lo |= 1;
123 }
124
125 static inline void context_set_fault_enable(struct context_entry *context)
126 {
127         context->lo &= (((u64)-1) << 2) | 1;
128 }
129
130 #define CONTEXT_TT_MULTI_LEVEL 0
131
132 static inline void context_set_translation_type(struct context_entry *context,
133                                                 unsigned long value)
134 {
135         context->lo &= (((u64)-1) << 4) | 3;
136         context->lo |= (value & 3) << 2;
137 }
138
139 static inline void context_set_address_root(struct context_entry *context,
140                                             unsigned long value)
141 {
142         context->lo |= value & VTD_PAGE_MASK;
143 }
144
145 static inline void context_set_address_width(struct context_entry *context,
146                                              unsigned long value)
147 {
148         context->hi |= value & 7;
149 }
150
151 static inline void context_set_domain_id(struct context_entry *context,
152                                          unsigned long value)
153 {
154         context->hi |= (value & ((1 << 16) - 1)) << 8;
155 }
156
157 static inline void context_clear_entry(struct context_entry *context)
158 {
159         context->lo = 0;
160         context->hi = 0;
161 }
162
163 /*
164  * 0: readable
165  * 1: writable
166  * 2-6: reserved
167  * 7: super page
168  * 8-10: available
169  * 11: snoop behavior
170  * 12-63: Host physcial address
171  */
172 struct dma_pte {
173         u64 val;
174 };
175
176 static inline void dma_clear_pte(struct dma_pte *pte)
177 {
178         pte->val = 0;
179 }
180
181 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_READ;
184 }
185
186 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 {
188         pte->val |= DMA_PTE_WRITE;
189 }
190
191 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 {
193         pte->val |= DMA_PTE_SNP;
194 }
195
196 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 {
198         pte->val = (pte->val & ~3) | (prot & 3);
199 }
200
201 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 {
203         return (pte->val & VTD_PAGE_MASK);
204 }
205
206 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 {
208         pte->val |= (addr & VTD_PAGE_MASK);
209 }
210
211 static inline bool dma_pte_present(struct dma_pte *pte)
212 {
213         return (pte->val & 3) != 0;
214 }
215
216 /* devices under the same p2p bridge are owned in one domain */
217 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218
219 /* domain represents a virtual machine, more than one devices
220  * across iommus may be owned in one domain, e.g. kvm guest.
221  */
222 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
223
224 struct dmar_domain {
225         int     id;                     /* domain id */
226         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
227
228         struct list_head devices;       /* all devices' list */
229         struct iova_domain iovad;       /* iova's that belong to this domain */
230
231         struct dma_pte  *pgd;           /* virtual address */
232         spinlock_t      mapping_lock;   /* page table lock */
233         int             gaw;            /* max guest address width */
234
235         /* adjusted guest address width, 0 is level 2 30-bit */
236         int             agaw;
237
238         int             flags;          /* flags to find out type of domain */
239
240         int             iommu_coherency;/* indicate coherency of iommu access */
241         int             iommu_snooping; /* indicate snooping control feature*/
242         int             iommu_count;    /* reference count of iommu */
243         spinlock_t      iommu_lock;     /* protect iommu set in domain */
244         u64             max_addr;       /* maximum mapped address */
245 };
246
247 /* PCI domain-device relationship */
248 struct device_domain_info {
249         struct list_head link;  /* link to domain siblings */
250         struct list_head global; /* link to global list */
251         int segment;            /* PCI domain */
252         u8 bus;                 /* PCI bus number */
253         u8 devfn;               /* PCI devfn number */
254         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
255         struct dmar_domain *domain; /* pointer to domain */
256 };
257
258 static void flush_unmaps_timeout(unsigned long data);
259
260 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
261
262 #define HIGH_WATER_MARK 250
263 struct deferred_flush_tables {
264         int next;
265         struct iova *iova[HIGH_WATER_MARK];
266         struct dmar_domain *domain[HIGH_WATER_MARK];
267 };
268
269 static struct deferred_flush_tables *deferred_flush;
270
271 /* bitmap for indexing intel_iommus */
272 static int g_num_of_iommus;
273
274 static DEFINE_SPINLOCK(async_umap_flush_lock);
275 static LIST_HEAD(unmaps_to_do);
276
277 static int timer_on;
278 static long list_size;
279
280 static void domain_remove_dev_info(struct dmar_domain *domain);
281
282 #ifdef CONFIG_DMAR_DEFAULT_ON
283 int dmar_disabled = 0;
284 #else
285 int dmar_disabled = 1;
286 #endif /*CONFIG_DMAR_DEFAULT_ON*/
287
288 static int __initdata dmar_map_gfx = 1;
289 static int dmar_forcedac;
290 static int intel_iommu_strict;
291
292 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
293 static DEFINE_SPINLOCK(device_domain_lock);
294 static LIST_HEAD(device_domain_list);
295
296 static struct iommu_ops intel_iommu_ops;
297
298 static int __init intel_iommu_setup(char *str)
299 {
300         if (!str)
301                 return -EINVAL;
302         while (*str) {
303                 if (!strncmp(str, "on", 2)) {
304                         dmar_disabled = 0;
305                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
306                 } else if (!strncmp(str, "off", 3)) {
307                         dmar_disabled = 1;
308                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
309                 } else if (!strncmp(str, "igfx_off", 8)) {
310                         dmar_map_gfx = 0;
311                         printk(KERN_INFO
312                                 "Intel-IOMMU: disable GFX device mapping\n");
313                 } else if (!strncmp(str, "forcedac", 8)) {
314                         printk(KERN_INFO
315                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
316                         dmar_forcedac = 1;
317                 } else if (!strncmp(str, "strict", 6)) {
318                         printk(KERN_INFO
319                                 "Intel-IOMMU: disable batched IOTLB flush\n");
320                         intel_iommu_strict = 1;
321                 }
322
323                 str += strcspn(str, ",");
324                 while (*str == ',')
325                         str++;
326         }
327         return 0;
328 }
329 __setup("intel_iommu=", intel_iommu_setup);
330
331 static struct kmem_cache *iommu_domain_cache;
332 static struct kmem_cache *iommu_devinfo_cache;
333 static struct kmem_cache *iommu_iova_cache;
334
335 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
336 {
337         unsigned int flags;
338         void *vaddr;
339
340         /* trying to avoid low memory issues */
341         flags = current->flags & PF_MEMALLOC;
342         current->flags |= PF_MEMALLOC;
343         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
344         current->flags &= (~PF_MEMALLOC | flags);
345         return vaddr;
346 }
347
348
349 static inline void *alloc_pgtable_page(void)
350 {
351         unsigned int flags;
352         void *vaddr;
353
354         /* trying to avoid low memory issues */
355         flags = current->flags & PF_MEMALLOC;
356         current->flags |= PF_MEMALLOC;
357         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
358         current->flags &= (~PF_MEMALLOC | flags);
359         return vaddr;
360 }
361
362 static inline void free_pgtable_page(void *vaddr)
363 {
364         free_page((unsigned long)vaddr);
365 }
366
367 static inline void *alloc_domain_mem(void)
368 {
369         return iommu_kmem_cache_alloc(iommu_domain_cache);
370 }
371
372 static void free_domain_mem(void *vaddr)
373 {
374         kmem_cache_free(iommu_domain_cache, vaddr);
375 }
376
377 static inline void * alloc_devinfo_mem(void)
378 {
379         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 }
381
382 static inline void free_devinfo_mem(void *vaddr)
383 {
384         kmem_cache_free(iommu_devinfo_cache, vaddr);
385 }
386
387 struct iova *alloc_iova_mem(void)
388 {
389         return iommu_kmem_cache_alloc(iommu_iova_cache);
390 }
391
392 void free_iova_mem(struct iova *iova)
393 {
394         kmem_cache_free(iommu_iova_cache, iova);
395 }
396
397
398 static inline int width_to_agaw(int width);
399
400 /* calculate agaw for each iommu.
401  * "SAGAW" may be different across iommus, use a default agaw, and
402  * get a supported less agaw for iommus that don't support the default agaw.
403  */
404 int iommu_calculate_agaw(struct intel_iommu *iommu)
405 {
406         unsigned long sagaw;
407         int agaw = -1;
408
409         sagaw = cap_sagaw(iommu->cap);
410         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
411              agaw >= 0; agaw--) {
412                 if (test_bit(agaw, &sagaw))
413                         break;
414         }
415
416         return agaw;
417 }
418
419 /* in native case, each domain is related to only one iommu */
420 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
421 {
422         int iommu_id;
423
424         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
425
426         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
427         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
428                 return NULL;
429
430         return g_iommus[iommu_id];
431 }
432
433 static void domain_update_iommu_coherency(struct dmar_domain *domain)
434 {
435         int i;
436
437         domain->iommu_coherency = 1;
438
439         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
440         for (; i < g_num_of_iommus; ) {
441                 if (!ecap_coherent(g_iommus[i]->ecap)) {
442                         domain->iommu_coherency = 0;
443                         break;
444                 }
445                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
446         }
447 }
448
449 static void domain_update_iommu_snooping(struct dmar_domain *domain)
450 {
451         int i;
452
453         domain->iommu_snooping = 1;
454
455         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
456         for (; i < g_num_of_iommus; ) {
457                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
458                         domain->iommu_snooping = 0;
459                         break;
460                 }
461                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
462         }
463 }
464
465 /* Some capabilities may be different across iommus */
466 static void domain_update_iommu_cap(struct dmar_domain *domain)
467 {
468         domain_update_iommu_coherency(domain);
469         domain_update_iommu_snooping(domain);
470 }
471
472 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
473 {
474         struct dmar_drhd_unit *drhd = NULL;
475         int i;
476
477         for_each_drhd_unit(drhd) {
478                 if (drhd->ignored)
479                         continue;
480                 if (segment != drhd->segment)
481                         continue;
482
483                 for (i = 0; i < drhd->devices_cnt; i++) {
484                         if (drhd->devices[i] &&
485                             drhd->devices[i]->bus->number == bus &&
486                             drhd->devices[i]->devfn == devfn)
487                                 return drhd->iommu;
488                         if (drhd->devices[i]->subordinate &&
489                             drhd->devices[i]->subordinate->number <= bus &&
490                             drhd->devices[i]->subordinate->subordinate >= bus)
491                                 return drhd->iommu;
492                 }
493
494                 if (drhd->include_all)
495                         return drhd->iommu;
496         }
497
498         return NULL;
499 }
500
501 static void domain_flush_cache(struct dmar_domain *domain,
502                                void *addr, int size)
503 {
504         if (!domain->iommu_coherency)
505                 clflush_cache_range(addr, size);
506 }
507
508 /* Gets context entry for a given bus and devfn */
509 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
510                 u8 bus, u8 devfn)
511 {
512         struct root_entry *root;
513         struct context_entry *context;
514         unsigned long phy_addr;
515         unsigned long flags;
516
517         spin_lock_irqsave(&iommu->lock, flags);
518         root = &iommu->root_entry[bus];
519         context = get_context_addr_from_root(root);
520         if (!context) {
521                 context = (struct context_entry *)alloc_pgtable_page();
522                 if (!context) {
523                         spin_unlock_irqrestore(&iommu->lock, flags);
524                         return NULL;
525                 }
526                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
527                 phy_addr = virt_to_phys((void *)context);
528                 set_root_value(root, phy_addr);
529                 set_root_present(root);
530                 __iommu_flush_cache(iommu, root, sizeof(*root));
531         }
532         spin_unlock_irqrestore(&iommu->lock, flags);
533         return &context[devfn];
534 }
535
536 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
537 {
538         struct root_entry *root;
539         struct context_entry *context;
540         int ret;
541         unsigned long flags;
542
543         spin_lock_irqsave(&iommu->lock, flags);
544         root = &iommu->root_entry[bus];
545         context = get_context_addr_from_root(root);
546         if (!context) {
547                 ret = 0;
548                 goto out;
549         }
550         ret = context_present(&context[devfn]);
551 out:
552         spin_unlock_irqrestore(&iommu->lock, flags);
553         return ret;
554 }
555
556 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
557 {
558         struct root_entry *root;
559         struct context_entry *context;
560         unsigned long flags;
561
562         spin_lock_irqsave(&iommu->lock, flags);
563         root = &iommu->root_entry[bus];
564         context = get_context_addr_from_root(root);
565         if (context) {
566                 context_clear_entry(&context[devfn]);
567                 __iommu_flush_cache(iommu, &context[devfn], \
568                         sizeof(*context));
569         }
570         spin_unlock_irqrestore(&iommu->lock, flags);
571 }
572
573 static void free_context_table(struct intel_iommu *iommu)
574 {
575         struct root_entry *root;
576         int i;
577         unsigned long flags;
578         struct context_entry *context;
579
580         spin_lock_irqsave(&iommu->lock, flags);
581         if (!iommu->root_entry) {
582                 goto out;
583         }
584         for (i = 0; i < ROOT_ENTRY_NR; i++) {
585                 root = &iommu->root_entry[i];
586                 context = get_context_addr_from_root(root);
587                 if (context)
588                         free_pgtable_page(context);
589         }
590         free_pgtable_page(iommu->root_entry);
591         iommu->root_entry = NULL;
592 out:
593         spin_unlock_irqrestore(&iommu->lock, flags);
594 }
595
596 /* page table handling */
597 #define LEVEL_STRIDE            (9)
598 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
599
600 static inline int agaw_to_level(int agaw)
601 {
602         return agaw + 2;
603 }
604
605 static inline int agaw_to_width(int agaw)
606 {
607         return 30 + agaw * LEVEL_STRIDE;
608
609 }
610
611 static inline int width_to_agaw(int width)
612 {
613         return (width - 30) / LEVEL_STRIDE;
614 }
615
616 static inline unsigned int level_to_offset_bits(int level)
617 {
618         return (12 + (level - 1) * LEVEL_STRIDE);
619 }
620
621 static inline int address_level_offset(u64 addr, int level)
622 {
623         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
624 }
625
626 static inline u64 level_mask(int level)
627 {
628         return ((u64)-1 << level_to_offset_bits(level));
629 }
630
631 static inline u64 level_size(int level)
632 {
633         return ((u64)1 << level_to_offset_bits(level));
634 }
635
636 static inline u64 align_to_level(u64 addr, int level)
637 {
638         return ((addr + level_size(level) - 1) & level_mask(level));
639 }
640
641 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
642 {
643         int addr_width = agaw_to_width(domain->agaw);
644         struct dma_pte *parent, *pte = NULL;
645         int level = agaw_to_level(domain->agaw);
646         int offset;
647         unsigned long flags;
648
649         BUG_ON(!domain->pgd);
650
651         addr &= (((u64)1) << addr_width) - 1;
652         parent = domain->pgd;
653
654         spin_lock_irqsave(&domain->mapping_lock, flags);
655         while (level > 0) {
656                 void *tmp_page;
657
658                 offset = address_level_offset(addr, level);
659                 pte = &parent[offset];
660                 if (level == 1)
661                         break;
662
663                 if (!dma_pte_present(pte)) {
664                         tmp_page = alloc_pgtable_page();
665
666                         if (!tmp_page) {
667                                 spin_unlock_irqrestore(&domain->mapping_lock,
668                                         flags);
669                                 return NULL;
670                         }
671                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
672                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
673                         /*
674                          * high level table always sets r/w, last level page
675                          * table control read/write
676                          */
677                         dma_set_pte_readable(pte);
678                         dma_set_pte_writable(pte);
679                         domain_flush_cache(domain, pte, sizeof(*pte));
680                 }
681                 parent = phys_to_virt(dma_pte_addr(pte));
682                 level--;
683         }
684
685         spin_unlock_irqrestore(&domain->mapping_lock, flags);
686         return pte;
687 }
688
689 /* return address's pte at specific level */
690 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
691                 int level)
692 {
693         struct dma_pte *parent, *pte = NULL;
694         int total = agaw_to_level(domain->agaw);
695         int offset;
696
697         parent = domain->pgd;
698         while (level <= total) {
699                 offset = address_level_offset(addr, total);
700                 pte = &parent[offset];
701                 if (level == total)
702                         return pte;
703
704                 if (!dma_pte_present(pte))
705                         break;
706                 parent = phys_to_virt(dma_pte_addr(pte));
707                 total--;
708         }
709         return NULL;
710 }
711
712 /* clear one page's page table */
713 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
714 {
715         struct dma_pte *pte = NULL;
716
717         /* get last level pte */
718         pte = dma_addr_level_pte(domain, addr, 1);
719
720         if (pte) {
721                 dma_clear_pte(pte);
722                 domain_flush_cache(domain, pte, sizeof(*pte));
723         }
724 }
725
726 /* clear last level pte, a tlb flush should be followed */
727 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
728 {
729         int addr_width = agaw_to_width(domain->agaw);
730         int npages;
731
732         start &= (((u64)1) << addr_width) - 1;
733         end &= (((u64)1) << addr_width) - 1;
734         /* in case it's partial page */
735         start = PAGE_ALIGN(start);
736         end &= PAGE_MASK;
737         npages = (end - start) / VTD_PAGE_SIZE;
738
739         /* we don't need lock here, nobody else touches the iova range */
740         while (npages--) {
741                 dma_pte_clear_one(domain, start);
742                 start += VTD_PAGE_SIZE;
743         }
744 }
745
746 /* free page table pages. last level pte should already be cleared */
747 static void dma_pte_free_pagetable(struct dmar_domain *domain,
748         u64 start, u64 end)
749 {
750         int addr_width = agaw_to_width(domain->agaw);
751         struct dma_pte *pte;
752         int total = agaw_to_level(domain->agaw);
753         int level;
754         u64 tmp;
755
756         start &= (((u64)1) << addr_width) - 1;
757         end &= (((u64)1) << addr_width) - 1;
758
759         /* we don't need lock here, nobody else touches the iova range */
760         level = 2;
761         while (level <= total) {
762                 tmp = align_to_level(start, level);
763                 if (tmp >= end || (tmp + level_size(level) > end))
764                         return;
765
766                 while (tmp < end) {
767                         pte = dma_addr_level_pte(domain, tmp, level);
768                         if (pte) {
769                                 free_pgtable_page(
770                                         phys_to_virt(dma_pte_addr(pte)));
771                                 dma_clear_pte(pte);
772                                 domain_flush_cache(domain, pte, sizeof(*pte));
773                         }
774                         tmp += level_size(level);
775                 }
776                 level++;
777         }
778         /* free pgd */
779         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
780                 free_pgtable_page(domain->pgd);
781                 domain->pgd = NULL;
782         }
783 }
784
785 /* iommu handling */
786 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
787 {
788         struct root_entry *root;
789         unsigned long flags;
790
791         root = (struct root_entry *)alloc_pgtable_page();
792         if (!root)
793                 return -ENOMEM;
794
795         __iommu_flush_cache(iommu, root, ROOT_SIZE);
796
797         spin_lock_irqsave(&iommu->lock, flags);
798         iommu->root_entry = root;
799         spin_unlock_irqrestore(&iommu->lock, flags);
800
801         return 0;
802 }
803
804 static void iommu_set_root_entry(struct intel_iommu *iommu)
805 {
806         void *addr;
807         u32 cmd, sts;
808         unsigned long flag;
809
810         addr = iommu->root_entry;
811
812         spin_lock_irqsave(&iommu->register_lock, flag);
813         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
814
815         cmd = iommu->gcmd | DMA_GCMD_SRTP;
816         writel(cmd, iommu->reg + DMAR_GCMD_REG);
817
818         /* Make sure hardware complete it */
819         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
820                 readl, (sts & DMA_GSTS_RTPS), sts);
821
822         spin_unlock_irqrestore(&iommu->register_lock, flag);
823 }
824
825 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
826 {
827         u32 val;
828         unsigned long flag;
829
830         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
831                 return;
832         val = iommu->gcmd | DMA_GCMD_WBF;
833
834         spin_lock_irqsave(&iommu->register_lock, flag);
835         writel(val, iommu->reg + DMAR_GCMD_REG);
836
837         /* Make sure hardware complete it */
838         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
839                         readl, (!(val & DMA_GSTS_WBFS)), val);
840
841         spin_unlock_irqrestore(&iommu->register_lock, flag);
842 }
843
844 /* return value determine if we need a write buffer flush */
845 static int __iommu_flush_context(struct intel_iommu *iommu,
846         u16 did, u16 source_id, u8 function_mask, u64 type,
847         int non_present_entry_flush)
848 {
849         u64 val = 0;
850         unsigned long flag;
851
852         /*
853          * In the non-present entry flush case, if hardware doesn't cache
854          * non-present entry we do nothing and if hardware cache non-present
855          * entry, we flush entries of domain 0 (the domain id is used to cache
856          * any non-present entries)
857          */
858         if (non_present_entry_flush) {
859                 if (!cap_caching_mode(iommu->cap))
860                         return 1;
861                 else
862                         did = 0;
863         }
864
865         switch (type) {
866         case DMA_CCMD_GLOBAL_INVL:
867                 val = DMA_CCMD_GLOBAL_INVL;
868                 break;
869         case DMA_CCMD_DOMAIN_INVL:
870                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
871                 break;
872         case DMA_CCMD_DEVICE_INVL:
873                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
874                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
875                 break;
876         default:
877                 BUG();
878         }
879         val |= DMA_CCMD_ICC;
880
881         spin_lock_irqsave(&iommu->register_lock, flag);
882         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
883
884         /* Make sure hardware complete it */
885         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
886                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
887
888         spin_unlock_irqrestore(&iommu->register_lock, flag);
889
890         /* flush context entry will implicitly flush write buffer */
891         return 0;
892 }
893
894 /* return value determine if we need a write buffer flush */
895 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
896         u64 addr, unsigned int size_order, u64 type,
897         int non_present_entry_flush)
898 {
899         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
900         u64 val = 0, val_iva = 0;
901         unsigned long flag;
902
903         /*
904          * In the non-present entry flush case, if hardware doesn't cache
905          * non-present entry we do nothing and if hardware cache non-present
906          * entry, we flush entries of domain 0 (the domain id is used to cache
907          * any non-present entries)
908          */
909         if (non_present_entry_flush) {
910                 if (!cap_caching_mode(iommu->cap))
911                         return 1;
912                 else
913                         did = 0;
914         }
915
916         switch (type) {
917         case DMA_TLB_GLOBAL_FLUSH:
918                 /* global flush doesn't need set IVA_REG */
919                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
920                 break;
921         case DMA_TLB_DSI_FLUSH:
922                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
923                 break;
924         case DMA_TLB_PSI_FLUSH:
925                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
926                 /* Note: always flush non-leaf currently */
927                 val_iva = size_order | addr;
928                 break;
929         default:
930                 BUG();
931         }
932         /* Note: set drain read/write */
933 #if 0
934         /*
935          * This is probably to be super secure.. Looks like we can
936          * ignore it without any impact.
937          */
938         if (cap_read_drain(iommu->cap))
939                 val |= DMA_TLB_READ_DRAIN;
940 #endif
941         if (cap_write_drain(iommu->cap))
942                 val |= DMA_TLB_WRITE_DRAIN;
943
944         spin_lock_irqsave(&iommu->register_lock, flag);
945         /* Note: Only uses first TLB reg currently */
946         if (val_iva)
947                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
948         dmar_writeq(iommu->reg + tlb_offset + 8, val);
949
950         /* Make sure hardware complete it */
951         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
952                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
953
954         spin_unlock_irqrestore(&iommu->register_lock, flag);
955
956         /* check IOTLB invalidation granularity */
957         if (DMA_TLB_IAIG(val) == 0)
958                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
959         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
960                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
961                         (unsigned long long)DMA_TLB_IIRG(type),
962                         (unsigned long long)DMA_TLB_IAIG(val));
963         /* flush iotlb entry will implicitly flush write buffer */
964         return 0;
965 }
966
967 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
968         u64 addr, unsigned int pages, int non_present_entry_flush)
969 {
970         unsigned int mask;
971
972         BUG_ON(addr & (~VTD_PAGE_MASK));
973         BUG_ON(pages == 0);
974
975         /* Fallback to domain selective flush if no PSI support */
976         if (!cap_pgsel_inv(iommu->cap))
977                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
978                                                 DMA_TLB_DSI_FLUSH,
979                                                 non_present_entry_flush);
980
981         /*
982          * PSI requires page size to be 2 ^ x, and the base address is naturally
983          * aligned to the size
984          */
985         mask = ilog2(__roundup_pow_of_two(pages));
986         /* Fallback to domain selective flush if size is too big */
987         if (mask > cap_max_amask_val(iommu->cap))
988                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
989                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
990
991         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
992                                         DMA_TLB_PSI_FLUSH,
993                                         non_present_entry_flush);
994 }
995
996 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
997 {
998         u32 pmen;
999         unsigned long flags;
1000
1001         spin_lock_irqsave(&iommu->register_lock, flags);
1002         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1003         pmen &= ~DMA_PMEN_EPM;
1004         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1005
1006         /* wait for the protected region status bit to clear */
1007         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1008                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1009
1010         spin_unlock_irqrestore(&iommu->register_lock, flags);
1011 }
1012
1013 static int iommu_enable_translation(struct intel_iommu *iommu)
1014 {
1015         u32 sts;
1016         unsigned long flags;
1017
1018         spin_lock_irqsave(&iommu->register_lock, flags);
1019         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1020
1021         /* Make sure hardware complete it */
1022         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1023                 readl, (sts & DMA_GSTS_TES), sts);
1024
1025         iommu->gcmd |= DMA_GCMD_TE;
1026         spin_unlock_irqrestore(&iommu->register_lock, flags);
1027         return 0;
1028 }
1029
1030 static int iommu_disable_translation(struct intel_iommu *iommu)
1031 {
1032         u32 sts;
1033         unsigned long flag;
1034
1035         spin_lock_irqsave(&iommu->register_lock, flag);
1036         iommu->gcmd &= ~DMA_GCMD_TE;
1037         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1038
1039         /* Make sure hardware complete it */
1040         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1041                 readl, (!(sts & DMA_GSTS_TES)), sts);
1042
1043         spin_unlock_irqrestore(&iommu->register_lock, flag);
1044         return 0;
1045 }
1046
1047
1048 static int iommu_init_domains(struct intel_iommu *iommu)
1049 {
1050         unsigned long ndomains;
1051         unsigned long nlongs;
1052
1053         ndomains = cap_ndoms(iommu->cap);
1054         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1055         nlongs = BITS_TO_LONGS(ndomains);
1056
1057         /* TBD: there might be 64K domains,
1058          * consider other allocation for future chip
1059          */
1060         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1061         if (!iommu->domain_ids) {
1062                 printk(KERN_ERR "Allocating domain id array failed\n");
1063                 return -ENOMEM;
1064         }
1065         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1066                         GFP_KERNEL);
1067         if (!iommu->domains) {
1068                 printk(KERN_ERR "Allocating domain array failed\n");
1069                 kfree(iommu->domain_ids);
1070                 return -ENOMEM;
1071         }
1072
1073         spin_lock_init(&iommu->lock);
1074
1075         /*
1076          * if Caching mode is set, then invalid translations are tagged
1077          * with domainid 0. Hence we need to pre-allocate it.
1078          */
1079         if (cap_caching_mode(iommu->cap))
1080                 set_bit(0, iommu->domain_ids);
1081         return 0;
1082 }
1083
1084
1085 static void domain_exit(struct dmar_domain *domain);
1086 static void vm_domain_exit(struct dmar_domain *domain);
1087
1088 void free_dmar_iommu(struct intel_iommu *iommu)
1089 {
1090         struct dmar_domain *domain;
1091         int i;
1092         unsigned long flags;
1093
1094         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1095         for (; i < cap_ndoms(iommu->cap); ) {
1096                 domain = iommu->domains[i];
1097                 clear_bit(i, iommu->domain_ids);
1098
1099                 spin_lock_irqsave(&domain->iommu_lock, flags);
1100                 if (--domain->iommu_count == 0) {
1101                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1102                                 vm_domain_exit(domain);
1103                         else
1104                                 domain_exit(domain);
1105                 }
1106                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1107
1108                 i = find_next_bit(iommu->domain_ids,
1109                         cap_ndoms(iommu->cap), i+1);
1110         }
1111
1112         if (iommu->gcmd & DMA_GCMD_TE)
1113                 iommu_disable_translation(iommu);
1114
1115         if (iommu->irq) {
1116                 set_irq_data(iommu->irq, NULL);
1117                 /* This will mask the irq */
1118                 free_irq(iommu->irq, iommu);
1119                 destroy_irq(iommu->irq);
1120         }
1121
1122         kfree(iommu->domains);
1123         kfree(iommu->domain_ids);
1124
1125         g_iommus[iommu->seq_id] = NULL;
1126
1127         /* if all iommus are freed, free g_iommus */
1128         for (i = 0; i < g_num_of_iommus; i++) {
1129                 if (g_iommus[i])
1130                         break;
1131         }
1132
1133         if (i == g_num_of_iommus)
1134                 kfree(g_iommus);
1135
1136         /* free context mapping */
1137         free_context_table(iommu);
1138 }
1139
1140 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1141 {
1142         unsigned long num;
1143         unsigned long ndomains;
1144         struct dmar_domain *domain;
1145         unsigned long flags;
1146
1147         domain = alloc_domain_mem();
1148         if (!domain)
1149                 return NULL;
1150
1151         ndomains = cap_ndoms(iommu->cap);
1152
1153         spin_lock_irqsave(&iommu->lock, flags);
1154         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1155         if (num >= ndomains) {
1156                 spin_unlock_irqrestore(&iommu->lock, flags);
1157                 free_domain_mem(domain);
1158                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1159                 return NULL;
1160         }
1161
1162         set_bit(num, iommu->domain_ids);
1163         domain->id = num;
1164         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1165         set_bit(iommu->seq_id, &domain->iommu_bmp);
1166         domain->flags = 0;
1167         iommu->domains[num] = domain;
1168         spin_unlock_irqrestore(&iommu->lock, flags);
1169
1170         return domain;
1171 }
1172
1173 static void iommu_free_domain(struct dmar_domain *domain)
1174 {
1175         unsigned long flags;
1176         struct intel_iommu *iommu;
1177
1178         iommu = domain_get_iommu(domain);
1179
1180         spin_lock_irqsave(&iommu->lock, flags);
1181         clear_bit(domain->id, iommu->domain_ids);
1182         spin_unlock_irqrestore(&iommu->lock, flags);
1183 }
1184
1185 static struct iova_domain reserved_iova_list;
1186 static struct lock_class_key reserved_alloc_key;
1187 static struct lock_class_key reserved_rbtree_key;
1188
1189 static void dmar_init_reserved_ranges(void)
1190 {
1191         struct pci_dev *pdev = NULL;
1192         struct iova *iova;
1193         int i;
1194         u64 addr, size;
1195
1196         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1197
1198         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1199                 &reserved_alloc_key);
1200         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1201                 &reserved_rbtree_key);
1202
1203         /* IOAPIC ranges shouldn't be accessed by DMA */
1204         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1205                 IOVA_PFN(IOAPIC_RANGE_END));
1206         if (!iova)
1207                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1208
1209         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1210         for_each_pci_dev(pdev) {
1211                 struct resource *r;
1212
1213                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1214                         r = &pdev->resource[i];
1215                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1216                                 continue;
1217                         addr = r->start;
1218                         addr &= PAGE_MASK;
1219                         size = r->end - addr;
1220                         size = PAGE_ALIGN(size);
1221                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1222                                 IOVA_PFN(size + addr) - 1);
1223                         if (!iova)
1224                                 printk(KERN_ERR "Reserve iova failed\n");
1225                 }
1226         }
1227
1228 }
1229
1230 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1231 {
1232         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1233 }
1234
1235 static inline int guestwidth_to_adjustwidth(int gaw)
1236 {
1237         int agaw;
1238         int r = (gaw - 12) % 9;
1239
1240         if (r == 0)
1241                 agaw = gaw;
1242         else
1243                 agaw = gaw + 9 - r;
1244         if (agaw > 64)
1245                 agaw = 64;
1246         return agaw;
1247 }
1248
1249 static int domain_init(struct dmar_domain *domain, int guest_width)
1250 {
1251         struct intel_iommu *iommu;
1252         int adjust_width, agaw;
1253         unsigned long sagaw;
1254
1255         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1256         spin_lock_init(&domain->mapping_lock);
1257         spin_lock_init(&domain->iommu_lock);
1258
1259         domain_reserve_special_ranges(domain);
1260
1261         /* calculate AGAW */
1262         iommu = domain_get_iommu(domain);
1263         if (guest_width > cap_mgaw(iommu->cap))
1264                 guest_width = cap_mgaw(iommu->cap);
1265         domain->gaw = guest_width;
1266         adjust_width = guestwidth_to_adjustwidth(guest_width);
1267         agaw = width_to_agaw(adjust_width);
1268         sagaw = cap_sagaw(iommu->cap);
1269         if (!test_bit(agaw, &sagaw)) {
1270                 /* hardware doesn't support it, choose a bigger one */
1271                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1272                 agaw = find_next_bit(&sagaw, 5, agaw);
1273                 if (agaw >= 5)
1274                         return -ENODEV;
1275         }
1276         domain->agaw = agaw;
1277         INIT_LIST_HEAD(&domain->devices);
1278
1279         if (ecap_coherent(iommu->ecap))
1280                 domain->iommu_coherency = 1;
1281         else
1282                 domain->iommu_coherency = 0;
1283
1284         if (ecap_sc_support(iommu->ecap))
1285                 domain->iommu_snooping = 1;
1286         else
1287                 domain->iommu_snooping = 0;
1288
1289         domain->iommu_count = 1;
1290
1291         /* always allocate the top pgd */
1292         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1293         if (!domain->pgd)
1294                 return -ENOMEM;
1295         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1296         return 0;
1297 }
1298
1299 static void domain_exit(struct dmar_domain *domain)
1300 {
1301         u64 end;
1302
1303         /* Domain 0 is reserved, so dont process it */
1304         if (!domain)
1305                 return;
1306
1307         domain_remove_dev_info(domain);
1308         /* destroy iovas */
1309         put_iova_domain(&domain->iovad);
1310         end = DOMAIN_MAX_ADDR(domain->gaw);
1311         end = end & (~PAGE_MASK);
1312
1313         /* clear ptes */
1314         dma_pte_clear_range(domain, 0, end);
1315
1316         /* free page tables */
1317         dma_pte_free_pagetable(domain, 0, end);
1318
1319         iommu_free_domain(domain);
1320         free_domain_mem(domain);
1321 }
1322
1323 static int domain_context_mapping_one(struct dmar_domain *domain,
1324                                       int segment, u8 bus, u8 devfn)
1325 {
1326         struct context_entry *context;
1327         unsigned long flags;
1328         struct intel_iommu *iommu;
1329         struct dma_pte *pgd;
1330         unsigned long num;
1331         unsigned long ndomains;
1332         int id;
1333         int agaw;
1334
1335         pr_debug("Set context mapping for %02x:%02x.%d\n",
1336                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1337         BUG_ON(!domain->pgd);
1338
1339         iommu = device_to_iommu(segment, bus, devfn);
1340         if (!iommu)
1341                 return -ENODEV;
1342
1343         context = device_to_context_entry(iommu, bus, devfn);
1344         if (!context)
1345                 return -ENOMEM;
1346         spin_lock_irqsave(&iommu->lock, flags);
1347         if (context_present(context)) {
1348                 spin_unlock_irqrestore(&iommu->lock, flags);
1349                 return 0;
1350         }
1351
1352         id = domain->id;
1353         pgd = domain->pgd;
1354
1355         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1356                 int found = 0;
1357
1358                 /* find an available domain id for this device in iommu */
1359                 ndomains = cap_ndoms(iommu->cap);
1360                 num = find_first_bit(iommu->domain_ids, ndomains);
1361                 for (; num < ndomains; ) {
1362                         if (iommu->domains[num] == domain) {
1363                                 id = num;
1364                                 found = 1;
1365                                 break;
1366                         }
1367                         num = find_next_bit(iommu->domain_ids,
1368                                             cap_ndoms(iommu->cap), num+1);
1369                 }
1370
1371                 if (found == 0) {
1372                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1373                         if (num >= ndomains) {
1374                                 spin_unlock_irqrestore(&iommu->lock, flags);
1375                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1376                                 return -EFAULT;
1377                         }
1378
1379                         set_bit(num, iommu->domain_ids);
1380                         iommu->domains[num] = domain;
1381                         id = num;
1382                 }
1383
1384                 /* Skip top levels of page tables for
1385                  * iommu which has less agaw than default.
1386                  */
1387                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1388                         pgd = phys_to_virt(dma_pte_addr(pgd));
1389                         if (!dma_pte_present(pgd)) {
1390                                 spin_unlock_irqrestore(&iommu->lock, flags);
1391                                 return -ENOMEM;
1392                         }
1393                 }
1394         }
1395
1396         context_set_domain_id(context, id);
1397         context_set_address_width(context, iommu->agaw);
1398         context_set_address_root(context, virt_to_phys(pgd));
1399         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1400         context_set_fault_enable(context);
1401         context_set_present(context);
1402         domain_flush_cache(domain, context, sizeof(*context));
1403
1404         /* it's a non-present to present mapping */
1405         if (iommu->flush.flush_context(iommu, domain->id,
1406                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1407                 DMA_CCMD_DEVICE_INVL, 1))
1408                 iommu_flush_write_buffer(iommu);
1409         else
1410                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1411
1412         spin_unlock_irqrestore(&iommu->lock, flags);
1413
1414         spin_lock_irqsave(&domain->iommu_lock, flags);
1415         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1416                 domain->iommu_count++;
1417                 domain_update_iommu_cap(domain);
1418         }
1419         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1420         return 0;
1421 }
1422
1423 static int
1424 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1425 {
1426         int ret;
1427         struct pci_dev *tmp, *parent;
1428
1429         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1430                                          pdev->bus->number, pdev->devfn);
1431         if (ret)
1432                 return ret;
1433
1434         /* dependent device mapping */
1435         tmp = pci_find_upstream_pcie_bridge(pdev);
1436         if (!tmp)
1437                 return 0;
1438         /* Secondary interface's bus number and devfn 0 */
1439         parent = pdev->bus->self;
1440         while (parent != tmp) {
1441                 ret = domain_context_mapping_one(domain,
1442                                                  pci_domain_nr(parent->bus),
1443                                                  parent->bus->number,
1444                                                  parent->devfn);
1445                 if (ret)
1446                         return ret;
1447                 parent = parent->bus->self;
1448         }
1449         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1450                 return domain_context_mapping_one(domain,
1451                                         pci_domain_nr(tmp->subordinate),
1452                                         tmp->subordinate->number, 0);
1453         else /* this is a legacy PCI bridge */
1454                 return domain_context_mapping_one(domain,
1455                                                   pci_domain_nr(tmp->bus),
1456                                                   tmp->bus->number,
1457                                                   tmp->devfn);
1458 }
1459
1460 static int domain_context_mapped(struct pci_dev *pdev)
1461 {
1462         int ret;
1463         struct pci_dev *tmp, *parent;
1464         struct intel_iommu *iommu;
1465
1466         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1467                                 pdev->devfn);
1468         if (!iommu)
1469                 return -ENODEV;
1470
1471         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1472         if (!ret)
1473                 return ret;
1474         /* dependent device mapping */
1475         tmp = pci_find_upstream_pcie_bridge(pdev);
1476         if (!tmp)
1477                 return ret;
1478         /* Secondary interface's bus number and devfn 0 */
1479         parent = pdev->bus->self;
1480         while (parent != tmp) {
1481                 ret = device_context_mapped(iommu, parent->bus->number,
1482                                             parent->devfn);
1483                 if (!ret)
1484                         return ret;
1485                 parent = parent->bus->self;
1486         }
1487         if (tmp->is_pcie)
1488                 return device_context_mapped(iommu, tmp->subordinate->number,
1489                                              0);
1490         else
1491                 return device_context_mapped(iommu, tmp->bus->number,
1492                                              tmp->devfn);
1493 }
1494
1495 static int
1496 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1497                         u64 hpa, size_t size, int prot)
1498 {
1499         u64 start_pfn, end_pfn;
1500         struct dma_pte *pte;
1501         int index;
1502         int addr_width = agaw_to_width(domain->agaw);
1503
1504         hpa &= (((u64)1) << addr_width) - 1;
1505
1506         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1507                 return -EINVAL;
1508         iova &= PAGE_MASK;
1509         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1510         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1511         index = 0;
1512         while (start_pfn < end_pfn) {
1513                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1514                 if (!pte)
1515                         return -ENOMEM;
1516                 /* We don't need lock here, nobody else
1517                  * touches the iova range
1518                  */
1519                 BUG_ON(dma_pte_addr(pte));
1520                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1521                 dma_set_pte_prot(pte, prot);
1522                 if (prot & DMA_PTE_SNP)
1523                         dma_set_pte_snp(pte);
1524                 domain_flush_cache(domain, pte, sizeof(*pte));
1525                 start_pfn++;
1526                 index++;
1527         }
1528         return 0;
1529 }
1530
1531 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1532 {
1533         if (!iommu)
1534                 return;
1535
1536         clear_context_table(iommu, bus, devfn);
1537         iommu->flush.flush_context(iommu, 0, 0, 0,
1538                                            DMA_CCMD_GLOBAL_INVL, 0);
1539         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1540                                          DMA_TLB_GLOBAL_FLUSH, 0);
1541 }
1542
1543 static void domain_remove_dev_info(struct dmar_domain *domain)
1544 {
1545         struct device_domain_info *info;
1546         unsigned long flags;
1547         struct intel_iommu *iommu;
1548
1549         spin_lock_irqsave(&device_domain_lock, flags);
1550         while (!list_empty(&domain->devices)) {
1551                 info = list_entry(domain->devices.next,
1552                         struct device_domain_info, link);
1553                 list_del(&info->link);
1554                 list_del(&info->global);
1555                 if (info->dev)
1556                         info->dev->dev.archdata.iommu = NULL;
1557                 spin_unlock_irqrestore(&device_domain_lock, flags);
1558
1559                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1560                 iommu_detach_dev(iommu, info->bus, info->devfn);
1561                 free_devinfo_mem(info);
1562
1563                 spin_lock_irqsave(&device_domain_lock, flags);
1564         }
1565         spin_unlock_irqrestore(&device_domain_lock, flags);
1566 }
1567
1568 /*
1569  * find_domain
1570  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1571  */
1572 static struct dmar_domain *
1573 find_domain(struct pci_dev *pdev)
1574 {
1575         struct device_domain_info *info;
1576
1577         /* No lock here, assumes no domain exit in normal case */
1578         info = pdev->dev.archdata.iommu;
1579         if (info)
1580                 return info->domain;
1581         return NULL;
1582 }
1583
1584 /* domain is initialized */
1585 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1586 {
1587         struct dmar_domain *domain, *found = NULL;
1588         struct intel_iommu *iommu;
1589         struct dmar_drhd_unit *drhd;
1590         struct device_domain_info *info, *tmp;
1591         struct pci_dev *dev_tmp;
1592         unsigned long flags;
1593         int bus = 0, devfn = 0;
1594         int segment;
1595
1596         domain = find_domain(pdev);
1597         if (domain)
1598                 return domain;
1599
1600         segment = pci_domain_nr(pdev->bus);
1601
1602         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1603         if (dev_tmp) {
1604                 if (dev_tmp->is_pcie) {
1605                         bus = dev_tmp->subordinate->number;
1606                         devfn = 0;
1607                 } else {
1608                         bus = dev_tmp->bus->number;
1609                         devfn = dev_tmp->devfn;
1610                 }
1611                 spin_lock_irqsave(&device_domain_lock, flags);
1612                 list_for_each_entry(info, &device_domain_list, global) {
1613                         if (info->segment == segment &&
1614                             info->bus == bus && info->devfn == devfn) {
1615                                 found = info->domain;
1616                                 break;
1617                         }
1618                 }
1619                 spin_unlock_irqrestore(&device_domain_lock, flags);
1620                 /* pcie-pci bridge already has a domain, uses it */
1621                 if (found) {
1622                         domain = found;
1623                         goto found_domain;
1624                 }
1625         }
1626
1627         /* Allocate new domain for the device */
1628         drhd = dmar_find_matched_drhd_unit(pdev);
1629         if (!drhd) {
1630                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1631                         pci_name(pdev));
1632                 return NULL;
1633         }
1634         iommu = drhd->iommu;
1635
1636         domain = iommu_alloc_domain(iommu);
1637         if (!domain)
1638                 goto error;
1639
1640         if (domain_init(domain, gaw)) {
1641                 domain_exit(domain);
1642                 goto error;
1643         }
1644
1645         /* register pcie-to-pci device */
1646         if (dev_tmp) {
1647                 info = alloc_devinfo_mem();
1648                 if (!info) {
1649                         domain_exit(domain);
1650                         goto error;
1651                 }
1652                 info->segment = segment;
1653                 info->bus = bus;
1654                 info->devfn = devfn;
1655                 info->dev = NULL;
1656                 info->domain = domain;
1657                 /* This domain is shared by devices under p2p bridge */
1658                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1659
1660                 /* pcie-to-pci bridge already has a domain, uses it */
1661                 found = NULL;
1662                 spin_lock_irqsave(&device_domain_lock, flags);
1663                 list_for_each_entry(tmp, &device_domain_list, global) {
1664                         if (tmp->segment == segment &&
1665                             tmp->bus == bus && tmp->devfn == devfn) {
1666                                 found = tmp->domain;
1667                                 break;
1668                         }
1669                 }
1670                 if (found) {
1671                         free_devinfo_mem(info);
1672                         domain_exit(domain);
1673                         domain = found;
1674                 } else {
1675                         list_add(&info->link, &domain->devices);
1676                         list_add(&info->global, &device_domain_list);
1677                 }
1678                 spin_unlock_irqrestore(&device_domain_lock, flags);
1679         }
1680
1681 found_domain:
1682         info = alloc_devinfo_mem();
1683         if (!info)
1684                 goto error;
1685         info->segment = segment;
1686         info->bus = pdev->bus->number;
1687         info->devfn = pdev->devfn;
1688         info->dev = pdev;
1689         info->domain = domain;
1690         spin_lock_irqsave(&device_domain_lock, flags);
1691         /* somebody is fast */
1692         found = find_domain(pdev);
1693         if (found != NULL) {
1694                 spin_unlock_irqrestore(&device_domain_lock, flags);
1695                 if (found != domain) {
1696                         domain_exit(domain);
1697                         domain = found;
1698                 }
1699                 free_devinfo_mem(info);
1700                 return domain;
1701         }
1702         list_add(&info->link, &domain->devices);
1703         list_add(&info->global, &device_domain_list);
1704         pdev->dev.archdata.iommu = info;
1705         spin_unlock_irqrestore(&device_domain_lock, flags);
1706         return domain;
1707 error:
1708         /* recheck it here, maybe others set it */
1709         return find_domain(pdev);
1710 }
1711
1712 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1713                                       unsigned long long start,
1714                                       unsigned long long end)
1715 {
1716         struct dmar_domain *domain;
1717         unsigned long size;
1718         unsigned long long base;
1719         int ret;
1720
1721         printk(KERN_INFO
1722                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1723                 pci_name(pdev), start, end);
1724         /* page table init */
1725         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1726         if (!domain)
1727                 return -ENOMEM;
1728
1729         /* The address might not be aligned */
1730         base = start & PAGE_MASK;
1731         size = end - base;
1732         size = PAGE_ALIGN(size);
1733         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1734                         IOVA_PFN(base + size) - 1)) {
1735                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1736                 ret = -ENOMEM;
1737                 goto error;
1738         }
1739
1740         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1741                 size, base, pci_name(pdev));
1742         /*
1743          * RMRR range might have overlap with physical memory range,
1744          * clear it first
1745          */
1746         dma_pte_clear_range(domain, base, base + size);
1747
1748         ret = domain_page_mapping(domain, base, base, size,
1749                 DMA_PTE_READ|DMA_PTE_WRITE);
1750         if (ret)
1751                 goto error;
1752
1753         /* context entry init */
1754         ret = domain_context_mapping(domain, pdev);
1755         if (!ret)
1756                 return 0;
1757 error:
1758         domain_exit(domain);
1759         return ret;
1760
1761 }
1762
1763 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1764         struct pci_dev *pdev)
1765 {
1766         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1767                 return 0;
1768         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1769                 rmrr->end_address + 1);
1770 }
1771
1772 #ifdef CONFIG_DMAR_GFX_WA
1773 struct iommu_prepare_data {
1774         struct pci_dev *pdev;
1775         int ret;
1776 };
1777
1778 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1779                                          unsigned long end_pfn, void *datax)
1780 {
1781         struct iommu_prepare_data *data;
1782
1783         data = (struct iommu_prepare_data *)datax;
1784
1785         data->ret = iommu_prepare_identity_map(data->pdev,
1786                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1787         return data->ret;
1788
1789 }
1790
1791 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1792 {
1793         int nid;
1794         struct iommu_prepare_data data;
1795
1796         data.pdev = pdev;
1797         data.ret = 0;
1798
1799         for_each_online_node(nid) {
1800                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1801                 if (data.ret)
1802                         return data.ret;
1803         }
1804         return data.ret;
1805 }
1806
1807 static void __init iommu_prepare_gfx_mapping(void)
1808 {
1809         struct pci_dev *pdev = NULL;
1810         int ret;
1811
1812         for_each_pci_dev(pdev) {
1813                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1814                                 !IS_GFX_DEVICE(pdev))
1815                         continue;
1816                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1817                         pci_name(pdev));
1818                 ret = iommu_prepare_with_active_regions(pdev);
1819                 if (ret)
1820                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1821         }
1822 }
1823 #else /* !CONFIG_DMAR_GFX_WA */
1824 static inline void iommu_prepare_gfx_mapping(void)
1825 {
1826         return;
1827 }
1828 #endif
1829
1830 #ifdef CONFIG_DMAR_FLOPPY_WA
1831 static inline void iommu_prepare_isa(void)
1832 {
1833         struct pci_dev *pdev;
1834         int ret;
1835
1836         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1837         if (!pdev)
1838                 return;
1839
1840         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1841         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1842
1843         if (ret)
1844                 printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
1845                         "floppy might not work\n");
1846
1847 }
1848 #else
1849 static inline void iommu_prepare_isa(void)
1850 {
1851         return;
1852 }
1853 #endif /* !CONFIG_DMAR_FLPY_WA */
1854
1855 static int __init init_dmars(void)
1856 {
1857         struct dmar_drhd_unit *drhd;
1858         struct dmar_rmrr_unit *rmrr;
1859         struct pci_dev *pdev;
1860         struct intel_iommu *iommu;
1861         int i, ret;
1862
1863         /*
1864          * for each drhd
1865          *    allocate root
1866          *    initialize and program root entry to not present
1867          * endfor
1868          */
1869         for_each_drhd_unit(drhd) {
1870                 g_num_of_iommus++;
1871                 /*
1872                  * lock not needed as this is only incremented in the single
1873                  * threaded kernel __init code path all other access are read
1874                  * only
1875                  */
1876         }
1877
1878         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1879                         GFP_KERNEL);
1880         if (!g_iommus) {
1881                 printk(KERN_ERR "Allocating global iommu array failed\n");
1882                 ret = -ENOMEM;
1883                 goto error;
1884         }
1885
1886         deferred_flush = kzalloc(g_num_of_iommus *
1887                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1888         if (!deferred_flush) {
1889                 kfree(g_iommus);
1890                 ret = -ENOMEM;
1891                 goto error;
1892         }
1893
1894         for_each_drhd_unit(drhd) {
1895                 if (drhd->ignored)
1896                         continue;
1897
1898                 iommu = drhd->iommu;
1899                 g_iommus[iommu->seq_id] = iommu;
1900
1901                 ret = iommu_init_domains(iommu);
1902                 if (ret)
1903                         goto error;
1904
1905                 /*
1906                  * TBD:
1907                  * we could share the same root & context tables
1908                  * amoung all IOMMU's. Need to Split it later.
1909                  */
1910                 ret = iommu_alloc_root_entry(iommu);
1911                 if (ret) {
1912                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1913                         goto error;
1914                 }
1915         }
1916
1917         /*
1918          * Start from the sane iommu hardware state.
1919          */
1920         for_each_drhd_unit(drhd) {
1921                 if (drhd->ignored)
1922                         continue;
1923
1924                 iommu = drhd->iommu;
1925
1926                 /*
1927                  * If the queued invalidation is already initialized by us
1928                  * (for example, while enabling interrupt-remapping) then
1929                  * we got the things already rolling from a sane state.
1930                  */
1931                 if (iommu->qi)
1932                         continue;
1933
1934                 /*
1935                  * Clear any previous faults.
1936                  */
1937                 dmar_fault(-1, iommu);
1938                 /*
1939                  * Disable queued invalidation if supported and already enabled
1940                  * before OS handover.
1941                  */
1942                 dmar_disable_qi(iommu);
1943         }
1944
1945         for_each_drhd_unit(drhd) {
1946                 if (drhd->ignored)
1947                         continue;
1948
1949                 iommu = drhd->iommu;
1950
1951                 if (dmar_enable_qi(iommu)) {
1952                         /*
1953                          * Queued Invalidate not enabled, use Register Based
1954                          * Invalidate
1955                          */
1956                         iommu->flush.flush_context = __iommu_flush_context;
1957                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1958                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1959                                "invalidation\n",
1960                                (unsigned long long)drhd->reg_base_addr);
1961                 } else {
1962                         iommu->flush.flush_context = qi_flush_context;
1963                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1964                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1965                                "invalidation\n",
1966                                (unsigned long long)drhd->reg_base_addr);
1967                 }
1968         }
1969
1970 #ifdef CONFIG_INTR_REMAP
1971         if (!intr_remapping_enabled) {
1972                 ret = enable_intr_remapping(0);
1973                 if (ret)
1974                         printk(KERN_ERR
1975                                "IOMMU: enable interrupt remapping failed\n");
1976         }
1977 #endif
1978
1979         /*
1980          * For each rmrr
1981          *   for each dev attached to rmrr
1982          *   do
1983          *     locate drhd for dev, alloc domain for dev
1984          *     allocate free domain
1985          *     allocate page table entries for rmrr
1986          *     if context not allocated for bus
1987          *           allocate and init context
1988          *           set present in root table for this bus
1989          *     init context with domain, translation etc
1990          *    endfor
1991          * endfor
1992          */
1993         for_each_rmrr_units(rmrr) {
1994                 for (i = 0; i < rmrr->devices_cnt; i++) {
1995                         pdev = rmrr->devices[i];
1996                         /* some BIOS lists non-exist devices in DMAR table */
1997                         if (!pdev)
1998                                 continue;
1999                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2000                         if (ret)
2001                                 printk(KERN_ERR
2002                                  "IOMMU: mapping reserved region failed\n");
2003                 }
2004         }
2005
2006         iommu_prepare_gfx_mapping();
2007
2008         iommu_prepare_isa();
2009
2010         /*
2011          * for each drhd
2012          *   enable fault log
2013          *   global invalidate context cache
2014          *   global invalidate iotlb
2015          *   enable translation
2016          */
2017         for_each_drhd_unit(drhd) {
2018                 if (drhd->ignored)
2019                         continue;
2020                 iommu = drhd->iommu;
2021
2022                 iommu_flush_write_buffer(iommu);
2023
2024                 ret = dmar_set_interrupt(iommu);
2025                 if (ret)
2026                         goto error;
2027
2028                 iommu_set_root_entry(iommu);
2029
2030                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2031                                            0);
2032                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2033                                          0);
2034                 iommu_disable_protect_mem_regions(iommu);
2035
2036                 ret = iommu_enable_translation(iommu);
2037                 if (ret)
2038                         goto error;
2039         }
2040
2041         return 0;
2042 error:
2043         for_each_drhd_unit(drhd) {
2044                 if (drhd->ignored)
2045                         continue;
2046                 iommu = drhd->iommu;
2047                 free_iommu(iommu);
2048         }
2049         kfree(g_iommus);
2050         return ret;
2051 }
2052
2053 static inline u64 aligned_size(u64 host_addr, size_t size)
2054 {
2055         u64 addr;
2056         addr = (host_addr & (~PAGE_MASK)) + size;
2057         return PAGE_ALIGN(addr);
2058 }
2059
2060 struct iova *
2061 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2062 {
2063         struct iova *piova;
2064
2065         /* Make sure it's in range */
2066         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2067         if (!size || (IOVA_START_ADDR + size > end))
2068                 return NULL;
2069
2070         piova = alloc_iova(&domain->iovad,
2071                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2072         return piova;
2073 }
2074
2075 static struct iova *
2076 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2077                    size_t size, u64 dma_mask)
2078 {
2079         struct pci_dev *pdev = to_pci_dev(dev);
2080         struct iova *iova = NULL;
2081
2082         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2083                 iova = iommu_alloc_iova(domain, size, dma_mask);
2084         else {
2085                 /*
2086                  * First try to allocate an io virtual address in
2087                  * DMA_32BIT_MASK and if that fails then try allocating
2088                  * from higher range
2089                  */
2090                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2091                 if (!iova)
2092                         iova = iommu_alloc_iova(domain, size, dma_mask);
2093         }
2094
2095         if (!iova) {
2096                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2097                 return NULL;
2098         }
2099
2100         return iova;
2101 }
2102
2103 static struct dmar_domain *
2104 get_valid_domain_for_dev(struct pci_dev *pdev)
2105 {
2106         struct dmar_domain *domain;
2107         int ret;
2108
2109         domain = get_domain_for_dev(pdev,
2110                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2111         if (!domain) {
2112                 printk(KERN_ERR
2113                         "Allocating domain for %s failed", pci_name(pdev));
2114                 return NULL;
2115         }
2116
2117         /* make sure context mapping is ok */
2118         if (unlikely(!domain_context_mapped(pdev))) {
2119                 ret = domain_context_mapping(domain, pdev);
2120                 if (ret) {
2121                         printk(KERN_ERR
2122                                 "Domain context map for %s failed",
2123                                 pci_name(pdev));
2124                         return NULL;
2125                 }
2126         }
2127
2128         return domain;
2129 }
2130
2131 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2132                                      size_t size, int dir, u64 dma_mask)
2133 {
2134         struct pci_dev *pdev = to_pci_dev(hwdev);
2135         struct dmar_domain *domain;
2136         phys_addr_t start_paddr;
2137         struct iova *iova;
2138         int prot = 0;
2139         int ret;
2140         struct intel_iommu *iommu;
2141
2142         BUG_ON(dir == DMA_NONE);
2143         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2144                 return paddr;
2145
2146         domain = get_valid_domain_for_dev(pdev);
2147         if (!domain)
2148                 return 0;
2149
2150         iommu = domain_get_iommu(domain);
2151         size = aligned_size((u64)paddr, size);
2152
2153         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2154         if (!iova)
2155                 goto error;
2156
2157         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2158
2159         /*
2160          * Check if DMAR supports zero-length reads on write only
2161          * mappings..
2162          */
2163         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2164                         !cap_zlr(iommu->cap))
2165                 prot |= DMA_PTE_READ;
2166         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2167                 prot |= DMA_PTE_WRITE;
2168         /*
2169          * paddr - (paddr + size) might be partial page, we should map the whole
2170          * page.  Note: if two part of one page are separately mapped, we
2171          * might have two guest_addr mapping to the same host paddr, but this
2172          * is not a big problem
2173          */
2174         ret = domain_page_mapping(domain, start_paddr,
2175                 ((u64)paddr) & PAGE_MASK, size, prot);
2176         if (ret)
2177                 goto error;
2178
2179         /* it's a non-present to present mapping */
2180         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2181                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2182         if (ret)
2183                 iommu_flush_write_buffer(iommu);
2184
2185         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2186
2187 error:
2188         if (iova)
2189                 __free_iova(&domain->iovad, iova);
2190         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2191                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2192         return 0;
2193 }
2194
2195 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2196                                  unsigned long offset, size_t size,
2197                                  enum dma_data_direction dir,
2198                                  struct dma_attrs *attrs)
2199 {
2200         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2201                                   dir, to_pci_dev(dev)->dma_mask);
2202 }
2203
2204 static void flush_unmaps(void)
2205 {
2206         int i, j;
2207
2208         timer_on = 0;
2209
2210         /* just flush them all */
2211         for (i = 0; i < g_num_of_iommus; i++) {
2212                 struct intel_iommu *iommu = g_iommus[i];
2213                 if (!iommu)
2214                         continue;
2215
2216                 if (deferred_flush[i].next) {
2217                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2218                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2219                         for (j = 0; j < deferred_flush[i].next; j++) {
2220                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2221                                                 deferred_flush[i].iova[j]);
2222                         }
2223                         deferred_flush[i].next = 0;
2224                 }
2225         }
2226
2227         list_size = 0;
2228 }
2229
2230 static void flush_unmaps_timeout(unsigned long data)
2231 {
2232         unsigned long flags;
2233
2234         spin_lock_irqsave(&async_umap_flush_lock, flags);
2235         flush_unmaps();
2236         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2237 }
2238
2239 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2240 {
2241         unsigned long flags;
2242         int next, iommu_id;
2243         struct intel_iommu *iommu;
2244
2245         spin_lock_irqsave(&async_umap_flush_lock, flags);
2246         if (list_size == HIGH_WATER_MARK)
2247                 flush_unmaps();
2248
2249         iommu = domain_get_iommu(dom);
2250         iommu_id = iommu->seq_id;
2251
2252         next = deferred_flush[iommu_id].next;
2253         deferred_flush[iommu_id].domain[next] = dom;
2254         deferred_flush[iommu_id].iova[next] = iova;
2255         deferred_flush[iommu_id].next++;
2256
2257         if (!timer_on) {
2258                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2259                 timer_on = 1;
2260         }
2261         list_size++;
2262         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2263 }
2264
2265 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2266                              size_t size, enum dma_data_direction dir,
2267                              struct dma_attrs *attrs)
2268 {
2269         struct pci_dev *pdev = to_pci_dev(dev);
2270         struct dmar_domain *domain;
2271         unsigned long start_addr;
2272         struct iova *iova;
2273         struct intel_iommu *iommu;
2274
2275         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2276                 return;
2277         domain = find_domain(pdev);
2278         BUG_ON(!domain);
2279
2280         iommu = domain_get_iommu(domain);
2281
2282         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2283         if (!iova)
2284                 return;
2285
2286         start_addr = iova->pfn_lo << PAGE_SHIFT;
2287         size = aligned_size((u64)dev_addr, size);
2288
2289         pr_debug("Device %s unmapping: %zx@%llx\n",
2290                 pci_name(pdev), size, (unsigned long long)start_addr);
2291
2292         /*  clear the whole page */
2293         dma_pte_clear_range(domain, start_addr, start_addr + size);
2294         /* free page tables */
2295         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2296         if (intel_iommu_strict) {
2297                 if (iommu_flush_iotlb_psi(iommu,
2298                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2299                         iommu_flush_write_buffer(iommu);
2300                 /* free iova */
2301                 __free_iova(&domain->iovad, iova);
2302         } else {
2303                 add_unmap(domain, iova);
2304                 /*
2305                  * queue up the release of the unmap to save the 1/6th of the
2306                  * cpu used up by the iotlb flush operation...
2307                  */
2308         }
2309 }
2310
2311 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2312                                int dir)
2313 {
2314         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2315 }
2316
2317 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2318                                   dma_addr_t *dma_handle, gfp_t flags)
2319 {
2320         void *vaddr;
2321         int order;
2322
2323         size = PAGE_ALIGN(size);
2324         order = get_order(size);
2325         flags &= ~(GFP_DMA | GFP_DMA32);
2326
2327         vaddr = (void *)__get_free_pages(flags, order);
2328         if (!vaddr)
2329                 return NULL;
2330         memset(vaddr, 0, size);
2331
2332         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2333                                          DMA_BIDIRECTIONAL,
2334                                          hwdev->coherent_dma_mask);
2335         if (*dma_handle)
2336                 return vaddr;
2337         free_pages((unsigned long)vaddr, order);
2338         return NULL;
2339 }
2340
2341 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2342                                 dma_addr_t dma_handle)
2343 {
2344         int order;
2345
2346         size = PAGE_ALIGN(size);
2347         order = get_order(size);
2348
2349         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2350         free_pages((unsigned long)vaddr, order);
2351 }
2352
2353 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2354                            int nelems, enum dma_data_direction dir,
2355                            struct dma_attrs *attrs)
2356 {
2357         int i;
2358         struct pci_dev *pdev = to_pci_dev(hwdev);
2359         struct dmar_domain *domain;
2360         unsigned long start_addr;
2361         struct iova *iova;
2362         size_t size = 0;
2363         phys_addr_t addr;
2364         struct scatterlist *sg;
2365         struct intel_iommu *iommu;
2366
2367         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2368                 return;
2369
2370         domain = find_domain(pdev);
2371         BUG_ON(!domain);
2372
2373         iommu = domain_get_iommu(domain);
2374
2375         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2376         if (!iova)
2377                 return;
2378         for_each_sg(sglist, sg, nelems, i) {
2379                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2380                 size += aligned_size((u64)addr, sg->length);
2381         }
2382
2383         start_addr = iova->pfn_lo << PAGE_SHIFT;
2384
2385         /*  clear the whole page */
2386         dma_pte_clear_range(domain, start_addr, start_addr + size);
2387         /* free page tables */
2388         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2389
2390         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2391                         size >> VTD_PAGE_SHIFT, 0))
2392                 iommu_flush_write_buffer(iommu);
2393
2394         /* free iova */
2395         __free_iova(&domain->iovad, iova);
2396 }
2397
2398 static int intel_nontranslate_map_sg(struct device *hddev,
2399         struct scatterlist *sglist, int nelems, int dir)
2400 {
2401         int i;
2402         struct scatterlist *sg;
2403
2404         for_each_sg(sglist, sg, nelems, i) {
2405                 BUG_ON(!sg_page(sg));
2406                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2407                 sg->dma_length = sg->length;
2408         }
2409         return nelems;
2410 }
2411
2412 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2413                         enum dma_data_direction dir, struct dma_attrs *attrs)
2414 {
2415         phys_addr_t addr;
2416         int i;
2417         struct pci_dev *pdev = to_pci_dev(hwdev);
2418         struct dmar_domain *domain;
2419         size_t size = 0;
2420         int prot = 0;
2421         size_t offset = 0;
2422         struct iova *iova = NULL;
2423         int ret;
2424         struct scatterlist *sg;
2425         unsigned long start_addr;
2426         struct intel_iommu *iommu;
2427
2428         BUG_ON(dir == DMA_NONE);
2429         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2430                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2431
2432         domain = get_valid_domain_for_dev(pdev);
2433         if (!domain)
2434                 return 0;
2435
2436         iommu = domain_get_iommu(domain);
2437
2438         for_each_sg(sglist, sg, nelems, i) {
2439                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2440                 size += aligned_size((u64)addr, sg->length);
2441         }
2442
2443         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2444         if (!iova) {
2445                 sglist->dma_length = 0;
2446                 return 0;
2447         }
2448
2449         /*
2450          * Check if DMAR supports zero-length reads on write only
2451          * mappings..
2452          */
2453         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2454                         !cap_zlr(iommu->cap))
2455                 prot |= DMA_PTE_READ;
2456         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2457                 prot |= DMA_PTE_WRITE;
2458
2459         start_addr = iova->pfn_lo << PAGE_SHIFT;
2460         offset = 0;
2461         for_each_sg(sglist, sg, nelems, i) {
2462                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2463                 size = aligned_size((u64)addr, sg->length);
2464                 ret = domain_page_mapping(domain, start_addr + offset,
2465                         ((u64)addr) & PAGE_MASK,
2466                         size, prot);
2467                 if (ret) {
2468                         /*  clear the page */
2469                         dma_pte_clear_range(domain, start_addr,
2470                                   start_addr + offset);
2471                         /* free page tables */
2472                         dma_pte_free_pagetable(domain, start_addr,
2473                                   start_addr + offset);
2474                         /* free iova */
2475                         __free_iova(&domain->iovad, iova);
2476                         return 0;
2477                 }
2478                 sg->dma_address = start_addr + offset +
2479                                 ((u64)addr & (~PAGE_MASK));
2480                 sg->dma_length = sg->length;
2481                 offset += size;
2482         }
2483
2484         /* it's a non-present to present mapping */
2485         if (iommu_flush_iotlb_psi(iommu, domain->id,
2486                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2487                 iommu_flush_write_buffer(iommu);
2488         return nelems;
2489 }
2490
2491 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2492 {
2493         return !dma_addr;
2494 }
2495
2496 struct dma_map_ops intel_dma_ops = {
2497         .alloc_coherent = intel_alloc_coherent,
2498         .free_coherent = intel_free_coherent,
2499         .map_sg = intel_map_sg,
2500         .unmap_sg = intel_unmap_sg,
2501         .map_page = intel_map_page,
2502         .unmap_page = intel_unmap_page,
2503         .mapping_error = intel_mapping_error,
2504 };
2505
2506 static inline int iommu_domain_cache_init(void)
2507 {
2508         int ret = 0;
2509
2510         iommu_domain_cache = kmem_cache_create("iommu_domain",
2511                                          sizeof(struct dmar_domain),
2512                                          0,
2513                                          SLAB_HWCACHE_ALIGN,
2514
2515                                          NULL);
2516         if (!iommu_domain_cache) {
2517                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2518                 ret = -ENOMEM;
2519         }
2520
2521         return ret;
2522 }
2523
2524 static inline int iommu_devinfo_cache_init(void)
2525 {
2526         int ret = 0;
2527
2528         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2529                                          sizeof(struct device_domain_info),
2530                                          0,
2531                                          SLAB_HWCACHE_ALIGN,
2532                                          NULL);
2533         if (!iommu_devinfo_cache) {
2534                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2535                 ret = -ENOMEM;
2536         }
2537
2538         return ret;
2539 }
2540
2541 static inline int iommu_iova_cache_init(void)
2542 {
2543         int ret = 0;
2544
2545         iommu_iova_cache = kmem_cache_create("iommu_iova",
2546                                          sizeof(struct iova),
2547                                          0,
2548                                          SLAB_HWCACHE_ALIGN,
2549                                          NULL);
2550         if (!iommu_iova_cache) {
2551                 printk(KERN_ERR "Couldn't create iova cache\n");
2552                 ret = -ENOMEM;
2553         }
2554
2555         return ret;
2556 }
2557
2558 static int __init iommu_init_mempool(void)
2559 {
2560         int ret;
2561         ret = iommu_iova_cache_init();
2562         if (ret)
2563                 return ret;
2564
2565         ret = iommu_domain_cache_init();
2566         if (ret)
2567                 goto domain_error;
2568
2569         ret = iommu_devinfo_cache_init();
2570         if (!ret)
2571                 return ret;
2572
2573         kmem_cache_destroy(iommu_domain_cache);
2574 domain_error:
2575         kmem_cache_destroy(iommu_iova_cache);
2576
2577         return -ENOMEM;
2578 }
2579
2580 static void __init iommu_exit_mempool(void)
2581 {
2582         kmem_cache_destroy(iommu_devinfo_cache);
2583         kmem_cache_destroy(iommu_domain_cache);
2584         kmem_cache_destroy(iommu_iova_cache);
2585
2586 }
2587
2588 static void __init init_no_remapping_devices(void)
2589 {
2590         struct dmar_drhd_unit *drhd;
2591
2592         for_each_drhd_unit(drhd) {
2593                 if (!drhd->include_all) {
2594                         int i;
2595                         for (i = 0; i < drhd->devices_cnt; i++)
2596                                 if (drhd->devices[i] != NULL)
2597                                         break;
2598                         /* ignore DMAR unit if no pci devices exist */
2599                         if (i == drhd->devices_cnt)
2600                                 drhd->ignored = 1;
2601                 }
2602         }
2603
2604         if (dmar_map_gfx)
2605                 return;
2606
2607         for_each_drhd_unit(drhd) {
2608                 int i;
2609                 if (drhd->ignored || drhd->include_all)
2610                         continue;
2611
2612                 for (i = 0; i < drhd->devices_cnt; i++)
2613                         if (drhd->devices[i] &&
2614                                 !IS_GFX_DEVICE(drhd->devices[i]))
2615                                 break;
2616
2617                 if (i < drhd->devices_cnt)
2618                         continue;
2619
2620                 /* bypass IOMMU if it is just for gfx devices */
2621                 drhd->ignored = 1;
2622                 for (i = 0; i < drhd->devices_cnt; i++) {
2623                         if (!drhd->devices[i])
2624                                 continue;
2625                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2626                 }
2627         }
2628 }
2629
2630 #ifdef CONFIG_SUSPEND
2631 static int init_iommu_hw(void)
2632 {
2633         struct dmar_drhd_unit *drhd;
2634         struct intel_iommu *iommu = NULL;
2635
2636         for_each_active_iommu(iommu, drhd)
2637                 if (iommu->qi)
2638                         dmar_reenable_qi(iommu);
2639
2640         for_each_active_iommu(iommu, drhd) {
2641                 iommu_flush_write_buffer(iommu);
2642
2643                 iommu_set_root_entry(iommu);
2644
2645                 iommu->flush.flush_context(iommu, 0, 0, 0,
2646                                                 DMA_CCMD_GLOBAL_INVL, 0);
2647                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2648                                                 DMA_TLB_GLOBAL_FLUSH, 0);
2649                 iommu_disable_protect_mem_regions(iommu);
2650                 iommu_enable_translation(iommu);
2651         }
2652
2653         return 0;
2654 }
2655
2656 static void iommu_flush_all(void)
2657 {
2658         struct dmar_drhd_unit *drhd;
2659         struct intel_iommu *iommu;
2660
2661         for_each_active_iommu(iommu, drhd) {
2662                 iommu->flush.flush_context(iommu, 0, 0, 0,
2663                                                 DMA_CCMD_GLOBAL_INVL, 0);
2664                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2665                                                 DMA_TLB_GLOBAL_FLUSH, 0);
2666         }
2667 }
2668
2669 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
2670 {
2671         struct dmar_drhd_unit *drhd;
2672         struct intel_iommu *iommu = NULL;
2673         unsigned long flag;
2674
2675         for_each_active_iommu(iommu, drhd) {
2676                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
2677                                                  GFP_ATOMIC);
2678                 if (!iommu->iommu_state)
2679                         goto nomem;
2680         }
2681
2682         iommu_flush_all();
2683
2684         for_each_active_iommu(iommu, drhd) {
2685                 iommu_disable_translation(iommu);
2686
2687                 spin_lock_irqsave(&iommu->register_lock, flag);
2688
2689                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2690                         readl(iommu->reg + DMAR_FECTL_REG);
2691                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2692                         readl(iommu->reg + DMAR_FEDATA_REG);
2693                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2694                         readl(iommu->reg + DMAR_FEADDR_REG);
2695                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2696                         readl(iommu->reg + DMAR_FEUADDR_REG);
2697
2698                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2699         }
2700         return 0;
2701
2702 nomem:
2703         for_each_active_iommu(iommu, drhd)
2704                 kfree(iommu->iommu_state);
2705
2706         return -ENOMEM;
2707 }
2708
2709 static int iommu_resume(struct sys_device *dev)
2710 {
2711         struct dmar_drhd_unit *drhd;
2712         struct intel_iommu *iommu = NULL;
2713         unsigned long flag;
2714
2715         if (init_iommu_hw()) {
2716                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2717                 return -EIO;
2718         }
2719
2720         for_each_active_iommu(iommu, drhd) {
2721
2722                 spin_lock_irqsave(&iommu->register_lock, flag);
2723
2724                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2725                         iommu->reg + DMAR_FECTL_REG);
2726                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2727                         iommu->reg + DMAR_FEDATA_REG);
2728                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2729                         iommu->reg + DMAR_FEADDR_REG);
2730                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2731                         iommu->reg + DMAR_FEUADDR_REG);
2732
2733                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2734         }
2735
2736         for_each_active_iommu(iommu, drhd)
2737                 kfree(iommu->iommu_state);
2738
2739         return 0;
2740 }
2741
2742 static struct sysdev_class iommu_sysclass = {
2743         .name           = "iommu",
2744         .resume         = iommu_resume,
2745         .suspend        = iommu_suspend,
2746 };
2747
2748 static struct sys_device device_iommu = {
2749         .cls    = &iommu_sysclass,
2750 };
2751
2752 static int __init init_iommu_sysfs(void)
2753 {
2754         int error;
2755
2756         error = sysdev_class_register(&iommu_sysclass);
2757         if (error)
2758                 return error;
2759
2760         error = sysdev_register(&device_iommu);
2761         if (error)
2762                 sysdev_class_unregister(&iommu_sysclass);
2763
2764         return error;
2765 }
2766
2767 #else
2768 static int __init init_iommu_sysfs(void)
2769 {
2770         return 0;
2771 }
2772 #endif  /* CONFIG_PM */
2773
2774 int __init intel_iommu_init(void)
2775 {
2776         int ret = 0;
2777
2778         if (dmar_table_init())
2779                 return  -ENODEV;
2780
2781         if (dmar_dev_scope_init())
2782                 return  -ENODEV;
2783
2784         /*
2785          * Check the need for DMA-remapping initialization now.
2786          * Above initialization will also be used by Interrupt-remapping.
2787          */
2788         if (no_iommu || swiotlb || dmar_disabled)
2789                 return -ENODEV;
2790
2791         iommu_init_mempool();
2792         dmar_init_reserved_ranges();
2793
2794         init_no_remapping_devices();
2795
2796         ret = init_dmars();
2797         if (ret) {
2798                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2799                 put_iova_domain(&reserved_iova_list);
2800                 iommu_exit_mempool();
2801                 return ret;
2802         }
2803         printk(KERN_INFO
2804         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2805
2806         init_timer(&unmap_timer);
2807         force_iommu = 1;
2808         dma_ops = &intel_dma_ops;
2809         init_iommu_sysfs();
2810
2811         register_iommu(&intel_iommu_ops);
2812
2813         return 0;
2814 }
2815
2816 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2817                                   struct pci_dev *pdev)
2818 {
2819         struct device_domain_info *info;
2820         unsigned long flags;
2821
2822         info = alloc_devinfo_mem();
2823         if (!info)
2824                 return -ENOMEM;
2825
2826         info->segment = pci_domain_nr(pdev->bus);
2827         info->bus = pdev->bus->number;
2828         info->devfn = pdev->devfn;
2829         info->dev = pdev;
2830         info->domain = domain;
2831
2832         spin_lock_irqsave(&device_domain_lock, flags);
2833         list_add(&info->link, &domain->devices);
2834         list_add(&info->global, &device_domain_list);
2835         pdev->dev.archdata.iommu = info;
2836         spin_unlock_irqrestore(&device_domain_lock, flags);
2837
2838         return 0;
2839 }
2840
2841 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2842                                            struct pci_dev *pdev)
2843 {
2844         struct pci_dev *tmp, *parent;
2845
2846         if (!iommu || !pdev)
2847                 return;
2848
2849         /* dependent device detach */
2850         tmp = pci_find_upstream_pcie_bridge(pdev);
2851         /* Secondary interface's bus number and devfn 0 */
2852         if (tmp) {
2853                 parent = pdev->bus->self;
2854                 while (parent != tmp) {
2855                         iommu_detach_dev(iommu, parent->bus->number,
2856                                          parent->devfn);
2857                         parent = parent->bus->self;
2858                 }
2859                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2860                         iommu_detach_dev(iommu,
2861                                 tmp->subordinate->number, 0);
2862                 else /* this is a legacy PCI bridge */
2863                         iommu_detach_dev(iommu, tmp->bus->number,
2864                                          tmp->devfn);
2865         }
2866 }
2867
2868 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2869                                           struct pci_dev *pdev)
2870 {
2871         struct device_domain_info *info;
2872         struct intel_iommu *iommu;
2873         unsigned long flags;
2874         int found = 0;
2875         struct list_head *entry, *tmp;
2876
2877         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
2878                                 pdev->devfn);
2879         if (!iommu)
2880                 return;
2881
2882         spin_lock_irqsave(&device_domain_lock, flags);
2883         list_for_each_safe(entry, tmp, &domain->devices) {
2884                 info = list_entry(entry, struct device_domain_info, link);
2885                 /* No need to compare PCI domain; it has to be the same */
2886                 if (info->bus == pdev->bus->number &&
2887                     info->devfn == pdev->devfn) {
2888                         list_del(&info->link);
2889                         list_del(&info->global);
2890                         if (info->dev)
2891                                 info->dev->dev.archdata.iommu = NULL;
2892                         spin_unlock_irqrestore(&device_domain_lock, flags);
2893
2894                         iommu_detach_dev(iommu, info->bus, info->devfn);
2895                         iommu_detach_dependent_devices(iommu, pdev);
2896                         free_devinfo_mem(info);
2897
2898                         spin_lock_irqsave(&device_domain_lock, flags);
2899
2900                         if (found)
2901                                 break;
2902                         else
2903                                 continue;
2904                 }
2905
2906                 /* if there is no other devices under the same iommu
2907                  * owned by this domain, clear this iommu in iommu_bmp
2908                  * update iommu count and coherency
2909                  */
2910                 if (iommu == device_to_iommu(info->segment, info->bus,
2911                                             info->devfn))
2912                         found = 1;
2913         }
2914
2915         if (found == 0) {
2916                 unsigned long tmp_flags;
2917                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2918                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2919                 domain->iommu_count--;
2920                 domain_update_iommu_cap(domain);
2921                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2922         }
2923
2924         spin_unlock_irqrestore(&device_domain_lock, flags);
2925 }
2926
2927 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2928 {
2929         struct device_domain_info *info;
2930         struct intel_iommu *iommu;
2931         unsigned long flags1, flags2;
2932
2933         spin_lock_irqsave(&device_domain_lock, flags1);
2934         while (!list_empty(&domain->devices)) {
2935                 info = list_entry(domain->devices.next,
2936                         struct device_domain_info, link);
2937                 list_del(&info->link);
2938                 list_del(&info->global);
2939                 if (info->dev)
2940                         info->dev->dev.archdata.iommu = NULL;
2941
2942                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2943
2944                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2945                 iommu_detach_dev(iommu, info->bus, info->devfn);
2946                 iommu_detach_dependent_devices(iommu, info->dev);
2947
2948                 /* clear this iommu in iommu_bmp, update iommu count
2949                  * and capabilities
2950                  */
2951                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2952                 if (test_and_clear_bit(iommu->seq_id,
2953                                        &domain->iommu_bmp)) {
2954                         domain->iommu_count--;
2955                         domain_update_iommu_cap(domain);
2956                 }
2957                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2958
2959                 free_devinfo_mem(info);
2960                 spin_lock_irqsave(&device_domain_lock, flags1);
2961         }
2962         spin_unlock_irqrestore(&device_domain_lock, flags1);
2963 }
2964
2965 /* domain id for virtual machine, it won't be set in context */
2966 static unsigned long vm_domid;
2967
2968 static int vm_domain_min_agaw(struct dmar_domain *domain)
2969 {
2970         int i;
2971         int min_agaw = domain->agaw;
2972
2973         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2974         for (; i < g_num_of_iommus; ) {
2975                 if (min_agaw > g_iommus[i]->agaw)
2976                         min_agaw = g_iommus[i]->agaw;
2977
2978                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2979         }
2980
2981         return min_agaw;
2982 }
2983
2984 static struct dmar_domain *iommu_alloc_vm_domain(void)
2985 {
2986         struct dmar_domain *domain;
2987
2988         domain = alloc_domain_mem();
2989         if (!domain)
2990                 return NULL;
2991
2992         domain->id = vm_domid++;
2993         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2994         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2995
2996         return domain;
2997 }
2998
2999 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
3000 {
3001         int adjust_width;
3002
3003         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3004         spin_lock_init(&domain->mapping_lock);
3005         spin_lock_init(&domain->iommu_lock);
3006
3007         domain_reserve_special_ranges(domain);
3008
3009         /* calculate AGAW */
3010         domain->gaw = guest_width;
3011         adjust_width = guestwidth_to_adjustwidth(guest_width);
3012         domain->agaw = width_to_agaw(adjust_width);
3013
3014         INIT_LIST_HEAD(&domain->devices);
3015
3016         domain->iommu_count = 0;
3017         domain->iommu_coherency = 0;
3018         domain->max_addr = 0;
3019
3020         /* always allocate the top pgd */
3021         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3022         if (!domain->pgd)
3023                 return -ENOMEM;
3024         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3025         return 0;
3026 }
3027
3028 static void iommu_free_vm_domain(struct dmar_domain *domain)
3029 {
3030         unsigned long flags;
3031         struct dmar_drhd_unit *drhd;
3032         struct intel_iommu *iommu;
3033         unsigned long i;
3034         unsigned long ndomains;
3035
3036         for_each_drhd_unit(drhd) {
3037                 if (drhd->ignored)
3038                         continue;
3039                 iommu = drhd->iommu;
3040
3041                 ndomains = cap_ndoms(iommu->cap);
3042                 i = find_first_bit(iommu->domain_ids, ndomains);
3043                 for (; i < ndomains; ) {
3044                         if (iommu->domains[i] == domain) {
3045                                 spin_lock_irqsave(&iommu->lock, flags);
3046                                 clear_bit(i, iommu->domain_ids);
3047                                 iommu->domains[i] = NULL;
3048                                 spin_unlock_irqrestore(&iommu->lock, flags);
3049                                 break;
3050                         }
3051                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3052                 }
3053         }
3054 }
3055
3056 static void vm_domain_exit(struct dmar_domain *domain)
3057 {
3058         u64 end;
3059
3060         /* Domain 0 is reserved, so dont process it */
3061         if (!domain)
3062                 return;
3063
3064         vm_domain_remove_all_dev_info(domain);
3065         /* destroy iovas */
3066         put_iova_domain(&domain->iovad);
3067         end = DOMAIN_MAX_ADDR(domain->gaw);
3068         end = end & (~VTD_PAGE_MASK);
3069
3070         /* clear ptes */
3071         dma_pte_clear_range(domain, 0, end);
3072
3073         /* free page tables */
3074         dma_pte_free_pagetable(domain, 0, end);
3075
3076         iommu_free_vm_domain(domain);
3077         free_domain_mem(domain);
3078 }
3079
3080 static int intel_iommu_domain_init(struct iommu_domain *domain)
3081 {
3082         struct dmar_domain *dmar_domain;
3083
3084         dmar_domain = iommu_alloc_vm_domain();
3085         if (!dmar_domain) {
3086                 printk(KERN_ERR
3087                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3088                 return -ENOMEM;
3089         }
3090         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3091                 printk(KERN_ERR
3092                         "intel_iommu_domain_init() failed\n");
3093                 vm_domain_exit(dmar_domain);
3094                 return -ENOMEM;
3095         }
3096         domain->priv = dmar_domain;
3097
3098         return 0;
3099 }
3100
3101 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3102 {
3103         struct dmar_domain *dmar_domain = domain->priv;
3104
3105         domain->priv = NULL;
3106         vm_domain_exit(dmar_domain);
3107 }
3108
3109 static int intel_iommu_attach_device(struct iommu_domain *domain,
3110                                      struct device *dev)
3111 {
3112         struct dmar_domain *dmar_domain = domain->priv;
3113         struct pci_dev *pdev = to_pci_dev(dev);
3114         struct intel_iommu *iommu;
3115         int addr_width;
3116         u64 end;
3117         int ret;
3118
3119         /* normally pdev is not mapped */
3120         if (unlikely(domain_context_mapped(pdev))) {
3121                 struct dmar_domain *old_domain;
3122
3123                 old_domain = find_domain(pdev);
3124                 if (old_domain) {
3125                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3126                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3127                         else
3128                                 domain_remove_dev_info(old_domain);
3129                 }
3130         }
3131
3132         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3133                                 pdev->devfn);
3134         if (!iommu)
3135                 return -ENODEV;
3136
3137         /* check if this iommu agaw is sufficient for max mapped address */
3138         addr_width = agaw_to_width(iommu->agaw);
3139         end = DOMAIN_MAX_ADDR(addr_width);
3140         end = end & VTD_PAGE_MASK;
3141         if (end < dmar_domain->max_addr) {
3142                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3143                        "sufficient for the mapped address (%llx)\n",
3144                        __func__, iommu->agaw, dmar_domain->max_addr);
3145                 return -EFAULT;
3146         }
3147
3148         ret = domain_context_mapping(dmar_domain, pdev);
3149         if (ret)
3150                 return ret;
3151
3152         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3153         return ret;
3154 }
3155
3156 static void intel_iommu_detach_device(struct iommu_domain *domain,
3157                                       struct device *dev)
3158 {
3159         struct dmar_domain *dmar_domain = domain->priv;
3160         struct pci_dev *pdev = to_pci_dev(dev);
3161
3162         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3163 }
3164
3165 static int intel_iommu_map_range(struct iommu_domain *domain,
3166                                  unsigned long iova, phys_addr_t hpa,
3167                                  size_t size, int iommu_prot)
3168 {
3169         struct dmar_domain *dmar_domain = domain->priv;
3170         u64 max_addr;
3171         int addr_width;
3172         int prot = 0;
3173         int ret;
3174
3175         if (iommu_prot & IOMMU_READ)
3176                 prot |= DMA_PTE_READ;
3177         if (iommu_prot & IOMMU_WRITE)
3178                 prot |= DMA_PTE_WRITE;
3179         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3180                 prot |= DMA_PTE_SNP;
3181
3182         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3183         if (dmar_domain->max_addr < max_addr) {
3184                 int min_agaw;
3185                 u64 end;
3186
3187                 /* check if minimum agaw is sufficient for mapped address */
3188                 min_agaw = vm_domain_min_agaw(dmar_domain);
3189                 addr_width = agaw_to_width(min_agaw);
3190                 end = DOMAIN_MAX_ADDR(addr_width);
3191                 end = end & VTD_PAGE_MASK;
3192                 if (end < max_addr) {
3193                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3194                                "sufficient for the mapped address (%llx)\n",
3195                                __func__, min_agaw, max_addr);
3196                         return -EFAULT;
3197                 }
3198                 dmar_domain->max_addr = max_addr;
3199         }
3200
3201         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3202         return ret;
3203 }
3204
3205 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3206                                     unsigned long iova, size_t size)
3207 {
3208         struct dmar_domain *dmar_domain = domain->priv;
3209         dma_addr_t base;
3210
3211         /* The address might not be aligned */
3212         base = iova & VTD_PAGE_MASK;
3213         size = VTD_PAGE_ALIGN(size);
3214         dma_pte_clear_range(dmar_domain, base, base + size);
3215
3216         if (dmar_domain->max_addr == base + size)
3217                 dmar_domain->max_addr = base;
3218 }
3219
3220 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3221                                             unsigned long iova)
3222 {
3223         struct dmar_domain *dmar_domain = domain->priv;
3224         struct dma_pte *pte;
3225         u64 phys = 0;
3226
3227         pte = addr_to_dma_pte(dmar_domain, iova);
3228         if (pte)
3229                 phys = dma_pte_addr(pte);
3230
3231         return phys;
3232 }
3233
3234 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3235                                       unsigned long cap)
3236 {
3237         struct dmar_domain *dmar_domain = domain->priv;
3238
3239         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3240                 return dmar_domain->iommu_snooping;
3241
3242         return 0;
3243 }
3244
3245 static struct iommu_ops intel_iommu_ops = {
3246         .domain_init    = intel_iommu_domain_init,
3247         .domain_destroy = intel_iommu_domain_destroy,
3248         .attach_dev     = intel_iommu_attach_device,
3249         .detach_dev     = intel_iommu_detach_device,
3250         .map            = intel_iommu_map_range,
3251         .unmap          = intel_iommu_unmap_range,
3252         .iova_to_phys   = intel_iommu_iova_to_phys,
3253         .domain_has_cap = intel_iommu_domain_has_cap,
3254 };
3255
3256 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3257 {
3258         /*
3259          * Mobile 4 Series Chipset neglects to set RWBF capability,
3260          * but needs it:
3261          */
3262         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3263         rwbf_quirk = 1;
3264 }
3265
3266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);