Change intel iommu APIs of virtual machine domain
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64  * 0: Present
65  * 1-11: Reserved
66  * 12-63: Context Ptr (12 - (haw-1))
67  * 64-127: Reserved
68  */
69 struct root_entry {
70         u64     val;
71         u64     rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76         return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80         root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84         root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90         return (struct context_entry *)
91                 (root_present(root)?phys_to_virt(
92                 root->val & VTD_PAGE_MASK) :
93                 NULL);
94 }
95
96 /*
97  * low 64 bits:
98  * 0: present
99  * 1: fault processing disable
100  * 2-3: translation type
101  * 12-63: address space root
102  * high 64 bits:
103  * 0-2: address width
104  * 3-6: aval
105  * 8-23: domain id
106  */
107 struct context_entry {
108         u64 lo;
109         u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114         return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118         context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123         context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129                                                 unsigned long value)
130 {
131         context->lo &= (((u64)-1) << 4) | 3;
132         context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136                                             unsigned long value)
137 {
138         context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142                                              unsigned long value)
143 {
144         context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148                                          unsigned long value)
149 {
150         context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155         context->lo = 0;
156         context->hi = 0;
157 }
158
159 /*
160  * 0: readable
161  * 1: writable
162  * 2-6: reserved
163  * 7: super page
164  * 8-11: available
165  * 12-63: Host physcial address
166  */
167 struct dma_pte {
168         u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173         pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178         pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188         pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193         return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198         pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203         return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 /* domain represents a virtual machine, more than one devices
210  * across iommus may be owned in one domain, e.g. kvm guest.
211  */
212 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
213
214 struct dmar_domain {
215         int     id;                     /* domain id */
216         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
217
218         struct list_head devices;       /* all devices' list */
219         struct iova_domain iovad;       /* iova's that belong to this domain */
220
221         struct dma_pte  *pgd;           /* virtual address */
222         spinlock_t      mapping_lock;   /* page table lock */
223         int             gaw;            /* max guest address width */
224
225         /* adjusted guest address width, 0 is level 2 30-bit */
226         int             agaw;
227
228         int             flags;          /* flags to find out type of domain */
229
230         int             iommu_coherency;/* indicate coherency of iommu access */
231         int             iommu_count;    /* reference count of iommu */
232         spinlock_t      iommu_lock;     /* protect iommu set in domain */
233 };
234
235 /* PCI domain-device relationship */
236 struct device_domain_info {
237         struct list_head link;  /* link to domain siblings */
238         struct list_head global; /* link to global list */
239         u8 bus;                 /* PCI bus numer */
240         u8 devfn;               /* PCI devfn number */
241         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
242         struct dmar_domain *domain; /* pointer to domain */
243 };
244
245 static void flush_unmaps_timeout(unsigned long data);
246
247 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
248
249 #define HIGH_WATER_MARK 250
250 struct deferred_flush_tables {
251         int next;
252         struct iova *iova[HIGH_WATER_MARK];
253         struct dmar_domain *domain[HIGH_WATER_MARK];
254 };
255
256 static struct deferred_flush_tables *deferred_flush;
257
258 /* bitmap for indexing intel_iommus */
259 static int g_num_of_iommus;
260
261 static DEFINE_SPINLOCK(async_umap_flush_lock);
262 static LIST_HEAD(unmaps_to_do);
263
264 static int timer_on;
265 static long list_size;
266
267 static void domain_remove_dev_info(struct dmar_domain *domain);
268
269 int dmar_disabled;
270 static int __initdata dmar_map_gfx = 1;
271 static int dmar_forcedac;
272 static int intel_iommu_strict;
273
274 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
275 static DEFINE_SPINLOCK(device_domain_lock);
276 static LIST_HEAD(device_domain_list);
277
278 static int __init intel_iommu_setup(char *str)
279 {
280         if (!str)
281                 return -EINVAL;
282         while (*str) {
283                 if (!strncmp(str, "off", 3)) {
284                         dmar_disabled = 1;
285                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
286                 } else if (!strncmp(str, "igfx_off", 8)) {
287                         dmar_map_gfx = 0;
288                         printk(KERN_INFO
289                                 "Intel-IOMMU: disable GFX device mapping\n");
290                 } else if (!strncmp(str, "forcedac", 8)) {
291                         printk(KERN_INFO
292                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
293                         dmar_forcedac = 1;
294                 } else if (!strncmp(str, "strict", 6)) {
295                         printk(KERN_INFO
296                                 "Intel-IOMMU: disable batched IOTLB flush\n");
297                         intel_iommu_strict = 1;
298                 }
299
300                 str += strcspn(str, ",");
301                 while (*str == ',')
302                         str++;
303         }
304         return 0;
305 }
306 __setup("intel_iommu=", intel_iommu_setup);
307
308 static struct kmem_cache *iommu_domain_cache;
309 static struct kmem_cache *iommu_devinfo_cache;
310 static struct kmem_cache *iommu_iova_cache;
311
312 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
313 {
314         unsigned int flags;
315         void *vaddr;
316
317         /* trying to avoid low memory issues */
318         flags = current->flags & PF_MEMALLOC;
319         current->flags |= PF_MEMALLOC;
320         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
321         current->flags &= (~PF_MEMALLOC | flags);
322         return vaddr;
323 }
324
325
326 static inline void *alloc_pgtable_page(void)
327 {
328         unsigned int flags;
329         void *vaddr;
330
331         /* trying to avoid low memory issues */
332         flags = current->flags & PF_MEMALLOC;
333         current->flags |= PF_MEMALLOC;
334         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
335         current->flags &= (~PF_MEMALLOC | flags);
336         return vaddr;
337 }
338
339 static inline void free_pgtable_page(void *vaddr)
340 {
341         free_page((unsigned long)vaddr);
342 }
343
344 static inline void *alloc_domain_mem(void)
345 {
346         return iommu_kmem_cache_alloc(iommu_domain_cache);
347 }
348
349 static void free_domain_mem(void *vaddr)
350 {
351         kmem_cache_free(iommu_domain_cache, vaddr);
352 }
353
354 static inline void * alloc_devinfo_mem(void)
355 {
356         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
357 }
358
359 static inline void free_devinfo_mem(void *vaddr)
360 {
361         kmem_cache_free(iommu_devinfo_cache, vaddr);
362 }
363
364 struct iova *alloc_iova_mem(void)
365 {
366         return iommu_kmem_cache_alloc(iommu_iova_cache);
367 }
368
369 void free_iova_mem(struct iova *iova)
370 {
371         kmem_cache_free(iommu_iova_cache, iova);
372 }
373
374
375 static inline int width_to_agaw(int width);
376
377 /* calculate agaw for each iommu.
378  * "SAGAW" may be different across iommus, use a default agaw, and
379  * get a supported less agaw for iommus that don't support the default agaw.
380  */
381 int iommu_calculate_agaw(struct intel_iommu *iommu)
382 {
383         unsigned long sagaw;
384         int agaw = -1;
385
386         sagaw = cap_sagaw(iommu->cap);
387         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
388              agaw >= 0; agaw--) {
389                 if (test_bit(agaw, &sagaw))
390                         break;
391         }
392
393         return agaw;
394 }
395
396 /* in native case, each domain is related to only one iommu */
397 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
398 {
399         int iommu_id;
400
401         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
402
403         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
404         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
405                 return NULL;
406
407         return g_iommus[iommu_id];
408 }
409
410 /* "Coherency" capability may be different across iommus */
411 static void domain_update_iommu_coherency(struct dmar_domain *domain)
412 {
413         int i;
414
415         domain->iommu_coherency = 1;
416
417         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
418         for (; i < g_num_of_iommus; ) {
419                 if (!ecap_coherent(g_iommus[i]->ecap)) {
420                         domain->iommu_coherency = 0;
421                         break;
422                 }
423                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
424         }
425 }
426
427 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
428 {
429         struct dmar_drhd_unit *drhd = NULL;
430         int i;
431
432         for_each_drhd_unit(drhd) {
433                 if (drhd->ignored)
434                         continue;
435
436                 for (i = 0; i < drhd->devices_cnt; i++)
437                         if (drhd->devices[i]->bus->number == bus &&
438                             drhd->devices[i]->devfn == devfn)
439                                 return drhd->iommu;
440
441                 if (drhd->include_all)
442                         return drhd->iommu;
443         }
444
445         return NULL;
446 }
447
448 static void domain_flush_cache(struct dmar_domain *domain,
449                                void *addr, int size)
450 {
451         if (!domain->iommu_coherency)
452                 clflush_cache_range(addr, size);
453 }
454
455 /* Gets context entry for a given bus and devfn */
456 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
457                 u8 bus, u8 devfn)
458 {
459         struct root_entry *root;
460         struct context_entry *context;
461         unsigned long phy_addr;
462         unsigned long flags;
463
464         spin_lock_irqsave(&iommu->lock, flags);
465         root = &iommu->root_entry[bus];
466         context = get_context_addr_from_root(root);
467         if (!context) {
468                 context = (struct context_entry *)alloc_pgtable_page();
469                 if (!context) {
470                         spin_unlock_irqrestore(&iommu->lock, flags);
471                         return NULL;
472                 }
473                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
474                 phy_addr = virt_to_phys((void *)context);
475                 set_root_value(root, phy_addr);
476                 set_root_present(root);
477                 __iommu_flush_cache(iommu, root, sizeof(*root));
478         }
479         spin_unlock_irqrestore(&iommu->lock, flags);
480         return &context[devfn];
481 }
482
483 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
484 {
485         struct root_entry *root;
486         struct context_entry *context;
487         int ret;
488         unsigned long flags;
489
490         spin_lock_irqsave(&iommu->lock, flags);
491         root = &iommu->root_entry[bus];
492         context = get_context_addr_from_root(root);
493         if (!context) {
494                 ret = 0;
495                 goto out;
496         }
497         ret = context_present(&context[devfn]);
498 out:
499         spin_unlock_irqrestore(&iommu->lock, flags);
500         return ret;
501 }
502
503 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
504 {
505         struct root_entry *root;
506         struct context_entry *context;
507         unsigned long flags;
508
509         spin_lock_irqsave(&iommu->lock, flags);
510         root = &iommu->root_entry[bus];
511         context = get_context_addr_from_root(root);
512         if (context) {
513                 context_clear_entry(&context[devfn]);
514                 __iommu_flush_cache(iommu, &context[devfn], \
515                         sizeof(*context));
516         }
517         spin_unlock_irqrestore(&iommu->lock, flags);
518 }
519
520 static void free_context_table(struct intel_iommu *iommu)
521 {
522         struct root_entry *root;
523         int i;
524         unsigned long flags;
525         struct context_entry *context;
526
527         spin_lock_irqsave(&iommu->lock, flags);
528         if (!iommu->root_entry) {
529                 goto out;
530         }
531         for (i = 0; i < ROOT_ENTRY_NR; i++) {
532                 root = &iommu->root_entry[i];
533                 context = get_context_addr_from_root(root);
534                 if (context)
535                         free_pgtable_page(context);
536         }
537         free_pgtable_page(iommu->root_entry);
538         iommu->root_entry = NULL;
539 out:
540         spin_unlock_irqrestore(&iommu->lock, flags);
541 }
542
543 /* page table handling */
544 #define LEVEL_STRIDE            (9)
545 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
546
547 static inline int agaw_to_level(int agaw)
548 {
549         return agaw + 2;
550 }
551
552 static inline int agaw_to_width(int agaw)
553 {
554         return 30 + agaw * LEVEL_STRIDE;
555
556 }
557
558 static inline int width_to_agaw(int width)
559 {
560         return (width - 30) / LEVEL_STRIDE;
561 }
562
563 static inline unsigned int level_to_offset_bits(int level)
564 {
565         return (12 + (level - 1) * LEVEL_STRIDE);
566 }
567
568 static inline int address_level_offset(u64 addr, int level)
569 {
570         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
571 }
572
573 static inline u64 level_mask(int level)
574 {
575         return ((u64)-1 << level_to_offset_bits(level));
576 }
577
578 static inline u64 level_size(int level)
579 {
580         return ((u64)1 << level_to_offset_bits(level));
581 }
582
583 static inline u64 align_to_level(u64 addr, int level)
584 {
585         return ((addr + level_size(level) - 1) & level_mask(level));
586 }
587
588 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
589 {
590         int addr_width = agaw_to_width(domain->agaw);
591         struct dma_pte *parent, *pte = NULL;
592         int level = agaw_to_level(domain->agaw);
593         int offset;
594         unsigned long flags;
595
596         BUG_ON(!domain->pgd);
597
598         addr &= (((u64)1) << addr_width) - 1;
599         parent = domain->pgd;
600
601         spin_lock_irqsave(&domain->mapping_lock, flags);
602         while (level > 0) {
603                 void *tmp_page;
604
605                 offset = address_level_offset(addr, level);
606                 pte = &parent[offset];
607                 if (level == 1)
608                         break;
609
610                 if (!dma_pte_present(pte)) {
611                         tmp_page = alloc_pgtable_page();
612
613                         if (!tmp_page) {
614                                 spin_unlock_irqrestore(&domain->mapping_lock,
615                                         flags);
616                                 return NULL;
617                         }
618                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
619                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
620                         /*
621                          * high level table always sets r/w, last level page
622                          * table control read/write
623                          */
624                         dma_set_pte_readable(pte);
625                         dma_set_pte_writable(pte);
626                         domain_flush_cache(domain, pte, sizeof(*pte));
627                 }
628                 parent = phys_to_virt(dma_pte_addr(pte));
629                 level--;
630         }
631
632         spin_unlock_irqrestore(&domain->mapping_lock, flags);
633         return pte;
634 }
635
636 /* return address's pte at specific level */
637 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
638                 int level)
639 {
640         struct dma_pte *parent, *pte = NULL;
641         int total = agaw_to_level(domain->agaw);
642         int offset;
643
644         parent = domain->pgd;
645         while (level <= total) {
646                 offset = address_level_offset(addr, total);
647                 pte = &parent[offset];
648                 if (level == total)
649                         return pte;
650
651                 if (!dma_pte_present(pte))
652                         break;
653                 parent = phys_to_virt(dma_pte_addr(pte));
654                 total--;
655         }
656         return NULL;
657 }
658
659 /* clear one page's page table */
660 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
661 {
662         struct dma_pte *pte = NULL;
663
664         /* get last level pte */
665         pte = dma_addr_level_pte(domain, addr, 1);
666
667         if (pte) {
668                 dma_clear_pte(pte);
669                 domain_flush_cache(domain, pte, sizeof(*pte));
670         }
671 }
672
673 /* clear last level pte, a tlb flush should be followed */
674 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
675 {
676         int addr_width = agaw_to_width(domain->agaw);
677
678         start &= (((u64)1) << addr_width) - 1;
679         end &= (((u64)1) << addr_width) - 1;
680         /* in case it's partial page */
681         start = PAGE_ALIGN(start);
682         end &= PAGE_MASK;
683
684         /* we don't need lock here, nobody else touches the iova range */
685         while (start < end) {
686                 dma_pte_clear_one(domain, start);
687                 start += VTD_PAGE_SIZE;
688         }
689 }
690
691 /* free page table pages. last level pte should already be cleared */
692 static void dma_pte_free_pagetable(struct dmar_domain *domain,
693         u64 start, u64 end)
694 {
695         int addr_width = agaw_to_width(domain->agaw);
696         struct dma_pte *pte;
697         int total = agaw_to_level(domain->agaw);
698         int level;
699         u64 tmp;
700
701         start &= (((u64)1) << addr_width) - 1;
702         end &= (((u64)1) << addr_width) - 1;
703
704         /* we don't need lock here, nobody else touches the iova range */
705         level = 2;
706         while (level <= total) {
707                 tmp = align_to_level(start, level);
708                 if (tmp >= end || (tmp + level_size(level) > end))
709                         return;
710
711                 while (tmp < end) {
712                         pte = dma_addr_level_pte(domain, tmp, level);
713                         if (pte) {
714                                 free_pgtable_page(
715                                         phys_to_virt(dma_pte_addr(pte)));
716                                 dma_clear_pte(pte);
717                                 domain_flush_cache(domain, pte, sizeof(*pte));
718                         }
719                         tmp += level_size(level);
720                 }
721                 level++;
722         }
723         /* free pgd */
724         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
725                 free_pgtable_page(domain->pgd);
726                 domain->pgd = NULL;
727         }
728 }
729
730 /* iommu handling */
731 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
732 {
733         struct root_entry *root;
734         unsigned long flags;
735
736         root = (struct root_entry *)alloc_pgtable_page();
737         if (!root)
738                 return -ENOMEM;
739
740         __iommu_flush_cache(iommu, root, ROOT_SIZE);
741
742         spin_lock_irqsave(&iommu->lock, flags);
743         iommu->root_entry = root;
744         spin_unlock_irqrestore(&iommu->lock, flags);
745
746         return 0;
747 }
748
749 static void iommu_set_root_entry(struct intel_iommu *iommu)
750 {
751         void *addr;
752         u32 cmd, sts;
753         unsigned long flag;
754
755         addr = iommu->root_entry;
756
757         spin_lock_irqsave(&iommu->register_lock, flag);
758         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
759
760         cmd = iommu->gcmd | DMA_GCMD_SRTP;
761         writel(cmd, iommu->reg + DMAR_GCMD_REG);
762
763         /* Make sure hardware complete it */
764         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
765                 readl, (sts & DMA_GSTS_RTPS), sts);
766
767         spin_unlock_irqrestore(&iommu->register_lock, flag);
768 }
769
770 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
771 {
772         u32 val;
773         unsigned long flag;
774
775         if (!cap_rwbf(iommu->cap))
776                 return;
777         val = iommu->gcmd | DMA_GCMD_WBF;
778
779         spin_lock_irqsave(&iommu->register_lock, flag);
780         writel(val, iommu->reg + DMAR_GCMD_REG);
781
782         /* Make sure hardware complete it */
783         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
784                         readl, (!(val & DMA_GSTS_WBFS)), val);
785
786         spin_unlock_irqrestore(&iommu->register_lock, flag);
787 }
788
789 /* return value determine if we need a write buffer flush */
790 static int __iommu_flush_context(struct intel_iommu *iommu,
791         u16 did, u16 source_id, u8 function_mask, u64 type,
792         int non_present_entry_flush)
793 {
794         u64 val = 0;
795         unsigned long flag;
796
797         /*
798          * In the non-present entry flush case, if hardware doesn't cache
799          * non-present entry we do nothing and if hardware cache non-present
800          * entry, we flush entries of domain 0 (the domain id is used to cache
801          * any non-present entries)
802          */
803         if (non_present_entry_flush) {
804                 if (!cap_caching_mode(iommu->cap))
805                         return 1;
806                 else
807                         did = 0;
808         }
809
810         switch (type) {
811         case DMA_CCMD_GLOBAL_INVL:
812                 val = DMA_CCMD_GLOBAL_INVL;
813                 break;
814         case DMA_CCMD_DOMAIN_INVL:
815                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
816                 break;
817         case DMA_CCMD_DEVICE_INVL:
818                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
819                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
820                 break;
821         default:
822                 BUG();
823         }
824         val |= DMA_CCMD_ICC;
825
826         spin_lock_irqsave(&iommu->register_lock, flag);
827         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
828
829         /* Make sure hardware complete it */
830         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
831                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
832
833         spin_unlock_irqrestore(&iommu->register_lock, flag);
834
835         /* flush context entry will implicitly flush write buffer */
836         return 0;
837 }
838
839 /* return value determine if we need a write buffer flush */
840 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
841         u64 addr, unsigned int size_order, u64 type,
842         int non_present_entry_flush)
843 {
844         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
845         u64 val = 0, val_iva = 0;
846         unsigned long flag;
847
848         /*
849          * In the non-present entry flush case, if hardware doesn't cache
850          * non-present entry we do nothing and if hardware cache non-present
851          * entry, we flush entries of domain 0 (the domain id is used to cache
852          * any non-present entries)
853          */
854         if (non_present_entry_flush) {
855                 if (!cap_caching_mode(iommu->cap))
856                         return 1;
857                 else
858                         did = 0;
859         }
860
861         switch (type) {
862         case DMA_TLB_GLOBAL_FLUSH:
863                 /* global flush doesn't need set IVA_REG */
864                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
865                 break;
866         case DMA_TLB_DSI_FLUSH:
867                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
868                 break;
869         case DMA_TLB_PSI_FLUSH:
870                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
871                 /* Note: always flush non-leaf currently */
872                 val_iva = size_order | addr;
873                 break;
874         default:
875                 BUG();
876         }
877         /* Note: set drain read/write */
878 #if 0
879         /*
880          * This is probably to be super secure.. Looks like we can
881          * ignore it without any impact.
882          */
883         if (cap_read_drain(iommu->cap))
884                 val |= DMA_TLB_READ_DRAIN;
885 #endif
886         if (cap_write_drain(iommu->cap))
887                 val |= DMA_TLB_WRITE_DRAIN;
888
889         spin_lock_irqsave(&iommu->register_lock, flag);
890         /* Note: Only uses first TLB reg currently */
891         if (val_iva)
892                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
893         dmar_writeq(iommu->reg + tlb_offset + 8, val);
894
895         /* Make sure hardware complete it */
896         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
897                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
898
899         spin_unlock_irqrestore(&iommu->register_lock, flag);
900
901         /* check IOTLB invalidation granularity */
902         if (DMA_TLB_IAIG(val) == 0)
903                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
904         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
905                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
906                         (unsigned long long)DMA_TLB_IIRG(type),
907                         (unsigned long long)DMA_TLB_IAIG(val));
908         /* flush iotlb entry will implicitly flush write buffer */
909         return 0;
910 }
911
912 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
913         u64 addr, unsigned int pages, int non_present_entry_flush)
914 {
915         unsigned int mask;
916
917         BUG_ON(addr & (~VTD_PAGE_MASK));
918         BUG_ON(pages == 0);
919
920         /* Fallback to domain selective flush if no PSI support */
921         if (!cap_pgsel_inv(iommu->cap))
922                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
923                                                 DMA_TLB_DSI_FLUSH,
924                                                 non_present_entry_flush);
925
926         /*
927          * PSI requires page size to be 2 ^ x, and the base address is naturally
928          * aligned to the size
929          */
930         mask = ilog2(__roundup_pow_of_two(pages));
931         /* Fallback to domain selective flush if size is too big */
932         if (mask > cap_max_amask_val(iommu->cap))
933                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
934                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
935
936         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
937                                         DMA_TLB_PSI_FLUSH,
938                                         non_present_entry_flush);
939 }
940
941 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
942 {
943         u32 pmen;
944         unsigned long flags;
945
946         spin_lock_irqsave(&iommu->register_lock, flags);
947         pmen = readl(iommu->reg + DMAR_PMEN_REG);
948         pmen &= ~DMA_PMEN_EPM;
949         writel(pmen, iommu->reg + DMAR_PMEN_REG);
950
951         /* wait for the protected region status bit to clear */
952         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
953                 readl, !(pmen & DMA_PMEN_PRS), pmen);
954
955         spin_unlock_irqrestore(&iommu->register_lock, flags);
956 }
957
958 static int iommu_enable_translation(struct intel_iommu *iommu)
959 {
960         u32 sts;
961         unsigned long flags;
962
963         spin_lock_irqsave(&iommu->register_lock, flags);
964         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
965
966         /* Make sure hardware complete it */
967         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
968                 readl, (sts & DMA_GSTS_TES), sts);
969
970         iommu->gcmd |= DMA_GCMD_TE;
971         spin_unlock_irqrestore(&iommu->register_lock, flags);
972         return 0;
973 }
974
975 static int iommu_disable_translation(struct intel_iommu *iommu)
976 {
977         u32 sts;
978         unsigned long flag;
979
980         spin_lock_irqsave(&iommu->register_lock, flag);
981         iommu->gcmd &= ~DMA_GCMD_TE;
982         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
983
984         /* Make sure hardware complete it */
985         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
986                 readl, (!(sts & DMA_GSTS_TES)), sts);
987
988         spin_unlock_irqrestore(&iommu->register_lock, flag);
989         return 0;
990 }
991
992 /* iommu interrupt handling. Most stuff are MSI-like. */
993
994 static const char *fault_reason_strings[] =
995 {
996         "Software",
997         "Present bit in root entry is clear",
998         "Present bit in context entry is clear",
999         "Invalid context entry",
1000         "Access beyond MGAW",
1001         "PTE Write access is not set",
1002         "PTE Read access is not set",
1003         "Next page table ptr is invalid",
1004         "Root table address invalid",
1005         "Context table ptr is invalid",
1006         "non-zero reserved fields in RTP",
1007         "non-zero reserved fields in CTP",
1008         "non-zero reserved fields in PTE",
1009 };
1010 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1011
1012 const char *dmar_get_fault_reason(u8 fault_reason)
1013 {
1014         if (fault_reason > MAX_FAULT_REASON_IDX)
1015                 return "Unknown";
1016         else
1017                 return fault_reason_strings[fault_reason];
1018 }
1019
1020 void dmar_msi_unmask(unsigned int irq)
1021 {
1022         struct intel_iommu *iommu = get_irq_data(irq);
1023         unsigned long flag;
1024
1025         /* unmask it */
1026         spin_lock_irqsave(&iommu->register_lock, flag);
1027         writel(0, iommu->reg + DMAR_FECTL_REG);
1028         /* Read a reg to force flush the post write */
1029         readl(iommu->reg + DMAR_FECTL_REG);
1030         spin_unlock_irqrestore(&iommu->register_lock, flag);
1031 }
1032
1033 void dmar_msi_mask(unsigned int irq)
1034 {
1035         unsigned long flag;
1036         struct intel_iommu *iommu = get_irq_data(irq);
1037
1038         /* mask it */
1039         spin_lock_irqsave(&iommu->register_lock, flag);
1040         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1041         /* Read a reg to force flush the post write */
1042         readl(iommu->reg + DMAR_FECTL_REG);
1043         spin_unlock_irqrestore(&iommu->register_lock, flag);
1044 }
1045
1046 void dmar_msi_write(int irq, struct msi_msg *msg)
1047 {
1048         struct intel_iommu *iommu = get_irq_data(irq);
1049         unsigned long flag;
1050
1051         spin_lock_irqsave(&iommu->register_lock, flag);
1052         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1053         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1054         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1055         spin_unlock_irqrestore(&iommu->register_lock, flag);
1056 }
1057
1058 void dmar_msi_read(int irq, struct msi_msg *msg)
1059 {
1060         struct intel_iommu *iommu = get_irq_data(irq);
1061         unsigned long flag;
1062
1063         spin_lock_irqsave(&iommu->register_lock, flag);
1064         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1065         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1066         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1067         spin_unlock_irqrestore(&iommu->register_lock, flag);
1068 }
1069
1070 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1071                 u8 fault_reason, u16 source_id, unsigned long long addr)
1072 {
1073         const char *reason;
1074
1075         reason = dmar_get_fault_reason(fault_reason);
1076
1077         printk(KERN_ERR
1078                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1079                 "fault addr %llx \n"
1080                 "DMAR:[fault reason %02d] %s\n",
1081                 (type ? "DMA Read" : "DMA Write"),
1082                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1083                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1084         return 0;
1085 }
1086
1087 #define PRIMARY_FAULT_REG_LEN (16)
1088 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1089 {
1090         struct intel_iommu *iommu = dev_id;
1091         int reg, fault_index;
1092         u32 fault_status;
1093         unsigned long flag;
1094
1095         spin_lock_irqsave(&iommu->register_lock, flag);
1096         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1097
1098         /* TBD: ignore advanced fault log currently */
1099         if (!(fault_status & DMA_FSTS_PPF))
1100                 goto clear_overflow;
1101
1102         fault_index = dma_fsts_fault_record_index(fault_status);
1103         reg = cap_fault_reg_offset(iommu->cap);
1104         while (1) {
1105                 u8 fault_reason;
1106                 u16 source_id;
1107                 u64 guest_addr;
1108                 int type;
1109                 u32 data;
1110
1111                 /* highest 32 bits */
1112                 data = readl(iommu->reg + reg +
1113                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1114                 if (!(data & DMA_FRCD_F))
1115                         break;
1116
1117                 fault_reason = dma_frcd_fault_reason(data);
1118                 type = dma_frcd_type(data);
1119
1120                 data = readl(iommu->reg + reg +
1121                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1122                 source_id = dma_frcd_source_id(data);
1123
1124                 guest_addr = dmar_readq(iommu->reg + reg +
1125                                 fault_index * PRIMARY_FAULT_REG_LEN);
1126                 guest_addr = dma_frcd_page_addr(guest_addr);
1127                 /* clear the fault */
1128                 writel(DMA_FRCD_F, iommu->reg + reg +
1129                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1130
1131                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1132
1133                 iommu_page_fault_do_one(iommu, type, fault_reason,
1134                                 source_id, guest_addr);
1135
1136                 fault_index++;
1137                 if (fault_index > cap_num_fault_regs(iommu->cap))
1138                         fault_index = 0;
1139                 spin_lock_irqsave(&iommu->register_lock, flag);
1140         }
1141 clear_overflow:
1142         /* clear primary fault overflow */
1143         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1144         if (fault_status & DMA_FSTS_PFO)
1145                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1146
1147         spin_unlock_irqrestore(&iommu->register_lock, flag);
1148         return IRQ_HANDLED;
1149 }
1150
1151 int dmar_set_interrupt(struct intel_iommu *iommu)
1152 {
1153         int irq, ret;
1154
1155         irq = create_irq();
1156         if (!irq) {
1157                 printk(KERN_ERR "IOMMU: no free vectors\n");
1158                 return -EINVAL;
1159         }
1160
1161         set_irq_data(irq, iommu);
1162         iommu->irq = irq;
1163
1164         ret = arch_setup_dmar_msi(irq);
1165         if (ret) {
1166                 set_irq_data(irq, NULL);
1167                 iommu->irq = 0;
1168                 destroy_irq(irq);
1169                 return 0;
1170         }
1171
1172         /* Force fault register is cleared */
1173         iommu_page_fault(irq, iommu);
1174
1175         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1176         if (ret)
1177                 printk(KERN_ERR "IOMMU: can't request irq\n");
1178         return ret;
1179 }
1180
1181 static int iommu_init_domains(struct intel_iommu *iommu)
1182 {
1183         unsigned long ndomains;
1184         unsigned long nlongs;
1185
1186         ndomains = cap_ndoms(iommu->cap);
1187         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1188         nlongs = BITS_TO_LONGS(ndomains);
1189
1190         /* TBD: there might be 64K domains,
1191          * consider other allocation for future chip
1192          */
1193         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1194         if (!iommu->domain_ids) {
1195                 printk(KERN_ERR "Allocating domain id array failed\n");
1196                 return -ENOMEM;
1197         }
1198         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1199                         GFP_KERNEL);
1200         if (!iommu->domains) {
1201                 printk(KERN_ERR "Allocating domain array failed\n");
1202                 kfree(iommu->domain_ids);
1203                 return -ENOMEM;
1204         }
1205
1206         spin_lock_init(&iommu->lock);
1207
1208         /*
1209          * if Caching mode is set, then invalid translations are tagged
1210          * with domainid 0. Hence we need to pre-allocate it.
1211          */
1212         if (cap_caching_mode(iommu->cap))
1213                 set_bit(0, iommu->domain_ids);
1214         return 0;
1215 }
1216
1217
1218 static void domain_exit(struct dmar_domain *domain);
1219 static void vm_domain_exit(struct dmar_domain *domain);
1220
1221 void free_dmar_iommu(struct intel_iommu *iommu)
1222 {
1223         struct dmar_domain *domain;
1224         int i;
1225         unsigned long flags;
1226
1227         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1228         for (; i < cap_ndoms(iommu->cap); ) {
1229                 domain = iommu->domains[i];
1230                 clear_bit(i, iommu->domain_ids);
1231
1232                 spin_lock_irqsave(&domain->iommu_lock, flags);
1233                 if (--domain->iommu_count == 0) {
1234                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1235                                 vm_domain_exit(domain);
1236                         else
1237                                 domain_exit(domain);
1238                 }
1239                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1240
1241                 i = find_next_bit(iommu->domain_ids,
1242                         cap_ndoms(iommu->cap), i+1);
1243         }
1244
1245         if (iommu->gcmd & DMA_GCMD_TE)
1246                 iommu_disable_translation(iommu);
1247
1248         if (iommu->irq) {
1249                 set_irq_data(iommu->irq, NULL);
1250                 /* This will mask the irq */
1251                 free_irq(iommu->irq, iommu);
1252                 destroy_irq(iommu->irq);
1253         }
1254
1255         kfree(iommu->domains);
1256         kfree(iommu->domain_ids);
1257
1258         g_iommus[iommu->seq_id] = NULL;
1259
1260         /* if all iommus are freed, free g_iommus */
1261         for (i = 0; i < g_num_of_iommus; i++) {
1262                 if (g_iommus[i])
1263                         break;
1264         }
1265
1266         if (i == g_num_of_iommus)
1267                 kfree(g_iommus);
1268
1269         /* free context mapping */
1270         free_context_table(iommu);
1271 }
1272
1273 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1274 {
1275         unsigned long num;
1276         unsigned long ndomains;
1277         struct dmar_domain *domain;
1278         unsigned long flags;
1279
1280         domain = alloc_domain_mem();
1281         if (!domain)
1282                 return NULL;
1283
1284         ndomains = cap_ndoms(iommu->cap);
1285
1286         spin_lock_irqsave(&iommu->lock, flags);
1287         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1288         if (num >= ndomains) {
1289                 spin_unlock_irqrestore(&iommu->lock, flags);
1290                 free_domain_mem(domain);
1291                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1292                 return NULL;
1293         }
1294
1295         set_bit(num, iommu->domain_ids);
1296         domain->id = num;
1297         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1298         set_bit(iommu->seq_id, &domain->iommu_bmp);
1299         domain->flags = 0;
1300         iommu->domains[num] = domain;
1301         spin_unlock_irqrestore(&iommu->lock, flags);
1302
1303         return domain;
1304 }
1305
1306 static void iommu_free_domain(struct dmar_domain *domain)
1307 {
1308         unsigned long flags;
1309         struct intel_iommu *iommu;
1310
1311         iommu = domain_get_iommu(domain);
1312
1313         spin_lock_irqsave(&iommu->lock, flags);
1314         clear_bit(domain->id, iommu->domain_ids);
1315         spin_unlock_irqrestore(&iommu->lock, flags);
1316 }
1317
1318 static struct iova_domain reserved_iova_list;
1319 static struct lock_class_key reserved_alloc_key;
1320 static struct lock_class_key reserved_rbtree_key;
1321
1322 static void dmar_init_reserved_ranges(void)
1323 {
1324         struct pci_dev *pdev = NULL;
1325         struct iova *iova;
1326         int i;
1327         u64 addr, size;
1328
1329         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1330
1331         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1332                 &reserved_alloc_key);
1333         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1334                 &reserved_rbtree_key);
1335
1336         /* IOAPIC ranges shouldn't be accessed by DMA */
1337         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1338                 IOVA_PFN(IOAPIC_RANGE_END));
1339         if (!iova)
1340                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1341
1342         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1343         for_each_pci_dev(pdev) {
1344                 struct resource *r;
1345
1346                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1347                         r = &pdev->resource[i];
1348                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1349                                 continue;
1350                         addr = r->start;
1351                         addr &= PAGE_MASK;
1352                         size = r->end - addr;
1353                         size = PAGE_ALIGN(size);
1354                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1355                                 IOVA_PFN(size + addr) - 1);
1356                         if (!iova)
1357                                 printk(KERN_ERR "Reserve iova failed\n");
1358                 }
1359         }
1360
1361 }
1362
1363 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1364 {
1365         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1366 }
1367
1368 static inline int guestwidth_to_adjustwidth(int gaw)
1369 {
1370         int agaw;
1371         int r = (gaw - 12) % 9;
1372
1373         if (r == 0)
1374                 agaw = gaw;
1375         else
1376                 agaw = gaw + 9 - r;
1377         if (agaw > 64)
1378                 agaw = 64;
1379         return agaw;
1380 }
1381
1382 static int domain_init(struct dmar_domain *domain, int guest_width)
1383 {
1384         struct intel_iommu *iommu;
1385         int adjust_width, agaw;
1386         unsigned long sagaw;
1387
1388         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1389         spin_lock_init(&domain->mapping_lock);
1390         spin_lock_init(&domain->iommu_lock);
1391
1392         domain_reserve_special_ranges(domain);
1393
1394         /* calculate AGAW */
1395         iommu = domain_get_iommu(domain);
1396         if (guest_width > cap_mgaw(iommu->cap))
1397                 guest_width = cap_mgaw(iommu->cap);
1398         domain->gaw = guest_width;
1399         adjust_width = guestwidth_to_adjustwidth(guest_width);
1400         agaw = width_to_agaw(adjust_width);
1401         sagaw = cap_sagaw(iommu->cap);
1402         if (!test_bit(agaw, &sagaw)) {
1403                 /* hardware doesn't support it, choose a bigger one */
1404                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1405                 agaw = find_next_bit(&sagaw, 5, agaw);
1406                 if (agaw >= 5)
1407                         return -ENODEV;
1408         }
1409         domain->agaw = agaw;
1410         INIT_LIST_HEAD(&domain->devices);
1411
1412         if (ecap_coherent(iommu->ecap))
1413                 domain->iommu_coherency = 1;
1414         else
1415                 domain->iommu_coherency = 0;
1416
1417         domain->iommu_count = 1;
1418
1419         /* always allocate the top pgd */
1420         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1421         if (!domain->pgd)
1422                 return -ENOMEM;
1423         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1424         return 0;
1425 }
1426
1427 static void domain_exit(struct dmar_domain *domain)
1428 {
1429         u64 end;
1430
1431         /* Domain 0 is reserved, so dont process it */
1432         if (!domain)
1433                 return;
1434
1435         domain_remove_dev_info(domain);
1436         /* destroy iovas */
1437         put_iova_domain(&domain->iovad);
1438         end = DOMAIN_MAX_ADDR(domain->gaw);
1439         end = end & (~PAGE_MASK);
1440
1441         /* clear ptes */
1442         dma_pte_clear_range(domain, 0, end);
1443
1444         /* free page tables */
1445         dma_pte_free_pagetable(domain, 0, end);
1446
1447         iommu_free_domain(domain);
1448         free_domain_mem(domain);
1449 }
1450
1451 static int domain_context_mapping_one(struct dmar_domain *domain,
1452                 u8 bus, u8 devfn)
1453 {
1454         struct context_entry *context;
1455         unsigned long flags;
1456         struct intel_iommu *iommu;
1457         struct dma_pte *pgd;
1458         unsigned long num;
1459         unsigned long ndomains;
1460         int id;
1461         int agaw;
1462
1463         pr_debug("Set context mapping for %02x:%02x.%d\n",
1464                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1465         BUG_ON(!domain->pgd);
1466
1467         iommu = device_to_iommu(bus, devfn);
1468         if (!iommu)
1469                 return -ENODEV;
1470
1471         context = device_to_context_entry(iommu, bus, devfn);
1472         if (!context)
1473                 return -ENOMEM;
1474         spin_lock_irqsave(&iommu->lock, flags);
1475         if (context_present(context)) {
1476                 spin_unlock_irqrestore(&iommu->lock, flags);
1477                 return 0;
1478         }
1479
1480         id = domain->id;
1481         pgd = domain->pgd;
1482
1483         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1484                 int found = 0;
1485
1486                 /* find an available domain id for this device in iommu */
1487                 ndomains = cap_ndoms(iommu->cap);
1488                 num = find_first_bit(iommu->domain_ids, ndomains);
1489                 for (; num < ndomains; ) {
1490                         if (iommu->domains[num] == domain) {
1491                                 id = num;
1492                                 found = 1;
1493                                 break;
1494                         }
1495                         num = find_next_bit(iommu->domain_ids,
1496                                             cap_ndoms(iommu->cap), num+1);
1497                 }
1498
1499                 if (found == 0) {
1500                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1501                         if (num >= ndomains) {
1502                                 spin_unlock_irqrestore(&iommu->lock, flags);
1503                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1504                                 return -EFAULT;
1505                         }
1506
1507                         set_bit(num, iommu->domain_ids);
1508                         iommu->domains[num] = domain;
1509                         id = num;
1510                 }
1511
1512                 /* Skip top levels of page tables for
1513                  * iommu which has less agaw than default.
1514                  */
1515                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1516                         pgd = phys_to_virt(dma_pte_addr(pgd));
1517                         if (!dma_pte_present(pgd)) {
1518                                 spin_unlock_irqrestore(&iommu->lock, flags);
1519                                 return -ENOMEM;
1520                         }
1521                 }
1522         }
1523
1524         context_set_domain_id(context, id);
1525         context_set_address_width(context, iommu->agaw);
1526         context_set_address_root(context, virt_to_phys(pgd));
1527         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1528         context_set_fault_enable(context);
1529         context_set_present(context);
1530         domain_flush_cache(domain, context, sizeof(*context));
1531
1532         /* it's a non-present to present mapping */
1533         if (iommu->flush.flush_context(iommu, domain->id,
1534                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1535                 DMA_CCMD_DEVICE_INVL, 1))
1536                 iommu_flush_write_buffer(iommu);
1537         else
1538                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1539
1540         spin_unlock_irqrestore(&iommu->lock, flags);
1541
1542         spin_lock_irqsave(&domain->iommu_lock, flags);
1543         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1544                 domain->iommu_count++;
1545                 domain_update_iommu_coherency(domain);
1546         }
1547         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1548         return 0;
1549 }
1550
1551 static int
1552 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1553 {
1554         int ret;
1555         struct pci_dev *tmp, *parent;
1556
1557         ret = domain_context_mapping_one(domain, pdev->bus->number,
1558                 pdev->devfn);
1559         if (ret)
1560                 return ret;
1561
1562         /* dependent device mapping */
1563         tmp = pci_find_upstream_pcie_bridge(pdev);
1564         if (!tmp)
1565                 return 0;
1566         /* Secondary interface's bus number and devfn 0 */
1567         parent = pdev->bus->self;
1568         while (parent != tmp) {
1569                 ret = domain_context_mapping_one(domain, parent->bus->number,
1570                         parent->devfn);
1571                 if (ret)
1572                         return ret;
1573                 parent = parent->bus->self;
1574         }
1575         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1576                 return domain_context_mapping_one(domain,
1577                         tmp->subordinate->number, 0);
1578         else /* this is a legacy PCI bridge */
1579                 return domain_context_mapping_one(domain,
1580                         tmp->bus->number, tmp->devfn);
1581 }
1582
1583 static int domain_context_mapped(struct pci_dev *pdev)
1584 {
1585         int ret;
1586         struct pci_dev *tmp, *parent;
1587         struct intel_iommu *iommu;
1588
1589         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1590         if (!iommu)
1591                 return -ENODEV;
1592
1593         ret = device_context_mapped(iommu,
1594                 pdev->bus->number, pdev->devfn);
1595         if (!ret)
1596                 return ret;
1597         /* dependent device mapping */
1598         tmp = pci_find_upstream_pcie_bridge(pdev);
1599         if (!tmp)
1600                 return ret;
1601         /* Secondary interface's bus number and devfn 0 */
1602         parent = pdev->bus->self;
1603         while (parent != tmp) {
1604                 ret = device_context_mapped(iommu, parent->bus->number,
1605                         parent->devfn);
1606                 if (!ret)
1607                         return ret;
1608                 parent = parent->bus->self;
1609         }
1610         if (tmp->is_pcie)
1611                 return device_context_mapped(iommu,
1612                         tmp->subordinate->number, 0);
1613         else
1614                 return device_context_mapped(iommu,
1615                         tmp->bus->number, tmp->devfn);
1616 }
1617
1618 static int
1619 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1620                         u64 hpa, size_t size, int prot)
1621 {
1622         u64 start_pfn, end_pfn;
1623         struct dma_pte *pte;
1624         int index;
1625         int addr_width = agaw_to_width(domain->agaw);
1626
1627         hpa &= (((u64)1) << addr_width) - 1;
1628
1629         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1630                 return -EINVAL;
1631         iova &= PAGE_MASK;
1632         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1633         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1634         index = 0;
1635         while (start_pfn < end_pfn) {
1636                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1637                 if (!pte)
1638                         return -ENOMEM;
1639                 /* We don't need lock here, nobody else
1640                  * touches the iova range
1641                  */
1642                 BUG_ON(dma_pte_addr(pte));
1643                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1644                 dma_set_pte_prot(pte, prot);
1645                 domain_flush_cache(domain, pte, sizeof(*pte));
1646                 start_pfn++;
1647                 index++;
1648         }
1649         return 0;
1650 }
1651
1652 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1653 {
1654         if (!iommu)
1655                 return;
1656
1657         clear_context_table(iommu, bus, devfn);
1658         iommu->flush.flush_context(iommu, 0, 0, 0,
1659                                            DMA_CCMD_GLOBAL_INVL, 0);
1660         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1661                                          DMA_TLB_GLOBAL_FLUSH, 0);
1662 }
1663
1664 static void domain_remove_dev_info(struct dmar_domain *domain)
1665 {
1666         struct device_domain_info *info;
1667         unsigned long flags;
1668         struct intel_iommu *iommu;
1669
1670         spin_lock_irqsave(&device_domain_lock, flags);
1671         while (!list_empty(&domain->devices)) {
1672                 info = list_entry(domain->devices.next,
1673                         struct device_domain_info, link);
1674                 list_del(&info->link);
1675                 list_del(&info->global);
1676                 if (info->dev)
1677                         info->dev->dev.archdata.iommu = NULL;
1678                 spin_unlock_irqrestore(&device_domain_lock, flags);
1679
1680                 iommu = device_to_iommu(info->bus, info->devfn);
1681                 iommu_detach_dev(iommu, info->bus, info->devfn);
1682                 free_devinfo_mem(info);
1683
1684                 spin_lock_irqsave(&device_domain_lock, flags);
1685         }
1686         spin_unlock_irqrestore(&device_domain_lock, flags);
1687 }
1688
1689 /*
1690  * find_domain
1691  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1692  */
1693 static struct dmar_domain *
1694 find_domain(struct pci_dev *pdev)
1695 {
1696         struct device_domain_info *info;
1697
1698         /* No lock here, assumes no domain exit in normal case */
1699         info = pdev->dev.archdata.iommu;
1700         if (info)
1701                 return info->domain;
1702         return NULL;
1703 }
1704
1705 /* domain is initialized */
1706 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1707 {
1708         struct dmar_domain *domain, *found = NULL;
1709         struct intel_iommu *iommu;
1710         struct dmar_drhd_unit *drhd;
1711         struct device_domain_info *info, *tmp;
1712         struct pci_dev *dev_tmp;
1713         unsigned long flags;
1714         int bus = 0, devfn = 0;
1715
1716         domain = find_domain(pdev);
1717         if (domain)
1718                 return domain;
1719
1720         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1721         if (dev_tmp) {
1722                 if (dev_tmp->is_pcie) {
1723                         bus = dev_tmp->subordinate->number;
1724                         devfn = 0;
1725                 } else {
1726                         bus = dev_tmp->bus->number;
1727                         devfn = dev_tmp->devfn;
1728                 }
1729                 spin_lock_irqsave(&device_domain_lock, flags);
1730                 list_for_each_entry(info, &device_domain_list, global) {
1731                         if (info->bus == bus && info->devfn == devfn) {
1732                                 found = info->domain;
1733                                 break;
1734                         }
1735                 }
1736                 spin_unlock_irqrestore(&device_domain_lock, flags);
1737                 /* pcie-pci bridge already has a domain, uses it */
1738                 if (found) {
1739                         domain = found;
1740                         goto found_domain;
1741                 }
1742         }
1743
1744         /* Allocate new domain for the device */
1745         drhd = dmar_find_matched_drhd_unit(pdev);
1746         if (!drhd) {
1747                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1748                         pci_name(pdev));
1749                 return NULL;
1750         }
1751         iommu = drhd->iommu;
1752
1753         domain = iommu_alloc_domain(iommu);
1754         if (!domain)
1755                 goto error;
1756
1757         if (domain_init(domain, gaw)) {
1758                 domain_exit(domain);
1759                 goto error;
1760         }
1761
1762         /* register pcie-to-pci device */
1763         if (dev_tmp) {
1764                 info = alloc_devinfo_mem();
1765                 if (!info) {
1766                         domain_exit(domain);
1767                         goto error;
1768                 }
1769                 info->bus = bus;
1770                 info->devfn = devfn;
1771                 info->dev = NULL;
1772                 info->domain = domain;
1773                 /* This domain is shared by devices under p2p bridge */
1774                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1775
1776                 /* pcie-to-pci bridge already has a domain, uses it */
1777                 found = NULL;
1778                 spin_lock_irqsave(&device_domain_lock, flags);
1779                 list_for_each_entry(tmp, &device_domain_list, global) {
1780                         if (tmp->bus == bus && tmp->devfn == devfn) {
1781                                 found = tmp->domain;
1782                                 break;
1783                         }
1784                 }
1785                 if (found) {
1786                         free_devinfo_mem(info);
1787                         domain_exit(domain);
1788                         domain = found;
1789                 } else {
1790                         list_add(&info->link, &domain->devices);
1791                         list_add(&info->global, &device_domain_list);
1792                 }
1793                 spin_unlock_irqrestore(&device_domain_lock, flags);
1794         }
1795
1796 found_domain:
1797         info = alloc_devinfo_mem();
1798         if (!info)
1799                 goto error;
1800         info->bus = pdev->bus->number;
1801         info->devfn = pdev->devfn;
1802         info->dev = pdev;
1803         info->domain = domain;
1804         spin_lock_irqsave(&device_domain_lock, flags);
1805         /* somebody is fast */
1806         found = find_domain(pdev);
1807         if (found != NULL) {
1808                 spin_unlock_irqrestore(&device_domain_lock, flags);
1809                 if (found != domain) {
1810                         domain_exit(domain);
1811                         domain = found;
1812                 }
1813                 free_devinfo_mem(info);
1814                 return domain;
1815         }
1816         list_add(&info->link, &domain->devices);
1817         list_add(&info->global, &device_domain_list);
1818         pdev->dev.archdata.iommu = info;
1819         spin_unlock_irqrestore(&device_domain_lock, flags);
1820         return domain;
1821 error:
1822         /* recheck it here, maybe others set it */
1823         return find_domain(pdev);
1824 }
1825
1826 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1827                                       unsigned long long start,
1828                                       unsigned long long end)
1829 {
1830         struct dmar_domain *domain;
1831         unsigned long size;
1832         unsigned long long base;
1833         int ret;
1834
1835         printk(KERN_INFO
1836                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1837                 pci_name(pdev), start, end);
1838         /* page table init */
1839         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1840         if (!domain)
1841                 return -ENOMEM;
1842
1843         /* The address might not be aligned */
1844         base = start & PAGE_MASK;
1845         size = end - base;
1846         size = PAGE_ALIGN(size);
1847         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1848                         IOVA_PFN(base + size) - 1)) {
1849                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1850                 ret = -ENOMEM;
1851                 goto error;
1852         }
1853
1854         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1855                 size, base, pci_name(pdev));
1856         /*
1857          * RMRR range might have overlap with physical memory range,
1858          * clear it first
1859          */
1860         dma_pte_clear_range(domain, base, base + size);
1861
1862         ret = domain_page_mapping(domain, base, base, size,
1863                 DMA_PTE_READ|DMA_PTE_WRITE);
1864         if (ret)
1865                 goto error;
1866
1867         /* context entry init */
1868         ret = domain_context_mapping(domain, pdev);
1869         if (!ret)
1870                 return 0;
1871 error:
1872         domain_exit(domain);
1873         return ret;
1874
1875 }
1876
1877 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1878         struct pci_dev *pdev)
1879 {
1880         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1881                 return 0;
1882         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1883                 rmrr->end_address + 1);
1884 }
1885
1886 #ifdef CONFIG_DMAR_GFX_WA
1887 struct iommu_prepare_data {
1888         struct pci_dev *pdev;
1889         int ret;
1890 };
1891
1892 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1893                                          unsigned long end_pfn, void *datax)
1894 {
1895         struct iommu_prepare_data *data;
1896
1897         data = (struct iommu_prepare_data *)datax;
1898
1899         data->ret = iommu_prepare_identity_map(data->pdev,
1900                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1901         return data->ret;
1902
1903 }
1904
1905 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1906 {
1907         int nid;
1908         struct iommu_prepare_data data;
1909
1910         data.pdev = pdev;
1911         data.ret = 0;
1912
1913         for_each_online_node(nid) {
1914                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1915                 if (data.ret)
1916                         return data.ret;
1917         }
1918         return data.ret;
1919 }
1920
1921 static void __init iommu_prepare_gfx_mapping(void)
1922 {
1923         struct pci_dev *pdev = NULL;
1924         int ret;
1925
1926         for_each_pci_dev(pdev) {
1927                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1928                                 !IS_GFX_DEVICE(pdev))
1929                         continue;
1930                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1931                         pci_name(pdev));
1932                 ret = iommu_prepare_with_active_regions(pdev);
1933                 if (ret)
1934                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1935         }
1936 }
1937 #else /* !CONFIG_DMAR_GFX_WA */
1938 static inline void iommu_prepare_gfx_mapping(void)
1939 {
1940         return;
1941 }
1942 #endif
1943
1944 #ifdef CONFIG_DMAR_FLOPPY_WA
1945 static inline void iommu_prepare_isa(void)
1946 {
1947         struct pci_dev *pdev;
1948         int ret;
1949
1950         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1951         if (!pdev)
1952                 return;
1953
1954         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1955         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1956
1957         if (ret)
1958                 printk("IOMMU: Failed to create 0-64M identity map, "
1959                         "floppy might not work\n");
1960
1961 }
1962 #else
1963 static inline void iommu_prepare_isa(void)
1964 {
1965         return;
1966 }
1967 #endif /* !CONFIG_DMAR_FLPY_WA */
1968
1969 static int __init init_dmars(void)
1970 {
1971         struct dmar_drhd_unit *drhd;
1972         struct dmar_rmrr_unit *rmrr;
1973         struct pci_dev *pdev;
1974         struct intel_iommu *iommu;
1975         int i, ret, unit = 0;
1976
1977         /*
1978          * for each drhd
1979          *    allocate root
1980          *    initialize and program root entry to not present
1981          * endfor
1982          */
1983         for_each_drhd_unit(drhd) {
1984                 g_num_of_iommus++;
1985                 /*
1986                  * lock not needed as this is only incremented in the single
1987                  * threaded kernel __init code path all other access are read
1988                  * only
1989                  */
1990         }
1991
1992         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1993                         GFP_KERNEL);
1994         if (!g_iommus) {
1995                 printk(KERN_ERR "Allocating global iommu array failed\n");
1996                 ret = -ENOMEM;
1997                 goto error;
1998         }
1999
2000         deferred_flush = kzalloc(g_num_of_iommus *
2001                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2002         if (!deferred_flush) {
2003                 kfree(g_iommus);
2004                 ret = -ENOMEM;
2005                 goto error;
2006         }
2007
2008         for_each_drhd_unit(drhd) {
2009                 if (drhd->ignored)
2010                         continue;
2011
2012                 iommu = drhd->iommu;
2013                 g_iommus[iommu->seq_id] = iommu;
2014
2015                 ret = iommu_init_domains(iommu);
2016                 if (ret)
2017                         goto error;
2018
2019                 /*
2020                  * TBD:
2021                  * we could share the same root & context tables
2022                  * amoung all IOMMU's. Need to Split it later.
2023                  */
2024                 ret = iommu_alloc_root_entry(iommu);
2025                 if (ret) {
2026                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2027                         goto error;
2028                 }
2029         }
2030
2031         for_each_drhd_unit(drhd) {
2032                 if (drhd->ignored)
2033                         continue;
2034
2035                 iommu = drhd->iommu;
2036                 if (dmar_enable_qi(iommu)) {
2037                         /*
2038                          * Queued Invalidate not enabled, use Register Based
2039                          * Invalidate
2040                          */
2041                         iommu->flush.flush_context = __iommu_flush_context;
2042                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2043                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2044                                "invalidation\n",
2045                                (unsigned long long)drhd->reg_base_addr);
2046                 } else {
2047                         iommu->flush.flush_context = qi_flush_context;
2048                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2049                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2050                                "invalidation\n",
2051                                (unsigned long long)drhd->reg_base_addr);
2052                 }
2053         }
2054
2055         /*
2056          * For each rmrr
2057          *   for each dev attached to rmrr
2058          *   do
2059          *     locate drhd for dev, alloc domain for dev
2060          *     allocate free domain
2061          *     allocate page table entries for rmrr
2062          *     if context not allocated for bus
2063          *           allocate and init context
2064          *           set present in root table for this bus
2065          *     init context with domain, translation etc
2066          *    endfor
2067          * endfor
2068          */
2069         for_each_rmrr_units(rmrr) {
2070                 for (i = 0; i < rmrr->devices_cnt; i++) {
2071                         pdev = rmrr->devices[i];
2072                         /* some BIOS lists non-exist devices in DMAR table */
2073                         if (!pdev)
2074                                 continue;
2075                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2076                         if (ret)
2077                                 printk(KERN_ERR
2078                                  "IOMMU: mapping reserved region failed\n");
2079                 }
2080         }
2081
2082         iommu_prepare_gfx_mapping();
2083
2084         iommu_prepare_isa();
2085
2086         /*
2087          * for each drhd
2088          *   enable fault log
2089          *   global invalidate context cache
2090          *   global invalidate iotlb
2091          *   enable translation
2092          */
2093         for_each_drhd_unit(drhd) {
2094                 if (drhd->ignored)
2095                         continue;
2096                 iommu = drhd->iommu;
2097                 sprintf (iommu->name, "dmar%d", unit++);
2098
2099                 iommu_flush_write_buffer(iommu);
2100
2101                 ret = dmar_set_interrupt(iommu);
2102                 if (ret)
2103                         goto error;
2104
2105                 iommu_set_root_entry(iommu);
2106
2107                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2108                                            0);
2109                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2110                                          0);
2111                 iommu_disable_protect_mem_regions(iommu);
2112
2113                 ret = iommu_enable_translation(iommu);
2114                 if (ret)
2115                         goto error;
2116         }
2117
2118         return 0;
2119 error:
2120         for_each_drhd_unit(drhd) {
2121                 if (drhd->ignored)
2122                         continue;
2123                 iommu = drhd->iommu;
2124                 free_iommu(iommu);
2125         }
2126         kfree(g_iommus);
2127         return ret;
2128 }
2129
2130 static inline u64 aligned_size(u64 host_addr, size_t size)
2131 {
2132         u64 addr;
2133         addr = (host_addr & (~PAGE_MASK)) + size;
2134         return PAGE_ALIGN(addr);
2135 }
2136
2137 struct iova *
2138 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2139 {
2140         struct iova *piova;
2141
2142         /* Make sure it's in range */
2143         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2144         if (!size || (IOVA_START_ADDR + size > end))
2145                 return NULL;
2146
2147         piova = alloc_iova(&domain->iovad,
2148                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2149         return piova;
2150 }
2151
2152 static struct iova *
2153 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2154                    size_t size, u64 dma_mask)
2155 {
2156         struct pci_dev *pdev = to_pci_dev(dev);
2157         struct iova *iova = NULL;
2158
2159         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2160                 iova = iommu_alloc_iova(domain, size, dma_mask);
2161         else {
2162                 /*
2163                  * First try to allocate an io virtual address in
2164                  * DMA_32BIT_MASK and if that fails then try allocating
2165                  * from higher range
2166                  */
2167                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2168                 if (!iova)
2169                         iova = iommu_alloc_iova(domain, size, dma_mask);
2170         }
2171
2172         if (!iova) {
2173                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2174                 return NULL;
2175         }
2176
2177         return iova;
2178 }
2179
2180 static struct dmar_domain *
2181 get_valid_domain_for_dev(struct pci_dev *pdev)
2182 {
2183         struct dmar_domain *domain;
2184         int ret;
2185
2186         domain = get_domain_for_dev(pdev,
2187                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2188         if (!domain) {
2189                 printk(KERN_ERR
2190                         "Allocating domain for %s failed", pci_name(pdev));
2191                 return NULL;
2192         }
2193
2194         /* make sure context mapping is ok */
2195         if (unlikely(!domain_context_mapped(pdev))) {
2196                 ret = domain_context_mapping(domain, pdev);
2197                 if (ret) {
2198                         printk(KERN_ERR
2199                                 "Domain context map for %s failed",
2200                                 pci_name(pdev));
2201                         return NULL;
2202                 }
2203         }
2204
2205         return domain;
2206 }
2207
2208 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2209                                      size_t size, int dir, u64 dma_mask)
2210 {
2211         struct pci_dev *pdev = to_pci_dev(hwdev);
2212         struct dmar_domain *domain;
2213         phys_addr_t start_paddr;
2214         struct iova *iova;
2215         int prot = 0;
2216         int ret;
2217         struct intel_iommu *iommu;
2218
2219         BUG_ON(dir == DMA_NONE);
2220         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2221                 return paddr;
2222
2223         domain = get_valid_domain_for_dev(pdev);
2224         if (!domain)
2225                 return 0;
2226
2227         iommu = domain_get_iommu(domain);
2228         size = aligned_size((u64)paddr, size);
2229
2230         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2231         if (!iova)
2232                 goto error;
2233
2234         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2235
2236         /*
2237          * Check if DMAR supports zero-length reads on write only
2238          * mappings..
2239          */
2240         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2241                         !cap_zlr(iommu->cap))
2242                 prot |= DMA_PTE_READ;
2243         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2244                 prot |= DMA_PTE_WRITE;
2245         /*
2246          * paddr - (paddr + size) might be partial page, we should map the whole
2247          * page.  Note: if two part of one page are separately mapped, we
2248          * might have two guest_addr mapping to the same host paddr, but this
2249          * is not a big problem
2250          */
2251         ret = domain_page_mapping(domain, start_paddr,
2252                 ((u64)paddr) & PAGE_MASK, size, prot);
2253         if (ret)
2254                 goto error;
2255
2256         /* it's a non-present to present mapping */
2257         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2258                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2259         if (ret)
2260                 iommu_flush_write_buffer(iommu);
2261
2262         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2263
2264 error:
2265         if (iova)
2266                 __free_iova(&domain->iovad, iova);
2267         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2268                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2269         return 0;
2270 }
2271
2272 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2273                             size_t size, int dir)
2274 {
2275         return __intel_map_single(hwdev, paddr, size, dir,
2276                                   to_pci_dev(hwdev)->dma_mask);
2277 }
2278
2279 static void flush_unmaps(void)
2280 {
2281         int i, j;
2282
2283         timer_on = 0;
2284
2285         /* just flush them all */
2286         for (i = 0; i < g_num_of_iommus; i++) {
2287                 struct intel_iommu *iommu = g_iommus[i];
2288                 if (!iommu)
2289                         continue;
2290
2291                 if (deferred_flush[i].next) {
2292                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2293                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2294                         for (j = 0; j < deferred_flush[i].next; j++) {
2295                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2296                                                 deferred_flush[i].iova[j]);
2297                         }
2298                         deferred_flush[i].next = 0;
2299                 }
2300         }
2301
2302         list_size = 0;
2303 }
2304
2305 static void flush_unmaps_timeout(unsigned long data)
2306 {
2307         unsigned long flags;
2308
2309         spin_lock_irqsave(&async_umap_flush_lock, flags);
2310         flush_unmaps();
2311         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2312 }
2313
2314 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2315 {
2316         unsigned long flags;
2317         int next, iommu_id;
2318         struct intel_iommu *iommu;
2319
2320         spin_lock_irqsave(&async_umap_flush_lock, flags);
2321         if (list_size == HIGH_WATER_MARK)
2322                 flush_unmaps();
2323
2324         iommu = domain_get_iommu(dom);
2325         iommu_id = iommu->seq_id;
2326
2327         next = deferred_flush[iommu_id].next;
2328         deferred_flush[iommu_id].domain[next] = dom;
2329         deferred_flush[iommu_id].iova[next] = iova;
2330         deferred_flush[iommu_id].next++;
2331
2332         if (!timer_on) {
2333                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2334                 timer_on = 1;
2335         }
2336         list_size++;
2337         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2338 }
2339
2340 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2341                         int dir)
2342 {
2343         struct pci_dev *pdev = to_pci_dev(dev);
2344         struct dmar_domain *domain;
2345         unsigned long start_addr;
2346         struct iova *iova;
2347         struct intel_iommu *iommu;
2348
2349         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2350                 return;
2351         domain = find_domain(pdev);
2352         BUG_ON(!domain);
2353
2354         iommu = domain_get_iommu(domain);
2355
2356         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2357         if (!iova)
2358                 return;
2359
2360         start_addr = iova->pfn_lo << PAGE_SHIFT;
2361         size = aligned_size((u64)dev_addr, size);
2362
2363         pr_debug("Device %s unmapping: %lx@%llx\n",
2364                 pci_name(pdev), size, (unsigned long long)start_addr);
2365
2366         /*  clear the whole page */
2367         dma_pte_clear_range(domain, start_addr, start_addr + size);
2368         /* free page tables */
2369         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2370         if (intel_iommu_strict) {
2371                 if (iommu_flush_iotlb_psi(iommu,
2372                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2373                         iommu_flush_write_buffer(iommu);
2374                 /* free iova */
2375                 __free_iova(&domain->iovad, iova);
2376         } else {
2377                 add_unmap(domain, iova);
2378                 /*
2379                  * queue up the release of the unmap to save the 1/6th of the
2380                  * cpu used up by the iotlb flush operation...
2381                  */
2382         }
2383 }
2384
2385 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2386                            dma_addr_t *dma_handle, gfp_t flags)
2387 {
2388         void *vaddr;
2389         int order;
2390
2391         size = PAGE_ALIGN(size);
2392         order = get_order(size);
2393         flags &= ~(GFP_DMA | GFP_DMA32);
2394
2395         vaddr = (void *)__get_free_pages(flags, order);
2396         if (!vaddr)
2397                 return NULL;
2398         memset(vaddr, 0, size);
2399
2400         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2401                                          DMA_BIDIRECTIONAL,
2402                                          hwdev->coherent_dma_mask);
2403         if (*dma_handle)
2404                 return vaddr;
2405         free_pages((unsigned long)vaddr, order);
2406         return NULL;
2407 }
2408
2409 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2410                          dma_addr_t dma_handle)
2411 {
2412         int order;
2413
2414         size = PAGE_ALIGN(size);
2415         order = get_order(size);
2416
2417         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2418         free_pages((unsigned long)vaddr, order);
2419 }
2420
2421 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2422
2423 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2424                     int nelems, int dir)
2425 {
2426         int i;
2427         struct pci_dev *pdev = to_pci_dev(hwdev);
2428         struct dmar_domain *domain;
2429         unsigned long start_addr;
2430         struct iova *iova;
2431         size_t size = 0;
2432         void *addr;
2433         struct scatterlist *sg;
2434         struct intel_iommu *iommu;
2435
2436         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2437                 return;
2438
2439         domain = find_domain(pdev);
2440         BUG_ON(!domain);
2441
2442         iommu = domain_get_iommu(domain);
2443
2444         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2445         if (!iova)
2446                 return;
2447         for_each_sg(sglist, sg, nelems, i) {
2448                 addr = SG_ENT_VIRT_ADDRESS(sg);
2449                 size += aligned_size((u64)addr, sg->length);
2450         }
2451
2452         start_addr = iova->pfn_lo << PAGE_SHIFT;
2453
2454         /*  clear the whole page */
2455         dma_pte_clear_range(domain, start_addr, start_addr + size);
2456         /* free page tables */
2457         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2458
2459         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2460                         size >> VTD_PAGE_SHIFT, 0))
2461                 iommu_flush_write_buffer(iommu);
2462
2463         /* free iova */
2464         __free_iova(&domain->iovad, iova);
2465 }
2466
2467 static int intel_nontranslate_map_sg(struct device *hddev,
2468         struct scatterlist *sglist, int nelems, int dir)
2469 {
2470         int i;
2471         struct scatterlist *sg;
2472
2473         for_each_sg(sglist, sg, nelems, i) {
2474                 BUG_ON(!sg_page(sg));
2475                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2476                 sg->dma_length = sg->length;
2477         }
2478         return nelems;
2479 }
2480
2481 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2482                  int dir)
2483 {
2484         void *addr;
2485         int i;
2486         struct pci_dev *pdev = to_pci_dev(hwdev);
2487         struct dmar_domain *domain;
2488         size_t size = 0;
2489         int prot = 0;
2490         size_t offset = 0;
2491         struct iova *iova = NULL;
2492         int ret;
2493         struct scatterlist *sg;
2494         unsigned long start_addr;
2495         struct intel_iommu *iommu;
2496
2497         BUG_ON(dir == DMA_NONE);
2498         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2499                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2500
2501         domain = get_valid_domain_for_dev(pdev);
2502         if (!domain)
2503                 return 0;
2504
2505         iommu = domain_get_iommu(domain);
2506
2507         for_each_sg(sglist, sg, nelems, i) {
2508                 addr = SG_ENT_VIRT_ADDRESS(sg);
2509                 addr = (void *)virt_to_phys(addr);
2510                 size += aligned_size((u64)addr, sg->length);
2511         }
2512
2513         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2514         if (!iova) {
2515                 sglist->dma_length = 0;
2516                 return 0;
2517         }
2518
2519         /*
2520          * Check if DMAR supports zero-length reads on write only
2521          * mappings..
2522          */
2523         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2524                         !cap_zlr(iommu->cap))
2525                 prot |= DMA_PTE_READ;
2526         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2527                 prot |= DMA_PTE_WRITE;
2528
2529         start_addr = iova->pfn_lo << PAGE_SHIFT;
2530         offset = 0;
2531         for_each_sg(sglist, sg, nelems, i) {
2532                 addr = SG_ENT_VIRT_ADDRESS(sg);
2533                 addr = (void *)virt_to_phys(addr);
2534                 size = aligned_size((u64)addr, sg->length);
2535                 ret = domain_page_mapping(domain, start_addr + offset,
2536                         ((u64)addr) & PAGE_MASK,
2537                         size, prot);
2538                 if (ret) {
2539                         /*  clear the page */
2540                         dma_pte_clear_range(domain, start_addr,
2541                                   start_addr + offset);
2542                         /* free page tables */
2543                         dma_pte_free_pagetable(domain, start_addr,
2544                                   start_addr + offset);
2545                         /* free iova */
2546                         __free_iova(&domain->iovad, iova);
2547                         return 0;
2548                 }
2549                 sg->dma_address = start_addr + offset +
2550                                 ((u64)addr & (~PAGE_MASK));
2551                 sg->dma_length = sg->length;
2552                 offset += size;
2553         }
2554
2555         /* it's a non-present to present mapping */
2556         if (iommu_flush_iotlb_psi(iommu, domain->id,
2557                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2558                 iommu_flush_write_buffer(iommu);
2559         return nelems;
2560 }
2561
2562 static struct dma_mapping_ops intel_dma_ops = {
2563         .alloc_coherent = intel_alloc_coherent,
2564         .free_coherent = intel_free_coherent,
2565         .map_single = intel_map_single,
2566         .unmap_single = intel_unmap_single,
2567         .map_sg = intel_map_sg,
2568         .unmap_sg = intel_unmap_sg,
2569 };
2570
2571 static inline int iommu_domain_cache_init(void)
2572 {
2573         int ret = 0;
2574
2575         iommu_domain_cache = kmem_cache_create("iommu_domain",
2576                                          sizeof(struct dmar_domain),
2577                                          0,
2578                                          SLAB_HWCACHE_ALIGN,
2579
2580                                          NULL);
2581         if (!iommu_domain_cache) {
2582                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2583                 ret = -ENOMEM;
2584         }
2585
2586         return ret;
2587 }
2588
2589 static inline int iommu_devinfo_cache_init(void)
2590 {
2591         int ret = 0;
2592
2593         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2594                                          sizeof(struct device_domain_info),
2595                                          0,
2596                                          SLAB_HWCACHE_ALIGN,
2597                                          NULL);
2598         if (!iommu_devinfo_cache) {
2599                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2600                 ret = -ENOMEM;
2601         }
2602
2603         return ret;
2604 }
2605
2606 static inline int iommu_iova_cache_init(void)
2607 {
2608         int ret = 0;
2609
2610         iommu_iova_cache = kmem_cache_create("iommu_iova",
2611                                          sizeof(struct iova),
2612                                          0,
2613                                          SLAB_HWCACHE_ALIGN,
2614                                          NULL);
2615         if (!iommu_iova_cache) {
2616                 printk(KERN_ERR "Couldn't create iova cache\n");
2617                 ret = -ENOMEM;
2618         }
2619
2620         return ret;
2621 }
2622
2623 static int __init iommu_init_mempool(void)
2624 {
2625         int ret;
2626         ret = iommu_iova_cache_init();
2627         if (ret)
2628                 return ret;
2629
2630         ret = iommu_domain_cache_init();
2631         if (ret)
2632                 goto domain_error;
2633
2634         ret = iommu_devinfo_cache_init();
2635         if (!ret)
2636                 return ret;
2637
2638         kmem_cache_destroy(iommu_domain_cache);
2639 domain_error:
2640         kmem_cache_destroy(iommu_iova_cache);
2641
2642         return -ENOMEM;
2643 }
2644
2645 static void __init iommu_exit_mempool(void)
2646 {
2647         kmem_cache_destroy(iommu_devinfo_cache);
2648         kmem_cache_destroy(iommu_domain_cache);
2649         kmem_cache_destroy(iommu_iova_cache);
2650
2651 }
2652
2653 static void __init init_no_remapping_devices(void)
2654 {
2655         struct dmar_drhd_unit *drhd;
2656
2657         for_each_drhd_unit(drhd) {
2658                 if (!drhd->include_all) {
2659                         int i;
2660                         for (i = 0; i < drhd->devices_cnt; i++)
2661                                 if (drhd->devices[i] != NULL)
2662                                         break;
2663                         /* ignore DMAR unit if no pci devices exist */
2664                         if (i == drhd->devices_cnt)
2665                                 drhd->ignored = 1;
2666                 }
2667         }
2668
2669         if (dmar_map_gfx)
2670                 return;
2671
2672         for_each_drhd_unit(drhd) {
2673                 int i;
2674                 if (drhd->ignored || drhd->include_all)
2675                         continue;
2676
2677                 for (i = 0; i < drhd->devices_cnt; i++)
2678                         if (drhd->devices[i] &&
2679                                 !IS_GFX_DEVICE(drhd->devices[i]))
2680                                 break;
2681
2682                 if (i < drhd->devices_cnt)
2683                         continue;
2684
2685                 /* bypass IOMMU if it is just for gfx devices */
2686                 drhd->ignored = 1;
2687                 for (i = 0; i < drhd->devices_cnt; i++) {
2688                         if (!drhd->devices[i])
2689                                 continue;
2690                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2691                 }
2692         }
2693 }
2694
2695 int __init intel_iommu_init(void)
2696 {
2697         int ret = 0;
2698
2699         if (dmar_table_init())
2700                 return  -ENODEV;
2701
2702         if (dmar_dev_scope_init())
2703                 return  -ENODEV;
2704
2705         /*
2706          * Check the need for DMA-remapping initialization now.
2707          * Above initialization will also be used by Interrupt-remapping.
2708          */
2709         if (no_iommu || swiotlb || dmar_disabled)
2710                 return -ENODEV;
2711
2712         iommu_init_mempool();
2713         dmar_init_reserved_ranges();
2714
2715         init_no_remapping_devices();
2716
2717         ret = init_dmars();
2718         if (ret) {
2719                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2720                 put_iova_domain(&reserved_iova_list);
2721                 iommu_exit_mempool();
2722                 return ret;
2723         }
2724         printk(KERN_INFO
2725         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2726
2727         init_timer(&unmap_timer);
2728         force_iommu = 1;
2729         dma_ops = &intel_dma_ops;
2730         return 0;
2731 }
2732
2733 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2734                                   struct pci_dev *pdev)
2735 {
2736         struct device_domain_info *info;
2737         unsigned long flags;
2738
2739         info = alloc_devinfo_mem();
2740         if (!info)
2741                 return -ENOMEM;
2742
2743         info->bus = pdev->bus->number;
2744         info->devfn = pdev->devfn;
2745         info->dev = pdev;
2746         info->domain = domain;
2747
2748         spin_lock_irqsave(&device_domain_lock, flags);
2749         list_add(&info->link, &domain->devices);
2750         list_add(&info->global, &device_domain_list);
2751         pdev->dev.archdata.iommu = info;
2752         spin_unlock_irqrestore(&device_domain_lock, flags);
2753
2754         return 0;
2755 }
2756
2757 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2758                                           struct pci_dev *pdev)
2759 {
2760         struct device_domain_info *info;
2761         struct intel_iommu *iommu;
2762         unsigned long flags;
2763         int found = 0;
2764         struct list_head *entry, *tmp;
2765
2766         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2767         if (!iommu)
2768                 return;
2769
2770         spin_lock_irqsave(&device_domain_lock, flags);
2771         list_for_each_safe(entry, tmp, &domain->devices) {
2772                 info = list_entry(entry, struct device_domain_info, link);
2773                 if (info->bus == pdev->bus->number &&
2774                     info->devfn == pdev->devfn) {
2775                         list_del(&info->link);
2776                         list_del(&info->global);
2777                         if (info->dev)
2778                                 info->dev->dev.archdata.iommu = NULL;
2779                         spin_unlock_irqrestore(&device_domain_lock, flags);
2780
2781                         iommu_detach_dev(iommu, info->bus, info->devfn);
2782                         free_devinfo_mem(info);
2783
2784                         spin_lock_irqsave(&device_domain_lock, flags);
2785
2786                         if (found)
2787                                 break;
2788                         else
2789                                 continue;
2790                 }
2791
2792                 /* if there is no other devices under the same iommu
2793                  * owned by this domain, clear this iommu in iommu_bmp
2794                  * update iommu count and coherency
2795                  */
2796                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2797                         found = 1;
2798         }
2799
2800         if (found == 0) {
2801                 unsigned long tmp_flags;
2802                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2803                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2804                 domain->iommu_count--;
2805                 domain_update_iommu_coherency(domain);
2806                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2807         }
2808
2809         spin_unlock_irqrestore(&device_domain_lock, flags);
2810 }
2811
2812 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2813 {
2814         struct device_domain_info *info;
2815         struct intel_iommu *iommu;
2816         unsigned long flags1, flags2;
2817
2818         spin_lock_irqsave(&device_domain_lock, flags1);
2819         while (!list_empty(&domain->devices)) {
2820                 info = list_entry(domain->devices.next,
2821                         struct device_domain_info, link);
2822                 list_del(&info->link);
2823                 list_del(&info->global);
2824                 if (info->dev)
2825                         info->dev->dev.archdata.iommu = NULL;
2826
2827                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2828
2829                 iommu = device_to_iommu(info->bus, info->devfn);
2830                 iommu_detach_dev(iommu, info->bus, info->devfn);
2831
2832                 /* clear this iommu in iommu_bmp, update iommu count
2833                  * and coherency
2834                  */
2835                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2836                 if (test_and_clear_bit(iommu->seq_id,
2837                                        &domain->iommu_bmp)) {
2838                         domain->iommu_count--;
2839                         domain_update_iommu_coherency(domain);
2840                 }
2841                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2842
2843                 free_devinfo_mem(info);
2844                 spin_lock_irqsave(&device_domain_lock, flags1);
2845         }
2846         spin_unlock_irqrestore(&device_domain_lock, flags1);
2847 }
2848
2849 /* domain id for virtual machine, it won't be set in context */
2850 static unsigned long vm_domid;
2851
2852 static struct dmar_domain *iommu_alloc_vm_domain(void)
2853 {
2854         struct dmar_domain *domain;
2855
2856         domain = alloc_domain_mem();
2857         if (!domain)
2858                 return NULL;
2859
2860         domain->id = vm_domid++;
2861         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2862         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2863
2864         return domain;
2865 }
2866
2867 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2868 {
2869         int adjust_width;
2870
2871         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2872         spin_lock_init(&domain->mapping_lock);
2873         spin_lock_init(&domain->iommu_lock);
2874
2875         domain_reserve_special_ranges(domain);
2876
2877         /* calculate AGAW */
2878         domain->gaw = guest_width;
2879         adjust_width = guestwidth_to_adjustwidth(guest_width);
2880         domain->agaw = width_to_agaw(adjust_width);
2881
2882         INIT_LIST_HEAD(&domain->devices);
2883
2884         domain->iommu_count = 0;
2885         domain->iommu_coherency = 0;
2886
2887         /* always allocate the top pgd */
2888         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2889         if (!domain->pgd)
2890                 return -ENOMEM;
2891         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2892         return 0;
2893 }
2894
2895 static void iommu_free_vm_domain(struct dmar_domain *domain)
2896 {
2897         unsigned long flags;
2898         struct dmar_drhd_unit *drhd;
2899         struct intel_iommu *iommu;
2900         unsigned long i;
2901         unsigned long ndomains;
2902
2903         for_each_drhd_unit(drhd) {
2904                 if (drhd->ignored)
2905                         continue;
2906                 iommu = drhd->iommu;
2907
2908                 ndomains = cap_ndoms(iommu->cap);
2909                 i = find_first_bit(iommu->domain_ids, ndomains);
2910                 for (; i < ndomains; ) {
2911                         if (iommu->domains[i] == domain) {
2912                                 spin_lock_irqsave(&iommu->lock, flags);
2913                                 clear_bit(i, iommu->domain_ids);
2914                                 iommu->domains[i] = NULL;
2915                                 spin_unlock_irqrestore(&iommu->lock, flags);
2916                                 break;
2917                         }
2918                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2919                 }
2920         }
2921 }
2922
2923 static void vm_domain_exit(struct dmar_domain *domain)
2924 {
2925         u64 end;
2926
2927         /* Domain 0 is reserved, so dont process it */
2928         if (!domain)
2929                 return;
2930
2931         vm_domain_remove_all_dev_info(domain);
2932         /* destroy iovas */
2933         put_iova_domain(&domain->iovad);
2934         end = DOMAIN_MAX_ADDR(domain->gaw);
2935         end = end & (~VTD_PAGE_MASK);
2936
2937         /* clear ptes */
2938         dma_pte_clear_range(domain, 0, end);
2939
2940         /* free page tables */
2941         dma_pte_free_pagetable(domain, 0, end);
2942
2943         iommu_free_vm_domain(domain);
2944         free_domain_mem(domain);
2945 }
2946
2947 struct dmar_domain *intel_iommu_alloc_domain(void)
2948 {
2949         struct dmar_domain *domain;
2950
2951         domain = iommu_alloc_vm_domain();
2952         if (!domain) {
2953                 printk(KERN_ERR
2954                         "intel_iommu_domain_alloc: domain == NULL\n");
2955                 return NULL;
2956         }
2957         if (vm_domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2958                 printk(KERN_ERR
2959                         "intel_iommu_domain_alloc: domain_init() failed\n");
2960                 vm_domain_exit(domain);
2961                 return NULL;
2962         }
2963
2964         return domain;
2965 }
2966 EXPORT_SYMBOL_GPL(intel_iommu_alloc_domain);
2967
2968 void intel_iommu_free_domain(struct dmar_domain *domain)
2969 {
2970         vm_domain_exit(domain);
2971 }
2972 EXPORT_SYMBOL_GPL(intel_iommu_free_domain);
2973
2974 int intel_iommu_attach_device(struct dmar_domain *domain,
2975                               struct pci_dev *pdev)
2976 {
2977         int ret;
2978
2979         /* normally pdev is not mapped */
2980         if (unlikely(domain_context_mapped(pdev))) {
2981                 struct dmar_domain *old_domain;
2982
2983                 old_domain = find_domain(pdev);
2984                 if (old_domain) {
2985                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
2986                                 vm_domain_remove_one_dev_info(old_domain, pdev);
2987                         else
2988                                 domain_remove_dev_info(old_domain);
2989                 }
2990         }
2991
2992         ret = domain_context_mapping(domain, pdev);
2993         if (ret)
2994                 return ret;
2995
2996         ret = vm_domain_add_dev_info(domain, pdev);
2997         return ret;
2998 }
2999 EXPORT_SYMBOL_GPL(intel_iommu_attach_device);
3000
3001 void intel_iommu_detach_device(struct dmar_domain *domain,
3002                                struct pci_dev *pdev)
3003 {
3004         vm_domain_remove_one_dev_info(domain, pdev);
3005 }
3006 EXPORT_SYMBOL_GPL(intel_iommu_detach_device);
3007
3008 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
3009                             u64 hpa, size_t size, int prot)
3010 {
3011         int ret;
3012         ret = domain_page_mapping(domain, iova, hpa, size, prot);
3013         return ret;
3014 }
3015 EXPORT_SYMBOL_GPL(intel_iommu_map_address);
3016
3017 void intel_iommu_unmap_address(struct dmar_domain *domain,
3018                                dma_addr_t iova, size_t size)
3019 {
3020         dma_addr_t base;
3021
3022         /* The address might not be aligned */
3023         base = iova & VTD_PAGE_MASK;
3024         size = VTD_PAGE_ALIGN(size);
3025         dma_pte_clear_range(domain, base, base + size);
3026 }
3027 EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
3028
3029 int intel_iommu_found(void)
3030 {
3031         return g_num_of_iommus;
3032 }
3033 EXPORT_SYMBOL_GPL(intel_iommu_found);
3034
3035 u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
3036 {
3037         struct dma_pte *pte;
3038         u64 phys = 0;
3039
3040         pte = addr_to_dma_pte(domain, iova);
3041         if (pte)
3042                 phys = dma_pte_addr(pte);
3043
3044         return phys;
3045 }
3046 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);