VT-d: adapt device attach and detach functions for IOMMU API
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 /*
65  * 0: Present
66  * 1-11: Reserved
67  * 12-63: Context Ptr (12 - (haw-1))
68  * 64-127: Reserved
69  */
70 struct root_entry {
71         u64     val;
72         u64     rsvd1;
73 };
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
76 {
77         return (root->val & 1);
78 }
79 static inline void set_root_present(struct root_entry *root)
80 {
81         root->val |= 1;
82 }
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 {
85         root->val |= value & VTD_PAGE_MASK;
86 }
87
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
90 {
91         return (struct context_entry *)
92                 (root_present(root)?phys_to_virt(
93                 root->val & VTD_PAGE_MASK) :
94                 NULL);
95 }
96
97 /*
98  * low 64 bits:
99  * 0: present
100  * 1: fault processing disable
101  * 2-3: translation type
102  * 12-63: address space root
103  * high 64 bits:
104  * 0-2: address width
105  * 3-6: aval
106  * 8-23: domain id
107  */
108 struct context_entry {
109         u64 lo;
110         u64 hi;
111 };
112
113 static inline bool context_present(struct context_entry *context)
114 {
115         return (context->lo & 1);
116 }
117 static inline void context_set_present(struct context_entry *context)
118 {
119         context->lo |= 1;
120 }
121
122 static inline void context_set_fault_enable(struct context_entry *context)
123 {
124         context->lo &= (((u64)-1) << 2) | 1;
125 }
126
127 #define CONTEXT_TT_MULTI_LEVEL 0
128
129 static inline void context_set_translation_type(struct context_entry *context,
130                                                 unsigned long value)
131 {
132         context->lo &= (((u64)-1) << 4) | 3;
133         context->lo |= (value & 3) << 2;
134 }
135
136 static inline void context_set_address_root(struct context_entry *context,
137                                             unsigned long value)
138 {
139         context->lo |= value & VTD_PAGE_MASK;
140 }
141
142 static inline void context_set_address_width(struct context_entry *context,
143                                              unsigned long value)
144 {
145         context->hi |= value & 7;
146 }
147
148 static inline void context_set_domain_id(struct context_entry *context,
149                                          unsigned long value)
150 {
151         context->hi |= (value & ((1 << 16) - 1)) << 8;
152 }
153
154 static inline void context_clear_entry(struct context_entry *context)
155 {
156         context->lo = 0;
157         context->hi = 0;
158 }
159
160 /*
161  * 0: readable
162  * 1: writable
163  * 2-6: reserved
164  * 7: super page
165  * 8-11: available
166  * 12-63: Host physcial address
167  */
168 struct dma_pte {
169         u64 val;
170 };
171
172 static inline void dma_clear_pte(struct dma_pte *pte)
173 {
174         pte->val = 0;
175 }
176
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 {
179         pte->val |= DMA_PTE_READ;
180 }
181
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 {
184         pte->val |= DMA_PTE_WRITE;
185 }
186
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 {
189         pte->val = (pte->val & ~3) | (prot & 3);
190 }
191
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 {
194         return (pte->val & VTD_PAGE_MASK);
195 }
196
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 {
199         pte->val |= (addr & VTD_PAGE_MASK);
200 }
201
202 static inline bool dma_pte_present(struct dma_pte *pte)
203 {
204         return (pte->val & 3) != 0;
205 }
206
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
209
210 /* domain represents a virtual machine, more than one devices
211  * across iommus may be owned in one domain, e.g. kvm guest.
212  */
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
214
215 struct dmar_domain {
216         int     id;                     /* domain id */
217         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
218
219         struct list_head devices;       /* all devices' list */
220         struct iova_domain iovad;       /* iova's that belong to this domain */
221
222         struct dma_pte  *pgd;           /* virtual address */
223         spinlock_t      mapping_lock;   /* page table lock */
224         int             gaw;            /* max guest address width */
225
226         /* adjusted guest address width, 0 is level 2 30-bit */
227         int             agaw;
228
229         int             flags;          /* flags to find out type of domain */
230
231         int             iommu_coherency;/* indicate coherency of iommu access */
232         int             iommu_count;    /* reference count of iommu */
233         spinlock_t      iommu_lock;     /* protect iommu set in domain */
234         u64             max_addr;       /* maximum mapped address */
235 };
236
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239         struct list_head link;  /* link to domain siblings */
240         struct list_head global; /* link to global list */
241         u8 bus;                 /* PCI bus numer */
242         u8 devfn;               /* PCI devfn number */
243         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244         struct dmar_domain *domain; /* pointer to domain */
245 };
246
247 static void flush_unmaps_timeout(unsigned long data);
248
249 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
250
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
253         int next;
254         struct iova *iova[HIGH_WATER_MARK];
255         struct dmar_domain *domain[HIGH_WATER_MARK];
256 };
257
258 static struct deferred_flush_tables *deferred_flush;
259
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
262
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
265
266 static int timer_on;
267 static long list_size;
268
269 static void domain_remove_dev_info(struct dmar_domain *domain);
270
271 int dmar_disabled;
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
275
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
279
280 static int __init intel_iommu_setup(char *str)
281 {
282         if (!str)
283                 return -EINVAL;
284         while (*str) {
285                 if (!strncmp(str, "off", 3)) {
286                         dmar_disabled = 1;
287                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
288                 } else if (!strncmp(str, "igfx_off", 8)) {
289                         dmar_map_gfx = 0;
290                         printk(KERN_INFO
291                                 "Intel-IOMMU: disable GFX device mapping\n");
292                 } else if (!strncmp(str, "forcedac", 8)) {
293                         printk(KERN_INFO
294                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
295                         dmar_forcedac = 1;
296                 } else if (!strncmp(str, "strict", 6)) {
297                         printk(KERN_INFO
298                                 "Intel-IOMMU: disable batched IOTLB flush\n");
299                         intel_iommu_strict = 1;
300                 }
301
302                 str += strcspn(str, ",");
303                 while (*str == ',')
304                         str++;
305         }
306         return 0;
307 }
308 __setup("intel_iommu=", intel_iommu_setup);
309
310 static struct kmem_cache *iommu_domain_cache;
311 static struct kmem_cache *iommu_devinfo_cache;
312 static struct kmem_cache *iommu_iova_cache;
313
314 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
315 {
316         unsigned int flags;
317         void *vaddr;
318
319         /* trying to avoid low memory issues */
320         flags = current->flags & PF_MEMALLOC;
321         current->flags |= PF_MEMALLOC;
322         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
323         current->flags &= (~PF_MEMALLOC | flags);
324         return vaddr;
325 }
326
327
328 static inline void *alloc_pgtable_page(void)
329 {
330         unsigned int flags;
331         void *vaddr;
332
333         /* trying to avoid low memory issues */
334         flags = current->flags & PF_MEMALLOC;
335         current->flags |= PF_MEMALLOC;
336         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
337         current->flags &= (~PF_MEMALLOC | flags);
338         return vaddr;
339 }
340
341 static inline void free_pgtable_page(void *vaddr)
342 {
343         free_page((unsigned long)vaddr);
344 }
345
346 static inline void *alloc_domain_mem(void)
347 {
348         return iommu_kmem_cache_alloc(iommu_domain_cache);
349 }
350
351 static void free_domain_mem(void *vaddr)
352 {
353         kmem_cache_free(iommu_domain_cache, vaddr);
354 }
355
356 static inline void * alloc_devinfo_mem(void)
357 {
358         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
359 }
360
361 static inline void free_devinfo_mem(void *vaddr)
362 {
363         kmem_cache_free(iommu_devinfo_cache, vaddr);
364 }
365
366 struct iova *alloc_iova_mem(void)
367 {
368         return iommu_kmem_cache_alloc(iommu_iova_cache);
369 }
370
371 void free_iova_mem(struct iova *iova)
372 {
373         kmem_cache_free(iommu_iova_cache, iova);
374 }
375
376
377 static inline int width_to_agaw(int width);
378
379 /* calculate agaw for each iommu.
380  * "SAGAW" may be different across iommus, use a default agaw, and
381  * get a supported less agaw for iommus that don't support the default agaw.
382  */
383 int iommu_calculate_agaw(struct intel_iommu *iommu)
384 {
385         unsigned long sagaw;
386         int agaw = -1;
387
388         sagaw = cap_sagaw(iommu->cap);
389         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
390              agaw >= 0; agaw--) {
391                 if (test_bit(agaw, &sagaw))
392                         break;
393         }
394
395         return agaw;
396 }
397
398 /* in native case, each domain is related to only one iommu */
399 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
400 {
401         int iommu_id;
402
403         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
404
405         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
406         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
407                 return NULL;
408
409         return g_iommus[iommu_id];
410 }
411
412 /* "Coherency" capability may be different across iommus */
413 static void domain_update_iommu_coherency(struct dmar_domain *domain)
414 {
415         int i;
416
417         domain->iommu_coherency = 1;
418
419         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
420         for (; i < g_num_of_iommus; ) {
421                 if (!ecap_coherent(g_iommus[i]->ecap)) {
422                         domain->iommu_coherency = 0;
423                         break;
424                 }
425                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
426         }
427 }
428
429 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
430 {
431         struct dmar_drhd_unit *drhd = NULL;
432         int i;
433
434         for_each_drhd_unit(drhd) {
435                 if (drhd->ignored)
436                         continue;
437
438                 for (i = 0; i < drhd->devices_cnt; i++)
439                         if (drhd->devices[i]->bus->number == bus &&
440                             drhd->devices[i]->devfn == devfn)
441                                 return drhd->iommu;
442
443                 if (drhd->include_all)
444                         return drhd->iommu;
445         }
446
447         return NULL;
448 }
449
450 static void domain_flush_cache(struct dmar_domain *domain,
451                                void *addr, int size)
452 {
453         if (!domain->iommu_coherency)
454                 clflush_cache_range(addr, size);
455 }
456
457 /* Gets context entry for a given bus and devfn */
458 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
459                 u8 bus, u8 devfn)
460 {
461         struct root_entry *root;
462         struct context_entry *context;
463         unsigned long phy_addr;
464         unsigned long flags;
465
466         spin_lock_irqsave(&iommu->lock, flags);
467         root = &iommu->root_entry[bus];
468         context = get_context_addr_from_root(root);
469         if (!context) {
470                 context = (struct context_entry *)alloc_pgtable_page();
471                 if (!context) {
472                         spin_unlock_irqrestore(&iommu->lock, flags);
473                         return NULL;
474                 }
475                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
476                 phy_addr = virt_to_phys((void *)context);
477                 set_root_value(root, phy_addr);
478                 set_root_present(root);
479                 __iommu_flush_cache(iommu, root, sizeof(*root));
480         }
481         spin_unlock_irqrestore(&iommu->lock, flags);
482         return &context[devfn];
483 }
484
485 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
486 {
487         struct root_entry *root;
488         struct context_entry *context;
489         int ret;
490         unsigned long flags;
491
492         spin_lock_irqsave(&iommu->lock, flags);
493         root = &iommu->root_entry[bus];
494         context = get_context_addr_from_root(root);
495         if (!context) {
496                 ret = 0;
497                 goto out;
498         }
499         ret = context_present(&context[devfn]);
500 out:
501         spin_unlock_irqrestore(&iommu->lock, flags);
502         return ret;
503 }
504
505 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
506 {
507         struct root_entry *root;
508         struct context_entry *context;
509         unsigned long flags;
510
511         spin_lock_irqsave(&iommu->lock, flags);
512         root = &iommu->root_entry[bus];
513         context = get_context_addr_from_root(root);
514         if (context) {
515                 context_clear_entry(&context[devfn]);
516                 __iommu_flush_cache(iommu, &context[devfn], \
517                         sizeof(*context));
518         }
519         spin_unlock_irqrestore(&iommu->lock, flags);
520 }
521
522 static void free_context_table(struct intel_iommu *iommu)
523 {
524         struct root_entry *root;
525         int i;
526         unsigned long flags;
527         struct context_entry *context;
528
529         spin_lock_irqsave(&iommu->lock, flags);
530         if (!iommu->root_entry) {
531                 goto out;
532         }
533         for (i = 0; i < ROOT_ENTRY_NR; i++) {
534                 root = &iommu->root_entry[i];
535                 context = get_context_addr_from_root(root);
536                 if (context)
537                         free_pgtable_page(context);
538         }
539         free_pgtable_page(iommu->root_entry);
540         iommu->root_entry = NULL;
541 out:
542         spin_unlock_irqrestore(&iommu->lock, flags);
543 }
544
545 /* page table handling */
546 #define LEVEL_STRIDE            (9)
547 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
548
549 static inline int agaw_to_level(int agaw)
550 {
551         return agaw + 2;
552 }
553
554 static inline int agaw_to_width(int agaw)
555 {
556         return 30 + agaw * LEVEL_STRIDE;
557
558 }
559
560 static inline int width_to_agaw(int width)
561 {
562         return (width - 30) / LEVEL_STRIDE;
563 }
564
565 static inline unsigned int level_to_offset_bits(int level)
566 {
567         return (12 + (level - 1) * LEVEL_STRIDE);
568 }
569
570 static inline int address_level_offset(u64 addr, int level)
571 {
572         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
573 }
574
575 static inline u64 level_mask(int level)
576 {
577         return ((u64)-1 << level_to_offset_bits(level));
578 }
579
580 static inline u64 level_size(int level)
581 {
582         return ((u64)1 << level_to_offset_bits(level));
583 }
584
585 static inline u64 align_to_level(u64 addr, int level)
586 {
587         return ((addr + level_size(level) - 1) & level_mask(level));
588 }
589
590 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
591 {
592         int addr_width = agaw_to_width(domain->agaw);
593         struct dma_pte *parent, *pte = NULL;
594         int level = agaw_to_level(domain->agaw);
595         int offset;
596         unsigned long flags;
597
598         BUG_ON(!domain->pgd);
599
600         addr &= (((u64)1) << addr_width) - 1;
601         parent = domain->pgd;
602
603         spin_lock_irqsave(&domain->mapping_lock, flags);
604         while (level > 0) {
605                 void *tmp_page;
606
607                 offset = address_level_offset(addr, level);
608                 pte = &parent[offset];
609                 if (level == 1)
610                         break;
611
612                 if (!dma_pte_present(pte)) {
613                         tmp_page = alloc_pgtable_page();
614
615                         if (!tmp_page) {
616                                 spin_unlock_irqrestore(&domain->mapping_lock,
617                                         flags);
618                                 return NULL;
619                         }
620                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
621                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
622                         /*
623                          * high level table always sets r/w, last level page
624                          * table control read/write
625                          */
626                         dma_set_pte_readable(pte);
627                         dma_set_pte_writable(pte);
628                         domain_flush_cache(domain, pte, sizeof(*pte));
629                 }
630                 parent = phys_to_virt(dma_pte_addr(pte));
631                 level--;
632         }
633
634         spin_unlock_irqrestore(&domain->mapping_lock, flags);
635         return pte;
636 }
637
638 /* return address's pte at specific level */
639 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
640                 int level)
641 {
642         struct dma_pte *parent, *pte = NULL;
643         int total = agaw_to_level(domain->agaw);
644         int offset;
645
646         parent = domain->pgd;
647         while (level <= total) {
648                 offset = address_level_offset(addr, total);
649                 pte = &parent[offset];
650                 if (level == total)
651                         return pte;
652
653                 if (!dma_pte_present(pte))
654                         break;
655                 parent = phys_to_virt(dma_pte_addr(pte));
656                 total--;
657         }
658         return NULL;
659 }
660
661 /* clear one page's page table */
662 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
663 {
664         struct dma_pte *pte = NULL;
665
666         /* get last level pte */
667         pte = dma_addr_level_pte(domain, addr, 1);
668
669         if (pte) {
670                 dma_clear_pte(pte);
671                 domain_flush_cache(domain, pte, sizeof(*pte));
672         }
673 }
674
675 /* clear last level pte, a tlb flush should be followed */
676 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
677 {
678         int addr_width = agaw_to_width(domain->agaw);
679
680         start &= (((u64)1) << addr_width) - 1;
681         end &= (((u64)1) << addr_width) - 1;
682         /* in case it's partial page */
683         start = PAGE_ALIGN(start);
684         end &= PAGE_MASK;
685
686         /* we don't need lock here, nobody else touches the iova range */
687         while (start < end) {
688                 dma_pte_clear_one(domain, start);
689                 start += VTD_PAGE_SIZE;
690         }
691 }
692
693 /* free page table pages. last level pte should already be cleared */
694 static void dma_pte_free_pagetable(struct dmar_domain *domain,
695         u64 start, u64 end)
696 {
697         int addr_width = agaw_to_width(domain->agaw);
698         struct dma_pte *pte;
699         int total = agaw_to_level(domain->agaw);
700         int level;
701         u64 tmp;
702
703         start &= (((u64)1) << addr_width) - 1;
704         end &= (((u64)1) << addr_width) - 1;
705
706         /* we don't need lock here, nobody else touches the iova range */
707         level = 2;
708         while (level <= total) {
709                 tmp = align_to_level(start, level);
710                 if (tmp >= end || (tmp + level_size(level) > end))
711                         return;
712
713                 while (tmp < end) {
714                         pte = dma_addr_level_pte(domain, tmp, level);
715                         if (pte) {
716                                 free_pgtable_page(
717                                         phys_to_virt(dma_pte_addr(pte)));
718                                 dma_clear_pte(pte);
719                                 domain_flush_cache(domain, pte, sizeof(*pte));
720                         }
721                         tmp += level_size(level);
722                 }
723                 level++;
724         }
725         /* free pgd */
726         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
727                 free_pgtable_page(domain->pgd);
728                 domain->pgd = NULL;
729         }
730 }
731
732 /* iommu handling */
733 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
734 {
735         struct root_entry *root;
736         unsigned long flags;
737
738         root = (struct root_entry *)alloc_pgtable_page();
739         if (!root)
740                 return -ENOMEM;
741
742         __iommu_flush_cache(iommu, root, ROOT_SIZE);
743
744         spin_lock_irqsave(&iommu->lock, flags);
745         iommu->root_entry = root;
746         spin_unlock_irqrestore(&iommu->lock, flags);
747
748         return 0;
749 }
750
751 static void iommu_set_root_entry(struct intel_iommu *iommu)
752 {
753         void *addr;
754         u32 cmd, sts;
755         unsigned long flag;
756
757         addr = iommu->root_entry;
758
759         spin_lock_irqsave(&iommu->register_lock, flag);
760         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
761
762         cmd = iommu->gcmd | DMA_GCMD_SRTP;
763         writel(cmd, iommu->reg + DMAR_GCMD_REG);
764
765         /* Make sure hardware complete it */
766         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
767                 readl, (sts & DMA_GSTS_RTPS), sts);
768
769         spin_unlock_irqrestore(&iommu->register_lock, flag);
770 }
771
772 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
773 {
774         u32 val;
775         unsigned long flag;
776
777         if (!cap_rwbf(iommu->cap))
778                 return;
779         val = iommu->gcmd | DMA_GCMD_WBF;
780
781         spin_lock_irqsave(&iommu->register_lock, flag);
782         writel(val, iommu->reg + DMAR_GCMD_REG);
783
784         /* Make sure hardware complete it */
785         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
786                         readl, (!(val & DMA_GSTS_WBFS)), val);
787
788         spin_unlock_irqrestore(&iommu->register_lock, flag);
789 }
790
791 /* return value determine if we need a write buffer flush */
792 static int __iommu_flush_context(struct intel_iommu *iommu,
793         u16 did, u16 source_id, u8 function_mask, u64 type,
794         int non_present_entry_flush)
795 {
796         u64 val = 0;
797         unsigned long flag;
798
799         /*
800          * In the non-present entry flush case, if hardware doesn't cache
801          * non-present entry we do nothing and if hardware cache non-present
802          * entry, we flush entries of domain 0 (the domain id is used to cache
803          * any non-present entries)
804          */
805         if (non_present_entry_flush) {
806                 if (!cap_caching_mode(iommu->cap))
807                         return 1;
808                 else
809                         did = 0;
810         }
811
812         switch (type) {
813         case DMA_CCMD_GLOBAL_INVL:
814                 val = DMA_CCMD_GLOBAL_INVL;
815                 break;
816         case DMA_CCMD_DOMAIN_INVL:
817                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
818                 break;
819         case DMA_CCMD_DEVICE_INVL:
820                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
821                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
822                 break;
823         default:
824                 BUG();
825         }
826         val |= DMA_CCMD_ICC;
827
828         spin_lock_irqsave(&iommu->register_lock, flag);
829         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
830
831         /* Make sure hardware complete it */
832         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
833                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
834
835         spin_unlock_irqrestore(&iommu->register_lock, flag);
836
837         /* flush context entry will implicitly flush write buffer */
838         return 0;
839 }
840
841 /* return value determine if we need a write buffer flush */
842 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
843         u64 addr, unsigned int size_order, u64 type,
844         int non_present_entry_flush)
845 {
846         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
847         u64 val = 0, val_iva = 0;
848         unsigned long flag;
849
850         /*
851          * In the non-present entry flush case, if hardware doesn't cache
852          * non-present entry we do nothing and if hardware cache non-present
853          * entry, we flush entries of domain 0 (the domain id is used to cache
854          * any non-present entries)
855          */
856         if (non_present_entry_flush) {
857                 if (!cap_caching_mode(iommu->cap))
858                         return 1;
859                 else
860                         did = 0;
861         }
862
863         switch (type) {
864         case DMA_TLB_GLOBAL_FLUSH:
865                 /* global flush doesn't need set IVA_REG */
866                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
867                 break;
868         case DMA_TLB_DSI_FLUSH:
869                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
870                 break;
871         case DMA_TLB_PSI_FLUSH:
872                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
873                 /* Note: always flush non-leaf currently */
874                 val_iva = size_order | addr;
875                 break;
876         default:
877                 BUG();
878         }
879         /* Note: set drain read/write */
880 #if 0
881         /*
882          * This is probably to be super secure.. Looks like we can
883          * ignore it without any impact.
884          */
885         if (cap_read_drain(iommu->cap))
886                 val |= DMA_TLB_READ_DRAIN;
887 #endif
888         if (cap_write_drain(iommu->cap))
889                 val |= DMA_TLB_WRITE_DRAIN;
890
891         spin_lock_irqsave(&iommu->register_lock, flag);
892         /* Note: Only uses first TLB reg currently */
893         if (val_iva)
894                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
895         dmar_writeq(iommu->reg + tlb_offset + 8, val);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
899                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902
903         /* check IOTLB invalidation granularity */
904         if (DMA_TLB_IAIG(val) == 0)
905                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
906         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
907                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
908                         (unsigned long long)DMA_TLB_IIRG(type),
909                         (unsigned long long)DMA_TLB_IAIG(val));
910         /* flush iotlb entry will implicitly flush write buffer */
911         return 0;
912 }
913
914 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
915         u64 addr, unsigned int pages, int non_present_entry_flush)
916 {
917         unsigned int mask;
918
919         BUG_ON(addr & (~VTD_PAGE_MASK));
920         BUG_ON(pages == 0);
921
922         /* Fallback to domain selective flush if no PSI support */
923         if (!cap_pgsel_inv(iommu->cap))
924                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
925                                                 DMA_TLB_DSI_FLUSH,
926                                                 non_present_entry_flush);
927
928         /*
929          * PSI requires page size to be 2 ^ x, and the base address is naturally
930          * aligned to the size
931          */
932         mask = ilog2(__roundup_pow_of_two(pages));
933         /* Fallback to domain selective flush if size is too big */
934         if (mask > cap_max_amask_val(iommu->cap))
935                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
936                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
937
938         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
939                                         DMA_TLB_PSI_FLUSH,
940                                         non_present_entry_flush);
941 }
942
943 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
944 {
945         u32 pmen;
946         unsigned long flags;
947
948         spin_lock_irqsave(&iommu->register_lock, flags);
949         pmen = readl(iommu->reg + DMAR_PMEN_REG);
950         pmen &= ~DMA_PMEN_EPM;
951         writel(pmen, iommu->reg + DMAR_PMEN_REG);
952
953         /* wait for the protected region status bit to clear */
954         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
955                 readl, !(pmen & DMA_PMEN_PRS), pmen);
956
957         spin_unlock_irqrestore(&iommu->register_lock, flags);
958 }
959
960 static int iommu_enable_translation(struct intel_iommu *iommu)
961 {
962         u32 sts;
963         unsigned long flags;
964
965         spin_lock_irqsave(&iommu->register_lock, flags);
966         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
967
968         /* Make sure hardware complete it */
969         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
970                 readl, (sts & DMA_GSTS_TES), sts);
971
972         iommu->gcmd |= DMA_GCMD_TE;
973         spin_unlock_irqrestore(&iommu->register_lock, flags);
974         return 0;
975 }
976
977 static int iommu_disable_translation(struct intel_iommu *iommu)
978 {
979         u32 sts;
980         unsigned long flag;
981
982         spin_lock_irqsave(&iommu->register_lock, flag);
983         iommu->gcmd &= ~DMA_GCMD_TE;
984         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
985
986         /* Make sure hardware complete it */
987         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
988                 readl, (!(sts & DMA_GSTS_TES)), sts);
989
990         spin_unlock_irqrestore(&iommu->register_lock, flag);
991         return 0;
992 }
993
994 /* iommu interrupt handling. Most stuff are MSI-like. */
995
996 static const char *fault_reason_strings[] =
997 {
998         "Software",
999         "Present bit in root entry is clear",
1000         "Present bit in context entry is clear",
1001         "Invalid context entry",
1002         "Access beyond MGAW",
1003         "PTE Write access is not set",
1004         "PTE Read access is not set",
1005         "Next page table ptr is invalid",
1006         "Root table address invalid",
1007         "Context table ptr is invalid",
1008         "non-zero reserved fields in RTP",
1009         "non-zero reserved fields in CTP",
1010         "non-zero reserved fields in PTE",
1011 };
1012 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1013
1014 const char *dmar_get_fault_reason(u8 fault_reason)
1015 {
1016         if (fault_reason > MAX_FAULT_REASON_IDX)
1017                 return "Unknown";
1018         else
1019                 return fault_reason_strings[fault_reason];
1020 }
1021
1022 void dmar_msi_unmask(unsigned int irq)
1023 {
1024         struct intel_iommu *iommu = get_irq_data(irq);
1025         unsigned long flag;
1026
1027         /* unmask it */
1028         spin_lock_irqsave(&iommu->register_lock, flag);
1029         writel(0, iommu->reg + DMAR_FECTL_REG);
1030         /* Read a reg to force flush the post write */
1031         readl(iommu->reg + DMAR_FECTL_REG);
1032         spin_unlock_irqrestore(&iommu->register_lock, flag);
1033 }
1034
1035 void dmar_msi_mask(unsigned int irq)
1036 {
1037         unsigned long flag;
1038         struct intel_iommu *iommu = get_irq_data(irq);
1039
1040         /* mask it */
1041         spin_lock_irqsave(&iommu->register_lock, flag);
1042         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1043         /* Read a reg to force flush the post write */
1044         readl(iommu->reg + DMAR_FECTL_REG);
1045         spin_unlock_irqrestore(&iommu->register_lock, flag);
1046 }
1047
1048 void dmar_msi_write(int irq, struct msi_msg *msg)
1049 {
1050         struct intel_iommu *iommu = get_irq_data(irq);
1051         unsigned long flag;
1052
1053         spin_lock_irqsave(&iommu->register_lock, flag);
1054         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1055         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1056         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1057         spin_unlock_irqrestore(&iommu->register_lock, flag);
1058 }
1059
1060 void dmar_msi_read(int irq, struct msi_msg *msg)
1061 {
1062         struct intel_iommu *iommu = get_irq_data(irq);
1063         unsigned long flag;
1064
1065         spin_lock_irqsave(&iommu->register_lock, flag);
1066         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1067         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1068         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1069         spin_unlock_irqrestore(&iommu->register_lock, flag);
1070 }
1071
1072 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1073                 u8 fault_reason, u16 source_id, unsigned long long addr)
1074 {
1075         const char *reason;
1076
1077         reason = dmar_get_fault_reason(fault_reason);
1078
1079         printk(KERN_ERR
1080                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1081                 "fault addr %llx \n"
1082                 "DMAR:[fault reason %02d] %s\n",
1083                 (type ? "DMA Read" : "DMA Write"),
1084                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1085                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1086         return 0;
1087 }
1088
1089 #define PRIMARY_FAULT_REG_LEN (16)
1090 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1091 {
1092         struct intel_iommu *iommu = dev_id;
1093         int reg, fault_index;
1094         u32 fault_status;
1095         unsigned long flag;
1096
1097         spin_lock_irqsave(&iommu->register_lock, flag);
1098         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1099
1100         /* TBD: ignore advanced fault log currently */
1101         if (!(fault_status & DMA_FSTS_PPF))
1102                 goto clear_overflow;
1103
1104         fault_index = dma_fsts_fault_record_index(fault_status);
1105         reg = cap_fault_reg_offset(iommu->cap);
1106         while (1) {
1107                 u8 fault_reason;
1108                 u16 source_id;
1109                 u64 guest_addr;
1110                 int type;
1111                 u32 data;
1112
1113                 /* highest 32 bits */
1114                 data = readl(iommu->reg + reg +
1115                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1116                 if (!(data & DMA_FRCD_F))
1117                         break;
1118
1119                 fault_reason = dma_frcd_fault_reason(data);
1120                 type = dma_frcd_type(data);
1121
1122                 data = readl(iommu->reg + reg +
1123                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1124                 source_id = dma_frcd_source_id(data);
1125
1126                 guest_addr = dmar_readq(iommu->reg + reg +
1127                                 fault_index * PRIMARY_FAULT_REG_LEN);
1128                 guest_addr = dma_frcd_page_addr(guest_addr);
1129                 /* clear the fault */
1130                 writel(DMA_FRCD_F, iommu->reg + reg +
1131                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1132
1133                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1134
1135                 iommu_page_fault_do_one(iommu, type, fault_reason,
1136                                 source_id, guest_addr);
1137
1138                 fault_index++;
1139                 if (fault_index > cap_num_fault_regs(iommu->cap))
1140                         fault_index = 0;
1141                 spin_lock_irqsave(&iommu->register_lock, flag);
1142         }
1143 clear_overflow:
1144         /* clear primary fault overflow */
1145         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1146         if (fault_status & DMA_FSTS_PFO)
1147                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1148
1149         spin_unlock_irqrestore(&iommu->register_lock, flag);
1150         return IRQ_HANDLED;
1151 }
1152
1153 int dmar_set_interrupt(struct intel_iommu *iommu)
1154 {
1155         int irq, ret;
1156
1157         irq = create_irq();
1158         if (!irq) {
1159                 printk(KERN_ERR "IOMMU: no free vectors\n");
1160                 return -EINVAL;
1161         }
1162
1163         set_irq_data(irq, iommu);
1164         iommu->irq = irq;
1165
1166         ret = arch_setup_dmar_msi(irq);
1167         if (ret) {
1168                 set_irq_data(irq, NULL);
1169                 iommu->irq = 0;
1170                 destroy_irq(irq);
1171                 return 0;
1172         }
1173
1174         /* Force fault register is cleared */
1175         iommu_page_fault(irq, iommu);
1176
1177         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1178         if (ret)
1179                 printk(KERN_ERR "IOMMU: can't request irq\n");
1180         return ret;
1181 }
1182
1183 static int iommu_init_domains(struct intel_iommu *iommu)
1184 {
1185         unsigned long ndomains;
1186         unsigned long nlongs;
1187
1188         ndomains = cap_ndoms(iommu->cap);
1189         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1190         nlongs = BITS_TO_LONGS(ndomains);
1191
1192         /* TBD: there might be 64K domains,
1193          * consider other allocation for future chip
1194          */
1195         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1196         if (!iommu->domain_ids) {
1197                 printk(KERN_ERR "Allocating domain id array failed\n");
1198                 return -ENOMEM;
1199         }
1200         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1201                         GFP_KERNEL);
1202         if (!iommu->domains) {
1203                 printk(KERN_ERR "Allocating domain array failed\n");
1204                 kfree(iommu->domain_ids);
1205                 return -ENOMEM;
1206         }
1207
1208         spin_lock_init(&iommu->lock);
1209
1210         /*
1211          * if Caching mode is set, then invalid translations are tagged
1212          * with domainid 0. Hence we need to pre-allocate it.
1213          */
1214         if (cap_caching_mode(iommu->cap))
1215                 set_bit(0, iommu->domain_ids);
1216         return 0;
1217 }
1218
1219
1220 static void domain_exit(struct dmar_domain *domain);
1221 static void vm_domain_exit(struct dmar_domain *domain);
1222
1223 void free_dmar_iommu(struct intel_iommu *iommu)
1224 {
1225         struct dmar_domain *domain;
1226         int i;
1227         unsigned long flags;
1228
1229         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1230         for (; i < cap_ndoms(iommu->cap); ) {
1231                 domain = iommu->domains[i];
1232                 clear_bit(i, iommu->domain_ids);
1233
1234                 spin_lock_irqsave(&domain->iommu_lock, flags);
1235                 if (--domain->iommu_count == 0) {
1236                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1237                                 vm_domain_exit(domain);
1238                         else
1239                                 domain_exit(domain);
1240                 }
1241                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1242
1243                 i = find_next_bit(iommu->domain_ids,
1244                         cap_ndoms(iommu->cap), i+1);
1245         }
1246
1247         if (iommu->gcmd & DMA_GCMD_TE)
1248                 iommu_disable_translation(iommu);
1249
1250         if (iommu->irq) {
1251                 set_irq_data(iommu->irq, NULL);
1252                 /* This will mask the irq */
1253                 free_irq(iommu->irq, iommu);
1254                 destroy_irq(iommu->irq);
1255         }
1256
1257         kfree(iommu->domains);
1258         kfree(iommu->domain_ids);
1259
1260         g_iommus[iommu->seq_id] = NULL;
1261
1262         /* if all iommus are freed, free g_iommus */
1263         for (i = 0; i < g_num_of_iommus; i++) {
1264                 if (g_iommus[i])
1265                         break;
1266         }
1267
1268         if (i == g_num_of_iommus)
1269                 kfree(g_iommus);
1270
1271         /* free context mapping */
1272         free_context_table(iommu);
1273 }
1274
1275 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1276 {
1277         unsigned long num;
1278         unsigned long ndomains;
1279         struct dmar_domain *domain;
1280         unsigned long flags;
1281
1282         domain = alloc_domain_mem();
1283         if (!domain)
1284                 return NULL;
1285
1286         ndomains = cap_ndoms(iommu->cap);
1287
1288         spin_lock_irqsave(&iommu->lock, flags);
1289         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1290         if (num >= ndomains) {
1291                 spin_unlock_irqrestore(&iommu->lock, flags);
1292                 free_domain_mem(domain);
1293                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1294                 return NULL;
1295         }
1296
1297         set_bit(num, iommu->domain_ids);
1298         domain->id = num;
1299         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1300         set_bit(iommu->seq_id, &domain->iommu_bmp);
1301         domain->flags = 0;
1302         iommu->domains[num] = domain;
1303         spin_unlock_irqrestore(&iommu->lock, flags);
1304
1305         return domain;
1306 }
1307
1308 static void iommu_free_domain(struct dmar_domain *domain)
1309 {
1310         unsigned long flags;
1311         struct intel_iommu *iommu;
1312
1313         iommu = domain_get_iommu(domain);
1314
1315         spin_lock_irqsave(&iommu->lock, flags);
1316         clear_bit(domain->id, iommu->domain_ids);
1317         spin_unlock_irqrestore(&iommu->lock, flags);
1318 }
1319
1320 static struct iova_domain reserved_iova_list;
1321 static struct lock_class_key reserved_alloc_key;
1322 static struct lock_class_key reserved_rbtree_key;
1323
1324 static void dmar_init_reserved_ranges(void)
1325 {
1326         struct pci_dev *pdev = NULL;
1327         struct iova *iova;
1328         int i;
1329         u64 addr, size;
1330
1331         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1332
1333         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1334                 &reserved_alloc_key);
1335         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1336                 &reserved_rbtree_key);
1337
1338         /* IOAPIC ranges shouldn't be accessed by DMA */
1339         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1340                 IOVA_PFN(IOAPIC_RANGE_END));
1341         if (!iova)
1342                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1343
1344         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1345         for_each_pci_dev(pdev) {
1346                 struct resource *r;
1347
1348                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1349                         r = &pdev->resource[i];
1350                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1351                                 continue;
1352                         addr = r->start;
1353                         addr &= PAGE_MASK;
1354                         size = r->end - addr;
1355                         size = PAGE_ALIGN(size);
1356                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1357                                 IOVA_PFN(size + addr) - 1);
1358                         if (!iova)
1359                                 printk(KERN_ERR "Reserve iova failed\n");
1360                 }
1361         }
1362
1363 }
1364
1365 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1366 {
1367         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1368 }
1369
1370 static inline int guestwidth_to_adjustwidth(int gaw)
1371 {
1372         int agaw;
1373         int r = (gaw - 12) % 9;
1374
1375         if (r == 0)
1376                 agaw = gaw;
1377         else
1378                 agaw = gaw + 9 - r;
1379         if (agaw > 64)
1380                 agaw = 64;
1381         return agaw;
1382 }
1383
1384 static int domain_init(struct dmar_domain *domain, int guest_width)
1385 {
1386         struct intel_iommu *iommu;
1387         int adjust_width, agaw;
1388         unsigned long sagaw;
1389
1390         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1391         spin_lock_init(&domain->mapping_lock);
1392         spin_lock_init(&domain->iommu_lock);
1393
1394         domain_reserve_special_ranges(domain);
1395
1396         /* calculate AGAW */
1397         iommu = domain_get_iommu(domain);
1398         if (guest_width > cap_mgaw(iommu->cap))
1399                 guest_width = cap_mgaw(iommu->cap);
1400         domain->gaw = guest_width;
1401         adjust_width = guestwidth_to_adjustwidth(guest_width);
1402         agaw = width_to_agaw(adjust_width);
1403         sagaw = cap_sagaw(iommu->cap);
1404         if (!test_bit(agaw, &sagaw)) {
1405                 /* hardware doesn't support it, choose a bigger one */
1406                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1407                 agaw = find_next_bit(&sagaw, 5, agaw);
1408                 if (agaw >= 5)
1409                         return -ENODEV;
1410         }
1411         domain->agaw = agaw;
1412         INIT_LIST_HEAD(&domain->devices);
1413
1414         if (ecap_coherent(iommu->ecap))
1415                 domain->iommu_coherency = 1;
1416         else
1417                 domain->iommu_coherency = 0;
1418
1419         domain->iommu_count = 1;
1420
1421         /* always allocate the top pgd */
1422         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1423         if (!domain->pgd)
1424                 return -ENOMEM;
1425         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1426         return 0;
1427 }
1428
1429 static void domain_exit(struct dmar_domain *domain)
1430 {
1431         u64 end;
1432
1433         /* Domain 0 is reserved, so dont process it */
1434         if (!domain)
1435                 return;
1436
1437         domain_remove_dev_info(domain);
1438         /* destroy iovas */
1439         put_iova_domain(&domain->iovad);
1440         end = DOMAIN_MAX_ADDR(domain->gaw);
1441         end = end & (~PAGE_MASK);
1442
1443         /* clear ptes */
1444         dma_pte_clear_range(domain, 0, end);
1445
1446         /* free page tables */
1447         dma_pte_free_pagetable(domain, 0, end);
1448
1449         iommu_free_domain(domain);
1450         free_domain_mem(domain);
1451 }
1452
1453 static int domain_context_mapping_one(struct dmar_domain *domain,
1454                 u8 bus, u8 devfn)
1455 {
1456         struct context_entry *context;
1457         unsigned long flags;
1458         struct intel_iommu *iommu;
1459         struct dma_pte *pgd;
1460         unsigned long num;
1461         unsigned long ndomains;
1462         int id;
1463         int agaw;
1464
1465         pr_debug("Set context mapping for %02x:%02x.%d\n",
1466                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1467         BUG_ON(!domain->pgd);
1468
1469         iommu = device_to_iommu(bus, devfn);
1470         if (!iommu)
1471                 return -ENODEV;
1472
1473         context = device_to_context_entry(iommu, bus, devfn);
1474         if (!context)
1475                 return -ENOMEM;
1476         spin_lock_irqsave(&iommu->lock, flags);
1477         if (context_present(context)) {
1478                 spin_unlock_irqrestore(&iommu->lock, flags);
1479                 return 0;
1480         }
1481
1482         id = domain->id;
1483         pgd = domain->pgd;
1484
1485         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1486                 int found = 0;
1487
1488                 /* find an available domain id for this device in iommu */
1489                 ndomains = cap_ndoms(iommu->cap);
1490                 num = find_first_bit(iommu->domain_ids, ndomains);
1491                 for (; num < ndomains; ) {
1492                         if (iommu->domains[num] == domain) {
1493                                 id = num;
1494                                 found = 1;
1495                                 break;
1496                         }
1497                         num = find_next_bit(iommu->domain_ids,
1498                                             cap_ndoms(iommu->cap), num+1);
1499                 }
1500
1501                 if (found == 0) {
1502                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1503                         if (num >= ndomains) {
1504                                 spin_unlock_irqrestore(&iommu->lock, flags);
1505                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1506                                 return -EFAULT;
1507                         }
1508
1509                         set_bit(num, iommu->domain_ids);
1510                         iommu->domains[num] = domain;
1511                         id = num;
1512                 }
1513
1514                 /* Skip top levels of page tables for
1515                  * iommu which has less agaw than default.
1516                  */
1517                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1518                         pgd = phys_to_virt(dma_pte_addr(pgd));
1519                         if (!dma_pte_present(pgd)) {
1520                                 spin_unlock_irqrestore(&iommu->lock, flags);
1521                                 return -ENOMEM;
1522                         }
1523                 }
1524         }
1525
1526         context_set_domain_id(context, id);
1527         context_set_address_width(context, iommu->agaw);
1528         context_set_address_root(context, virt_to_phys(pgd));
1529         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1530         context_set_fault_enable(context);
1531         context_set_present(context);
1532         domain_flush_cache(domain, context, sizeof(*context));
1533
1534         /* it's a non-present to present mapping */
1535         if (iommu->flush.flush_context(iommu, domain->id,
1536                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1537                 DMA_CCMD_DEVICE_INVL, 1))
1538                 iommu_flush_write_buffer(iommu);
1539         else
1540                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1541
1542         spin_unlock_irqrestore(&iommu->lock, flags);
1543
1544         spin_lock_irqsave(&domain->iommu_lock, flags);
1545         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1546                 domain->iommu_count++;
1547                 domain_update_iommu_coherency(domain);
1548         }
1549         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1550         return 0;
1551 }
1552
1553 static int
1554 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1555 {
1556         int ret;
1557         struct pci_dev *tmp, *parent;
1558
1559         ret = domain_context_mapping_one(domain, pdev->bus->number,
1560                 pdev->devfn);
1561         if (ret)
1562                 return ret;
1563
1564         /* dependent device mapping */
1565         tmp = pci_find_upstream_pcie_bridge(pdev);
1566         if (!tmp)
1567                 return 0;
1568         /* Secondary interface's bus number and devfn 0 */
1569         parent = pdev->bus->self;
1570         while (parent != tmp) {
1571                 ret = domain_context_mapping_one(domain, parent->bus->number,
1572                         parent->devfn);
1573                 if (ret)
1574                         return ret;
1575                 parent = parent->bus->self;
1576         }
1577         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1578                 return domain_context_mapping_one(domain,
1579                         tmp->subordinate->number, 0);
1580         else /* this is a legacy PCI bridge */
1581                 return domain_context_mapping_one(domain,
1582                         tmp->bus->number, tmp->devfn);
1583 }
1584
1585 static int domain_context_mapped(struct pci_dev *pdev)
1586 {
1587         int ret;
1588         struct pci_dev *tmp, *parent;
1589         struct intel_iommu *iommu;
1590
1591         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1592         if (!iommu)
1593                 return -ENODEV;
1594
1595         ret = device_context_mapped(iommu,
1596                 pdev->bus->number, pdev->devfn);
1597         if (!ret)
1598                 return ret;
1599         /* dependent device mapping */
1600         tmp = pci_find_upstream_pcie_bridge(pdev);
1601         if (!tmp)
1602                 return ret;
1603         /* Secondary interface's bus number and devfn 0 */
1604         parent = pdev->bus->self;
1605         while (parent != tmp) {
1606                 ret = device_context_mapped(iommu, parent->bus->number,
1607                         parent->devfn);
1608                 if (!ret)
1609                         return ret;
1610                 parent = parent->bus->self;
1611         }
1612         if (tmp->is_pcie)
1613                 return device_context_mapped(iommu,
1614                         tmp->subordinate->number, 0);
1615         else
1616                 return device_context_mapped(iommu,
1617                         tmp->bus->number, tmp->devfn);
1618 }
1619
1620 static int
1621 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1622                         u64 hpa, size_t size, int prot)
1623 {
1624         u64 start_pfn, end_pfn;
1625         struct dma_pte *pte;
1626         int index;
1627         int addr_width = agaw_to_width(domain->agaw);
1628
1629         hpa &= (((u64)1) << addr_width) - 1;
1630
1631         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1632                 return -EINVAL;
1633         iova &= PAGE_MASK;
1634         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1635         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1636         index = 0;
1637         while (start_pfn < end_pfn) {
1638                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1639                 if (!pte)
1640                         return -ENOMEM;
1641                 /* We don't need lock here, nobody else
1642                  * touches the iova range
1643                  */
1644                 BUG_ON(dma_pte_addr(pte));
1645                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1646                 dma_set_pte_prot(pte, prot);
1647                 domain_flush_cache(domain, pte, sizeof(*pte));
1648                 start_pfn++;
1649                 index++;
1650         }
1651         return 0;
1652 }
1653
1654 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1655 {
1656         if (!iommu)
1657                 return;
1658
1659         clear_context_table(iommu, bus, devfn);
1660         iommu->flush.flush_context(iommu, 0, 0, 0,
1661                                            DMA_CCMD_GLOBAL_INVL, 0);
1662         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1663                                          DMA_TLB_GLOBAL_FLUSH, 0);
1664 }
1665
1666 static void domain_remove_dev_info(struct dmar_domain *domain)
1667 {
1668         struct device_domain_info *info;
1669         unsigned long flags;
1670         struct intel_iommu *iommu;
1671
1672         spin_lock_irqsave(&device_domain_lock, flags);
1673         while (!list_empty(&domain->devices)) {
1674                 info = list_entry(domain->devices.next,
1675                         struct device_domain_info, link);
1676                 list_del(&info->link);
1677                 list_del(&info->global);
1678                 if (info->dev)
1679                         info->dev->dev.archdata.iommu = NULL;
1680                 spin_unlock_irqrestore(&device_domain_lock, flags);
1681
1682                 iommu = device_to_iommu(info->bus, info->devfn);
1683                 iommu_detach_dev(iommu, info->bus, info->devfn);
1684                 free_devinfo_mem(info);
1685
1686                 spin_lock_irqsave(&device_domain_lock, flags);
1687         }
1688         spin_unlock_irqrestore(&device_domain_lock, flags);
1689 }
1690
1691 /*
1692  * find_domain
1693  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1694  */
1695 static struct dmar_domain *
1696 find_domain(struct pci_dev *pdev)
1697 {
1698         struct device_domain_info *info;
1699
1700         /* No lock here, assumes no domain exit in normal case */
1701         info = pdev->dev.archdata.iommu;
1702         if (info)
1703                 return info->domain;
1704         return NULL;
1705 }
1706
1707 /* domain is initialized */
1708 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1709 {
1710         struct dmar_domain *domain, *found = NULL;
1711         struct intel_iommu *iommu;
1712         struct dmar_drhd_unit *drhd;
1713         struct device_domain_info *info, *tmp;
1714         struct pci_dev *dev_tmp;
1715         unsigned long flags;
1716         int bus = 0, devfn = 0;
1717
1718         domain = find_domain(pdev);
1719         if (domain)
1720                 return domain;
1721
1722         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1723         if (dev_tmp) {
1724                 if (dev_tmp->is_pcie) {
1725                         bus = dev_tmp->subordinate->number;
1726                         devfn = 0;
1727                 } else {
1728                         bus = dev_tmp->bus->number;
1729                         devfn = dev_tmp->devfn;
1730                 }
1731                 spin_lock_irqsave(&device_domain_lock, flags);
1732                 list_for_each_entry(info, &device_domain_list, global) {
1733                         if (info->bus == bus && info->devfn == devfn) {
1734                                 found = info->domain;
1735                                 break;
1736                         }
1737                 }
1738                 spin_unlock_irqrestore(&device_domain_lock, flags);
1739                 /* pcie-pci bridge already has a domain, uses it */
1740                 if (found) {
1741                         domain = found;
1742                         goto found_domain;
1743                 }
1744         }
1745
1746         /* Allocate new domain for the device */
1747         drhd = dmar_find_matched_drhd_unit(pdev);
1748         if (!drhd) {
1749                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1750                         pci_name(pdev));
1751                 return NULL;
1752         }
1753         iommu = drhd->iommu;
1754
1755         domain = iommu_alloc_domain(iommu);
1756         if (!domain)
1757                 goto error;
1758
1759         if (domain_init(domain, gaw)) {
1760                 domain_exit(domain);
1761                 goto error;
1762         }
1763
1764         /* register pcie-to-pci device */
1765         if (dev_tmp) {
1766                 info = alloc_devinfo_mem();
1767                 if (!info) {
1768                         domain_exit(domain);
1769                         goto error;
1770                 }
1771                 info->bus = bus;
1772                 info->devfn = devfn;
1773                 info->dev = NULL;
1774                 info->domain = domain;
1775                 /* This domain is shared by devices under p2p bridge */
1776                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1777
1778                 /* pcie-to-pci bridge already has a domain, uses it */
1779                 found = NULL;
1780                 spin_lock_irqsave(&device_domain_lock, flags);
1781                 list_for_each_entry(tmp, &device_domain_list, global) {
1782                         if (tmp->bus == bus && tmp->devfn == devfn) {
1783                                 found = tmp->domain;
1784                                 break;
1785                         }
1786                 }
1787                 if (found) {
1788                         free_devinfo_mem(info);
1789                         domain_exit(domain);
1790                         domain = found;
1791                 } else {
1792                         list_add(&info->link, &domain->devices);
1793                         list_add(&info->global, &device_domain_list);
1794                 }
1795                 spin_unlock_irqrestore(&device_domain_lock, flags);
1796         }
1797
1798 found_domain:
1799         info = alloc_devinfo_mem();
1800         if (!info)
1801                 goto error;
1802         info->bus = pdev->bus->number;
1803         info->devfn = pdev->devfn;
1804         info->dev = pdev;
1805         info->domain = domain;
1806         spin_lock_irqsave(&device_domain_lock, flags);
1807         /* somebody is fast */
1808         found = find_domain(pdev);
1809         if (found != NULL) {
1810                 spin_unlock_irqrestore(&device_domain_lock, flags);
1811                 if (found != domain) {
1812                         domain_exit(domain);
1813                         domain = found;
1814                 }
1815                 free_devinfo_mem(info);
1816                 return domain;
1817         }
1818         list_add(&info->link, &domain->devices);
1819         list_add(&info->global, &device_domain_list);
1820         pdev->dev.archdata.iommu = info;
1821         spin_unlock_irqrestore(&device_domain_lock, flags);
1822         return domain;
1823 error:
1824         /* recheck it here, maybe others set it */
1825         return find_domain(pdev);
1826 }
1827
1828 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1829                                       unsigned long long start,
1830                                       unsigned long long end)
1831 {
1832         struct dmar_domain *domain;
1833         unsigned long size;
1834         unsigned long long base;
1835         int ret;
1836
1837         printk(KERN_INFO
1838                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1839                 pci_name(pdev), start, end);
1840         /* page table init */
1841         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1842         if (!domain)
1843                 return -ENOMEM;
1844
1845         /* The address might not be aligned */
1846         base = start & PAGE_MASK;
1847         size = end - base;
1848         size = PAGE_ALIGN(size);
1849         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1850                         IOVA_PFN(base + size) - 1)) {
1851                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1852                 ret = -ENOMEM;
1853                 goto error;
1854         }
1855
1856         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1857                 size, base, pci_name(pdev));
1858         /*
1859          * RMRR range might have overlap with physical memory range,
1860          * clear it first
1861          */
1862         dma_pte_clear_range(domain, base, base + size);
1863
1864         ret = domain_page_mapping(domain, base, base, size,
1865                 DMA_PTE_READ|DMA_PTE_WRITE);
1866         if (ret)
1867                 goto error;
1868
1869         /* context entry init */
1870         ret = domain_context_mapping(domain, pdev);
1871         if (!ret)
1872                 return 0;
1873 error:
1874         domain_exit(domain);
1875         return ret;
1876
1877 }
1878
1879 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1880         struct pci_dev *pdev)
1881 {
1882         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1883                 return 0;
1884         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1885                 rmrr->end_address + 1);
1886 }
1887
1888 #ifdef CONFIG_DMAR_GFX_WA
1889 struct iommu_prepare_data {
1890         struct pci_dev *pdev;
1891         int ret;
1892 };
1893
1894 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1895                                          unsigned long end_pfn, void *datax)
1896 {
1897         struct iommu_prepare_data *data;
1898
1899         data = (struct iommu_prepare_data *)datax;
1900
1901         data->ret = iommu_prepare_identity_map(data->pdev,
1902                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1903         return data->ret;
1904
1905 }
1906
1907 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1908 {
1909         int nid;
1910         struct iommu_prepare_data data;
1911
1912         data.pdev = pdev;
1913         data.ret = 0;
1914
1915         for_each_online_node(nid) {
1916                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1917                 if (data.ret)
1918                         return data.ret;
1919         }
1920         return data.ret;
1921 }
1922
1923 static void __init iommu_prepare_gfx_mapping(void)
1924 {
1925         struct pci_dev *pdev = NULL;
1926         int ret;
1927
1928         for_each_pci_dev(pdev) {
1929                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1930                                 !IS_GFX_DEVICE(pdev))
1931                         continue;
1932                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1933                         pci_name(pdev));
1934                 ret = iommu_prepare_with_active_regions(pdev);
1935                 if (ret)
1936                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1937         }
1938 }
1939 #else /* !CONFIG_DMAR_GFX_WA */
1940 static inline void iommu_prepare_gfx_mapping(void)
1941 {
1942         return;
1943 }
1944 #endif
1945
1946 #ifdef CONFIG_DMAR_FLOPPY_WA
1947 static inline void iommu_prepare_isa(void)
1948 {
1949         struct pci_dev *pdev;
1950         int ret;
1951
1952         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1953         if (!pdev)
1954                 return;
1955
1956         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1957         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1958
1959         if (ret)
1960                 printk("IOMMU: Failed to create 0-64M identity map, "
1961                         "floppy might not work\n");
1962
1963 }
1964 #else
1965 static inline void iommu_prepare_isa(void)
1966 {
1967         return;
1968 }
1969 #endif /* !CONFIG_DMAR_FLPY_WA */
1970
1971 static int __init init_dmars(void)
1972 {
1973         struct dmar_drhd_unit *drhd;
1974         struct dmar_rmrr_unit *rmrr;
1975         struct pci_dev *pdev;
1976         struct intel_iommu *iommu;
1977         int i, ret, unit = 0;
1978
1979         /*
1980          * for each drhd
1981          *    allocate root
1982          *    initialize and program root entry to not present
1983          * endfor
1984          */
1985         for_each_drhd_unit(drhd) {
1986                 g_num_of_iommus++;
1987                 /*
1988                  * lock not needed as this is only incremented in the single
1989                  * threaded kernel __init code path all other access are read
1990                  * only
1991                  */
1992         }
1993
1994         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1995                         GFP_KERNEL);
1996         if (!g_iommus) {
1997                 printk(KERN_ERR "Allocating global iommu array failed\n");
1998                 ret = -ENOMEM;
1999                 goto error;
2000         }
2001
2002         deferred_flush = kzalloc(g_num_of_iommus *
2003                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2004         if (!deferred_flush) {
2005                 kfree(g_iommus);
2006                 ret = -ENOMEM;
2007                 goto error;
2008         }
2009
2010         for_each_drhd_unit(drhd) {
2011                 if (drhd->ignored)
2012                         continue;
2013
2014                 iommu = drhd->iommu;
2015                 g_iommus[iommu->seq_id] = iommu;
2016
2017                 ret = iommu_init_domains(iommu);
2018                 if (ret)
2019                         goto error;
2020
2021                 /*
2022                  * TBD:
2023                  * we could share the same root & context tables
2024                  * amoung all IOMMU's. Need to Split it later.
2025                  */
2026                 ret = iommu_alloc_root_entry(iommu);
2027                 if (ret) {
2028                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2029                         goto error;
2030                 }
2031         }
2032
2033         for_each_drhd_unit(drhd) {
2034                 if (drhd->ignored)
2035                         continue;
2036
2037                 iommu = drhd->iommu;
2038                 if (dmar_enable_qi(iommu)) {
2039                         /*
2040                          * Queued Invalidate not enabled, use Register Based
2041                          * Invalidate
2042                          */
2043                         iommu->flush.flush_context = __iommu_flush_context;
2044                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2045                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2046                                "invalidation\n",
2047                                (unsigned long long)drhd->reg_base_addr);
2048                 } else {
2049                         iommu->flush.flush_context = qi_flush_context;
2050                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2051                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2052                                "invalidation\n",
2053                                (unsigned long long)drhd->reg_base_addr);
2054                 }
2055         }
2056
2057         /*
2058          * For each rmrr
2059          *   for each dev attached to rmrr
2060          *   do
2061          *     locate drhd for dev, alloc domain for dev
2062          *     allocate free domain
2063          *     allocate page table entries for rmrr
2064          *     if context not allocated for bus
2065          *           allocate and init context
2066          *           set present in root table for this bus
2067          *     init context with domain, translation etc
2068          *    endfor
2069          * endfor
2070          */
2071         for_each_rmrr_units(rmrr) {
2072                 for (i = 0; i < rmrr->devices_cnt; i++) {
2073                         pdev = rmrr->devices[i];
2074                         /* some BIOS lists non-exist devices in DMAR table */
2075                         if (!pdev)
2076                                 continue;
2077                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2078                         if (ret)
2079                                 printk(KERN_ERR
2080                                  "IOMMU: mapping reserved region failed\n");
2081                 }
2082         }
2083
2084         iommu_prepare_gfx_mapping();
2085
2086         iommu_prepare_isa();
2087
2088         /*
2089          * for each drhd
2090          *   enable fault log
2091          *   global invalidate context cache
2092          *   global invalidate iotlb
2093          *   enable translation
2094          */
2095         for_each_drhd_unit(drhd) {
2096                 if (drhd->ignored)
2097                         continue;
2098                 iommu = drhd->iommu;
2099                 sprintf (iommu->name, "dmar%d", unit++);
2100
2101                 iommu_flush_write_buffer(iommu);
2102
2103                 ret = dmar_set_interrupt(iommu);
2104                 if (ret)
2105                         goto error;
2106
2107                 iommu_set_root_entry(iommu);
2108
2109                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2110                                            0);
2111                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2112                                          0);
2113                 iommu_disable_protect_mem_regions(iommu);
2114
2115                 ret = iommu_enable_translation(iommu);
2116                 if (ret)
2117                         goto error;
2118         }
2119
2120         return 0;
2121 error:
2122         for_each_drhd_unit(drhd) {
2123                 if (drhd->ignored)
2124                         continue;
2125                 iommu = drhd->iommu;
2126                 free_iommu(iommu);
2127         }
2128         kfree(g_iommus);
2129         return ret;
2130 }
2131
2132 static inline u64 aligned_size(u64 host_addr, size_t size)
2133 {
2134         u64 addr;
2135         addr = (host_addr & (~PAGE_MASK)) + size;
2136         return PAGE_ALIGN(addr);
2137 }
2138
2139 struct iova *
2140 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2141 {
2142         struct iova *piova;
2143
2144         /* Make sure it's in range */
2145         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2146         if (!size || (IOVA_START_ADDR + size > end))
2147                 return NULL;
2148
2149         piova = alloc_iova(&domain->iovad,
2150                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2151         return piova;
2152 }
2153
2154 static struct iova *
2155 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2156                    size_t size, u64 dma_mask)
2157 {
2158         struct pci_dev *pdev = to_pci_dev(dev);
2159         struct iova *iova = NULL;
2160
2161         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2162                 iova = iommu_alloc_iova(domain, size, dma_mask);
2163         else {
2164                 /*
2165                  * First try to allocate an io virtual address in
2166                  * DMA_32BIT_MASK and if that fails then try allocating
2167                  * from higher range
2168                  */
2169                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2170                 if (!iova)
2171                         iova = iommu_alloc_iova(domain, size, dma_mask);
2172         }
2173
2174         if (!iova) {
2175                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2176                 return NULL;
2177         }
2178
2179         return iova;
2180 }
2181
2182 static struct dmar_domain *
2183 get_valid_domain_for_dev(struct pci_dev *pdev)
2184 {
2185         struct dmar_domain *domain;
2186         int ret;
2187
2188         domain = get_domain_for_dev(pdev,
2189                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2190         if (!domain) {
2191                 printk(KERN_ERR
2192                         "Allocating domain for %s failed", pci_name(pdev));
2193                 return NULL;
2194         }
2195
2196         /* make sure context mapping is ok */
2197         if (unlikely(!domain_context_mapped(pdev))) {
2198                 ret = domain_context_mapping(domain, pdev);
2199                 if (ret) {
2200                         printk(KERN_ERR
2201                                 "Domain context map for %s failed",
2202                                 pci_name(pdev));
2203                         return NULL;
2204                 }
2205         }
2206
2207         return domain;
2208 }
2209
2210 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2211                                      size_t size, int dir, u64 dma_mask)
2212 {
2213         struct pci_dev *pdev = to_pci_dev(hwdev);
2214         struct dmar_domain *domain;
2215         phys_addr_t start_paddr;
2216         struct iova *iova;
2217         int prot = 0;
2218         int ret;
2219         struct intel_iommu *iommu;
2220
2221         BUG_ON(dir == DMA_NONE);
2222         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2223                 return paddr;
2224
2225         domain = get_valid_domain_for_dev(pdev);
2226         if (!domain)
2227                 return 0;
2228
2229         iommu = domain_get_iommu(domain);
2230         size = aligned_size((u64)paddr, size);
2231
2232         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2233         if (!iova)
2234                 goto error;
2235
2236         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2237
2238         /*
2239          * Check if DMAR supports zero-length reads on write only
2240          * mappings..
2241          */
2242         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2243                         !cap_zlr(iommu->cap))
2244                 prot |= DMA_PTE_READ;
2245         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2246                 prot |= DMA_PTE_WRITE;
2247         /*
2248          * paddr - (paddr + size) might be partial page, we should map the whole
2249          * page.  Note: if two part of one page are separately mapped, we
2250          * might have two guest_addr mapping to the same host paddr, but this
2251          * is not a big problem
2252          */
2253         ret = domain_page_mapping(domain, start_paddr,
2254                 ((u64)paddr) & PAGE_MASK, size, prot);
2255         if (ret)
2256                 goto error;
2257
2258         /* it's a non-present to present mapping */
2259         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2260                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2261         if (ret)
2262                 iommu_flush_write_buffer(iommu);
2263
2264         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2265
2266 error:
2267         if (iova)
2268                 __free_iova(&domain->iovad, iova);
2269         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2270                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2271         return 0;
2272 }
2273
2274 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2275                             size_t size, int dir)
2276 {
2277         return __intel_map_single(hwdev, paddr, size, dir,
2278                                   to_pci_dev(hwdev)->dma_mask);
2279 }
2280
2281 static void flush_unmaps(void)
2282 {
2283         int i, j;
2284
2285         timer_on = 0;
2286
2287         /* just flush them all */
2288         for (i = 0; i < g_num_of_iommus; i++) {
2289                 struct intel_iommu *iommu = g_iommus[i];
2290                 if (!iommu)
2291                         continue;
2292
2293                 if (deferred_flush[i].next) {
2294                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2295                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2296                         for (j = 0; j < deferred_flush[i].next; j++) {
2297                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2298                                                 deferred_flush[i].iova[j]);
2299                         }
2300                         deferred_flush[i].next = 0;
2301                 }
2302         }
2303
2304         list_size = 0;
2305 }
2306
2307 static void flush_unmaps_timeout(unsigned long data)
2308 {
2309         unsigned long flags;
2310
2311         spin_lock_irqsave(&async_umap_flush_lock, flags);
2312         flush_unmaps();
2313         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2314 }
2315
2316 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2317 {
2318         unsigned long flags;
2319         int next, iommu_id;
2320         struct intel_iommu *iommu;
2321
2322         spin_lock_irqsave(&async_umap_flush_lock, flags);
2323         if (list_size == HIGH_WATER_MARK)
2324                 flush_unmaps();
2325
2326         iommu = domain_get_iommu(dom);
2327         iommu_id = iommu->seq_id;
2328
2329         next = deferred_flush[iommu_id].next;
2330         deferred_flush[iommu_id].domain[next] = dom;
2331         deferred_flush[iommu_id].iova[next] = iova;
2332         deferred_flush[iommu_id].next++;
2333
2334         if (!timer_on) {
2335                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2336                 timer_on = 1;
2337         }
2338         list_size++;
2339         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2340 }
2341
2342 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2343                         int dir)
2344 {
2345         struct pci_dev *pdev = to_pci_dev(dev);
2346         struct dmar_domain *domain;
2347         unsigned long start_addr;
2348         struct iova *iova;
2349         struct intel_iommu *iommu;
2350
2351         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2352                 return;
2353         domain = find_domain(pdev);
2354         BUG_ON(!domain);
2355
2356         iommu = domain_get_iommu(domain);
2357
2358         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2359         if (!iova)
2360                 return;
2361
2362         start_addr = iova->pfn_lo << PAGE_SHIFT;
2363         size = aligned_size((u64)dev_addr, size);
2364
2365         pr_debug("Device %s unmapping: %lx@%llx\n",
2366                 pci_name(pdev), size, (unsigned long long)start_addr);
2367
2368         /*  clear the whole page */
2369         dma_pte_clear_range(domain, start_addr, start_addr + size);
2370         /* free page tables */
2371         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2372         if (intel_iommu_strict) {
2373                 if (iommu_flush_iotlb_psi(iommu,
2374                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2375                         iommu_flush_write_buffer(iommu);
2376                 /* free iova */
2377                 __free_iova(&domain->iovad, iova);
2378         } else {
2379                 add_unmap(domain, iova);
2380                 /*
2381                  * queue up the release of the unmap to save the 1/6th of the
2382                  * cpu used up by the iotlb flush operation...
2383                  */
2384         }
2385 }
2386
2387 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2388                            dma_addr_t *dma_handle, gfp_t flags)
2389 {
2390         void *vaddr;
2391         int order;
2392
2393         size = PAGE_ALIGN(size);
2394         order = get_order(size);
2395         flags &= ~(GFP_DMA | GFP_DMA32);
2396
2397         vaddr = (void *)__get_free_pages(flags, order);
2398         if (!vaddr)
2399                 return NULL;
2400         memset(vaddr, 0, size);
2401
2402         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2403                                          DMA_BIDIRECTIONAL,
2404                                          hwdev->coherent_dma_mask);
2405         if (*dma_handle)
2406                 return vaddr;
2407         free_pages((unsigned long)vaddr, order);
2408         return NULL;
2409 }
2410
2411 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2412                          dma_addr_t dma_handle)
2413 {
2414         int order;
2415
2416         size = PAGE_ALIGN(size);
2417         order = get_order(size);
2418
2419         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2420         free_pages((unsigned long)vaddr, order);
2421 }
2422
2423 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2424
2425 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2426                     int nelems, int dir)
2427 {
2428         int i;
2429         struct pci_dev *pdev = to_pci_dev(hwdev);
2430         struct dmar_domain *domain;
2431         unsigned long start_addr;
2432         struct iova *iova;
2433         size_t size = 0;
2434         void *addr;
2435         struct scatterlist *sg;
2436         struct intel_iommu *iommu;
2437
2438         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2439                 return;
2440
2441         domain = find_domain(pdev);
2442         BUG_ON(!domain);
2443
2444         iommu = domain_get_iommu(domain);
2445
2446         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2447         if (!iova)
2448                 return;
2449         for_each_sg(sglist, sg, nelems, i) {
2450                 addr = SG_ENT_VIRT_ADDRESS(sg);
2451                 size += aligned_size((u64)addr, sg->length);
2452         }
2453
2454         start_addr = iova->pfn_lo << PAGE_SHIFT;
2455
2456         /*  clear the whole page */
2457         dma_pte_clear_range(domain, start_addr, start_addr + size);
2458         /* free page tables */
2459         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2460
2461         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2462                         size >> VTD_PAGE_SHIFT, 0))
2463                 iommu_flush_write_buffer(iommu);
2464
2465         /* free iova */
2466         __free_iova(&domain->iovad, iova);
2467 }
2468
2469 static int intel_nontranslate_map_sg(struct device *hddev,
2470         struct scatterlist *sglist, int nelems, int dir)
2471 {
2472         int i;
2473         struct scatterlist *sg;
2474
2475         for_each_sg(sglist, sg, nelems, i) {
2476                 BUG_ON(!sg_page(sg));
2477                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2478                 sg->dma_length = sg->length;
2479         }
2480         return nelems;
2481 }
2482
2483 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2484                  int dir)
2485 {
2486         void *addr;
2487         int i;
2488         struct pci_dev *pdev = to_pci_dev(hwdev);
2489         struct dmar_domain *domain;
2490         size_t size = 0;
2491         int prot = 0;
2492         size_t offset = 0;
2493         struct iova *iova = NULL;
2494         int ret;
2495         struct scatterlist *sg;
2496         unsigned long start_addr;
2497         struct intel_iommu *iommu;
2498
2499         BUG_ON(dir == DMA_NONE);
2500         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2501                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2502
2503         domain = get_valid_domain_for_dev(pdev);
2504         if (!domain)
2505                 return 0;
2506
2507         iommu = domain_get_iommu(domain);
2508
2509         for_each_sg(sglist, sg, nelems, i) {
2510                 addr = SG_ENT_VIRT_ADDRESS(sg);
2511                 addr = (void *)virt_to_phys(addr);
2512                 size += aligned_size((u64)addr, sg->length);
2513         }
2514
2515         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2516         if (!iova) {
2517                 sglist->dma_length = 0;
2518                 return 0;
2519         }
2520
2521         /*
2522          * Check if DMAR supports zero-length reads on write only
2523          * mappings..
2524          */
2525         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2526                         !cap_zlr(iommu->cap))
2527                 prot |= DMA_PTE_READ;
2528         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2529                 prot |= DMA_PTE_WRITE;
2530
2531         start_addr = iova->pfn_lo << PAGE_SHIFT;
2532         offset = 0;
2533         for_each_sg(sglist, sg, nelems, i) {
2534                 addr = SG_ENT_VIRT_ADDRESS(sg);
2535                 addr = (void *)virt_to_phys(addr);
2536                 size = aligned_size((u64)addr, sg->length);
2537                 ret = domain_page_mapping(domain, start_addr + offset,
2538                         ((u64)addr) & PAGE_MASK,
2539                         size, prot);
2540                 if (ret) {
2541                         /*  clear the page */
2542                         dma_pte_clear_range(domain, start_addr,
2543                                   start_addr + offset);
2544                         /* free page tables */
2545                         dma_pte_free_pagetable(domain, start_addr,
2546                                   start_addr + offset);
2547                         /* free iova */
2548                         __free_iova(&domain->iovad, iova);
2549                         return 0;
2550                 }
2551                 sg->dma_address = start_addr + offset +
2552                                 ((u64)addr & (~PAGE_MASK));
2553                 sg->dma_length = sg->length;
2554                 offset += size;
2555         }
2556
2557         /* it's a non-present to present mapping */
2558         if (iommu_flush_iotlb_psi(iommu, domain->id,
2559                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2560                 iommu_flush_write_buffer(iommu);
2561         return nelems;
2562 }
2563
2564 static struct dma_mapping_ops intel_dma_ops = {
2565         .alloc_coherent = intel_alloc_coherent,
2566         .free_coherent = intel_free_coherent,
2567         .map_single = intel_map_single,
2568         .unmap_single = intel_unmap_single,
2569         .map_sg = intel_map_sg,
2570         .unmap_sg = intel_unmap_sg,
2571 };
2572
2573 static inline int iommu_domain_cache_init(void)
2574 {
2575         int ret = 0;
2576
2577         iommu_domain_cache = kmem_cache_create("iommu_domain",
2578                                          sizeof(struct dmar_domain),
2579                                          0,
2580                                          SLAB_HWCACHE_ALIGN,
2581
2582                                          NULL);
2583         if (!iommu_domain_cache) {
2584                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2585                 ret = -ENOMEM;
2586         }
2587
2588         return ret;
2589 }
2590
2591 static inline int iommu_devinfo_cache_init(void)
2592 {
2593         int ret = 0;
2594
2595         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2596                                          sizeof(struct device_domain_info),
2597                                          0,
2598                                          SLAB_HWCACHE_ALIGN,
2599                                          NULL);
2600         if (!iommu_devinfo_cache) {
2601                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2602                 ret = -ENOMEM;
2603         }
2604
2605         return ret;
2606 }
2607
2608 static inline int iommu_iova_cache_init(void)
2609 {
2610         int ret = 0;
2611
2612         iommu_iova_cache = kmem_cache_create("iommu_iova",
2613                                          sizeof(struct iova),
2614                                          0,
2615                                          SLAB_HWCACHE_ALIGN,
2616                                          NULL);
2617         if (!iommu_iova_cache) {
2618                 printk(KERN_ERR "Couldn't create iova cache\n");
2619                 ret = -ENOMEM;
2620         }
2621
2622         return ret;
2623 }
2624
2625 static int __init iommu_init_mempool(void)
2626 {
2627         int ret;
2628         ret = iommu_iova_cache_init();
2629         if (ret)
2630                 return ret;
2631
2632         ret = iommu_domain_cache_init();
2633         if (ret)
2634                 goto domain_error;
2635
2636         ret = iommu_devinfo_cache_init();
2637         if (!ret)
2638                 return ret;
2639
2640         kmem_cache_destroy(iommu_domain_cache);
2641 domain_error:
2642         kmem_cache_destroy(iommu_iova_cache);
2643
2644         return -ENOMEM;
2645 }
2646
2647 static void __init iommu_exit_mempool(void)
2648 {
2649         kmem_cache_destroy(iommu_devinfo_cache);
2650         kmem_cache_destroy(iommu_domain_cache);
2651         kmem_cache_destroy(iommu_iova_cache);
2652
2653 }
2654
2655 static void __init init_no_remapping_devices(void)
2656 {
2657         struct dmar_drhd_unit *drhd;
2658
2659         for_each_drhd_unit(drhd) {
2660                 if (!drhd->include_all) {
2661                         int i;
2662                         for (i = 0; i < drhd->devices_cnt; i++)
2663                                 if (drhd->devices[i] != NULL)
2664                                         break;
2665                         /* ignore DMAR unit if no pci devices exist */
2666                         if (i == drhd->devices_cnt)
2667                                 drhd->ignored = 1;
2668                 }
2669         }
2670
2671         if (dmar_map_gfx)
2672                 return;
2673
2674         for_each_drhd_unit(drhd) {
2675                 int i;
2676                 if (drhd->ignored || drhd->include_all)
2677                         continue;
2678
2679                 for (i = 0; i < drhd->devices_cnt; i++)
2680                         if (drhd->devices[i] &&
2681                                 !IS_GFX_DEVICE(drhd->devices[i]))
2682                                 break;
2683
2684                 if (i < drhd->devices_cnt)
2685                         continue;
2686
2687                 /* bypass IOMMU if it is just for gfx devices */
2688                 drhd->ignored = 1;
2689                 for (i = 0; i < drhd->devices_cnt; i++) {
2690                         if (!drhd->devices[i])
2691                                 continue;
2692                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2693                 }
2694         }
2695 }
2696
2697 int __init intel_iommu_init(void)
2698 {
2699         int ret = 0;
2700
2701         if (dmar_table_init())
2702                 return  -ENODEV;
2703
2704         if (dmar_dev_scope_init())
2705                 return  -ENODEV;
2706
2707         /*
2708          * Check the need for DMA-remapping initialization now.
2709          * Above initialization will also be used by Interrupt-remapping.
2710          */
2711         if (no_iommu || swiotlb || dmar_disabled)
2712                 return -ENODEV;
2713
2714         iommu_init_mempool();
2715         dmar_init_reserved_ranges();
2716
2717         init_no_remapping_devices();
2718
2719         ret = init_dmars();
2720         if (ret) {
2721                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2722                 put_iova_domain(&reserved_iova_list);
2723                 iommu_exit_mempool();
2724                 return ret;
2725         }
2726         printk(KERN_INFO
2727         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2728
2729         init_timer(&unmap_timer);
2730         force_iommu = 1;
2731         dma_ops = &intel_dma_ops;
2732         return 0;
2733 }
2734
2735 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2736                                   struct pci_dev *pdev)
2737 {
2738         struct device_domain_info *info;
2739         unsigned long flags;
2740
2741         info = alloc_devinfo_mem();
2742         if (!info)
2743                 return -ENOMEM;
2744
2745         info->bus = pdev->bus->number;
2746         info->devfn = pdev->devfn;
2747         info->dev = pdev;
2748         info->domain = domain;
2749
2750         spin_lock_irqsave(&device_domain_lock, flags);
2751         list_add(&info->link, &domain->devices);
2752         list_add(&info->global, &device_domain_list);
2753         pdev->dev.archdata.iommu = info;
2754         spin_unlock_irqrestore(&device_domain_lock, flags);
2755
2756         return 0;
2757 }
2758
2759 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2760                                           struct pci_dev *pdev)
2761 {
2762         struct device_domain_info *info;
2763         struct intel_iommu *iommu;
2764         unsigned long flags;
2765         int found = 0;
2766         struct list_head *entry, *tmp;
2767
2768         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2769         if (!iommu)
2770                 return;
2771
2772         spin_lock_irqsave(&device_domain_lock, flags);
2773         list_for_each_safe(entry, tmp, &domain->devices) {
2774                 info = list_entry(entry, struct device_domain_info, link);
2775                 if (info->bus == pdev->bus->number &&
2776                     info->devfn == pdev->devfn) {
2777                         list_del(&info->link);
2778                         list_del(&info->global);
2779                         if (info->dev)
2780                                 info->dev->dev.archdata.iommu = NULL;
2781                         spin_unlock_irqrestore(&device_domain_lock, flags);
2782
2783                         iommu_detach_dev(iommu, info->bus, info->devfn);
2784                         free_devinfo_mem(info);
2785
2786                         spin_lock_irqsave(&device_domain_lock, flags);
2787
2788                         if (found)
2789                                 break;
2790                         else
2791                                 continue;
2792                 }
2793
2794                 /* if there is no other devices under the same iommu
2795                  * owned by this domain, clear this iommu in iommu_bmp
2796                  * update iommu count and coherency
2797                  */
2798                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2799                         found = 1;
2800         }
2801
2802         if (found == 0) {
2803                 unsigned long tmp_flags;
2804                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2805                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2806                 domain->iommu_count--;
2807                 domain_update_iommu_coherency(domain);
2808                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2809         }
2810
2811         spin_unlock_irqrestore(&device_domain_lock, flags);
2812 }
2813
2814 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2815 {
2816         struct device_domain_info *info;
2817         struct intel_iommu *iommu;
2818         unsigned long flags1, flags2;
2819
2820         spin_lock_irqsave(&device_domain_lock, flags1);
2821         while (!list_empty(&domain->devices)) {
2822                 info = list_entry(domain->devices.next,
2823                         struct device_domain_info, link);
2824                 list_del(&info->link);
2825                 list_del(&info->global);
2826                 if (info->dev)
2827                         info->dev->dev.archdata.iommu = NULL;
2828
2829                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2830
2831                 iommu = device_to_iommu(info->bus, info->devfn);
2832                 iommu_detach_dev(iommu, info->bus, info->devfn);
2833
2834                 /* clear this iommu in iommu_bmp, update iommu count
2835                  * and coherency
2836                  */
2837                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2838                 if (test_and_clear_bit(iommu->seq_id,
2839                                        &domain->iommu_bmp)) {
2840                         domain->iommu_count--;
2841                         domain_update_iommu_coherency(domain);
2842                 }
2843                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2844
2845                 free_devinfo_mem(info);
2846                 spin_lock_irqsave(&device_domain_lock, flags1);
2847         }
2848         spin_unlock_irqrestore(&device_domain_lock, flags1);
2849 }
2850
2851 /* domain id for virtual machine, it won't be set in context */
2852 static unsigned long vm_domid;
2853
2854 static int vm_domain_min_agaw(struct dmar_domain *domain)
2855 {
2856         int i;
2857         int min_agaw = domain->agaw;
2858
2859         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2860         for (; i < g_num_of_iommus; ) {
2861                 if (min_agaw > g_iommus[i]->agaw)
2862                         min_agaw = g_iommus[i]->agaw;
2863
2864                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2865         }
2866
2867         return min_agaw;
2868 }
2869
2870 static struct dmar_domain *iommu_alloc_vm_domain(void)
2871 {
2872         struct dmar_domain *domain;
2873
2874         domain = alloc_domain_mem();
2875         if (!domain)
2876                 return NULL;
2877
2878         domain->id = vm_domid++;
2879         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2880         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2881
2882         return domain;
2883 }
2884
2885 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2886 {
2887         int adjust_width;
2888
2889         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2890         spin_lock_init(&domain->mapping_lock);
2891         spin_lock_init(&domain->iommu_lock);
2892
2893         domain_reserve_special_ranges(domain);
2894
2895         /* calculate AGAW */
2896         domain->gaw = guest_width;
2897         adjust_width = guestwidth_to_adjustwidth(guest_width);
2898         domain->agaw = width_to_agaw(adjust_width);
2899
2900         INIT_LIST_HEAD(&domain->devices);
2901
2902         domain->iommu_count = 0;
2903         domain->iommu_coherency = 0;
2904         domain->max_addr = 0;
2905
2906         /* always allocate the top pgd */
2907         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2908         if (!domain->pgd)
2909                 return -ENOMEM;
2910         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2911         return 0;
2912 }
2913
2914 static void iommu_free_vm_domain(struct dmar_domain *domain)
2915 {
2916         unsigned long flags;
2917         struct dmar_drhd_unit *drhd;
2918         struct intel_iommu *iommu;
2919         unsigned long i;
2920         unsigned long ndomains;
2921
2922         for_each_drhd_unit(drhd) {
2923                 if (drhd->ignored)
2924                         continue;
2925                 iommu = drhd->iommu;
2926
2927                 ndomains = cap_ndoms(iommu->cap);
2928                 i = find_first_bit(iommu->domain_ids, ndomains);
2929                 for (; i < ndomains; ) {
2930                         if (iommu->domains[i] == domain) {
2931                                 spin_lock_irqsave(&iommu->lock, flags);
2932                                 clear_bit(i, iommu->domain_ids);
2933                                 iommu->domains[i] = NULL;
2934                                 spin_unlock_irqrestore(&iommu->lock, flags);
2935                                 break;
2936                         }
2937                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2938                 }
2939         }
2940 }
2941
2942 static void vm_domain_exit(struct dmar_domain *domain)
2943 {
2944         u64 end;
2945
2946         /* Domain 0 is reserved, so dont process it */
2947         if (!domain)
2948                 return;
2949
2950         vm_domain_remove_all_dev_info(domain);
2951         /* destroy iovas */
2952         put_iova_domain(&domain->iovad);
2953         end = DOMAIN_MAX_ADDR(domain->gaw);
2954         end = end & (~VTD_PAGE_MASK);
2955
2956         /* clear ptes */
2957         dma_pte_clear_range(domain, 0, end);
2958
2959         /* free page tables */
2960         dma_pte_free_pagetable(domain, 0, end);
2961
2962         iommu_free_vm_domain(domain);
2963         free_domain_mem(domain);
2964 }
2965
2966 static int intel_iommu_domain_init(struct iommu_domain *domain)
2967 {
2968         struct dmar_domain *dmar_domain;
2969
2970         dmar_domain = iommu_alloc_vm_domain();
2971         if (!dmar_domain) {
2972                 printk(KERN_ERR
2973                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2974                 return -ENOMEM;
2975         }
2976         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2977                 printk(KERN_ERR
2978                         "intel_iommu_domain_init() failed\n");
2979                 vm_domain_exit(dmar_domain);
2980                 return -ENOMEM;
2981         }
2982         domain->priv = dmar_domain;
2983
2984         return 0;
2985 }
2986
2987 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
2988 {
2989         struct dmar_domain *dmar_domain = domain->priv;
2990
2991         domain->priv = NULL;
2992         vm_domain_exit(dmar_domain);
2993 }
2994
2995 static int intel_iommu_attach_device(struct iommu_domain *domain,
2996                                      struct device *dev)
2997 {
2998         struct dmar_domain *dmar_domain = domain->priv;
2999         struct pci_dev *pdev = to_pci_dev(dev);
3000         struct intel_iommu *iommu;
3001         int addr_width;
3002         u64 end;
3003         int ret;
3004
3005         /* normally pdev is not mapped */
3006         if (unlikely(domain_context_mapped(pdev))) {
3007                 struct dmar_domain *old_domain;
3008
3009                 old_domain = find_domain(pdev);
3010                 if (old_domain) {
3011                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3012                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3013                         else
3014                                 domain_remove_dev_info(old_domain);
3015                 }
3016         }
3017
3018         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3019         if (!iommu)
3020                 return -ENODEV;
3021
3022         /* check if this iommu agaw is sufficient for max mapped address */
3023         addr_width = agaw_to_width(iommu->agaw);
3024         end = DOMAIN_MAX_ADDR(addr_width);
3025         end = end & VTD_PAGE_MASK;
3026         if (end < dmar_domain->max_addr) {
3027                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3028                        "sufficient for the mapped address (%llx)\n",
3029                        __func__, iommu->agaw, dmar_domain->max_addr);
3030                 return -EFAULT;
3031         }
3032
3033         ret = domain_context_mapping(dmar_domain, pdev);
3034         if (ret)
3035                 return ret;
3036
3037         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3038         return ret;
3039 }
3040
3041 static void intel_iommu_detach_device(struct iommu_domain *domain,
3042                                       struct device *dev)
3043 {
3044         struct dmar_domain *dmar_domain = domain->priv;
3045         struct pci_dev *pdev = to_pci_dev(dev);
3046
3047         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3048 }
3049
3050 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
3051                             u64 hpa, size_t size, int prot)
3052 {
3053         u64 max_addr;
3054         int addr_width;
3055         int ret;
3056
3057         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3058         if (domain->max_addr < max_addr) {
3059                 int min_agaw;
3060                 u64 end;
3061
3062                 /* check if minimum agaw is sufficient for mapped address */
3063                 min_agaw = vm_domain_min_agaw(domain);
3064                 addr_width = agaw_to_width(min_agaw);
3065                 end = DOMAIN_MAX_ADDR(addr_width);
3066                 end = end & VTD_PAGE_MASK;
3067                 if (end < max_addr) {
3068                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3069                                "sufficient for the mapped address (%llx)\n",
3070                                __func__, min_agaw, max_addr);
3071                         return -EFAULT;
3072                 }
3073                 domain->max_addr = max_addr;
3074         }
3075
3076         ret = domain_page_mapping(domain, iova, hpa, size, prot);
3077         return ret;
3078 }
3079 EXPORT_SYMBOL_GPL(intel_iommu_map_address);
3080
3081 void intel_iommu_unmap_address(struct dmar_domain *domain,
3082                                dma_addr_t iova, size_t size)
3083 {
3084         dma_addr_t base;
3085
3086         /* The address might not be aligned */
3087         base = iova & VTD_PAGE_MASK;
3088         size = VTD_PAGE_ALIGN(size);
3089         dma_pte_clear_range(domain, base, base + size);
3090
3091         if (domain->max_addr == base + size)
3092                 domain->max_addr = base;
3093 }
3094 EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
3095
3096 int intel_iommu_found(void)
3097 {
3098         return g_num_of_iommus;
3099 }
3100 EXPORT_SYMBOL_GPL(intel_iommu_found);
3101
3102 u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
3103 {
3104         struct dma_pte *pte;
3105         u64 phys = 0;
3106
3107         pte = addr_to_dma_pte(domain, iova);
3108         if (pte)
3109                 phys = dma_pte_addr(pte);
3110
3111         return phys;
3112 }
3113 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);