x86, ia64: convert to use generic dma_map_ops struct
[safe/jmp/linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 /*
65  * 0: Present
66  * 1-11: Reserved
67  * 12-63: Context Ptr (12 - (haw-1))
68  * 64-127: Reserved
69  */
70 struct root_entry {
71         u64     val;
72         u64     rsvd1;
73 };
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
76 {
77         return (root->val & 1);
78 }
79 static inline void set_root_present(struct root_entry *root)
80 {
81         root->val |= 1;
82 }
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 {
85         root->val |= value & VTD_PAGE_MASK;
86 }
87
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
90 {
91         return (struct context_entry *)
92                 (root_present(root)?phys_to_virt(
93                 root->val & VTD_PAGE_MASK) :
94                 NULL);
95 }
96
97 /*
98  * low 64 bits:
99  * 0: present
100  * 1: fault processing disable
101  * 2-3: translation type
102  * 12-63: address space root
103  * high 64 bits:
104  * 0-2: address width
105  * 3-6: aval
106  * 8-23: domain id
107  */
108 struct context_entry {
109         u64 lo;
110         u64 hi;
111 };
112
113 static inline bool context_present(struct context_entry *context)
114 {
115         return (context->lo & 1);
116 }
117 static inline void context_set_present(struct context_entry *context)
118 {
119         context->lo |= 1;
120 }
121
122 static inline void context_set_fault_enable(struct context_entry *context)
123 {
124         context->lo &= (((u64)-1) << 2) | 1;
125 }
126
127 #define CONTEXT_TT_MULTI_LEVEL 0
128
129 static inline void context_set_translation_type(struct context_entry *context,
130                                                 unsigned long value)
131 {
132         context->lo &= (((u64)-1) << 4) | 3;
133         context->lo |= (value & 3) << 2;
134 }
135
136 static inline void context_set_address_root(struct context_entry *context,
137                                             unsigned long value)
138 {
139         context->lo |= value & VTD_PAGE_MASK;
140 }
141
142 static inline void context_set_address_width(struct context_entry *context,
143                                              unsigned long value)
144 {
145         context->hi |= value & 7;
146 }
147
148 static inline void context_set_domain_id(struct context_entry *context,
149                                          unsigned long value)
150 {
151         context->hi |= (value & ((1 << 16) - 1)) << 8;
152 }
153
154 static inline void context_clear_entry(struct context_entry *context)
155 {
156         context->lo = 0;
157         context->hi = 0;
158 }
159
160 /*
161  * 0: readable
162  * 1: writable
163  * 2-6: reserved
164  * 7: super page
165  * 8-11: available
166  * 12-63: Host physcial address
167  */
168 struct dma_pte {
169         u64 val;
170 };
171
172 static inline void dma_clear_pte(struct dma_pte *pte)
173 {
174         pte->val = 0;
175 }
176
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 {
179         pte->val |= DMA_PTE_READ;
180 }
181
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 {
184         pte->val |= DMA_PTE_WRITE;
185 }
186
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 {
189         pte->val = (pte->val & ~3) | (prot & 3);
190 }
191
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 {
194         return (pte->val & VTD_PAGE_MASK);
195 }
196
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 {
199         pte->val |= (addr & VTD_PAGE_MASK);
200 }
201
202 static inline bool dma_pte_present(struct dma_pte *pte)
203 {
204         return (pte->val & 3) != 0;
205 }
206
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
209
210 /* domain represents a virtual machine, more than one devices
211  * across iommus may be owned in one domain, e.g. kvm guest.
212  */
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
214
215 struct dmar_domain {
216         int     id;                     /* domain id */
217         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
218
219         struct list_head devices;       /* all devices' list */
220         struct iova_domain iovad;       /* iova's that belong to this domain */
221
222         struct dma_pte  *pgd;           /* virtual address */
223         spinlock_t      mapping_lock;   /* page table lock */
224         int             gaw;            /* max guest address width */
225
226         /* adjusted guest address width, 0 is level 2 30-bit */
227         int             agaw;
228
229         int             flags;          /* flags to find out type of domain */
230
231         int             iommu_coherency;/* indicate coherency of iommu access */
232         int             iommu_count;    /* reference count of iommu */
233         spinlock_t      iommu_lock;     /* protect iommu set in domain */
234         u64             max_addr;       /* maximum mapped address */
235 };
236
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239         struct list_head link;  /* link to domain siblings */
240         struct list_head global; /* link to global list */
241         u8 bus;                 /* PCI bus numer */
242         u8 devfn;               /* PCI devfn number */
243         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244         struct dmar_domain *domain; /* pointer to domain */
245 };
246
247 static void flush_unmaps_timeout(unsigned long data);
248
249 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
250
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
253         int next;
254         struct iova *iova[HIGH_WATER_MARK];
255         struct dmar_domain *domain[HIGH_WATER_MARK];
256 };
257
258 static struct deferred_flush_tables *deferred_flush;
259
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
262
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
265
266 static int timer_on;
267 static long list_size;
268
269 static void domain_remove_dev_info(struct dmar_domain *domain);
270
271 int dmar_disabled;
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
275
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
279
280 static struct iommu_ops intel_iommu_ops;
281
282 static int __init intel_iommu_setup(char *str)
283 {
284         if (!str)
285                 return -EINVAL;
286         while (*str) {
287                 if (!strncmp(str, "off", 3)) {
288                         dmar_disabled = 1;
289                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
290                 } else if (!strncmp(str, "igfx_off", 8)) {
291                         dmar_map_gfx = 0;
292                         printk(KERN_INFO
293                                 "Intel-IOMMU: disable GFX device mapping\n");
294                 } else if (!strncmp(str, "forcedac", 8)) {
295                         printk(KERN_INFO
296                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
297                         dmar_forcedac = 1;
298                 } else if (!strncmp(str, "strict", 6)) {
299                         printk(KERN_INFO
300                                 "Intel-IOMMU: disable batched IOTLB flush\n");
301                         intel_iommu_strict = 1;
302                 }
303
304                 str += strcspn(str, ",");
305                 while (*str == ',')
306                         str++;
307         }
308         return 0;
309 }
310 __setup("intel_iommu=", intel_iommu_setup);
311
312 static struct kmem_cache *iommu_domain_cache;
313 static struct kmem_cache *iommu_devinfo_cache;
314 static struct kmem_cache *iommu_iova_cache;
315
316 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
317 {
318         unsigned int flags;
319         void *vaddr;
320
321         /* trying to avoid low memory issues */
322         flags = current->flags & PF_MEMALLOC;
323         current->flags |= PF_MEMALLOC;
324         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
325         current->flags &= (~PF_MEMALLOC | flags);
326         return vaddr;
327 }
328
329
330 static inline void *alloc_pgtable_page(void)
331 {
332         unsigned int flags;
333         void *vaddr;
334
335         /* trying to avoid low memory issues */
336         flags = current->flags & PF_MEMALLOC;
337         current->flags |= PF_MEMALLOC;
338         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
339         current->flags &= (~PF_MEMALLOC | flags);
340         return vaddr;
341 }
342
343 static inline void free_pgtable_page(void *vaddr)
344 {
345         free_page((unsigned long)vaddr);
346 }
347
348 static inline void *alloc_domain_mem(void)
349 {
350         return iommu_kmem_cache_alloc(iommu_domain_cache);
351 }
352
353 static void free_domain_mem(void *vaddr)
354 {
355         kmem_cache_free(iommu_domain_cache, vaddr);
356 }
357
358 static inline void * alloc_devinfo_mem(void)
359 {
360         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
361 }
362
363 static inline void free_devinfo_mem(void *vaddr)
364 {
365         kmem_cache_free(iommu_devinfo_cache, vaddr);
366 }
367
368 struct iova *alloc_iova_mem(void)
369 {
370         return iommu_kmem_cache_alloc(iommu_iova_cache);
371 }
372
373 void free_iova_mem(struct iova *iova)
374 {
375         kmem_cache_free(iommu_iova_cache, iova);
376 }
377
378
379 static inline int width_to_agaw(int width);
380
381 /* calculate agaw for each iommu.
382  * "SAGAW" may be different across iommus, use a default agaw, and
383  * get a supported less agaw for iommus that don't support the default agaw.
384  */
385 int iommu_calculate_agaw(struct intel_iommu *iommu)
386 {
387         unsigned long sagaw;
388         int agaw = -1;
389
390         sagaw = cap_sagaw(iommu->cap);
391         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
392              agaw >= 0; agaw--) {
393                 if (test_bit(agaw, &sagaw))
394                         break;
395         }
396
397         return agaw;
398 }
399
400 /* in native case, each domain is related to only one iommu */
401 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
402 {
403         int iommu_id;
404
405         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
406
407         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
408         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
409                 return NULL;
410
411         return g_iommus[iommu_id];
412 }
413
414 /* "Coherency" capability may be different across iommus */
415 static void domain_update_iommu_coherency(struct dmar_domain *domain)
416 {
417         int i;
418
419         domain->iommu_coherency = 1;
420
421         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
422         for (; i < g_num_of_iommus; ) {
423                 if (!ecap_coherent(g_iommus[i]->ecap)) {
424                         domain->iommu_coherency = 0;
425                         break;
426                 }
427                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
428         }
429 }
430
431 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
432 {
433         struct dmar_drhd_unit *drhd = NULL;
434         int i;
435
436         for_each_drhd_unit(drhd) {
437                 if (drhd->ignored)
438                         continue;
439
440                 for (i = 0; i < drhd->devices_cnt; i++)
441                         if (drhd->devices[i]->bus->number == bus &&
442                             drhd->devices[i]->devfn == devfn)
443                                 return drhd->iommu;
444
445                 if (drhd->include_all)
446                         return drhd->iommu;
447         }
448
449         return NULL;
450 }
451
452 static void domain_flush_cache(struct dmar_domain *domain,
453                                void *addr, int size)
454 {
455         if (!domain->iommu_coherency)
456                 clflush_cache_range(addr, size);
457 }
458
459 /* Gets context entry for a given bus and devfn */
460 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
461                 u8 bus, u8 devfn)
462 {
463         struct root_entry *root;
464         struct context_entry *context;
465         unsigned long phy_addr;
466         unsigned long flags;
467
468         spin_lock_irqsave(&iommu->lock, flags);
469         root = &iommu->root_entry[bus];
470         context = get_context_addr_from_root(root);
471         if (!context) {
472                 context = (struct context_entry *)alloc_pgtable_page();
473                 if (!context) {
474                         spin_unlock_irqrestore(&iommu->lock, flags);
475                         return NULL;
476                 }
477                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
478                 phy_addr = virt_to_phys((void *)context);
479                 set_root_value(root, phy_addr);
480                 set_root_present(root);
481                 __iommu_flush_cache(iommu, root, sizeof(*root));
482         }
483         spin_unlock_irqrestore(&iommu->lock, flags);
484         return &context[devfn];
485 }
486
487 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
488 {
489         struct root_entry *root;
490         struct context_entry *context;
491         int ret;
492         unsigned long flags;
493
494         spin_lock_irqsave(&iommu->lock, flags);
495         root = &iommu->root_entry[bus];
496         context = get_context_addr_from_root(root);
497         if (!context) {
498                 ret = 0;
499                 goto out;
500         }
501         ret = context_present(&context[devfn]);
502 out:
503         spin_unlock_irqrestore(&iommu->lock, flags);
504         return ret;
505 }
506
507 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
508 {
509         struct root_entry *root;
510         struct context_entry *context;
511         unsigned long flags;
512
513         spin_lock_irqsave(&iommu->lock, flags);
514         root = &iommu->root_entry[bus];
515         context = get_context_addr_from_root(root);
516         if (context) {
517                 context_clear_entry(&context[devfn]);
518                 __iommu_flush_cache(iommu, &context[devfn], \
519                         sizeof(*context));
520         }
521         spin_unlock_irqrestore(&iommu->lock, flags);
522 }
523
524 static void free_context_table(struct intel_iommu *iommu)
525 {
526         struct root_entry *root;
527         int i;
528         unsigned long flags;
529         struct context_entry *context;
530
531         spin_lock_irqsave(&iommu->lock, flags);
532         if (!iommu->root_entry) {
533                 goto out;
534         }
535         for (i = 0; i < ROOT_ENTRY_NR; i++) {
536                 root = &iommu->root_entry[i];
537                 context = get_context_addr_from_root(root);
538                 if (context)
539                         free_pgtable_page(context);
540         }
541         free_pgtable_page(iommu->root_entry);
542         iommu->root_entry = NULL;
543 out:
544         spin_unlock_irqrestore(&iommu->lock, flags);
545 }
546
547 /* page table handling */
548 #define LEVEL_STRIDE            (9)
549 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
550
551 static inline int agaw_to_level(int agaw)
552 {
553         return agaw + 2;
554 }
555
556 static inline int agaw_to_width(int agaw)
557 {
558         return 30 + agaw * LEVEL_STRIDE;
559
560 }
561
562 static inline int width_to_agaw(int width)
563 {
564         return (width - 30) / LEVEL_STRIDE;
565 }
566
567 static inline unsigned int level_to_offset_bits(int level)
568 {
569         return (12 + (level - 1) * LEVEL_STRIDE);
570 }
571
572 static inline int address_level_offset(u64 addr, int level)
573 {
574         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
575 }
576
577 static inline u64 level_mask(int level)
578 {
579         return ((u64)-1 << level_to_offset_bits(level));
580 }
581
582 static inline u64 level_size(int level)
583 {
584         return ((u64)1 << level_to_offset_bits(level));
585 }
586
587 static inline u64 align_to_level(u64 addr, int level)
588 {
589         return ((addr + level_size(level) - 1) & level_mask(level));
590 }
591
592 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
593 {
594         int addr_width = agaw_to_width(domain->agaw);
595         struct dma_pte *parent, *pte = NULL;
596         int level = agaw_to_level(domain->agaw);
597         int offset;
598         unsigned long flags;
599
600         BUG_ON(!domain->pgd);
601
602         addr &= (((u64)1) << addr_width) - 1;
603         parent = domain->pgd;
604
605         spin_lock_irqsave(&domain->mapping_lock, flags);
606         while (level > 0) {
607                 void *tmp_page;
608
609                 offset = address_level_offset(addr, level);
610                 pte = &parent[offset];
611                 if (level == 1)
612                         break;
613
614                 if (!dma_pte_present(pte)) {
615                         tmp_page = alloc_pgtable_page();
616
617                         if (!tmp_page) {
618                                 spin_unlock_irqrestore(&domain->mapping_lock,
619                                         flags);
620                                 return NULL;
621                         }
622                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
623                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
624                         /*
625                          * high level table always sets r/w, last level page
626                          * table control read/write
627                          */
628                         dma_set_pte_readable(pte);
629                         dma_set_pte_writable(pte);
630                         domain_flush_cache(domain, pte, sizeof(*pte));
631                 }
632                 parent = phys_to_virt(dma_pte_addr(pte));
633                 level--;
634         }
635
636         spin_unlock_irqrestore(&domain->mapping_lock, flags);
637         return pte;
638 }
639
640 /* return address's pte at specific level */
641 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
642                 int level)
643 {
644         struct dma_pte *parent, *pte = NULL;
645         int total = agaw_to_level(domain->agaw);
646         int offset;
647
648         parent = domain->pgd;
649         while (level <= total) {
650                 offset = address_level_offset(addr, total);
651                 pte = &parent[offset];
652                 if (level == total)
653                         return pte;
654
655                 if (!dma_pte_present(pte))
656                         break;
657                 parent = phys_to_virt(dma_pte_addr(pte));
658                 total--;
659         }
660         return NULL;
661 }
662
663 /* clear one page's page table */
664 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
665 {
666         struct dma_pte *pte = NULL;
667
668         /* get last level pte */
669         pte = dma_addr_level_pte(domain, addr, 1);
670
671         if (pte) {
672                 dma_clear_pte(pte);
673                 domain_flush_cache(domain, pte, sizeof(*pte));
674         }
675 }
676
677 /* clear last level pte, a tlb flush should be followed */
678 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
679 {
680         int addr_width = agaw_to_width(domain->agaw);
681
682         start &= (((u64)1) << addr_width) - 1;
683         end &= (((u64)1) << addr_width) - 1;
684         /* in case it's partial page */
685         start = PAGE_ALIGN(start);
686         end &= PAGE_MASK;
687
688         /* we don't need lock here, nobody else touches the iova range */
689         while (start < end) {
690                 dma_pte_clear_one(domain, start);
691                 start += VTD_PAGE_SIZE;
692         }
693 }
694
695 /* free page table pages. last level pte should already be cleared */
696 static void dma_pte_free_pagetable(struct dmar_domain *domain,
697         u64 start, u64 end)
698 {
699         int addr_width = agaw_to_width(domain->agaw);
700         struct dma_pte *pte;
701         int total = agaw_to_level(domain->agaw);
702         int level;
703         u64 tmp;
704
705         start &= (((u64)1) << addr_width) - 1;
706         end &= (((u64)1) << addr_width) - 1;
707
708         /* we don't need lock here, nobody else touches the iova range */
709         level = 2;
710         while (level <= total) {
711                 tmp = align_to_level(start, level);
712                 if (tmp >= end || (tmp + level_size(level) > end))
713                         return;
714
715                 while (tmp < end) {
716                         pte = dma_addr_level_pte(domain, tmp, level);
717                         if (pte) {
718                                 free_pgtable_page(
719                                         phys_to_virt(dma_pte_addr(pte)));
720                                 dma_clear_pte(pte);
721                                 domain_flush_cache(domain, pte, sizeof(*pte));
722                         }
723                         tmp += level_size(level);
724                 }
725                 level++;
726         }
727         /* free pgd */
728         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
729                 free_pgtable_page(domain->pgd);
730                 domain->pgd = NULL;
731         }
732 }
733
734 /* iommu handling */
735 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
736 {
737         struct root_entry *root;
738         unsigned long flags;
739
740         root = (struct root_entry *)alloc_pgtable_page();
741         if (!root)
742                 return -ENOMEM;
743
744         __iommu_flush_cache(iommu, root, ROOT_SIZE);
745
746         spin_lock_irqsave(&iommu->lock, flags);
747         iommu->root_entry = root;
748         spin_unlock_irqrestore(&iommu->lock, flags);
749
750         return 0;
751 }
752
753 static void iommu_set_root_entry(struct intel_iommu *iommu)
754 {
755         void *addr;
756         u32 cmd, sts;
757         unsigned long flag;
758
759         addr = iommu->root_entry;
760
761         spin_lock_irqsave(&iommu->register_lock, flag);
762         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
763
764         cmd = iommu->gcmd | DMA_GCMD_SRTP;
765         writel(cmd, iommu->reg + DMAR_GCMD_REG);
766
767         /* Make sure hardware complete it */
768         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
769                 readl, (sts & DMA_GSTS_RTPS), sts);
770
771         spin_unlock_irqrestore(&iommu->register_lock, flag);
772 }
773
774 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
775 {
776         u32 val;
777         unsigned long flag;
778
779         if (!cap_rwbf(iommu->cap))
780                 return;
781         val = iommu->gcmd | DMA_GCMD_WBF;
782
783         spin_lock_irqsave(&iommu->register_lock, flag);
784         writel(val, iommu->reg + DMAR_GCMD_REG);
785
786         /* Make sure hardware complete it */
787         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
788                         readl, (!(val & DMA_GSTS_WBFS)), val);
789
790         spin_unlock_irqrestore(&iommu->register_lock, flag);
791 }
792
793 /* return value determine if we need a write buffer flush */
794 static int __iommu_flush_context(struct intel_iommu *iommu,
795         u16 did, u16 source_id, u8 function_mask, u64 type,
796         int non_present_entry_flush)
797 {
798         u64 val = 0;
799         unsigned long flag;
800
801         /*
802          * In the non-present entry flush case, if hardware doesn't cache
803          * non-present entry we do nothing and if hardware cache non-present
804          * entry, we flush entries of domain 0 (the domain id is used to cache
805          * any non-present entries)
806          */
807         if (non_present_entry_flush) {
808                 if (!cap_caching_mode(iommu->cap))
809                         return 1;
810                 else
811                         did = 0;
812         }
813
814         switch (type) {
815         case DMA_CCMD_GLOBAL_INVL:
816                 val = DMA_CCMD_GLOBAL_INVL;
817                 break;
818         case DMA_CCMD_DOMAIN_INVL:
819                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
820                 break;
821         case DMA_CCMD_DEVICE_INVL:
822                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
823                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
824                 break;
825         default:
826                 BUG();
827         }
828         val |= DMA_CCMD_ICC;
829
830         spin_lock_irqsave(&iommu->register_lock, flag);
831         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
832
833         /* Make sure hardware complete it */
834         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
835                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
836
837         spin_unlock_irqrestore(&iommu->register_lock, flag);
838
839         /* flush context entry will implicitly flush write buffer */
840         return 0;
841 }
842
843 /* return value determine if we need a write buffer flush */
844 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
845         u64 addr, unsigned int size_order, u64 type,
846         int non_present_entry_flush)
847 {
848         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
849         u64 val = 0, val_iva = 0;
850         unsigned long flag;
851
852         /*
853          * In the non-present entry flush case, if hardware doesn't cache
854          * non-present entry we do nothing and if hardware cache non-present
855          * entry, we flush entries of domain 0 (the domain id is used to cache
856          * any non-present entries)
857          */
858         if (non_present_entry_flush) {
859                 if (!cap_caching_mode(iommu->cap))
860                         return 1;
861                 else
862                         did = 0;
863         }
864
865         switch (type) {
866         case DMA_TLB_GLOBAL_FLUSH:
867                 /* global flush doesn't need set IVA_REG */
868                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
869                 break;
870         case DMA_TLB_DSI_FLUSH:
871                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
872                 break;
873         case DMA_TLB_PSI_FLUSH:
874                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
875                 /* Note: always flush non-leaf currently */
876                 val_iva = size_order | addr;
877                 break;
878         default:
879                 BUG();
880         }
881         /* Note: set drain read/write */
882 #if 0
883         /*
884          * This is probably to be super secure.. Looks like we can
885          * ignore it without any impact.
886          */
887         if (cap_read_drain(iommu->cap))
888                 val |= DMA_TLB_READ_DRAIN;
889 #endif
890         if (cap_write_drain(iommu->cap))
891                 val |= DMA_TLB_WRITE_DRAIN;
892
893         spin_lock_irqsave(&iommu->register_lock, flag);
894         /* Note: Only uses first TLB reg currently */
895         if (val_iva)
896                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
897         dmar_writeq(iommu->reg + tlb_offset + 8, val);
898
899         /* Make sure hardware complete it */
900         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
901                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
902
903         spin_unlock_irqrestore(&iommu->register_lock, flag);
904
905         /* check IOTLB invalidation granularity */
906         if (DMA_TLB_IAIG(val) == 0)
907                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
908         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
909                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
910                         (unsigned long long)DMA_TLB_IIRG(type),
911                         (unsigned long long)DMA_TLB_IAIG(val));
912         /* flush iotlb entry will implicitly flush write buffer */
913         return 0;
914 }
915
916 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
917         u64 addr, unsigned int pages, int non_present_entry_flush)
918 {
919         unsigned int mask;
920
921         BUG_ON(addr & (~VTD_PAGE_MASK));
922         BUG_ON(pages == 0);
923
924         /* Fallback to domain selective flush if no PSI support */
925         if (!cap_pgsel_inv(iommu->cap))
926                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
927                                                 DMA_TLB_DSI_FLUSH,
928                                                 non_present_entry_flush);
929
930         /*
931          * PSI requires page size to be 2 ^ x, and the base address is naturally
932          * aligned to the size
933          */
934         mask = ilog2(__roundup_pow_of_two(pages));
935         /* Fallback to domain selective flush if size is too big */
936         if (mask > cap_max_amask_val(iommu->cap))
937                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
938                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
939
940         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
941                                         DMA_TLB_PSI_FLUSH,
942                                         non_present_entry_flush);
943 }
944
945 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
946 {
947         u32 pmen;
948         unsigned long flags;
949
950         spin_lock_irqsave(&iommu->register_lock, flags);
951         pmen = readl(iommu->reg + DMAR_PMEN_REG);
952         pmen &= ~DMA_PMEN_EPM;
953         writel(pmen, iommu->reg + DMAR_PMEN_REG);
954
955         /* wait for the protected region status bit to clear */
956         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
957                 readl, !(pmen & DMA_PMEN_PRS), pmen);
958
959         spin_unlock_irqrestore(&iommu->register_lock, flags);
960 }
961
962 static int iommu_enable_translation(struct intel_iommu *iommu)
963 {
964         u32 sts;
965         unsigned long flags;
966
967         spin_lock_irqsave(&iommu->register_lock, flags);
968         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
969
970         /* Make sure hardware complete it */
971         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
972                 readl, (sts & DMA_GSTS_TES), sts);
973
974         iommu->gcmd |= DMA_GCMD_TE;
975         spin_unlock_irqrestore(&iommu->register_lock, flags);
976         return 0;
977 }
978
979 static int iommu_disable_translation(struct intel_iommu *iommu)
980 {
981         u32 sts;
982         unsigned long flag;
983
984         spin_lock_irqsave(&iommu->register_lock, flag);
985         iommu->gcmd &= ~DMA_GCMD_TE;
986         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
987
988         /* Make sure hardware complete it */
989         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
990                 readl, (!(sts & DMA_GSTS_TES)), sts);
991
992         spin_unlock_irqrestore(&iommu->register_lock, flag);
993         return 0;
994 }
995
996 /* iommu interrupt handling. Most stuff are MSI-like. */
997
998 static const char *fault_reason_strings[] =
999 {
1000         "Software",
1001         "Present bit in root entry is clear",
1002         "Present bit in context entry is clear",
1003         "Invalid context entry",
1004         "Access beyond MGAW",
1005         "PTE Write access is not set",
1006         "PTE Read access is not set",
1007         "Next page table ptr is invalid",
1008         "Root table address invalid",
1009         "Context table ptr is invalid",
1010         "non-zero reserved fields in RTP",
1011         "non-zero reserved fields in CTP",
1012         "non-zero reserved fields in PTE",
1013 };
1014 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1015
1016 const char *dmar_get_fault_reason(u8 fault_reason)
1017 {
1018         if (fault_reason > MAX_FAULT_REASON_IDX)
1019                 return "Unknown";
1020         else
1021                 return fault_reason_strings[fault_reason];
1022 }
1023
1024 void dmar_msi_unmask(unsigned int irq)
1025 {
1026         struct intel_iommu *iommu = get_irq_data(irq);
1027         unsigned long flag;
1028
1029         /* unmask it */
1030         spin_lock_irqsave(&iommu->register_lock, flag);
1031         writel(0, iommu->reg + DMAR_FECTL_REG);
1032         /* Read a reg to force flush the post write */
1033         readl(iommu->reg + DMAR_FECTL_REG);
1034         spin_unlock_irqrestore(&iommu->register_lock, flag);
1035 }
1036
1037 void dmar_msi_mask(unsigned int irq)
1038 {
1039         unsigned long flag;
1040         struct intel_iommu *iommu = get_irq_data(irq);
1041
1042         /* mask it */
1043         spin_lock_irqsave(&iommu->register_lock, flag);
1044         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1045         /* Read a reg to force flush the post write */
1046         readl(iommu->reg + DMAR_FECTL_REG);
1047         spin_unlock_irqrestore(&iommu->register_lock, flag);
1048 }
1049
1050 void dmar_msi_write(int irq, struct msi_msg *msg)
1051 {
1052         struct intel_iommu *iommu = get_irq_data(irq);
1053         unsigned long flag;
1054
1055         spin_lock_irqsave(&iommu->register_lock, flag);
1056         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1057         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1058         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1059         spin_unlock_irqrestore(&iommu->register_lock, flag);
1060 }
1061
1062 void dmar_msi_read(int irq, struct msi_msg *msg)
1063 {
1064         struct intel_iommu *iommu = get_irq_data(irq);
1065         unsigned long flag;
1066
1067         spin_lock_irqsave(&iommu->register_lock, flag);
1068         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1069         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1070         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1071         spin_unlock_irqrestore(&iommu->register_lock, flag);
1072 }
1073
1074 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1075                 u8 fault_reason, u16 source_id, unsigned long long addr)
1076 {
1077         const char *reason;
1078
1079         reason = dmar_get_fault_reason(fault_reason);
1080
1081         printk(KERN_ERR
1082                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1083                 "fault addr %llx \n"
1084                 "DMAR:[fault reason %02d] %s\n",
1085                 (type ? "DMA Read" : "DMA Write"),
1086                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1087                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1088         return 0;
1089 }
1090
1091 #define PRIMARY_FAULT_REG_LEN (16)
1092 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1093 {
1094         struct intel_iommu *iommu = dev_id;
1095         int reg, fault_index;
1096         u32 fault_status;
1097         unsigned long flag;
1098
1099         spin_lock_irqsave(&iommu->register_lock, flag);
1100         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1101
1102         /* TBD: ignore advanced fault log currently */
1103         if (!(fault_status & DMA_FSTS_PPF))
1104                 goto clear_overflow;
1105
1106         fault_index = dma_fsts_fault_record_index(fault_status);
1107         reg = cap_fault_reg_offset(iommu->cap);
1108         while (1) {
1109                 u8 fault_reason;
1110                 u16 source_id;
1111                 u64 guest_addr;
1112                 int type;
1113                 u32 data;
1114
1115                 /* highest 32 bits */
1116                 data = readl(iommu->reg + reg +
1117                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1118                 if (!(data & DMA_FRCD_F))
1119                         break;
1120
1121                 fault_reason = dma_frcd_fault_reason(data);
1122                 type = dma_frcd_type(data);
1123
1124                 data = readl(iommu->reg + reg +
1125                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1126                 source_id = dma_frcd_source_id(data);
1127
1128                 guest_addr = dmar_readq(iommu->reg + reg +
1129                                 fault_index * PRIMARY_FAULT_REG_LEN);
1130                 guest_addr = dma_frcd_page_addr(guest_addr);
1131                 /* clear the fault */
1132                 writel(DMA_FRCD_F, iommu->reg + reg +
1133                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1134
1135                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1136
1137                 iommu_page_fault_do_one(iommu, type, fault_reason,
1138                                 source_id, guest_addr);
1139
1140                 fault_index++;
1141                 if (fault_index > cap_num_fault_regs(iommu->cap))
1142                         fault_index = 0;
1143                 spin_lock_irqsave(&iommu->register_lock, flag);
1144         }
1145 clear_overflow:
1146         /* clear primary fault overflow */
1147         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1148         if (fault_status & DMA_FSTS_PFO)
1149                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1150
1151         spin_unlock_irqrestore(&iommu->register_lock, flag);
1152         return IRQ_HANDLED;
1153 }
1154
1155 int dmar_set_interrupt(struct intel_iommu *iommu)
1156 {
1157         int irq, ret;
1158
1159         irq = create_irq();
1160         if (!irq) {
1161                 printk(KERN_ERR "IOMMU: no free vectors\n");
1162                 return -EINVAL;
1163         }
1164
1165         set_irq_data(irq, iommu);
1166         iommu->irq = irq;
1167
1168         ret = arch_setup_dmar_msi(irq);
1169         if (ret) {
1170                 set_irq_data(irq, NULL);
1171                 iommu->irq = 0;
1172                 destroy_irq(irq);
1173                 return 0;
1174         }
1175
1176         /* Force fault register is cleared */
1177         iommu_page_fault(irq, iommu);
1178
1179         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1180         if (ret)
1181                 printk(KERN_ERR "IOMMU: can't request irq\n");
1182         return ret;
1183 }
1184
1185 static int iommu_init_domains(struct intel_iommu *iommu)
1186 {
1187         unsigned long ndomains;
1188         unsigned long nlongs;
1189
1190         ndomains = cap_ndoms(iommu->cap);
1191         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1192         nlongs = BITS_TO_LONGS(ndomains);
1193
1194         /* TBD: there might be 64K domains,
1195          * consider other allocation for future chip
1196          */
1197         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1198         if (!iommu->domain_ids) {
1199                 printk(KERN_ERR "Allocating domain id array failed\n");
1200                 return -ENOMEM;
1201         }
1202         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1203                         GFP_KERNEL);
1204         if (!iommu->domains) {
1205                 printk(KERN_ERR "Allocating domain array failed\n");
1206                 kfree(iommu->domain_ids);
1207                 return -ENOMEM;
1208         }
1209
1210         spin_lock_init(&iommu->lock);
1211
1212         /*
1213          * if Caching mode is set, then invalid translations are tagged
1214          * with domainid 0. Hence we need to pre-allocate it.
1215          */
1216         if (cap_caching_mode(iommu->cap))
1217                 set_bit(0, iommu->domain_ids);
1218         return 0;
1219 }
1220
1221
1222 static void domain_exit(struct dmar_domain *domain);
1223 static void vm_domain_exit(struct dmar_domain *domain);
1224
1225 void free_dmar_iommu(struct intel_iommu *iommu)
1226 {
1227         struct dmar_domain *domain;
1228         int i;
1229         unsigned long flags;
1230
1231         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1232         for (; i < cap_ndoms(iommu->cap); ) {
1233                 domain = iommu->domains[i];
1234                 clear_bit(i, iommu->domain_ids);
1235
1236                 spin_lock_irqsave(&domain->iommu_lock, flags);
1237                 if (--domain->iommu_count == 0) {
1238                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1239                                 vm_domain_exit(domain);
1240                         else
1241                                 domain_exit(domain);
1242                 }
1243                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1244
1245                 i = find_next_bit(iommu->domain_ids,
1246                         cap_ndoms(iommu->cap), i+1);
1247         }
1248
1249         if (iommu->gcmd & DMA_GCMD_TE)
1250                 iommu_disable_translation(iommu);
1251
1252         if (iommu->irq) {
1253                 set_irq_data(iommu->irq, NULL);
1254                 /* This will mask the irq */
1255                 free_irq(iommu->irq, iommu);
1256                 destroy_irq(iommu->irq);
1257         }
1258
1259         kfree(iommu->domains);
1260         kfree(iommu->domain_ids);
1261
1262         g_iommus[iommu->seq_id] = NULL;
1263
1264         /* if all iommus are freed, free g_iommus */
1265         for (i = 0; i < g_num_of_iommus; i++) {
1266                 if (g_iommus[i])
1267                         break;
1268         }
1269
1270         if (i == g_num_of_iommus)
1271                 kfree(g_iommus);
1272
1273         /* free context mapping */
1274         free_context_table(iommu);
1275 }
1276
1277 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1278 {
1279         unsigned long num;
1280         unsigned long ndomains;
1281         struct dmar_domain *domain;
1282         unsigned long flags;
1283
1284         domain = alloc_domain_mem();
1285         if (!domain)
1286                 return NULL;
1287
1288         ndomains = cap_ndoms(iommu->cap);
1289
1290         spin_lock_irqsave(&iommu->lock, flags);
1291         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1292         if (num >= ndomains) {
1293                 spin_unlock_irqrestore(&iommu->lock, flags);
1294                 free_domain_mem(domain);
1295                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1296                 return NULL;
1297         }
1298
1299         set_bit(num, iommu->domain_ids);
1300         domain->id = num;
1301         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1302         set_bit(iommu->seq_id, &domain->iommu_bmp);
1303         domain->flags = 0;
1304         iommu->domains[num] = domain;
1305         spin_unlock_irqrestore(&iommu->lock, flags);
1306
1307         return domain;
1308 }
1309
1310 static void iommu_free_domain(struct dmar_domain *domain)
1311 {
1312         unsigned long flags;
1313         struct intel_iommu *iommu;
1314
1315         iommu = domain_get_iommu(domain);
1316
1317         spin_lock_irqsave(&iommu->lock, flags);
1318         clear_bit(domain->id, iommu->domain_ids);
1319         spin_unlock_irqrestore(&iommu->lock, flags);
1320 }
1321
1322 static struct iova_domain reserved_iova_list;
1323 static struct lock_class_key reserved_alloc_key;
1324 static struct lock_class_key reserved_rbtree_key;
1325
1326 static void dmar_init_reserved_ranges(void)
1327 {
1328         struct pci_dev *pdev = NULL;
1329         struct iova *iova;
1330         int i;
1331         u64 addr, size;
1332
1333         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1334
1335         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1336                 &reserved_alloc_key);
1337         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338                 &reserved_rbtree_key);
1339
1340         /* IOAPIC ranges shouldn't be accessed by DMA */
1341         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342                 IOVA_PFN(IOAPIC_RANGE_END));
1343         if (!iova)
1344                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1345
1346         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347         for_each_pci_dev(pdev) {
1348                 struct resource *r;
1349
1350                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351                         r = &pdev->resource[i];
1352                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353                                 continue;
1354                         addr = r->start;
1355                         addr &= PAGE_MASK;
1356                         size = r->end - addr;
1357                         size = PAGE_ALIGN(size);
1358                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1359                                 IOVA_PFN(size + addr) - 1);
1360                         if (!iova)
1361                                 printk(KERN_ERR "Reserve iova failed\n");
1362                 }
1363         }
1364
1365 }
1366
1367 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1368 {
1369         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1370 }
1371
1372 static inline int guestwidth_to_adjustwidth(int gaw)
1373 {
1374         int agaw;
1375         int r = (gaw - 12) % 9;
1376
1377         if (r == 0)
1378                 agaw = gaw;
1379         else
1380                 agaw = gaw + 9 - r;
1381         if (agaw > 64)
1382                 agaw = 64;
1383         return agaw;
1384 }
1385
1386 static int domain_init(struct dmar_domain *domain, int guest_width)
1387 {
1388         struct intel_iommu *iommu;
1389         int adjust_width, agaw;
1390         unsigned long sagaw;
1391
1392         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1393         spin_lock_init(&domain->mapping_lock);
1394         spin_lock_init(&domain->iommu_lock);
1395
1396         domain_reserve_special_ranges(domain);
1397
1398         /* calculate AGAW */
1399         iommu = domain_get_iommu(domain);
1400         if (guest_width > cap_mgaw(iommu->cap))
1401                 guest_width = cap_mgaw(iommu->cap);
1402         domain->gaw = guest_width;
1403         adjust_width = guestwidth_to_adjustwidth(guest_width);
1404         agaw = width_to_agaw(adjust_width);
1405         sagaw = cap_sagaw(iommu->cap);
1406         if (!test_bit(agaw, &sagaw)) {
1407                 /* hardware doesn't support it, choose a bigger one */
1408                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1409                 agaw = find_next_bit(&sagaw, 5, agaw);
1410                 if (agaw >= 5)
1411                         return -ENODEV;
1412         }
1413         domain->agaw = agaw;
1414         INIT_LIST_HEAD(&domain->devices);
1415
1416         if (ecap_coherent(iommu->ecap))
1417                 domain->iommu_coherency = 1;
1418         else
1419                 domain->iommu_coherency = 0;
1420
1421         domain->iommu_count = 1;
1422
1423         /* always allocate the top pgd */
1424         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1425         if (!domain->pgd)
1426                 return -ENOMEM;
1427         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1428         return 0;
1429 }
1430
1431 static void domain_exit(struct dmar_domain *domain)
1432 {
1433         u64 end;
1434
1435         /* Domain 0 is reserved, so dont process it */
1436         if (!domain)
1437                 return;
1438
1439         domain_remove_dev_info(domain);
1440         /* destroy iovas */
1441         put_iova_domain(&domain->iovad);
1442         end = DOMAIN_MAX_ADDR(domain->gaw);
1443         end = end & (~PAGE_MASK);
1444
1445         /* clear ptes */
1446         dma_pte_clear_range(domain, 0, end);
1447
1448         /* free page tables */
1449         dma_pte_free_pagetable(domain, 0, end);
1450
1451         iommu_free_domain(domain);
1452         free_domain_mem(domain);
1453 }
1454
1455 static int domain_context_mapping_one(struct dmar_domain *domain,
1456                 u8 bus, u8 devfn)
1457 {
1458         struct context_entry *context;
1459         unsigned long flags;
1460         struct intel_iommu *iommu;
1461         struct dma_pte *pgd;
1462         unsigned long num;
1463         unsigned long ndomains;
1464         int id;
1465         int agaw;
1466
1467         pr_debug("Set context mapping for %02x:%02x.%d\n",
1468                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1469         BUG_ON(!domain->pgd);
1470
1471         iommu = device_to_iommu(bus, devfn);
1472         if (!iommu)
1473                 return -ENODEV;
1474
1475         context = device_to_context_entry(iommu, bus, devfn);
1476         if (!context)
1477                 return -ENOMEM;
1478         spin_lock_irqsave(&iommu->lock, flags);
1479         if (context_present(context)) {
1480                 spin_unlock_irqrestore(&iommu->lock, flags);
1481                 return 0;
1482         }
1483
1484         id = domain->id;
1485         pgd = domain->pgd;
1486
1487         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1488                 int found = 0;
1489
1490                 /* find an available domain id for this device in iommu */
1491                 ndomains = cap_ndoms(iommu->cap);
1492                 num = find_first_bit(iommu->domain_ids, ndomains);
1493                 for (; num < ndomains; ) {
1494                         if (iommu->domains[num] == domain) {
1495                                 id = num;
1496                                 found = 1;
1497                                 break;
1498                         }
1499                         num = find_next_bit(iommu->domain_ids,
1500                                             cap_ndoms(iommu->cap), num+1);
1501                 }
1502
1503                 if (found == 0) {
1504                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1505                         if (num >= ndomains) {
1506                                 spin_unlock_irqrestore(&iommu->lock, flags);
1507                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1508                                 return -EFAULT;
1509                         }
1510
1511                         set_bit(num, iommu->domain_ids);
1512                         iommu->domains[num] = domain;
1513                         id = num;
1514                 }
1515
1516                 /* Skip top levels of page tables for
1517                  * iommu which has less agaw than default.
1518                  */
1519                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1520                         pgd = phys_to_virt(dma_pte_addr(pgd));
1521                         if (!dma_pte_present(pgd)) {
1522                                 spin_unlock_irqrestore(&iommu->lock, flags);
1523                                 return -ENOMEM;
1524                         }
1525                 }
1526         }
1527
1528         context_set_domain_id(context, id);
1529         context_set_address_width(context, iommu->agaw);
1530         context_set_address_root(context, virt_to_phys(pgd));
1531         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1532         context_set_fault_enable(context);
1533         context_set_present(context);
1534         domain_flush_cache(domain, context, sizeof(*context));
1535
1536         /* it's a non-present to present mapping */
1537         if (iommu->flush.flush_context(iommu, domain->id,
1538                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1539                 DMA_CCMD_DEVICE_INVL, 1))
1540                 iommu_flush_write_buffer(iommu);
1541         else
1542                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1543
1544         spin_unlock_irqrestore(&iommu->lock, flags);
1545
1546         spin_lock_irqsave(&domain->iommu_lock, flags);
1547         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1548                 domain->iommu_count++;
1549                 domain_update_iommu_coherency(domain);
1550         }
1551         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1552         return 0;
1553 }
1554
1555 static int
1556 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1557 {
1558         int ret;
1559         struct pci_dev *tmp, *parent;
1560
1561         ret = domain_context_mapping_one(domain, pdev->bus->number,
1562                 pdev->devfn);
1563         if (ret)
1564                 return ret;
1565
1566         /* dependent device mapping */
1567         tmp = pci_find_upstream_pcie_bridge(pdev);
1568         if (!tmp)
1569                 return 0;
1570         /* Secondary interface's bus number and devfn 0 */
1571         parent = pdev->bus->self;
1572         while (parent != tmp) {
1573                 ret = domain_context_mapping_one(domain, parent->bus->number,
1574                         parent->devfn);
1575                 if (ret)
1576                         return ret;
1577                 parent = parent->bus->self;
1578         }
1579         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1580                 return domain_context_mapping_one(domain,
1581                         tmp->subordinate->number, 0);
1582         else /* this is a legacy PCI bridge */
1583                 return domain_context_mapping_one(domain,
1584                         tmp->bus->number, tmp->devfn);
1585 }
1586
1587 static int domain_context_mapped(struct pci_dev *pdev)
1588 {
1589         int ret;
1590         struct pci_dev *tmp, *parent;
1591         struct intel_iommu *iommu;
1592
1593         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1594         if (!iommu)
1595                 return -ENODEV;
1596
1597         ret = device_context_mapped(iommu,
1598                 pdev->bus->number, pdev->devfn);
1599         if (!ret)
1600                 return ret;
1601         /* dependent device mapping */
1602         tmp = pci_find_upstream_pcie_bridge(pdev);
1603         if (!tmp)
1604                 return ret;
1605         /* Secondary interface's bus number and devfn 0 */
1606         parent = pdev->bus->self;
1607         while (parent != tmp) {
1608                 ret = device_context_mapped(iommu, parent->bus->number,
1609                         parent->devfn);
1610                 if (!ret)
1611                         return ret;
1612                 parent = parent->bus->self;
1613         }
1614         if (tmp->is_pcie)
1615                 return device_context_mapped(iommu,
1616                         tmp->subordinate->number, 0);
1617         else
1618                 return device_context_mapped(iommu,
1619                         tmp->bus->number, tmp->devfn);
1620 }
1621
1622 static int
1623 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1624                         u64 hpa, size_t size, int prot)
1625 {
1626         u64 start_pfn, end_pfn;
1627         struct dma_pte *pte;
1628         int index;
1629         int addr_width = agaw_to_width(domain->agaw);
1630
1631         hpa &= (((u64)1) << addr_width) - 1;
1632
1633         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1634                 return -EINVAL;
1635         iova &= PAGE_MASK;
1636         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1637         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1638         index = 0;
1639         while (start_pfn < end_pfn) {
1640                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1641                 if (!pte)
1642                         return -ENOMEM;
1643                 /* We don't need lock here, nobody else
1644                  * touches the iova range
1645                  */
1646                 BUG_ON(dma_pte_addr(pte));
1647                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1648                 dma_set_pte_prot(pte, prot);
1649                 domain_flush_cache(domain, pte, sizeof(*pte));
1650                 start_pfn++;
1651                 index++;
1652         }
1653         return 0;
1654 }
1655
1656 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1657 {
1658         if (!iommu)
1659                 return;
1660
1661         clear_context_table(iommu, bus, devfn);
1662         iommu->flush.flush_context(iommu, 0, 0, 0,
1663                                            DMA_CCMD_GLOBAL_INVL, 0);
1664         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1665                                          DMA_TLB_GLOBAL_FLUSH, 0);
1666 }
1667
1668 static void domain_remove_dev_info(struct dmar_domain *domain)
1669 {
1670         struct device_domain_info *info;
1671         unsigned long flags;
1672         struct intel_iommu *iommu;
1673
1674         spin_lock_irqsave(&device_domain_lock, flags);
1675         while (!list_empty(&domain->devices)) {
1676                 info = list_entry(domain->devices.next,
1677                         struct device_domain_info, link);
1678                 list_del(&info->link);
1679                 list_del(&info->global);
1680                 if (info->dev)
1681                         info->dev->dev.archdata.iommu = NULL;
1682                 spin_unlock_irqrestore(&device_domain_lock, flags);
1683
1684                 iommu = device_to_iommu(info->bus, info->devfn);
1685                 iommu_detach_dev(iommu, info->bus, info->devfn);
1686                 free_devinfo_mem(info);
1687
1688                 spin_lock_irqsave(&device_domain_lock, flags);
1689         }
1690         spin_unlock_irqrestore(&device_domain_lock, flags);
1691 }
1692
1693 /*
1694  * find_domain
1695  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1696  */
1697 static struct dmar_domain *
1698 find_domain(struct pci_dev *pdev)
1699 {
1700         struct device_domain_info *info;
1701
1702         /* No lock here, assumes no domain exit in normal case */
1703         info = pdev->dev.archdata.iommu;
1704         if (info)
1705                 return info->domain;
1706         return NULL;
1707 }
1708
1709 /* domain is initialized */
1710 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1711 {
1712         struct dmar_domain *domain, *found = NULL;
1713         struct intel_iommu *iommu;
1714         struct dmar_drhd_unit *drhd;
1715         struct device_domain_info *info, *tmp;
1716         struct pci_dev *dev_tmp;
1717         unsigned long flags;
1718         int bus = 0, devfn = 0;
1719
1720         domain = find_domain(pdev);
1721         if (domain)
1722                 return domain;
1723
1724         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1725         if (dev_tmp) {
1726                 if (dev_tmp->is_pcie) {
1727                         bus = dev_tmp->subordinate->number;
1728                         devfn = 0;
1729                 } else {
1730                         bus = dev_tmp->bus->number;
1731                         devfn = dev_tmp->devfn;
1732                 }
1733                 spin_lock_irqsave(&device_domain_lock, flags);
1734                 list_for_each_entry(info, &device_domain_list, global) {
1735                         if (info->bus == bus && info->devfn == devfn) {
1736                                 found = info->domain;
1737                                 break;
1738                         }
1739                 }
1740                 spin_unlock_irqrestore(&device_domain_lock, flags);
1741                 /* pcie-pci bridge already has a domain, uses it */
1742                 if (found) {
1743                         domain = found;
1744                         goto found_domain;
1745                 }
1746         }
1747
1748         /* Allocate new domain for the device */
1749         drhd = dmar_find_matched_drhd_unit(pdev);
1750         if (!drhd) {
1751                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1752                         pci_name(pdev));
1753                 return NULL;
1754         }
1755         iommu = drhd->iommu;
1756
1757         domain = iommu_alloc_domain(iommu);
1758         if (!domain)
1759                 goto error;
1760
1761         if (domain_init(domain, gaw)) {
1762                 domain_exit(domain);
1763                 goto error;
1764         }
1765
1766         /* register pcie-to-pci device */
1767         if (dev_tmp) {
1768                 info = alloc_devinfo_mem();
1769                 if (!info) {
1770                         domain_exit(domain);
1771                         goto error;
1772                 }
1773                 info->bus = bus;
1774                 info->devfn = devfn;
1775                 info->dev = NULL;
1776                 info->domain = domain;
1777                 /* This domain is shared by devices under p2p bridge */
1778                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1779
1780                 /* pcie-to-pci bridge already has a domain, uses it */
1781                 found = NULL;
1782                 spin_lock_irqsave(&device_domain_lock, flags);
1783                 list_for_each_entry(tmp, &device_domain_list, global) {
1784                         if (tmp->bus == bus && tmp->devfn == devfn) {
1785                                 found = tmp->domain;
1786                                 break;
1787                         }
1788                 }
1789                 if (found) {
1790                         free_devinfo_mem(info);
1791                         domain_exit(domain);
1792                         domain = found;
1793                 } else {
1794                         list_add(&info->link, &domain->devices);
1795                         list_add(&info->global, &device_domain_list);
1796                 }
1797                 spin_unlock_irqrestore(&device_domain_lock, flags);
1798         }
1799
1800 found_domain:
1801         info = alloc_devinfo_mem();
1802         if (!info)
1803                 goto error;
1804         info->bus = pdev->bus->number;
1805         info->devfn = pdev->devfn;
1806         info->dev = pdev;
1807         info->domain = domain;
1808         spin_lock_irqsave(&device_domain_lock, flags);
1809         /* somebody is fast */
1810         found = find_domain(pdev);
1811         if (found != NULL) {
1812                 spin_unlock_irqrestore(&device_domain_lock, flags);
1813                 if (found != domain) {
1814                         domain_exit(domain);
1815                         domain = found;
1816                 }
1817                 free_devinfo_mem(info);
1818                 return domain;
1819         }
1820         list_add(&info->link, &domain->devices);
1821         list_add(&info->global, &device_domain_list);
1822         pdev->dev.archdata.iommu = info;
1823         spin_unlock_irqrestore(&device_domain_lock, flags);
1824         return domain;
1825 error:
1826         /* recheck it here, maybe others set it */
1827         return find_domain(pdev);
1828 }
1829
1830 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1831                                       unsigned long long start,
1832                                       unsigned long long end)
1833 {
1834         struct dmar_domain *domain;
1835         unsigned long size;
1836         unsigned long long base;
1837         int ret;
1838
1839         printk(KERN_INFO
1840                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1841                 pci_name(pdev), start, end);
1842         /* page table init */
1843         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1844         if (!domain)
1845                 return -ENOMEM;
1846
1847         /* The address might not be aligned */
1848         base = start & PAGE_MASK;
1849         size = end - base;
1850         size = PAGE_ALIGN(size);
1851         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1852                         IOVA_PFN(base + size) - 1)) {
1853                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1854                 ret = -ENOMEM;
1855                 goto error;
1856         }
1857
1858         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1859                 size, base, pci_name(pdev));
1860         /*
1861          * RMRR range might have overlap with physical memory range,
1862          * clear it first
1863          */
1864         dma_pte_clear_range(domain, base, base + size);
1865
1866         ret = domain_page_mapping(domain, base, base, size,
1867                 DMA_PTE_READ|DMA_PTE_WRITE);
1868         if (ret)
1869                 goto error;
1870
1871         /* context entry init */
1872         ret = domain_context_mapping(domain, pdev);
1873         if (!ret)
1874                 return 0;
1875 error:
1876         domain_exit(domain);
1877         return ret;
1878
1879 }
1880
1881 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1882         struct pci_dev *pdev)
1883 {
1884         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1885                 return 0;
1886         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1887                 rmrr->end_address + 1);
1888 }
1889
1890 #ifdef CONFIG_DMAR_GFX_WA
1891 struct iommu_prepare_data {
1892         struct pci_dev *pdev;
1893         int ret;
1894 };
1895
1896 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1897                                          unsigned long end_pfn, void *datax)
1898 {
1899         struct iommu_prepare_data *data;
1900
1901         data = (struct iommu_prepare_data *)datax;
1902
1903         data->ret = iommu_prepare_identity_map(data->pdev,
1904                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1905         return data->ret;
1906
1907 }
1908
1909 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1910 {
1911         int nid;
1912         struct iommu_prepare_data data;
1913
1914         data.pdev = pdev;
1915         data.ret = 0;
1916
1917         for_each_online_node(nid) {
1918                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1919                 if (data.ret)
1920                         return data.ret;
1921         }
1922         return data.ret;
1923 }
1924
1925 static void __init iommu_prepare_gfx_mapping(void)
1926 {
1927         struct pci_dev *pdev = NULL;
1928         int ret;
1929
1930         for_each_pci_dev(pdev) {
1931                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1932                                 !IS_GFX_DEVICE(pdev))
1933                         continue;
1934                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1935                         pci_name(pdev));
1936                 ret = iommu_prepare_with_active_regions(pdev);
1937                 if (ret)
1938                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1939         }
1940 }
1941 #else /* !CONFIG_DMAR_GFX_WA */
1942 static inline void iommu_prepare_gfx_mapping(void)
1943 {
1944         return;
1945 }
1946 #endif
1947
1948 #ifdef CONFIG_DMAR_FLOPPY_WA
1949 static inline void iommu_prepare_isa(void)
1950 {
1951         struct pci_dev *pdev;
1952         int ret;
1953
1954         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1955         if (!pdev)
1956                 return;
1957
1958         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1959         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1960
1961         if (ret)
1962                 printk("IOMMU: Failed to create 0-64M identity map, "
1963                         "floppy might not work\n");
1964
1965 }
1966 #else
1967 static inline void iommu_prepare_isa(void)
1968 {
1969         return;
1970 }
1971 #endif /* !CONFIG_DMAR_FLPY_WA */
1972
1973 static int __init init_dmars(void)
1974 {
1975         struct dmar_drhd_unit *drhd;
1976         struct dmar_rmrr_unit *rmrr;
1977         struct pci_dev *pdev;
1978         struct intel_iommu *iommu;
1979         int i, ret, unit = 0;
1980
1981         /*
1982          * for each drhd
1983          *    allocate root
1984          *    initialize and program root entry to not present
1985          * endfor
1986          */
1987         for_each_drhd_unit(drhd) {
1988                 g_num_of_iommus++;
1989                 /*
1990                  * lock not needed as this is only incremented in the single
1991                  * threaded kernel __init code path all other access are read
1992                  * only
1993                  */
1994         }
1995
1996         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1997                         GFP_KERNEL);
1998         if (!g_iommus) {
1999                 printk(KERN_ERR "Allocating global iommu array failed\n");
2000                 ret = -ENOMEM;
2001                 goto error;
2002         }
2003
2004         deferred_flush = kzalloc(g_num_of_iommus *
2005                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2006         if (!deferred_flush) {
2007                 kfree(g_iommus);
2008                 ret = -ENOMEM;
2009                 goto error;
2010         }
2011
2012         for_each_drhd_unit(drhd) {
2013                 if (drhd->ignored)
2014                         continue;
2015
2016                 iommu = drhd->iommu;
2017                 g_iommus[iommu->seq_id] = iommu;
2018
2019                 ret = iommu_init_domains(iommu);
2020                 if (ret)
2021                         goto error;
2022
2023                 /*
2024                  * TBD:
2025                  * we could share the same root & context tables
2026                  * amoung all IOMMU's. Need to Split it later.
2027                  */
2028                 ret = iommu_alloc_root_entry(iommu);
2029                 if (ret) {
2030                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2031                         goto error;
2032                 }
2033         }
2034
2035         for_each_drhd_unit(drhd) {
2036                 if (drhd->ignored)
2037                         continue;
2038
2039                 iommu = drhd->iommu;
2040                 if (dmar_enable_qi(iommu)) {
2041                         /*
2042                          * Queued Invalidate not enabled, use Register Based
2043                          * Invalidate
2044                          */
2045                         iommu->flush.flush_context = __iommu_flush_context;
2046                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2047                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2048                                "invalidation\n",
2049                                (unsigned long long)drhd->reg_base_addr);
2050                 } else {
2051                         iommu->flush.flush_context = qi_flush_context;
2052                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2053                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2054                                "invalidation\n",
2055                                (unsigned long long)drhd->reg_base_addr);
2056                 }
2057         }
2058
2059         /*
2060          * For each rmrr
2061          *   for each dev attached to rmrr
2062          *   do
2063          *     locate drhd for dev, alloc domain for dev
2064          *     allocate free domain
2065          *     allocate page table entries for rmrr
2066          *     if context not allocated for bus
2067          *           allocate and init context
2068          *           set present in root table for this bus
2069          *     init context with domain, translation etc
2070          *    endfor
2071          * endfor
2072          */
2073         for_each_rmrr_units(rmrr) {
2074                 for (i = 0; i < rmrr->devices_cnt; i++) {
2075                         pdev = rmrr->devices[i];
2076                         /* some BIOS lists non-exist devices in DMAR table */
2077                         if (!pdev)
2078                                 continue;
2079                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2080                         if (ret)
2081                                 printk(KERN_ERR
2082                                  "IOMMU: mapping reserved region failed\n");
2083                 }
2084         }
2085
2086         iommu_prepare_gfx_mapping();
2087
2088         iommu_prepare_isa();
2089
2090         /*
2091          * for each drhd
2092          *   enable fault log
2093          *   global invalidate context cache
2094          *   global invalidate iotlb
2095          *   enable translation
2096          */
2097         for_each_drhd_unit(drhd) {
2098                 if (drhd->ignored)
2099                         continue;
2100                 iommu = drhd->iommu;
2101                 sprintf (iommu->name, "dmar%d", unit++);
2102
2103                 iommu_flush_write_buffer(iommu);
2104
2105                 ret = dmar_set_interrupt(iommu);
2106                 if (ret)
2107                         goto error;
2108
2109                 iommu_set_root_entry(iommu);
2110
2111                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2112                                            0);
2113                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2114                                          0);
2115                 iommu_disable_protect_mem_regions(iommu);
2116
2117                 ret = iommu_enable_translation(iommu);
2118                 if (ret)
2119                         goto error;
2120         }
2121
2122         return 0;
2123 error:
2124         for_each_drhd_unit(drhd) {
2125                 if (drhd->ignored)
2126                         continue;
2127                 iommu = drhd->iommu;
2128                 free_iommu(iommu);
2129         }
2130         kfree(g_iommus);
2131         return ret;
2132 }
2133
2134 static inline u64 aligned_size(u64 host_addr, size_t size)
2135 {
2136         u64 addr;
2137         addr = (host_addr & (~PAGE_MASK)) + size;
2138         return PAGE_ALIGN(addr);
2139 }
2140
2141 struct iova *
2142 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2143 {
2144         struct iova *piova;
2145
2146         /* Make sure it's in range */
2147         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2148         if (!size || (IOVA_START_ADDR + size > end))
2149                 return NULL;
2150
2151         piova = alloc_iova(&domain->iovad,
2152                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2153         return piova;
2154 }
2155
2156 static struct iova *
2157 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2158                    size_t size, u64 dma_mask)
2159 {
2160         struct pci_dev *pdev = to_pci_dev(dev);
2161         struct iova *iova = NULL;
2162
2163         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2164                 iova = iommu_alloc_iova(domain, size, dma_mask);
2165         else {
2166                 /*
2167                  * First try to allocate an io virtual address in
2168                  * DMA_32BIT_MASK and if that fails then try allocating
2169                  * from higher range
2170                  */
2171                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2172                 if (!iova)
2173                         iova = iommu_alloc_iova(domain, size, dma_mask);
2174         }
2175
2176         if (!iova) {
2177                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2178                 return NULL;
2179         }
2180
2181         return iova;
2182 }
2183
2184 static struct dmar_domain *
2185 get_valid_domain_for_dev(struct pci_dev *pdev)
2186 {
2187         struct dmar_domain *domain;
2188         int ret;
2189
2190         domain = get_domain_for_dev(pdev,
2191                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2192         if (!domain) {
2193                 printk(KERN_ERR
2194                         "Allocating domain for %s failed", pci_name(pdev));
2195                 return NULL;
2196         }
2197
2198         /* make sure context mapping is ok */
2199         if (unlikely(!domain_context_mapped(pdev))) {
2200                 ret = domain_context_mapping(domain, pdev);
2201                 if (ret) {
2202                         printk(KERN_ERR
2203                                 "Domain context map for %s failed",
2204                                 pci_name(pdev));
2205                         return NULL;
2206                 }
2207         }
2208
2209         return domain;
2210 }
2211
2212 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2213                                      size_t size, int dir, u64 dma_mask)
2214 {
2215         struct pci_dev *pdev = to_pci_dev(hwdev);
2216         struct dmar_domain *domain;
2217         phys_addr_t start_paddr;
2218         struct iova *iova;
2219         int prot = 0;
2220         int ret;
2221         struct intel_iommu *iommu;
2222
2223         BUG_ON(dir == DMA_NONE);
2224         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2225                 return paddr;
2226
2227         domain = get_valid_domain_for_dev(pdev);
2228         if (!domain)
2229                 return 0;
2230
2231         iommu = domain_get_iommu(domain);
2232         size = aligned_size((u64)paddr, size);
2233
2234         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2235         if (!iova)
2236                 goto error;
2237
2238         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2239
2240         /*
2241          * Check if DMAR supports zero-length reads on write only
2242          * mappings..
2243          */
2244         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2245                         !cap_zlr(iommu->cap))
2246                 prot |= DMA_PTE_READ;
2247         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2248                 prot |= DMA_PTE_WRITE;
2249         /*
2250          * paddr - (paddr + size) might be partial page, we should map the whole
2251          * page.  Note: if two part of one page are separately mapped, we
2252          * might have two guest_addr mapping to the same host paddr, but this
2253          * is not a big problem
2254          */
2255         ret = domain_page_mapping(domain, start_paddr,
2256                 ((u64)paddr) & PAGE_MASK, size, prot);
2257         if (ret)
2258                 goto error;
2259
2260         /* it's a non-present to present mapping */
2261         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2262                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2263         if (ret)
2264                 iommu_flush_write_buffer(iommu);
2265
2266         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2267
2268 error:
2269         if (iova)
2270                 __free_iova(&domain->iovad, iova);
2271         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2272                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2273         return 0;
2274 }
2275
2276 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2277                                  unsigned long offset, size_t size,
2278                                  enum dma_data_direction dir,
2279                                  struct dma_attrs *attrs)
2280 {
2281         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2282                                   dir, to_pci_dev(dev)->dma_mask);
2283 }
2284
2285 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2286                             size_t size, int dir)
2287 {
2288         return __intel_map_single(hwdev, paddr, size, dir,
2289                                   to_pci_dev(hwdev)->dma_mask);
2290 }
2291
2292 static void flush_unmaps(void)
2293 {
2294         int i, j;
2295
2296         timer_on = 0;
2297
2298         /* just flush them all */
2299         for (i = 0; i < g_num_of_iommus; i++) {
2300                 struct intel_iommu *iommu = g_iommus[i];
2301                 if (!iommu)
2302                         continue;
2303
2304                 if (deferred_flush[i].next) {
2305                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2306                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2307                         for (j = 0; j < deferred_flush[i].next; j++) {
2308                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2309                                                 deferred_flush[i].iova[j]);
2310                         }
2311                         deferred_flush[i].next = 0;
2312                 }
2313         }
2314
2315         list_size = 0;
2316 }
2317
2318 static void flush_unmaps_timeout(unsigned long data)
2319 {
2320         unsigned long flags;
2321
2322         spin_lock_irqsave(&async_umap_flush_lock, flags);
2323         flush_unmaps();
2324         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2325 }
2326
2327 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2328 {
2329         unsigned long flags;
2330         int next, iommu_id;
2331         struct intel_iommu *iommu;
2332
2333         spin_lock_irqsave(&async_umap_flush_lock, flags);
2334         if (list_size == HIGH_WATER_MARK)
2335                 flush_unmaps();
2336
2337         iommu = domain_get_iommu(dom);
2338         iommu_id = iommu->seq_id;
2339
2340         next = deferred_flush[iommu_id].next;
2341         deferred_flush[iommu_id].domain[next] = dom;
2342         deferred_flush[iommu_id].iova[next] = iova;
2343         deferred_flush[iommu_id].next++;
2344
2345         if (!timer_on) {
2346                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2347                 timer_on = 1;
2348         }
2349         list_size++;
2350         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2351 }
2352
2353 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2354                              size_t size, enum dma_data_direction dir,
2355                              struct dma_attrs *attrs)
2356 {
2357         struct pci_dev *pdev = to_pci_dev(dev);
2358         struct dmar_domain *domain;
2359         unsigned long start_addr;
2360         struct iova *iova;
2361         struct intel_iommu *iommu;
2362
2363         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2364                 return;
2365         domain = find_domain(pdev);
2366         BUG_ON(!domain);
2367
2368         iommu = domain_get_iommu(domain);
2369
2370         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2371         if (!iova)
2372                 return;
2373
2374         start_addr = iova->pfn_lo << PAGE_SHIFT;
2375         size = aligned_size((u64)dev_addr, size);
2376
2377         pr_debug("Device %s unmapping: %lx@%llx\n",
2378                 pci_name(pdev), size, (unsigned long long)start_addr);
2379
2380         /*  clear the whole page */
2381         dma_pte_clear_range(domain, start_addr, start_addr + size);
2382         /* free page tables */
2383         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2384         if (intel_iommu_strict) {
2385                 if (iommu_flush_iotlb_psi(iommu,
2386                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2387                         iommu_flush_write_buffer(iommu);
2388                 /* free iova */
2389                 __free_iova(&domain->iovad, iova);
2390         } else {
2391                 add_unmap(domain, iova);
2392                 /*
2393                  * queue up the release of the unmap to save the 1/6th of the
2394                  * cpu used up by the iotlb flush operation...
2395                  */
2396         }
2397 }
2398
2399 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2400                         int dir)
2401 {
2402         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2403 }
2404
2405 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2406                            dma_addr_t *dma_handle, gfp_t flags)
2407 {
2408         void *vaddr;
2409         int order;
2410
2411         size = PAGE_ALIGN(size);
2412         order = get_order(size);
2413         flags &= ~(GFP_DMA | GFP_DMA32);
2414
2415         vaddr = (void *)__get_free_pages(flags, order);
2416         if (!vaddr)
2417                 return NULL;
2418         memset(vaddr, 0, size);
2419
2420         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2421                                          DMA_BIDIRECTIONAL,
2422                                          hwdev->coherent_dma_mask);
2423         if (*dma_handle)
2424                 return vaddr;
2425         free_pages((unsigned long)vaddr, order);
2426         return NULL;
2427 }
2428
2429 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2430                          dma_addr_t dma_handle)
2431 {
2432         int order;
2433
2434         size = PAGE_ALIGN(size);
2435         order = get_order(size);
2436
2437         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2438         free_pages((unsigned long)vaddr, order);
2439 }
2440
2441 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2442
2443 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2444                     int nelems, enum dma_data_direction dir,
2445                     struct dma_attrs *attrs)
2446 {
2447         int i;
2448         struct pci_dev *pdev = to_pci_dev(hwdev);
2449         struct dmar_domain *domain;
2450         unsigned long start_addr;
2451         struct iova *iova;
2452         size_t size = 0;
2453         void *addr;
2454         struct scatterlist *sg;
2455         struct intel_iommu *iommu;
2456
2457         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2458                 return;
2459
2460         domain = find_domain(pdev);
2461         BUG_ON(!domain);
2462
2463         iommu = domain_get_iommu(domain);
2464
2465         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2466         if (!iova)
2467                 return;
2468         for_each_sg(sglist, sg, nelems, i) {
2469                 addr = SG_ENT_VIRT_ADDRESS(sg);
2470                 size += aligned_size((u64)addr, sg->length);
2471         }
2472
2473         start_addr = iova->pfn_lo << PAGE_SHIFT;
2474
2475         /*  clear the whole page */
2476         dma_pte_clear_range(domain, start_addr, start_addr + size);
2477         /* free page tables */
2478         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2479
2480         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2481                         size >> VTD_PAGE_SHIFT, 0))
2482                 iommu_flush_write_buffer(iommu);
2483
2484         /* free iova */
2485         __free_iova(&domain->iovad, iova);
2486 }
2487
2488 static int intel_nontranslate_map_sg(struct device *hddev,
2489         struct scatterlist *sglist, int nelems, int dir)
2490 {
2491         int i;
2492         struct scatterlist *sg;
2493
2494         for_each_sg(sglist, sg, nelems, i) {
2495                 BUG_ON(!sg_page(sg));
2496                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2497                 sg->dma_length = sg->length;
2498         }
2499         return nelems;
2500 }
2501
2502 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2503                  enum dma_data_direction dir, struct dma_attrs *attrs)
2504 {
2505         void *addr;
2506         int i;
2507         struct pci_dev *pdev = to_pci_dev(hwdev);
2508         struct dmar_domain *domain;
2509         size_t size = 0;
2510         int prot = 0;
2511         size_t offset = 0;
2512         struct iova *iova = NULL;
2513         int ret;
2514         struct scatterlist *sg;
2515         unsigned long start_addr;
2516         struct intel_iommu *iommu;
2517
2518         BUG_ON(dir == DMA_NONE);
2519         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2520                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2521
2522         domain = get_valid_domain_for_dev(pdev);
2523         if (!domain)
2524                 return 0;
2525
2526         iommu = domain_get_iommu(domain);
2527
2528         for_each_sg(sglist, sg, nelems, i) {
2529                 addr = SG_ENT_VIRT_ADDRESS(sg);
2530                 addr = (void *)virt_to_phys(addr);
2531                 size += aligned_size((u64)addr, sg->length);
2532         }
2533
2534         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2535         if (!iova) {
2536                 sglist->dma_length = 0;
2537                 return 0;
2538         }
2539
2540         /*
2541          * Check if DMAR supports zero-length reads on write only
2542          * mappings..
2543          */
2544         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2545                         !cap_zlr(iommu->cap))
2546                 prot |= DMA_PTE_READ;
2547         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2548                 prot |= DMA_PTE_WRITE;
2549
2550         start_addr = iova->pfn_lo << PAGE_SHIFT;
2551         offset = 0;
2552         for_each_sg(sglist, sg, nelems, i) {
2553                 addr = SG_ENT_VIRT_ADDRESS(sg);
2554                 addr = (void *)virt_to_phys(addr);
2555                 size = aligned_size((u64)addr, sg->length);
2556                 ret = domain_page_mapping(domain, start_addr + offset,
2557                         ((u64)addr) & PAGE_MASK,
2558                         size, prot);
2559                 if (ret) {
2560                         /*  clear the page */
2561                         dma_pte_clear_range(domain, start_addr,
2562                                   start_addr + offset);
2563                         /* free page tables */
2564                         dma_pte_free_pagetable(domain, start_addr,
2565                                   start_addr + offset);
2566                         /* free iova */
2567                         __free_iova(&domain->iovad, iova);
2568                         return 0;
2569                 }
2570                 sg->dma_address = start_addr + offset +
2571                                 ((u64)addr & (~PAGE_MASK));
2572                 sg->dma_length = sg->length;
2573                 offset += size;
2574         }
2575
2576         /* it's a non-present to present mapping */
2577         if (iommu_flush_iotlb_psi(iommu, domain->id,
2578                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2579                 iommu_flush_write_buffer(iommu);
2580         return nelems;
2581 }
2582
2583 struct dma_map_ops intel_dma_ops = {
2584         .alloc_coherent = intel_alloc_coherent,
2585         .free_coherent = intel_free_coherent,
2586         .map_sg = intel_map_sg,
2587         .unmap_sg = intel_unmap_sg,
2588         .map_page = intel_map_page,
2589         .unmap_page = intel_unmap_page,
2590 };
2591
2592 static inline int iommu_domain_cache_init(void)
2593 {
2594         int ret = 0;
2595
2596         iommu_domain_cache = kmem_cache_create("iommu_domain",
2597                                          sizeof(struct dmar_domain),
2598                                          0,
2599                                          SLAB_HWCACHE_ALIGN,
2600
2601                                          NULL);
2602         if (!iommu_domain_cache) {
2603                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2604                 ret = -ENOMEM;
2605         }
2606
2607         return ret;
2608 }
2609
2610 static inline int iommu_devinfo_cache_init(void)
2611 {
2612         int ret = 0;
2613
2614         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2615                                          sizeof(struct device_domain_info),
2616                                          0,
2617                                          SLAB_HWCACHE_ALIGN,
2618                                          NULL);
2619         if (!iommu_devinfo_cache) {
2620                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2621                 ret = -ENOMEM;
2622         }
2623
2624         return ret;
2625 }
2626
2627 static inline int iommu_iova_cache_init(void)
2628 {
2629         int ret = 0;
2630
2631         iommu_iova_cache = kmem_cache_create("iommu_iova",
2632                                          sizeof(struct iova),
2633                                          0,
2634                                          SLAB_HWCACHE_ALIGN,
2635                                          NULL);
2636         if (!iommu_iova_cache) {
2637                 printk(KERN_ERR "Couldn't create iova cache\n");
2638                 ret = -ENOMEM;
2639         }
2640
2641         return ret;
2642 }
2643
2644 static int __init iommu_init_mempool(void)
2645 {
2646         int ret;
2647         ret = iommu_iova_cache_init();
2648         if (ret)
2649                 return ret;
2650
2651         ret = iommu_domain_cache_init();
2652         if (ret)
2653                 goto domain_error;
2654
2655         ret = iommu_devinfo_cache_init();
2656         if (!ret)
2657                 return ret;
2658
2659         kmem_cache_destroy(iommu_domain_cache);
2660 domain_error:
2661         kmem_cache_destroy(iommu_iova_cache);
2662
2663         return -ENOMEM;
2664 }
2665
2666 static void __init iommu_exit_mempool(void)
2667 {
2668         kmem_cache_destroy(iommu_devinfo_cache);
2669         kmem_cache_destroy(iommu_domain_cache);
2670         kmem_cache_destroy(iommu_iova_cache);
2671
2672 }
2673
2674 static void __init init_no_remapping_devices(void)
2675 {
2676         struct dmar_drhd_unit *drhd;
2677
2678         for_each_drhd_unit(drhd) {
2679                 if (!drhd->include_all) {
2680                         int i;
2681                         for (i = 0; i < drhd->devices_cnt; i++)
2682                                 if (drhd->devices[i] != NULL)
2683                                         break;
2684                         /* ignore DMAR unit if no pci devices exist */
2685                         if (i == drhd->devices_cnt)
2686                                 drhd->ignored = 1;
2687                 }
2688         }
2689
2690         if (dmar_map_gfx)
2691                 return;
2692
2693         for_each_drhd_unit(drhd) {
2694                 int i;
2695                 if (drhd->ignored || drhd->include_all)
2696                         continue;
2697
2698                 for (i = 0; i < drhd->devices_cnt; i++)
2699                         if (drhd->devices[i] &&
2700                                 !IS_GFX_DEVICE(drhd->devices[i]))
2701                                 break;
2702
2703                 if (i < drhd->devices_cnt)
2704                         continue;
2705
2706                 /* bypass IOMMU if it is just for gfx devices */
2707                 drhd->ignored = 1;
2708                 for (i = 0; i < drhd->devices_cnt; i++) {
2709                         if (!drhd->devices[i])
2710                                 continue;
2711                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2712                 }
2713         }
2714 }
2715
2716 int __init intel_iommu_init(void)
2717 {
2718         int ret = 0;
2719
2720         if (dmar_table_init())
2721                 return  -ENODEV;
2722
2723         if (dmar_dev_scope_init())
2724                 return  -ENODEV;
2725
2726         /*
2727          * Check the need for DMA-remapping initialization now.
2728          * Above initialization will also be used by Interrupt-remapping.
2729          */
2730         if (no_iommu || swiotlb || dmar_disabled)
2731                 return -ENODEV;
2732
2733         iommu_init_mempool();
2734         dmar_init_reserved_ranges();
2735
2736         init_no_remapping_devices();
2737
2738         ret = init_dmars();
2739         if (ret) {
2740                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2741                 put_iova_domain(&reserved_iova_list);
2742                 iommu_exit_mempool();
2743                 return ret;
2744         }
2745         printk(KERN_INFO
2746         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2747
2748         init_timer(&unmap_timer);
2749         force_iommu = 1;
2750         dma_ops = &intel_dma_ops;
2751
2752         register_iommu(&intel_iommu_ops);
2753
2754         return 0;
2755 }
2756
2757 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2758                                   struct pci_dev *pdev)
2759 {
2760         struct device_domain_info *info;
2761         unsigned long flags;
2762
2763         info = alloc_devinfo_mem();
2764         if (!info)
2765                 return -ENOMEM;
2766
2767         info->bus = pdev->bus->number;
2768         info->devfn = pdev->devfn;
2769         info->dev = pdev;
2770         info->domain = domain;
2771
2772         spin_lock_irqsave(&device_domain_lock, flags);
2773         list_add(&info->link, &domain->devices);
2774         list_add(&info->global, &device_domain_list);
2775         pdev->dev.archdata.iommu = info;
2776         spin_unlock_irqrestore(&device_domain_lock, flags);
2777
2778         return 0;
2779 }
2780
2781 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2782                                           struct pci_dev *pdev)
2783 {
2784         struct device_domain_info *info;
2785         struct intel_iommu *iommu;
2786         unsigned long flags;
2787         int found = 0;
2788         struct list_head *entry, *tmp;
2789
2790         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2791         if (!iommu)
2792                 return;
2793
2794         spin_lock_irqsave(&device_domain_lock, flags);
2795         list_for_each_safe(entry, tmp, &domain->devices) {
2796                 info = list_entry(entry, struct device_domain_info, link);
2797                 if (info->bus == pdev->bus->number &&
2798                     info->devfn == pdev->devfn) {
2799                         list_del(&info->link);
2800                         list_del(&info->global);
2801                         if (info->dev)
2802                                 info->dev->dev.archdata.iommu = NULL;
2803                         spin_unlock_irqrestore(&device_domain_lock, flags);
2804
2805                         iommu_detach_dev(iommu, info->bus, info->devfn);
2806                         free_devinfo_mem(info);
2807
2808                         spin_lock_irqsave(&device_domain_lock, flags);
2809
2810                         if (found)
2811                                 break;
2812                         else
2813                                 continue;
2814                 }
2815
2816                 /* if there is no other devices under the same iommu
2817                  * owned by this domain, clear this iommu in iommu_bmp
2818                  * update iommu count and coherency
2819                  */
2820                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2821                         found = 1;
2822         }
2823
2824         if (found == 0) {
2825                 unsigned long tmp_flags;
2826                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2827                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2828                 domain->iommu_count--;
2829                 domain_update_iommu_coherency(domain);
2830                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2831         }
2832
2833         spin_unlock_irqrestore(&device_domain_lock, flags);
2834 }
2835
2836 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2837 {
2838         struct device_domain_info *info;
2839         struct intel_iommu *iommu;
2840         unsigned long flags1, flags2;
2841
2842         spin_lock_irqsave(&device_domain_lock, flags1);
2843         while (!list_empty(&domain->devices)) {
2844                 info = list_entry(domain->devices.next,
2845                         struct device_domain_info, link);
2846                 list_del(&info->link);
2847                 list_del(&info->global);
2848                 if (info->dev)
2849                         info->dev->dev.archdata.iommu = NULL;
2850
2851                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2852
2853                 iommu = device_to_iommu(info->bus, info->devfn);
2854                 iommu_detach_dev(iommu, info->bus, info->devfn);
2855
2856                 /* clear this iommu in iommu_bmp, update iommu count
2857                  * and coherency
2858                  */
2859                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2860                 if (test_and_clear_bit(iommu->seq_id,
2861                                        &domain->iommu_bmp)) {
2862                         domain->iommu_count--;
2863                         domain_update_iommu_coherency(domain);
2864                 }
2865                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2866
2867                 free_devinfo_mem(info);
2868                 spin_lock_irqsave(&device_domain_lock, flags1);
2869         }
2870         spin_unlock_irqrestore(&device_domain_lock, flags1);
2871 }
2872
2873 /* domain id for virtual machine, it won't be set in context */
2874 static unsigned long vm_domid;
2875
2876 static int vm_domain_min_agaw(struct dmar_domain *domain)
2877 {
2878         int i;
2879         int min_agaw = domain->agaw;
2880
2881         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2882         for (; i < g_num_of_iommus; ) {
2883                 if (min_agaw > g_iommus[i]->agaw)
2884                         min_agaw = g_iommus[i]->agaw;
2885
2886                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2887         }
2888
2889         return min_agaw;
2890 }
2891
2892 static struct dmar_domain *iommu_alloc_vm_domain(void)
2893 {
2894         struct dmar_domain *domain;
2895
2896         domain = alloc_domain_mem();
2897         if (!domain)
2898                 return NULL;
2899
2900         domain->id = vm_domid++;
2901         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2902         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2903
2904         return domain;
2905 }
2906
2907 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2908 {
2909         int adjust_width;
2910
2911         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2912         spin_lock_init(&domain->mapping_lock);
2913         spin_lock_init(&domain->iommu_lock);
2914
2915         domain_reserve_special_ranges(domain);
2916
2917         /* calculate AGAW */
2918         domain->gaw = guest_width;
2919         adjust_width = guestwidth_to_adjustwidth(guest_width);
2920         domain->agaw = width_to_agaw(adjust_width);
2921
2922         INIT_LIST_HEAD(&domain->devices);
2923
2924         domain->iommu_count = 0;
2925         domain->iommu_coherency = 0;
2926         domain->max_addr = 0;
2927
2928         /* always allocate the top pgd */
2929         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2930         if (!domain->pgd)
2931                 return -ENOMEM;
2932         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2933         return 0;
2934 }
2935
2936 static void iommu_free_vm_domain(struct dmar_domain *domain)
2937 {
2938         unsigned long flags;
2939         struct dmar_drhd_unit *drhd;
2940         struct intel_iommu *iommu;
2941         unsigned long i;
2942         unsigned long ndomains;
2943
2944         for_each_drhd_unit(drhd) {
2945                 if (drhd->ignored)
2946                         continue;
2947                 iommu = drhd->iommu;
2948
2949                 ndomains = cap_ndoms(iommu->cap);
2950                 i = find_first_bit(iommu->domain_ids, ndomains);
2951                 for (; i < ndomains; ) {
2952                         if (iommu->domains[i] == domain) {
2953                                 spin_lock_irqsave(&iommu->lock, flags);
2954                                 clear_bit(i, iommu->domain_ids);
2955                                 iommu->domains[i] = NULL;
2956                                 spin_unlock_irqrestore(&iommu->lock, flags);
2957                                 break;
2958                         }
2959                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2960                 }
2961         }
2962 }
2963
2964 static void vm_domain_exit(struct dmar_domain *domain)
2965 {
2966         u64 end;
2967
2968         /* Domain 0 is reserved, so dont process it */
2969         if (!domain)
2970                 return;
2971
2972         vm_domain_remove_all_dev_info(domain);
2973         /* destroy iovas */
2974         put_iova_domain(&domain->iovad);
2975         end = DOMAIN_MAX_ADDR(domain->gaw);
2976         end = end & (~VTD_PAGE_MASK);
2977
2978         /* clear ptes */
2979         dma_pte_clear_range(domain, 0, end);
2980
2981         /* free page tables */
2982         dma_pte_free_pagetable(domain, 0, end);
2983
2984         iommu_free_vm_domain(domain);
2985         free_domain_mem(domain);
2986 }
2987
2988 static int intel_iommu_domain_init(struct iommu_domain *domain)
2989 {
2990         struct dmar_domain *dmar_domain;
2991
2992         dmar_domain = iommu_alloc_vm_domain();
2993         if (!dmar_domain) {
2994                 printk(KERN_ERR
2995                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2996                 return -ENOMEM;
2997         }
2998         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2999                 printk(KERN_ERR
3000                         "intel_iommu_domain_init() failed\n");
3001                 vm_domain_exit(dmar_domain);
3002                 return -ENOMEM;
3003         }
3004         domain->priv = dmar_domain;
3005
3006         return 0;
3007 }
3008
3009 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3010 {
3011         struct dmar_domain *dmar_domain = domain->priv;
3012
3013         domain->priv = NULL;
3014         vm_domain_exit(dmar_domain);
3015 }
3016
3017 static int intel_iommu_attach_device(struct iommu_domain *domain,
3018                                      struct device *dev)
3019 {
3020         struct dmar_domain *dmar_domain = domain->priv;
3021         struct pci_dev *pdev = to_pci_dev(dev);
3022         struct intel_iommu *iommu;
3023         int addr_width;
3024         u64 end;
3025         int ret;
3026
3027         /* normally pdev is not mapped */
3028         if (unlikely(domain_context_mapped(pdev))) {
3029                 struct dmar_domain *old_domain;
3030
3031                 old_domain = find_domain(pdev);
3032                 if (old_domain) {
3033                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3034                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3035                         else
3036                                 domain_remove_dev_info(old_domain);
3037                 }
3038         }
3039
3040         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3041         if (!iommu)
3042                 return -ENODEV;
3043
3044         /* check if this iommu agaw is sufficient for max mapped address */
3045         addr_width = agaw_to_width(iommu->agaw);
3046         end = DOMAIN_MAX_ADDR(addr_width);
3047         end = end & VTD_PAGE_MASK;
3048         if (end < dmar_domain->max_addr) {
3049                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3050                        "sufficient for the mapped address (%llx)\n",
3051                        __func__, iommu->agaw, dmar_domain->max_addr);
3052                 return -EFAULT;
3053         }
3054
3055         ret = domain_context_mapping(dmar_domain, pdev);
3056         if (ret)
3057                 return ret;
3058
3059         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3060         return ret;
3061 }
3062
3063 static void intel_iommu_detach_device(struct iommu_domain *domain,
3064                                       struct device *dev)
3065 {
3066         struct dmar_domain *dmar_domain = domain->priv;
3067         struct pci_dev *pdev = to_pci_dev(dev);
3068
3069         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3070 }
3071
3072 static int intel_iommu_map_range(struct iommu_domain *domain,
3073                                  unsigned long iova, phys_addr_t hpa,
3074                                  size_t size, int iommu_prot)
3075 {
3076         struct dmar_domain *dmar_domain = domain->priv;
3077         u64 max_addr;
3078         int addr_width;
3079         int prot = 0;
3080         int ret;
3081
3082         if (iommu_prot & IOMMU_READ)
3083                 prot |= DMA_PTE_READ;
3084         if (iommu_prot & IOMMU_WRITE)
3085                 prot |= DMA_PTE_WRITE;
3086
3087         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3088         if (dmar_domain->max_addr < max_addr) {
3089                 int min_agaw;
3090                 u64 end;
3091
3092                 /* check if minimum agaw is sufficient for mapped address */
3093                 min_agaw = vm_domain_min_agaw(dmar_domain);
3094                 addr_width = agaw_to_width(min_agaw);
3095                 end = DOMAIN_MAX_ADDR(addr_width);
3096                 end = end & VTD_PAGE_MASK;
3097                 if (end < max_addr) {
3098                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3099                                "sufficient for the mapped address (%llx)\n",
3100                                __func__, min_agaw, max_addr);
3101                         return -EFAULT;
3102                 }
3103                 dmar_domain->max_addr = max_addr;
3104         }
3105
3106         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3107         return ret;
3108 }
3109
3110 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3111                                     unsigned long iova, size_t size)
3112 {
3113         struct dmar_domain *dmar_domain = domain->priv;
3114         dma_addr_t base;
3115
3116         /* The address might not be aligned */
3117         base = iova & VTD_PAGE_MASK;
3118         size = VTD_PAGE_ALIGN(size);
3119         dma_pte_clear_range(dmar_domain, base, base + size);
3120
3121         if (dmar_domain->max_addr == base + size)
3122                 dmar_domain->max_addr = base;
3123 }
3124
3125 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3126                                             unsigned long iova)
3127 {
3128         struct dmar_domain *dmar_domain = domain->priv;
3129         struct dma_pte *pte;
3130         u64 phys = 0;
3131
3132         pte = addr_to_dma_pte(dmar_domain, iova);
3133         if (pte)
3134                 phys = dma_pte_addr(pte);
3135
3136         return phys;
3137 }
3138
3139 static struct iommu_ops intel_iommu_ops = {
3140         .domain_init    = intel_iommu_domain_init,
3141         .domain_destroy = intel_iommu_domain_destroy,
3142         .attach_dev     = intel_iommu_attach_device,
3143         .detach_dev     = intel_iommu_detach_device,
3144         .map            = intel_iommu_map_range,
3145         .unmap          = intel_iommu_unmap_range,
3146         .iova_to_phys   = intel_iommu_iova_to_phys,
3147 };