2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
67 * 12-63: Context Ptr (12 - (haw-1))
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
77 return (root->val & 1);
79 static inline void set_root_present(struct root_entry *root)
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
85 root->val |= value & VTD_PAGE_MASK;
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
91 return (struct context_entry *)
92 (root_present(root)?phys_to_virt(
93 root->val & VTD_PAGE_MASK) :
100 * 1: fault processing disable
101 * 2-3: translation type
102 * 12-63: address space root
108 struct context_entry {
113 static inline bool context_present(struct context_entry *context)
115 return (context->lo & 1);
117 static inline void context_set_present(struct context_entry *context)
122 static inline void context_set_fault_enable(struct context_entry *context)
124 context->lo &= (((u64)-1) << 2) | 1;
127 #define CONTEXT_TT_MULTI_LEVEL 0
129 static inline void context_set_translation_type(struct context_entry *context,
132 context->lo &= (((u64)-1) << 4) | 3;
133 context->lo |= (value & 3) << 2;
136 static inline void context_set_address_root(struct context_entry *context,
139 context->lo |= value & VTD_PAGE_MASK;
142 static inline void context_set_address_width(struct context_entry *context,
145 context->hi |= value & 7;
148 static inline void context_set_domain_id(struct context_entry *context,
151 context->hi |= (value & ((1 << 16) - 1)) << 8;
154 static inline void context_clear_entry(struct context_entry *context)
166 * 12-63: Host physcial address
172 static inline void dma_clear_pte(struct dma_pte *pte)
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
179 pte->val |= DMA_PTE_READ;
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
184 pte->val |= DMA_PTE_WRITE;
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
189 pte->val = (pte->val & ~3) | (prot & 3);
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
194 return (pte->val & VTD_PAGE_MASK);
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
199 pte->val |= (addr & VTD_PAGE_MASK);
202 static inline bool dma_pte_present(struct dma_pte *pte)
204 return (pte->val & 3) != 0;
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
210 /* domain represents a virtual machine, more than one devices
211 * across iommus may be owned in one domain, e.g. kvm guest.
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
216 int id; /* domain id */
217 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
219 struct list_head devices; /* all devices' list */
220 struct iova_domain iovad; /* iova's that belong to this domain */
222 struct dma_pte *pgd; /* virtual address */
223 spinlock_t mapping_lock; /* page table lock */
224 int gaw; /* max guest address width */
226 /* adjusted guest address width, 0 is level 2 30-bit */
229 int flags; /* flags to find out type of domain */
231 int iommu_coherency;/* indicate coherency of iommu access */
232 int iommu_count; /* reference count of iommu */
233 spinlock_t iommu_lock; /* protect iommu set in domain */
234 u64 max_addr; /* maximum mapped address */
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239 struct list_head link; /* link to domain siblings */
240 struct list_head global; /* link to global list */
241 u8 bus; /* PCI bus numer */
242 u8 devfn; /* PCI devfn number */
243 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244 struct dmar_domain *domain; /* pointer to domain */
247 static void flush_unmaps_timeout(unsigned long data);
249 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
254 struct iova *iova[HIGH_WATER_MARK];
255 struct dmar_domain *domain[HIGH_WATER_MARK];
258 static struct deferred_flush_tables *deferred_flush;
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
267 static long list_size;
269 static void domain_remove_dev_info(struct dmar_domain *domain);
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
280 static struct iommu_ops intel_iommu_ops;
282 static int __init intel_iommu_setup(char *str)
287 if (!strncmp(str, "off", 3)) {
289 printk(KERN_INFO"Intel-IOMMU: disabled\n");
290 } else if (!strncmp(str, "igfx_off", 8)) {
293 "Intel-IOMMU: disable GFX device mapping\n");
294 } else if (!strncmp(str, "forcedac", 8)) {
296 "Intel-IOMMU: Forcing DAC for PCI devices\n");
298 } else if (!strncmp(str, "strict", 6)) {
300 "Intel-IOMMU: disable batched IOTLB flush\n");
301 intel_iommu_strict = 1;
304 str += strcspn(str, ",");
310 __setup("intel_iommu=", intel_iommu_setup);
312 static struct kmem_cache *iommu_domain_cache;
313 static struct kmem_cache *iommu_devinfo_cache;
314 static struct kmem_cache *iommu_iova_cache;
316 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
321 /* trying to avoid low memory issues */
322 flags = current->flags & PF_MEMALLOC;
323 current->flags |= PF_MEMALLOC;
324 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
325 current->flags &= (~PF_MEMALLOC | flags);
330 static inline void *alloc_pgtable_page(void)
335 /* trying to avoid low memory issues */
336 flags = current->flags & PF_MEMALLOC;
337 current->flags |= PF_MEMALLOC;
338 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
339 current->flags &= (~PF_MEMALLOC | flags);
343 static inline void free_pgtable_page(void *vaddr)
345 free_page((unsigned long)vaddr);
348 static inline void *alloc_domain_mem(void)
350 return iommu_kmem_cache_alloc(iommu_domain_cache);
353 static void free_domain_mem(void *vaddr)
355 kmem_cache_free(iommu_domain_cache, vaddr);
358 static inline void * alloc_devinfo_mem(void)
360 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
363 static inline void free_devinfo_mem(void *vaddr)
365 kmem_cache_free(iommu_devinfo_cache, vaddr);
368 struct iova *alloc_iova_mem(void)
370 return iommu_kmem_cache_alloc(iommu_iova_cache);
373 void free_iova_mem(struct iova *iova)
375 kmem_cache_free(iommu_iova_cache, iova);
379 static inline int width_to_agaw(int width);
381 /* calculate agaw for each iommu.
382 * "SAGAW" may be different across iommus, use a default agaw, and
383 * get a supported less agaw for iommus that don't support the default agaw.
385 int iommu_calculate_agaw(struct intel_iommu *iommu)
390 sagaw = cap_sagaw(iommu->cap);
391 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
393 if (test_bit(agaw, &sagaw))
400 /* in native case, each domain is related to only one iommu */
401 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
405 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
407 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
408 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
411 return g_iommus[iommu_id];
414 /* "Coherency" capability may be different across iommus */
415 static void domain_update_iommu_coherency(struct dmar_domain *domain)
419 domain->iommu_coherency = 1;
421 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
422 for (; i < g_num_of_iommus; ) {
423 if (!ecap_coherent(g_iommus[i]->ecap)) {
424 domain->iommu_coherency = 0;
427 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
431 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
433 struct dmar_drhd_unit *drhd = NULL;
436 for_each_drhd_unit(drhd) {
440 for (i = 0; i < drhd->devices_cnt; i++)
441 if (drhd->devices[i]->bus->number == bus &&
442 drhd->devices[i]->devfn == devfn)
445 if (drhd->include_all)
452 static void domain_flush_cache(struct dmar_domain *domain,
453 void *addr, int size)
455 if (!domain->iommu_coherency)
456 clflush_cache_range(addr, size);
459 /* Gets context entry for a given bus and devfn */
460 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
463 struct root_entry *root;
464 struct context_entry *context;
465 unsigned long phy_addr;
468 spin_lock_irqsave(&iommu->lock, flags);
469 root = &iommu->root_entry[bus];
470 context = get_context_addr_from_root(root);
472 context = (struct context_entry *)alloc_pgtable_page();
474 spin_unlock_irqrestore(&iommu->lock, flags);
477 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
478 phy_addr = virt_to_phys((void *)context);
479 set_root_value(root, phy_addr);
480 set_root_present(root);
481 __iommu_flush_cache(iommu, root, sizeof(*root));
483 spin_unlock_irqrestore(&iommu->lock, flags);
484 return &context[devfn];
487 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
489 struct root_entry *root;
490 struct context_entry *context;
494 spin_lock_irqsave(&iommu->lock, flags);
495 root = &iommu->root_entry[bus];
496 context = get_context_addr_from_root(root);
501 ret = context_present(&context[devfn]);
503 spin_unlock_irqrestore(&iommu->lock, flags);
507 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
509 struct root_entry *root;
510 struct context_entry *context;
513 spin_lock_irqsave(&iommu->lock, flags);
514 root = &iommu->root_entry[bus];
515 context = get_context_addr_from_root(root);
517 context_clear_entry(&context[devfn]);
518 __iommu_flush_cache(iommu, &context[devfn], \
521 spin_unlock_irqrestore(&iommu->lock, flags);
524 static void free_context_table(struct intel_iommu *iommu)
526 struct root_entry *root;
529 struct context_entry *context;
531 spin_lock_irqsave(&iommu->lock, flags);
532 if (!iommu->root_entry) {
535 for (i = 0; i < ROOT_ENTRY_NR; i++) {
536 root = &iommu->root_entry[i];
537 context = get_context_addr_from_root(root);
539 free_pgtable_page(context);
541 free_pgtable_page(iommu->root_entry);
542 iommu->root_entry = NULL;
544 spin_unlock_irqrestore(&iommu->lock, flags);
547 /* page table handling */
548 #define LEVEL_STRIDE (9)
549 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
551 static inline int agaw_to_level(int agaw)
556 static inline int agaw_to_width(int agaw)
558 return 30 + agaw * LEVEL_STRIDE;
562 static inline int width_to_agaw(int width)
564 return (width - 30) / LEVEL_STRIDE;
567 static inline unsigned int level_to_offset_bits(int level)
569 return (12 + (level - 1) * LEVEL_STRIDE);
572 static inline int address_level_offset(u64 addr, int level)
574 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
577 static inline u64 level_mask(int level)
579 return ((u64)-1 << level_to_offset_bits(level));
582 static inline u64 level_size(int level)
584 return ((u64)1 << level_to_offset_bits(level));
587 static inline u64 align_to_level(u64 addr, int level)
589 return ((addr + level_size(level) - 1) & level_mask(level));
592 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
594 int addr_width = agaw_to_width(domain->agaw);
595 struct dma_pte *parent, *pte = NULL;
596 int level = agaw_to_level(domain->agaw);
600 BUG_ON(!domain->pgd);
602 addr &= (((u64)1) << addr_width) - 1;
603 parent = domain->pgd;
605 spin_lock_irqsave(&domain->mapping_lock, flags);
609 offset = address_level_offset(addr, level);
610 pte = &parent[offset];
614 if (!dma_pte_present(pte)) {
615 tmp_page = alloc_pgtable_page();
618 spin_unlock_irqrestore(&domain->mapping_lock,
622 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
623 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
625 * high level table always sets r/w, last level page
626 * table control read/write
628 dma_set_pte_readable(pte);
629 dma_set_pte_writable(pte);
630 domain_flush_cache(domain, pte, sizeof(*pte));
632 parent = phys_to_virt(dma_pte_addr(pte));
636 spin_unlock_irqrestore(&domain->mapping_lock, flags);
640 /* return address's pte at specific level */
641 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
644 struct dma_pte *parent, *pte = NULL;
645 int total = agaw_to_level(domain->agaw);
648 parent = domain->pgd;
649 while (level <= total) {
650 offset = address_level_offset(addr, total);
651 pte = &parent[offset];
655 if (!dma_pte_present(pte))
657 parent = phys_to_virt(dma_pte_addr(pte));
663 /* clear one page's page table */
664 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
666 struct dma_pte *pte = NULL;
668 /* get last level pte */
669 pte = dma_addr_level_pte(domain, addr, 1);
673 domain_flush_cache(domain, pte, sizeof(*pte));
677 /* clear last level pte, a tlb flush should be followed */
678 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
680 int addr_width = agaw_to_width(domain->agaw);
682 start &= (((u64)1) << addr_width) - 1;
683 end &= (((u64)1) << addr_width) - 1;
684 /* in case it's partial page */
685 start = PAGE_ALIGN(start);
688 /* we don't need lock here, nobody else touches the iova range */
689 while (start < end) {
690 dma_pte_clear_one(domain, start);
691 start += VTD_PAGE_SIZE;
695 /* free page table pages. last level pte should already be cleared */
696 static void dma_pte_free_pagetable(struct dmar_domain *domain,
699 int addr_width = agaw_to_width(domain->agaw);
701 int total = agaw_to_level(domain->agaw);
705 start &= (((u64)1) << addr_width) - 1;
706 end &= (((u64)1) << addr_width) - 1;
708 /* we don't need lock here, nobody else touches the iova range */
710 while (level <= total) {
711 tmp = align_to_level(start, level);
712 if (tmp >= end || (tmp + level_size(level) > end))
716 pte = dma_addr_level_pte(domain, tmp, level);
719 phys_to_virt(dma_pte_addr(pte)));
721 domain_flush_cache(domain, pte, sizeof(*pte));
723 tmp += level_size(level);
728 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
729 free_pgtable_page(domain->pgd);
735 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
737 struct root_entry *root;
740 root = (struct root_entry *)alloc_pgtable_page();
744 __iommu_flush_cache(iommu, root, ROOT_SIZE);
746 spin_lock_irqsave(&iommu->lock, flags);
747 iommu->root_entry = root;
748 spin_unlock_irqrestore(&iommu->lock, flags);
753 static void iommu_set_root_entry(struct intel_iommu *iommu)
759 addr = iommu->root_entry;
761 spin_lock_irqsave(&iommu->register_lock, flag);
762 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
764 cmd = iommu->gcmd | DMA_GCMD_SRTP;
765 writel(cmd, iommu->reg + DMAR_GCMD_REG);
767 /* Make sure hardware complete it */
768 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
769 readl, (sts & DMA_GSTS_RTPS), sts);
771 spin_unlock_irqrestore(&iommu->register_lock, flag);
774 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
779 if (!cap_rwbf(iommu->cap))
781 val = iommu->gcmd | DMA_GCMD_WBF;
783 spin_lock_irqsave(&iommu->register_lock, flag);
784 writel(val, iommu->reg + DMAR_GCMD_REG);
786 /* Make sure hardware complete it */
787 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
788 readl, (!(val & DMA_GSTS_WBFS)), val);
790 spin_unlock_irqrestore(&iommu->register_lock, flag);
793 /* return value determine if we need a write buffer flush */
794 static int __iommu_flush_context(struct intel_iommu *iommu,
795 u16 did, u16 source_id, u8 function_mask, u64 type,
796 int non_present_entry_flush)
802 * In the non-present entry flush case, if hardware doesn't cache
803 * non-present entry we do nothing and if hardware cache non-present
804 * entry, we flush entries of domain 0 (the domain id is used to cache
805 * any non-present entries)
807 if (non_present_entry_flush) {
808 if (!cap_caching_mode(iommu->cap))
815 case DMA_CCMD_GLOBAL_INVL:
816 val = DMA_CCMD_GLOBAL_INVL;
818 case DMA_CCMD_DOMAIN_INVL:
819 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
821 case DMA_CCMD_DEVICE_INVL:
822 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
823 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
830 spin_lock_irqsave(&iommu->register_lock, flag);
831 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
833 /* Make sure hardware complete it */
834 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
835 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
837 spin_unlock_irqrestore(&iommu->register_lock, flag);
839 /* flush context entry will implicitly flush write buffer */
843 /* return value determine if we need a write buffer flush */
844 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
845 u64 addr, unsigned int size_order, u64 type,
846 int non_present_entry_flush)
848 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
849 u64 val = 0, val_iva = 0;
853 * In the non-present entry flush case, if hardware doesn't cache
854 * non-present entry we do nothing and if hardware cache non-present
855 * entry, we flush entries of domain 0 (the domain id is used to cache
856 * any non-present entries)
858 if (non_present_entry_flush) {
859 if (!cap_caching_mode(iommu->cap))
866 case DMA_TLB_GLOBAL_FLUSH:
867 /* global flush doesn't need set IVA_REG */
868 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
870 case DMA_TLB_DSI_FLUSH:
871 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
873 case DMA_TLB_PSI_FLUSH:
874 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
875 /* Note: always flush non-leaf currently */
876 val_iva = size_order | addr;
881 /* Note: set drain read/write */
884 * This is probably to be super secure.. Looks like we can
885 * ignore it without any impact.
887 if (cap_read_drain(iommu->cap))
888 val |= DMA_TLB_READ_DRAIN;
890 if (cap_write_drain(iommu->cap))
891 val |= DMA_TLB_WRITE_DRAIN;
893 spin_lock_irqsave(&iommu->register_lock, flag);
894 /* Note: Only uses first TLB reg currently */
896 dmar_writeq(iommu->reg + tlb_offset, val_iva);
897 dmar_writeq(iommu->reg + tlb_offset + 8, val);
899 /* Make sure hardware complete it */
900 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
901 dmar_readq, (!(val & DMA_TLB_IVT)), val);
903 spin_unlock_irqrestore(&iommu->register_lock, flag);
905 /* check IOTLB invalidation granularity */
906 if (DMA_TLB_IAIG(val) == 0)
907 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
908 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
909 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
910 (unsigned long long)DMA_TLB_IIRG(type),
911 (unsigned long long)DMA_TLB_IAIG(val));
912 /* flush iotlb entry will implicitly flush write buffer */
916 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
917 u64 addr, unsigned int pages, int non_present_entry_flush)
921 BUG_ON(addr & (~VTD_PAGE_MASK));
924 /* Fallback to domain selective flush if no PSI support */
925 if (!cap_pgsel_inv(iommu->cap))
926 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
928 non_present_entry_flush);
931 * PSI requires page size to be 2 ^ x, and the base address is naturally
932 * aligned to the size
934 mask = ilog2(__roundup_pow_of_two(pages));
935 /* Fallback to domain selective flush if size is too big */
936 if (mask > cap_max_amask_val(iommu->cap))
937 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
938 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
940 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
942 non_present_entry_flush);
945 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
950 spin_lock_irqsave(&iommu->register_lock, flags);
951 pmen = readl(iommu->reg + DMAR_PMEN_REG);
952 pmen &= ~DMA_PMEN_EPM;
953 writel(pmen, iommu->reg + DMAR_PMEN_REG);
955 /* wait for the protected region status bit to clear */
956 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
957 readl, !(pmen & DMA_PMEN_PRS), pmen);
959 spin_unlock_irqrestore(&iommu->register_lock, flags);
962 static int iommu_enable_translation(struct intel_iommu *iommu)
967 spin_lock_irqsave(&iommu->register_lock, flags);
968 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
970 /* Make sure hardware complete it */
971 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
972 readl, (sts & DMA_GSTS_TES), sts);
974 iommu->gcmd |= DMA_GCMD_TE;
975 spin_unlock_irqrestore(&iommu->register_lock, flags);
979 static int iommu_disable_translation(struct intel_iommu *iommu)
984 spin_lock_irqsave(&iommu->register_lock, flag);
985 iommu->gcmd &= ~DMA_GCMD_TE;
986 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
988 /* Make sure hardware complete it */
989 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
990 readl, (!(sts & DMA_GSTS_TES)), sts);
992 spin_unlock_irqrestore(&iommu->register_lock, flag);
996 /* iommu interrupt handling. Most stuff are MSI-like. */
998 static const char *fault_reason_strings[] =
1001 "Present bit in root entry is clear",
1002 "Present bit in context entry is clear",
1003 "Invalid context entry",
1004 "Access beyond MGAW",
1005 "PTE Write access is not set",
1006 "PTE Read access is not set",
1007 "Next page table ptr is invalid",
1008 "Root table address invalid",
1009 "Context table ptr is invalid",
1010 "non-zero reserved fields in RTP",
1011 "non-zero reserved fields in CTP",
1012 "non-zero reserved fields in PTE",
1014 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1016 const char *dmar_get_fault_reason(u8 fault_reason)
1018 if (fault_reason > MAX_FAULT_REASON_IDX)
1021 return fault_reason_strings[fault_reason];
1024 void dmar_msi_unmask(unsigned int irq)
1026 struct intel_iommu *iommu = get_irq_data(irq);
1030 spin_lock_irqsave(&iommu->register_lock, flag);
1031 writel(0, iommu->reg + DMAR_FECTL_REG);
1032 /* Read a reg to force flush the post write */
1033 readl(iommu->reg + DMAR_FECTL_REG);
1034 spin_unlock_irqrestore(&iommu->register_lock, flag);
1037 void dmar_msi_mask(unsigned int irq)
1040 struct intel_iommu *iommu = get_irq_data(irq);
1043 spin_lock_irqsave(&iommu->register_lock, flag);
1044 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1045 /* Read a reg to force flush the post write */
1046 readl(iommu->reg + DMAR_FECTL_REG);
1047 spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 void dmar_msi_write(int irq, struct msi_msg *msg)
1052 struct intel_iommu *iommu = get_irq_data(irq);
1055 spin_lock_irqsave(&iommu->register_lock, flag);
1056 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1057 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1058 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1059 spin_unlock_irqrestore(&iommu->register_lock, flag);
1062 void dmar_msi_read(int irq, struct msi_msg *msg)
1064 struct intel_iommu *iommu = get_irq_data(irq);
1067 spin_lock_irqsave(&iommu->register_lock, flag);
1068 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1069 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1070 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1071 spin_unlock_irqrestore(&iommu->register_lock, flag);
1074 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1075 u8 fault_reason, u16 source_id, unsigned long long addr)
1079 reason = dmar_get_fault_reason(fault_reason);
1082 "DMAR:[%s] Request device [%02x:%02x.%d] "
1083 "fault addr %llx \n"
1084 "DMAR:[fault reason %02d] %s\n",
1085 (type ? "DMA Read" : "DMA Write"),
1086 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1087 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1091 #define PRIMARY_FAULT_REG_LEN (16)
1092 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1094 struct intel_iommu *iommu = dev_id;
1095 int reg, fault_index;
1099 spin_lock_irqsave(&iommu->register_lock, flag);
1100 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1102 /* TBD: ignore advanced fault log currently */
1103 if (!(fault_status & DMA_FSTS_PPF))
1104 goto clear_overflow;
1106 fault_index = dma_fsts_fault_record_index(fault_status);
1107 reg = cap_fault_reg_offset(iommu->cap);
1115 /* highest 32 bits */
1116 data = readl(iommu->reg + reg +
1117 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1118 if (!(data & DMA_FRCD_F))
1121 fault_reason = dma_frcd_fault_reason(data);
1122 type = dma_frcd_type(data);
1124 data = readl(iommu->reg + reg +
1125 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1126 source_id = dma_frcd_source_id(data);
1128 guest_addr = dmar_readq(iommu->reg + reg +
1129 fault_index * PRIMARY_FAULT_REG_LEN);
1130 guest_addr = dma_frcd_page_addr(guest_addr);
1131 /* clear the fault */
1132 writel(DMA_FRCD_F, iommu->reg + reg +
1133 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1135 spin_unlock_irqrestore(&iommu->register_lock, flag);
1137 iommu_page_fault_do_one(iommu, type, fault_reason,
1138 source_id, guest_addr);
1141 if (fault_index > cap_num_fault_regs(iommu->cap))
1143 spin_lock_irqsave(&iommu->register_lock, flag);
1146 /* clear primary fault overflow */
1147 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1148 if (fault_status & DMA_FSTS_PFO)
1149 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1151 spin_unlock_irqrestore(&iommu->register_lock, flag);
1155 int dmar_set_interrupt(struct intel_iommu *iommu)
1161 printk(KERN_ERR "IOMMU: no free vectors\n");
1165 set_irq_data(irq, iommu);
1168 ret = arch_setup_dmar_msi(irq);
1170 set_irq_data(irq, NULL);
1176 /* Force fault register is cleared */
1177 iommu_page_fault(irq, iommu);
1179 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1181 printk(KERN_ERR "IOMMU: can't request irq\n");
1185 static int iommu_init_domains(struct intel_iommu *iommu)
1187 unsigned long ndomains;
1188 unsigned long nlongs;
1190 ndomains = cap_ndoms(iommu->cap);
1191 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1192 nlongs = BITS_TO_LONGS(ndomains);
1194 /* TBD: there might be 64K domains,
1195 * consider other allocation for future chip
1197 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1198 if (!iommu->domain_ids) {
1199 printk(KERN_ERR "Allocating domain id array failed\n");
1202 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1204 if (!iommu->domains) {
1205 printk(KERN_ERR "Allocating domain array failed\n");
1206 kfree(iommu->domain_ids);
1210 spin_lock_init(&iommu->lock);
1213 * if Caching mode is set, then invalid translations are tagged
1214 * with domainid 0. Hence we need to pre-allocate it.
1216 if (cap_caching_mode(iommu->cap))
1217 set_bit(0, iommu->domain_ids);
1222 static void domain_exit(struct dmar_domain *domain);
1223 static void vm_domain_exit(struct dmar_domain *domain);
1225 void free_dmar_iommu(struct intel_iommu *iommu)
1227 struct dmar_domain *domain;
1229 unsigned long flags;
1231 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1232 for (; i < cap_ndoms(iommu->cap); ) {
1233 domain = iommu->domains[i];
1234 clear_bit(i, iommu->domain_ids);
1236 spin_lock_irqsave(&domain->iommu_lock, flags);
1237 if (--domain->iommu_count == 0) {
1238 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1239 vm_domain_exit(domain);
1241 domain_exit(domain);
1243 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1245 i = find_next_bit(iommu->domain_ids,
1246 cap_ndoms(iommu->cap), i+1);
1249 if (iommu->gcmd & DMA_GCMD_TE)
1250 iommu_disable_translation(iommu);
1253 set_irq_data(iommu->irq, NULL);
1254 /* This will mask the irq */
1255 free_irq(iommu->irq, iommu);
1256 destroy_irq(iommu->irq);
1259 kfree(iommu->domains);
1260 kfree(iommu->domain_ids);
1262 g_iommus[iommu->seq_id] = NULL;
1264 /* if all iommus are freed, free g_iommus */
1265 for (i = 0; i < g_num_of_iommus; i++) {
1270 if (i == g_num_of_iommus)
1273 /* free context mapping */
1274 free_context_table(iommu);
1277 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1280 unsigned long ndomains;
1281 struct dmar_domain *domain;
1282 unsigned long flags;
1284 domain = alloc_domain_mem();
1288 ndomains = cap_ndoms(iommu->cap);
1290 spin_lock_irqsave(&iommu->lock, flags);
1291 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1292 if (num >= ndomains) {
1293 spin_unlock_irqrestore(&iommu->lock, flags);
1294 free_domain_mem(domain);
1295 printk(KERN_ERR "IOMMU: no free domain ids\n");
1299 set_bit(num, iommu->domain_ids);
1301 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1302 set_bit(iommu->seq_id, &domain->iommu_bmp);
1304 iommu->domains[num] = domain;
1305 spin_unlock_irqrestore(&iommu->lock, flags);
1310 static void iommu_free_domain(struct dmar_domain *domain)
1312 unsigned long flags;
1313 struct intel_iommu *iommu;
1315 iommu = domain_get_iommu(domain);
1317 spin_lock_irqsave(&iommu->lock, flags);
1318 clear_bit(domain->id, iommu->domain_ids);
1319 spin_unlock_irqrestore(&iommu->lock, flags);
1322 static struct iova_domain reserved_iova_list;
1323 static struct lock_class_key reserved_alloc_key;
1324 static struct lock_class_key reserved_rbtree_key;
1326 static void dmar_init_reserved_ranges(void)
1328 struct pci_dev *pdev = NULL;
1333 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1335 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1336 &reserved_alloc_key);
1337 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338 &reserved_rbtree_key);
1340 /* IOAPIC ranges shouldn't be accessed by DMA */
1341 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342 IOVA_PFN(IOAPIC_RANGE_END));
1344 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1346 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347 for_each_pci_dev(pdev) {
1350 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351 r = &pdev->resource[i];
1352 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1356 size = r->end - addr;
1357 size = PAGE_ALIGN(size);
1358 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1359 IOVA_PFN(size + addr) - 1);
1361 printk(KERN_ERR "Reserve iova failed\n");
1367 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1369 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1372 static inline int guestwidth_to_adjustwidth(int gaw)
1375 int r = (gaw - 12) % 9;
1386 static int domain_init(struct dmar_domain *domain, int guest_width)
1388 struct intel_iommu *iommu;
1389 int adjust_width, agaw;
1390 unsigned long sagaw;
1392 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1393 spin_lock_init(&domain->mapping_lock);
1394 spin_lock_init(&domain->iommu_lock);
1396 domain_reserve_special_ranges(domain);
1398 /* calculate AGAW */
1399 iommu = domain_get_iommu(domain);
1400 if (guest_width > cap_mgaw(iommu->cap))
1401 guest_width = cap_mgaw(iommu->cap);
1402 domain->gaw = guest_width;
1403 adjust_width = guestwidth_to_adjustwidth(guest_width);
1404 agaw = width_to_agaw(adjust_width);
1405 sagaw = cap_sagaw(iommu->cap);
1406 if (!test_bit(agaw, &sagaw)) {
1407 /* hardware doesn't support it, choose a bigger one */
1408 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1409 agaw = find_next_bit(&sagaw, 5, agaw);
1413 domain->agaw = agaw;
1414 INIT_LIST_HEAD(&domain->devices);
1416 if (ecap_coherent(iommu->ecap))
1417 domain->iommu_coherency = 1;
1419 domain->iommu_coherency = 0;
1421 domain->iommu_count = 1;
1423 /* always allocate the top pgd */
1424 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1427 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1431 static void domain_exit(struct dmar_domain *domain)
1435 /* Domain 0 is reserved, so dont process it */
1439 domain_remove_dev_info(domain);
1441 put_iova_domain(&domain->iovad);
1442 end = DOMAIN_MAX_ADDR(domain->gaw);
1443 end = end & (~PAGE_MASK);
1446 dma_pte_clear_range(domain, 0, end);
1448 /* free page tables */
1449 dma_pte_free_pagetable(domain, 0, end);
1451 iommu_free_domain(domain);
1452 free_domain_mem(domain);
1455 static int domain_context_mapping_one(struct dmar_domain *domain,
1458 struct context_entry *context;
1459 unsigned long flags;
1460 struct intel_iommu *iommu;
1461 struct dma_pte *pgd;
1463 unsigned long ndomains;
1467 pr_debug("Set context mapping for %02x:%02x.%d\n",
1468 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1469 BUG_ON(!domain->pgd);
1471 iommu = device_to_iommu(bus, devfn);
1475 context = device_to_context_entry(iommu, bus, devfn);
1478 spin_lock_irqsave(&iommu->lock, flags);
1479 if (context_present(context)) {
1480 spin_unlock_irqrestore(&iommu->lock, flags);
1487 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1490 /* find an available domain id for this device in iommu */
1491 ndomains = cap_ndoms(iommu->cap);
1492 num = find_first_bit(iommu->domain_ids, ndomains);
1493 for (; num < ndomains; ) {
1494 if (iommu->domains[num] == domain) {
1499 num = find_next_bit(iommu->domain_ids,
1500 cap_ndoms(iommu->cap), num+1);
1504 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1505 if (num >= ndomains) {
1506 spin_unlock_irqrestore(&iommu->lock, flags);
1507 printk(KERN_ERR "IOMMU: no free domain ids\n");
1511 set_bit(num, iommu->domain_ids);
1512 iommu->domains[num] = domain;
1516 /* Skip top levels of page tables for
1517 * iommu which has less agaw than default.
1519 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1520 pgd = phys_to_virt(dma_pte_addr(pgd));
1521 if (!dma_pte_present(pgd)) {
1522 spin_unlock_irqrestore(&iommu->lock, flags);
1528 context_set_domain_id(context, id);
1529 context_set_address_width(context, iommu->agaw);
1530 context_set_address_root(context, virt_to_phys(pgd));
1531 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1532 context_set_fault_enable(context);
1533 context_set_present(context);
1534 domain_flush_cache(domain, context, sizeof(*context));
1536 /* it's a non-present to present mapping */
1537 if (iommu->flush.flush_context(iommu, domain->id,
1538 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1539 DMA_CCMD_DEVICE_INVL, 1))
1540 iommu_flush_write_buffer(iommu);
1542 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1544 spin_unlock_irqrestore(&iommu->lock, flags);
1546 spin_lock_irqsave(&domain->iommu_lock, flags);
1547 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1548 domain->iommu_count++;
1549 domain_update_iommu_coherency(domain);
1551 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1556 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1559 struct pci_dev *tmp, *parent;
1561 ret = domain_context_mapping_one(domain, pdev->bus->number,
1566 /* dependent device mapping */
1567 tmp = pci_find_upstream_pcie_bridge(pdev);
1570 /* Secondary interface's bus number and devfn 0 */
1571 parent = pdev->bus->self;
1572 while (parent != tmp) {
1573 ret = domain_context_mapping_one(domain, parent->bus->number,
1577 parent = parent->bus->self;
1579 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1580 return domain_context_mapping_one(domain,
1581 tmp->subordinate->number, 0);
1582 else /* this is a legacy PCI bridge */
1583 return domain_context_mapping_one(domain,
1584 tmp->bus->number, tmp->devfn);
1587 static int domain_context_mapped(struct pci_dev *pdev)
1590 struct pci_dev *tmp, *parent;
1591 struct intel_iommu *iommu;
1593 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1597 ret = device_context_mapped(iommu,
1598 pdev->bus->number, pdev->devfn);
1601 /* dependent device mapping */
1602 tmp = pci_find_upstream_pcie_bridge(pdev);
1605 /* Secondary interface's bus number and devfn 0 */
1606 parent = pdev->bus->self;
1607 while (parent != tmp) {
1608 ret = device_context_mapped(iommu, parent->bus->number,
1612 parent = parent->bus->self;
1615 return device_context_mapped(iommu,
1616 tmp->subordinate->number, 0);
1618 return device_context_mapped(iommu,
1619 tmp->bus->number, tmp->devfn);
1623 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1624 u64 hpa, size_t size, int prot)
1626 u64 start_pfn, end_pfn;
1627 struct dma_pte *pte;
1629 int addr_width = agaw_to_width(domain->agaw);
1631 hpa &= (((u64)1) << addr_width) - 1;
1633 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1636 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1637 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1639 while (start_pfn < end_pfn) {
1640 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1643 /* We don't need lock here, nobody else
1644 * touches the iova range
1646 BUG_ON(dma_pte_addr(pte));
1647 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1648 dma_set_pte_prot(pte, prot);
1649 domain_flush_cache(domain, pte, sizeof(*pte));
1656 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1661 clear_context_table(iommu, bus, devfn);
1662 iommu->flush.flush_context(iommu, 0, 0, 0,
1663 DMA_CCMD_GLOBAL_INVL, 0);
1664 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1665 DMA_TLB_GLOBAL_FLUSH, 0);
1668 static void domain_remove_dev_info(struct dmar_domain *domain)
1670 struct device_domain_info *info;
1671 unsigned long flags;
1672 struct intel_iommu *iommu;
1674 spin_lock_irqsave(&device_domain_lock, flags);
1675 while (!list_empty(&domain->devices)) {
1676 info = list_entry(domain->devices.next,
1677 struct device_domain_info, link);
1678 list_del(&info->link);
1679 list_del(&info->global);
1681 info->dev->dev.archdata.iommu = NULL;
1682 spin_unlock_irqrestore(&device_domain_lock, flags);
1684 iommu = device_to_iommu(info->bus, info->devfn);
1685 iommu_detach_dev(iommu, info->bus, info->devfn);
1686 free_devinfo_mem(info);
1688 spin_lock_irqsave(&device_domain_lock, flags);
1690 spin_unlock_irqrestore(&device_domain_lock, flags);
1695 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1697 static struct dmar_domain *
1698 find_domain(struct pci_dev *pdev)
1700 struct device_domain_info *info;
1702 /* No lock here, assumes no domain exit in normal case */
1703 info = pdev->dev.archdata.iommu;
1705 return info->domain;
1709 /* domain is initialized */
1710 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1712 struct dmar_domain *domain, *found = NULL;
1713 struct intel_iommu *iommu;
1714 struct dmar_drhd_unit *drhd;
1715 struct device_domain_info *info, *tmp;
1716 struct pci_dev *dev_tmp;
1717 unsigned long flags;
1718 int bus = 0, devfn = 0;
1720 domain = find_domain(pdev);
1724 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1726 if (dev_tmp->is_pcie) {
1727 bus = dev_tmp->subordinate->number;
1730 bus = dev_tmp->bus->number;
1731 devfn = dev_tmp->devfn;
1733 spin_lock_irqsave(&device_domain_lock, flags);
1734 list_for_each_entry(info, &device_domain_list, global) {
1735 if (info->bus == bus && info->devfn == devfn) {
1736 found = info->domain;
1740 spin_unlock_irqrestore(&device_domain_lock, flags);
1741 /* pcie-pci bridge already has a domain, uses it */
1748 /* Allocate new domain for the device */
1749 drhd = dmar_find_matched_drhd_unit(pdev);
1751 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1755 iommu = drhd->iommu;
1757 domain = iommu_alloc_domain(iommu);
1761 if (domain_init(domain, gaw)) {
1762 domain_exit(domain);
1766 /* register pcie-to-pci device */
1768 info = alloc_devinfo_mem();
1770 domain_exit(domain);
1774 info->devfn = devfn;
1776 info->domain = domain;
1777 /* This domain is shared by devices under p2p bridge */
1778 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1780 /* pcie-to-pci bridge already has a domain, uses it */
1782 spin_lock_irqsave(&device_domain_lock, flags);
1783 list_for_each_entry(tmp, &device_domain_list, global) {
1784 if (tmp->bus == bus && tmp->devfn == devfn) {
1785 found = tmp->domain;
1790 free_devinfo_mem(info);
1791 domain_exit(domain);
1794 list_add(&info->link, &domain->devices);
1795 list_add(&info->global, &device_domain_list);
1797 spin_unlock_irqrestore(&device_domain_lock, flags);
1801 info = alloc_devinfo_mem();
1804 info->bus = pdev->bus->number;
1805 info->devfn = pdev->devfn;
1807 info->domain = domain;
1808 spin_lock_irqsave(&device_domain_lock, flags);
1809 /* somebody is fast */
1810 found = find_domain(pdev);
1811 if (found != NULL) {
1812 spin_unlock_irqrestore(&device_domain_lock, flags);
1813 if (found != domain) {
1814 domain_exit(domain);
1817 free_devinfo_mem(info);
1820 list_add(&info->link, &domain->devices);
1821 list_add(&info->global, &device_domain_list);
1822 pdev->dev.archdata.iommu = info;
1823 spin_unlock_irqrestore(&device_domain_lock, flags);
1826 /* recheck it here, maybe others set it */
1827 return find_domain(pdev);
1830 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1831 unsigned long long start,
1832 unsigned long long end)
1834 struct dmar_domain *domain;
1836 unsigned long long base;
1840 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1841 pci_name(pdev), start, end);
1842 /* page table init */
1843 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1847 /* The address might not be aligned */
1848 base = start & PAGE_MASK;
1850 size = PAGE_ALIGN(size);
1851 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1852 IOVA_PFN(base + size) - 1)) {
1853 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1858 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1859 size, base, pci_name(pdev));
1861 * RMRR range might have overlap with physical memory range,
1864 dma_pte_clear_range(domain, base, base + size);
1866 ret = domain_page_mapping(domain, base, base, size,
1867 DMA_PTE_READ|DMA_PTE_WRITE);
1871 /* context entry init */
1872 ret = domain_context_mapping(domain, pdev);
1876 domain_exit(domain);
1881 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1882 struct pci_dev *pdev)
1884 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1886 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1887 rmrr->end_address + 1);
1890 #ifdef CONFIG_DMAR_GFX_WA
1891 struct iommu_prepare_data {
1892 struct pci_dev *pdev;
1896 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1897 unsigned long end_pfn, void *datax)
1899 struct iommu_prepare_data *data;
1901 data = (struct iommu_prepare_data *)datax;
1903 data->ret = iommu_prepare_identity_map(data->pdev,
1904 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1909 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1912 struct iommu_prepare_data data;
1917 for_each_online_node(nid) {
1918 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1925 static void __init iommu_prepare_gfx_mapping(void)
1927 struct pci_dev *pdev = NULL;
1930 for_each_pci_dev(pdev) {
1931 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1932 !IS_GFX_DEVICE(pdev))
1934 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1936 ret = iommu_prepare_with_active_regions(pdev);
1938 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1941 #else /* !CONFIG_DMAR_GFX_WA */
1942 static inline void iommu_prepare_gfx_mapping(void)
1948 #ifdef CONFIG_DMAR_FLOPPY_WA
1949 static inline void iommu_prepare_isa(void)
1951 struct pci_dev *pdev;
1954 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1958 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1959 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1962 printk("IOMMU: Failed to create 0-64M identity map, "
1963 "floppy might not work\n");
1967 static inline void iommu_prepare_isa(void)
1971 #endif /* !CONFIG_DMAR_FLPY_WA */
1973 static int __init init_dmars(void)
1975 struct dmar_drhd_unit *drhd;
1976 struct dmar_rmrr_unit *rmrr;
1977 struct pci_dev *pdev;
1978 struct intel_iommu *iommu;
1979 int i, ret, unit = 0;
1984 * initialize and program root entry to not present
1987 for_each_drhd_unit(drhd) {
1990 * lock not needed as this is only incremented in the single
1991 * threaded kernel __init code path all other access are read
1996 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1999 printk(KERN_ERR "Allocating global iommu array failed\n");
2004 deferred_flush = kzalloc(g_num_of_iommus *
2005 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2006 if (!deferred_flush) {
2012 for_each_drhd_unit(drhd) {
2016 iommu = drhd->iommu;
2017 g_iommus[iommu->seq_id] = iommu;
2019 ret = iommu_init_domains(iommu);
2025 * we could share the same root & context tables
2026 * amoung all IOMMU's. Need to Split it later.
2028 ret = iommu_alloc_root_entry(iommu);
2030 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2035 for_each_drhd_unit(drhd) {
2039 iommu = drhd->iommu;
2040 if (dmar_enable_qi(iommu)) {
2042 * Queued Invalidate not enabled, use Register Based
2045 iommu->flush.flush_context = __iommu_flush_context;
2046 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2047 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2049 (unsigned long long)drhd->reg_base_addr);
2051 iommu->flush.flush_context = qi_flush_context;
2052 iommu->flush.flush_iotlb = qi_flush_iotlb;
2053 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2055 (unsigned long long)drhd->reg_base_addr);
2061 * for each dev attached to rmrr
2063 * locate drhd for dev, alloc domain for dev
2064 * allocate free domain
2065 * allocate page table entries for rmrr
2066 * if context not allocated for bus
2067 * allocate and init context
2068 * set present in root table for this bus
2069 * init context with domain, translation etc
2073 for_each_rmrr_units(rmrr) {
2074 for (i = 0; i < rmrr->devices_cnt; i++) {
2075 pdev = rmrr->devices[i];
2076 /* some BIOS lists non-exist devices in DMAR table */
2079 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2082 "IOMMU: mapping reserved region failed\n");
2086 iommu_prepare_gfx_mapping();
2088 iommu_prepare_isa();
2093 * global invalidate context cache
2094 * global invalidate iotlb
2095 * enable translation
2097 for_each_drhd_unit(drhd) {
2100 iommu = drhd->iommu;
2101 sprintf (iommu->name, "dmar%d", unit++);
2103 iommu_flush_write_buffer(iommu);
2105 ret = dmar_set_interrupt(iommu);
2109 iommu_set_root_entry(iommu);
2111 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2113 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2115 iommu_disable_protect_mem_regions(iommu);
2117 ret = iommu_enable_translation(iommu);
2124 for_each_drhd_unit(drhd) {
2127 iommu = drhd->iommu;
2134 static inline u64 aligned_size(u64 host_addr, size_t size)
2137 addr = (host_addr & (~PAGE_MASK)) + size;
2138 return PAGE_ALIGN(addr);
2142 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2146 /* Make sure it's in range */
2147 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2148 if (!size || (IOVA_START_ADDR + size > end))
2151 piova = alloc_iova(&domain->iovad,
2152 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2156 static struct iova *
2157 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2158 size_t size, u64 dma_mask)
2160 struct pci_dev *pdev = to_pci_dev(dev);
2161 struct iova *iova = NULL;
2163 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2164 iova = iommu_alloc_iova(domain, size, dma_mask);
2167 * First try to allocate an io virtual address in
2168 * DMA_32BIT_MASK and if that fails then try allocating
2171 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2173 iova = iommu_alloc_iova(domain, size, dma_mask);
2177 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2184 static struct dmar_domain *
2185 get_valid_domain_for_dev(struct pci_dev *pdev)
2187 struct dmar_domain *domain;
2190 domain = get_domain_for_dev(pdev,
2191 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2194 "Allocating domain for %s failed", pci_name(pdev));
2198 /* make sure context mapping is ok */
2199 if (unlikely(!domain_context_mapped(pdev))) {
2200 ret = domain_context_mapping(domain, pdev);
2203 "Domain context map for %s failed",
2212 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2213 size_t size, int dir, u64 dma_mask)
2215 struct pci_dev *pdev = to_pci_dev(hwdev);
2216 struct dmar_domain *domain;
2217 phys_addr_t start_paddr;
2221 struct intel_iommu *iommu;
2223 BUG_ON(dir == DMA_NONE);
2224 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2227 domain = get_valid_domain_for_dev(pdev);
2231 iommu = domain_get_iommu(domain);
2232 size = aligned_size((u64)paddr, size);
2234 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2238 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2241 * Check if DMAR supports zero-length reads on write only
2244 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2245 !cap_zlr(iommu->cap))
2246 prot |= DMA_PTE_READ;
2247 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2248 prot |= DMA_PTE_WRITE;
2250 * paddr - (paddr + size) might be partial page, we should map the whole
2251 * page. Note: if two part of one page are separately mapped, we
2252 * might have two guest_addr mapping to the same host paddr, but this
2253 * is not a big problem
2255 ret = domain_page_mapping(domain, start_paddr,
2256 ((u64)paddr) & PAGE_MASK, size, prot);
2260 /* it's a non-present to present mapping */
2261 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2262 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2264 iommu_flush_write_buffer(iommu);
2266 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2270 __free_iova(&domain->iovad, iova);
2271 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2272 pci_name(pdev), size, (unsigned long long)paddr, dir);
2276 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2277 unsigned long offset, size_t size,
2278 enum dma_data_direction dir,
2279 struct dma_attrs *attrs)
2281 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2282 dir, to_pci_dev(dev)->dma_mask);
2285 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2286 size_t size, int dir)
2288 return __intel_map_single(hwdev, paddr, size, dir,
2289 to_pci_dev(hwdev)->dma_mask);
2292 static void flush_unmaps(void)
2298 /* just flush them all */
2299 for (i = 0; i < g_num_of_iommus; i++) {
2300 struct intel_iommu *iommu = g_iommus[i];
2304 if (deferred_flush[i].next) {
2305 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2306 DMA_TLB_GLOBAL_FLUSH, 0);
2307 for (j = 0; j < deferred_flush[i].next; j++) {
2308 __free_iova(&deferred_flush[i].domain[j]->iovad,
2309 deferred_flush[i].iova[j]);
2311 deferred_flush[i].next = 0;
2318 static void flush_unmaps_timeout(unsigned long data)
2320 unsigned long flags;
2322 spin_lock_irqsave(&async_umap_flush_lock, flags);
2324 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2327 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2329 unsigned long flags;
2331 struct intel_iommu *iommu;
2333 spin_lock_irqsave(&async_umap_flush_lock, flags);
2334 if (list_size == HIGH_WATER_MARK)
2337 iommu = domain_get_iommu(dom);
2338 iommu_id = iommu->seq_id;
2340 next = deferred_flush[iommu_id].next;
2341 deferred_flush[iommu_id].domain[next] = dom;
2342 deferred_flush[iommu_id].iova[next] = iova;
2343 deferred_flush[iommu_id].next++;
2346 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2350 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2353 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2354 size_t size, enum dma_data_direction dir,
2355 struct dma_attrs *attrs)
2357 struct pci_dev *pdev = to_pci_dev(dev);
2358 struct dmar_domain *domain;
2359 unsigned long start_addr;
2361 struct intel_iommu *iommu;
2363 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2365 domain = find_domain(pdev);
2368 iommu = domain_get_iommu(domain);
2370 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2374 start_addr = iova->pfn_lo << PAGE_SHIFT;
2375 size = aligned_size((u64)dev_addr, size);
2377 pr_debug("Device %s unmapping: %lx@%llx\n",
2378 pci_name(pdev), size, (unsigned long long)start_addr);
2380 /* clear the whole page */
2381 dma_pte_clear_range(domain, start_addr, start_addr + size);
2382 /* free page tables */
2383 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2384 if (intel_iommu_strict) {
2385 if (iommu_flush_iotlb_psi(iommu,
2386 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2387 iommu_flush_write_buffer(iommu);
2389 __free_iova(&domain->iovad, iova);
2391 add_unmap(domain, iova);
2393 * queue up the release of the unmap to save the 1/6th of the
2394 * cpu used up by the iotlb flush operation...
2399 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2402 intel_unmap_page(dev, dev_addr, size, dir, NULL);
2405 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2406 dma_addr_t *dma_handle, gfp_t flags)
2411 size = PAGE_ALIGN(size);
2412 order = get_order(size);
2413 flags &= ~(GFP_DMA | GFP_DMA32);
2415 vaddr = (void *)__get_free_pages(flags, order);
2418 memset(vaddr, 0, size);
2420 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2422 hwdev->coherent_dma_mask);
2425 free_pages((unsigned long)vaddr, order);
2429 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2430 dma_addr_t dma_handle)
2434 size = PAGE_ALIGN(size);
2435 order = get_order(size);
2437 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2438 free_pages((unsigned long)vaddr, order);
2441 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2443 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2444 int nelems, enum dma_data_direction dir,
2445 struct dma_attrs *attrs)
2448 struct pci_dev *pdev = to_pci_dev(hwdev);
2449 struct dmar_domain *domain;
2450 unsigned long start_addr;
2454 struct scatterlist *sg;
2455 struct intel_iommu *iommu;
2457 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2460 domain = find_domain(pdev);
2463 iommu = domain_get_iommu(domain);
2465 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2468 for_each_sg(sglist, sg, nelems, i) {
2469 addr = SG_ENT_VIRT_ADDRESS(sg);
2470 size += aligned_size((u64)addr, sg->length);
2473 start_addr = iova->pfn_lo << PAGE_SHIFT;
2475 /* clear the whole page */
2476 dma_pte_clear_range(domain, start_addr, start_addr + size);
2477 /* free page tables */
2478 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2480 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2481 size >> VTD_PAGE_SHIFT, 0))
2482 iommu_flush_write_buffer(iommu);
2485 __free_iova(&domain->iovad, iova);
2488 static int intel_nontranslate_map_sg(struct device *hddev,
2489 struct scatterlist *sglist, int nelems, int dir)
2492 struct scatterlist *sg;
2494 for_each_sg(sglist, sg, nelems, i) {
2495 BUG_ON(!sg_page(sg));
2496 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2497 sg->dma_length = sg->length;
2502 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2503 enum dma_data_direction dir, struct dma_attrs *attrs)
2507 struct pci_dev *pdev = to_pci_dev(hwdev);
2508 struct dmar_domain *domain;
2512 struct iova *iova = NULL;
2514 struct scatterlist *sg;
2515 unsigned long start_addr;
2516 struct intel_iommu *iommu;
2518 BUG_ON(dir == DMA_NONE);
2519 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2520 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2522 domain = get_valid_domain_for_dev(pdev);
2526 iommu = domain_get_iommu(domain);
2528 for_each_sg(sglist, sg, nelems, i) {
2529 addr = SG_ENT_VIRT_ADDRESS(sg);
2530 addr = (void *)virt_to_phys(addr);
2531 size += aligned_size((u64)addr, sg->length);
2534 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2536 sglist->dma_length = 0;
2541 * Check if DMAR supports zero-length reads on write only
2544 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2545 !cap_zlr(iommu->cap))
2546 prot |= DMA_PTE_READ;
2547 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2548 prot |= DMA_PTE_WRITE;
2550 start_addr = iova->pfn_lo << PAGE_SHIFT;
2552 for_each_sg(sglist, sg, nelems, i) {
2553 addr = SG_ENT_VIRT_ADDRESS(sg);
2554 addr = (void *)virt_to_phys(addr);
2555 size = aligned_size((u64)addr, sg->length);
2556 ret = domain_page_mapping(domain, start_addr + offset,
2557 ((u64)addr) & PAGE_MASK,
2560 /* clear the page */
2561 dma_pte_clear_range(domain, start_addr,
2562 start_addr + offset);
2563 /* free page tables */
2564 dma_pte_free_pagetable(domain, start_addr,
2565 start_addr + offset);
2567 __free_iova(&domain->iovad, iova);
2570 sg->dma_address = start_addr + offset +
2571 ((u64)addr & (~PAGE_MASK));
2572 sg->dma_length = sg->length;
2576 /* it's a non-present to present mapping */
2577 if (iommu_flush_iotlb_psi(iommu, domain->id,
2578 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2579 iommu_flush_write_buffer(iommu);
2583 struct dma_map_ops intel_dma_ops = {
2584 .alloc_coherent = intel_alloc_coherent,
2585 .free_coherent = intel_free_coherent,
2586 .map_sg = intel_map_sg,
2587 .unmap_sg = intel_unmap_sg,
2588 .map_page = intel_map_page,
2589 .unmap_page = intel_unmap_page,
2592 static inline int iommu_domain_cache_init(void)
2596 iommu_domain_cache = kmem_cache_create("iommu_domain",
2597 sizeof(struct dmar_domain),
2602 if (!iommu_domain_cache) {
2603 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2610 static inline int iommu_devinfo_cache_init(void)
2614 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2615 sizeof(struct device_domain_info),
2619 if (!iommu_devinfo_cache) {
2620 printk(KERN_ERR "Couldn't create devinfo cache\n");
2627 static inline int iommu_iova_cache_init(void)
2631 iommu_iova_cache = kmem_cache_create("iommu_iova",
2632 sizeof(struct iova),
2636 if (!iommu_iova_cache) {
2637 printk(KERN_ERR "Couldn't create iova cache\n");
2644 static int __init iommu_init_mempool(void)
2647 ret = iommu_iova_cache_init();
2651 ret = iommu_domain_cache_init();
2655 ret = iommu_devinfo_cache_init();
2659 kmem_cache_destroy(iommu_domain_cache);
2661 kmem_cache_destroy(iommu_iova_cache);
2666 static void __init iommu_exit_mempool(void)
2668 kmem_cache_destroy(iommu_devinfo_cache);
2669 kmem_cache_destroy(iommu_domain_cache);
2670 kmem_cache_destroy(iommu_iova_cache);
2674 static void __init init_no_remapping_devices(void)
2676 struct dmar_drhd_unit *drhd;
2678 for_each_drhd_unit(drhd) {
2679 if (!drhd->include_all) {
2681 for (i = 0; i < drhd->devices_cnt; i++)
2682 if (drhd->devices[i] != NULL)
2684 /* ignore DMAR unit if no pci devices exist */
2685 if (i == drhd->devices_cnt)
2693 for_each_drhd_unit(drhd) {
2695 if (drhd->ignored || drhd->include_all)
2698 for (i = 0; i < drhd->devices_cnt; i++)
2699 if (drhd->devices[i] &&
2700 !IS_GFX_DEVICE(drhd->devices[i]))
2703 if (i < drhd->devices_cnt)
2706 /* bypass IOMMU if it is just for gfx devices */
2708 for (i = 0; i < drhd->devices_cnt; i++) {
2709 if (!drhd->devices[i])
2711 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2716 int __init intel_iommu_init(void)
2720 if (dmar_table_init())
2723 if (dmar_dev_scope_init())
2727 * Check the need for DMA-remapping initialization now.
2728 * Above initialization will also be used by Interrupt-remapping.
2730 if (no_iommu || swiotlb || dmar_disabled)
2733 iommu_init_mempool();
2734 dmar_init_reserved_ranges();
2736 init_no_remapping_devices();
2740 printk(KERN_ERR "IOMMU: dmar init failed\n");
2741 put_iova_domain(&reserved_iova_list);
2742 iommu_exit_mempool();
2746 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2748 init_timer(&unmap_timer);
2750 dma_ops = &intel_dma_ops;
2752 register_iommu(&intel_iommu_ops);
2757 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2758 struct pci_dev *pdev)
2760 struct device_domain_info *info;
2761 unsigned long flags;
2763 info = alloc_devinfo_mem();
2767 info->bus = pdev->bus->number;
2768 info->devfn = pdev->devfn;
2770 info->domain = domain;
2772 spin_lock_irqsave(&device_domain_lock, flags);
2773 list_add(&info->link, &domain->devices);
2774 list_add(&info->global, &device_domain_list);
2775 pdev->dev.archdata.iommu = info;
2776 spin_unlock_irqrestore(&device_domain_lock, flags);
2781 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2782 struct pci_dev *pdev)
2784 struct device_domain_info *info;
2785 struct intel_iommu *iommu;
2786 unsigned long flags;
2788 struct list_head *entry, *tmp;
2790 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2794 spin_lock_irqsave(&device_domain_lock, flags);
2795 list_for_each_safe(entry, tmp, &domain->devices) {
2796 info = list_entry(entry, struct device_domain_info, link);
2797 if (info->bus == pdev->bus->number &&
2798 info->devfn == pdev->devfn) {
2799 list_del(&info->link);
2800 list_del(&info->global);
2802 info->dev->dev.archdata.iommu = NULL;
2803 spin_unlock_irqrestore(&device_domain_lock, flags);
2805 iommu_detach_dev(iommu, info->bus, info->devfn);
2806 free_devinfo_mem(info);
2808 spin_lock_irqsave(&device_domain_lock, flags);
2816 /* if there is no other devices under the same iommu
2817 * owned by this domain, clear this iommu in iommu_bmp
2818 * update iommu count and coherency
2820 if (device_to_iommu(info->bus, info->devfn) == iommu)
2825 unsigned long tmp_flags;
2826 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2827 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2828 domain->iommu_count--;
2829 domain_update_iommu_coherency(domain);
2830 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2833 spin_unlock_irqrestore(&device_domain_lock, flags);
2836 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2838 struct device_domain_info *info;
2839 struct intel_iommu *iommu;
2840 unsigned long flags1, flags2;
2842 spin_lock_irqsave(&device_domain_lock, flags1);
2843 while (!list_empty(&domain->devices)) {
2844 info = list_entry(domain->devices.next,
2845 struct device_domain_info, link);
2846 list_del(&info->link);
2847 list_del(&info->global);
2849 info->dev->dev.archdata.iommu = NULL;
2851 spin_unlock_irqrestore(&device_domain_lock, flags1);
2853 iommu = device_to_iommu(info->bus, info->devfn);
2854 iommu_detach_dev(iommu, info->bus, info->devfn);
2856 /* clear this iommu in iommu_bmp, update iommu count
2859 spin_lock_irqsave(&domain->iommu_lock, flags2);
2860 if (test_and_clear_bit(iommu->seq_id,
2861 &domain->iommu_bmp)) {
2862 domain->iommu_count--;
2863 domain_update_iommu_coherency(domain);
2865 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2867 free_devinfo_mem(info);
2868 spin_lock_irqsave(&device_domain_lock, flags1);
2870 spin_unlock_irqrestore(&device_domain_lock, flags1);
2873 /* domain id for virtual machine, it won't be set in context */
2874 static unsigned long vm_domid;
2876 static int vm_domain_min_agaw(struct dmar_domain *domain)
2879 int min_agaw = domain->agaw;
2881 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2882 for (; i < g_num_of_iommus; ) {
2883 if (min_agaw > g_iommus[i]->agaw)
2884 min_agaw = g_iommus[i]->agaw;
2886 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2892 static struct dmar_domain *iommu_alloc_vm_domain(void)
2894 struct dmar_domain *domain;
2896 domain = alloc_domain_mem();
2900 domain->id = vm_domid++;
2901 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2902 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2907 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2911 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2912 spin_lock_init(&domain->mapping_lock);
2913 spin_lock_init(&domain->iommu_lock);
2915 domain_reserve_special_ranges(domain);
2917 /* calculate AGAW */
2918 domain->gaw = guest_width;
2919 adjust_width = guestwidth_to_adjustwidth(guest_width);
2920 domain->agaw = width_to_agaw(adjust_width);
2922 INIT_LIST_HEAD(&domain->devices);
2924 domain->iommu_count = 0;
2925 domain->iommu_coherency = 0;
2926 domain->max_addr = 0;
2928 /* always allocate the top pgd */
2929 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2932 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2936 static void iommu_free_vm_domain(struct dmar_domain *domain)
2938 unsigned long flags;
2939 struct dmar_drhd_unit *drhd;
2940 struct intel_iommu *iommu;
2942 unsigned long ndomains;
2944 for_each_drhd_unit(drhd) {
2947 iommu = drhd->iommu;
2949 ndomains = cap_ndoms(iommu->cap);
2950 i = find_first_bit(iommu->domain_ids, ndomains);
2951 for (; i < ndomains; ) {
2952 if (iommu->domains[i] == domain) {
2953 spin_lock_irqsave(&iommu->lock, flags);
2954 clear_bit(i, iommu->domain_ids);
2955 iommu->domains[i] = NULL;
2956 spin_unlock_irqrestore(&iommu->lock, flags);
2959 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2964 static void vm_domain_exit(struct dmar_domain *domain)
2968 /* Domain 0 is reserved, so dont process it */
2972 vm_domain_remove_all_dev_info(domain);
2974 put_iova_domain(&domain->iovad);
2975 end = DOMAIN_MAX_ADDR(domain->gaw);
2976 end = end & (~VTD_PAGE_MASK);
2979 dma_pte_clear_range(domain, 0, end);
2981 /* free page tables */
2982 dma_pte_free_pagetable(domain, 0, end);
2984 iommu_free_vm_domain(domain);
2985 free_domain_mem(domain);
2988 static int intel_iommu_domain_init(struct iommu_domain *domain)
2990 struct dmar_domain *dmar_domain;
2992 dmar_domain = iommu_alloc_vm_domain();
2995 "intel_iommu_domain_init: dmar_domain == NULL\n");
2998 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3000 "intel_iommu_domain_init() failed\n");
3001 vm_domain_exit(dmar_domain);
3004 domain->priv = dmar_domain;
3009 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3011 struct dmar_domain *dmar_domain = domain->priv;
3013 domain->priv = NULL;
3014 vm_domain_exit(dmar_domain);
3017 static int intel_iommu_attach_device(struct iommu_domain *domain,
3020 struct dmar_domain *dmar_domain = domain->priv;
3021 struct pci_dev *pdev = to_pci_dev(dev);
3022 struct intel_iommu *iommu;
3027 /* normally pdev is not mapped */
3028 if (unlikely(domain_context_mapped(pdev))) {
3029 struct dmar_domain *old_domain;
3031 old_domain = find_domain(pdev);
3033 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3034 vm_domain_remove_one_dev_info(old_domain, pdev);
3036 domain_remove_dev_info(old_domain);
3040 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3044 /* check if this iommu agaw is sufficient for max mapped address */
3045 addr_width = agaw_to_width(iommu->agaw);
3046 end = DOMAIN_MAX_ADDR(addr_width);
3047 end = end & VTD_PAGE_MASK;
3048 if (end < dmar_domain->max_addr) {
3049 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3050 "sufficient for the mapped address (%llx)\n",
3051 __func__, iommu->agaw, dmar_domain->max_addr);
3055 ret = domain_context_mapping(dmar_domain, pdev);
3059 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3063 static void intel_iommu_detach_device(struct iommu_domain *domain,
3066 struct dmar_domain *dmar_domain = domain->priv;
3067 struct pci_dev *pdev = to_pci_dev(dev);
3069 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3072 static int intel_iommu_map_range(struct iommu_domain *domain,
3073 unsigned long iova, phys_addr_t hpa,
3074 size_t size, int iommu_prot)
3076 struct dmar_domain *dmar_domain = domain->priv;
3082 if (iommu_prot & IOMMU_READ)
3083 prot |= DMA_PTE_READ;
3084 if (iommu_prot & IOMMU_WRITE)
3085 prot |= DMA_PTE_WRITE;
3087 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3088 if (dmar_domain->max_addr < max_addr) {
3092 /* check if minimum agaw is sufficient for mapped address */
3093 min_agaw = vm_domain_min_agaw(dmar_domain);
3094 addr_width = agaw_to_width(min_agaw);
3095 end = DOMAIN_MAX_ADDR(addr_width);
3096 end = end & VTD_PAGE_MASK;
3097 if (end < max_addr) {
3098 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3099 "sufficient for the mapped address (%llx)\n",
3100 __func__, min_agaw, max_addr);
3103 dmar_domain->max_addr = max_addr;
3106 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3110 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3111 unsigned long iova, size_t size)
3113 struct dmar_domain *dmar_domain = domain->priv;
3116 /* The address might not be aligned */
3117 base = iova & VTD_PAGE_MASK;
3118 size = VTD_PAGE_ALIGN(size);
3119 dma_pte_clear_range(dmar_domain, base, base + size);
3121 if (dmar_domain->max_addr == base + size)
3122 dmar_domain->max_addr = base;
3125 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3128 struct dmar_domain *dmar_domain = domain->priv;
3129 struct dma_pte *pte;
3132 pte = addr_to_dma_pte(dmar_domain, iova);
3134 phys = dma_pte_addr(pte);
3139 static struct iommu_ops intel_iommu_ops = {
3140 .domain_init = intel_iommu_domain_init,
3141 .domain_destroy = intel_iommu_domain_destroy,
3142 .attach_dev = intel_iommu_attach_device,
3143 .detach_dev = intel_iommu_detach_device,
3144 .map = intel_iommu_map_range,
3145 .unmap = intel_iommu_unmap_range,
3146 .iova_to_phys = intel_iommu_iova_to_phys,