X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fpci%2Fintel-iommu.c;h=2314ad7ee5fef4544ab2159cd7e253ed6e1d3889;hb=f80330357284c908e1f67cc8b4d8e56a3e2f6fc6;hp=001b328adf804e1e75d9c561df9e704b424b5468;hpb=7b11428d37fe65643172feff66cd0a4d72d1932a;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 001b328..2314ad7 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -53,12 +53,36 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 +#define MAX_AGAW_WIDTH 64 + #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) +#define DOMAIN_MAX_PFN(gaw) ((((u64)1) << (gaw-VTD_PAGE_SHIFT)) - 1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32)) #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64)) + +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things + are never going to work. */ +static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) +{ + return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); +} + +static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) +{ + return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ + return mm_to_dma_pfn(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ + return page_to_dma_pfn(virt_to_page(p)); +} + /* global iommu list, set NULL for ignored DMAR units */ static struct intel_iommu **g_iommus; @@ -127,8 +151,6 @@ static inline void context_set_fault_enable(struct context_entry *context) context->lo &= (((u64)-1) << 2) | 1; } -#define CONTEXT_TT_MULTI_LEVEL 0 - static inline void context_set_translation_type(struct context_entry *context, unsigned long value) { @@ -200,12 +222,17 @@ static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot) static inline u64 dma_pte_addr(struct dma_pte *pte) { - return (pte->val & VTD_PAGE_MASK); +#ifdef CONFIG_64BIT + return pte->val & VTD_PAGE_MASK; +#else + /* Must have a full atomic 64-bit read */ + return __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK; +#endif } -static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr) +static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn) { - pte->val |= (addr & VTD_PAGE_MASK); + pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT; } static inline bool dma_pte_present(struct dma_pte *pte) @@ -213,6 +240,19 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline int first_pte_in_page(struct dma_pte *pte) +{ + return !((unsigned long)pte & ~VTD_PAGE_MASK); +} + +/* + * This domain is a statically identity mapping domain. + * 1. This domain creats a static 1:1 mapping to all usable memory. + * 2. It maps to each iommu if successful. + * 3. Each iommu mapps to this domain if successful. + */ +struct dmar_domain *si_domain; + /* devices under the same p2p bridge are owned in one domain */ #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0) @@ -221,6 +261,9 @@ static inline bool dma_pte_present(struct dma_pte *pte) */ #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1) +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2) + struct dmar_domain { int id; /* domain id */ unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/ @@ -229,7 +272,6 @@ struct dmar_domain { struct iova_domain iovad; /* iova's that belong to this domain */ struct dma_pte *pgd; /* virtual address */ - spinlock_t mapping_lock; /* page table lock */ int gaw; /* max guest address width */ /* adjusted guest address width, 0 is level 2 30-bit */ @@ -252,6 +294,7 @@ struct device_domain_info { u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ + struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ }; @@ -397,17 +440,13 @@ void free_iova_mem(struct iova *iova) static inline int width_to_agaw(int width); -/* calculate agaw for each iommu. - * "SAGAW" may be different across iommus, use a default agaw, and - * get a supported less agaw for iommus that don't support the default agaw. - */ -int iommu_calculate_agaw(struct intel_iommu *iommu) +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; int agaw = -1; sagaw = cap_sagaw(iommu->cap); - for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break; @@ -416,12 +455,32 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return agaw; } -/* in native case, each domain is related to only one iommu */ +/* + * Calculate max SAGAW for each iommu. + */ +int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); +} + +/* + * calculate agaw for each iommu. + * "SAGAW" may be different across iommus, use a default agaw, and + * get a supported less agaw for iommus that don't support the default agaw. + */ +int iommu_calculate_agaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); +} + +/* This functionin only returns single iommu in a domain */ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) { int iommu_id; + /* si_domain and vm domain should not get here. */ BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE); + BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY); iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); if (iommu_id < 0 || iommu_id >= g_num_of_iommus) @@ -616,80 +675,78 @@ static inline int width_to_agaw(int width) static inline unsigned int level_to_offset_bits(int level) { - return (12 + (level - 1) * LEVEL_STRIDE); + return (level - 1) * LEVEL_STRIDE; } -static inline int address_level_offset(u64 addr, int level) +static inline int pfn_level_offset(unsigned long pfn, int level) { - return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK); + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) +static inline unsigned long level_mask(int level) { - return ((u64)-1 << level_to_offset_bits(level)); + return -1UL << level_to_offset_bits(level); } -static inline u64 level_size(int level) +static inline unsigned long level_size(int level) { - return ((u64)1 << level_to_offset_bits(level)); + return 1UL << level_to_offset_bits(level); } -static inline u64 align_to_level(u64 addr, int level) +static inline unsigned long align_to_level(unsigned long pfn, int level) { - return ((addr + level_size(level) - 1) & level_mask(level)); + return (pfn + level_size(level) - 1) & level_mask(level); } -static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) +static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, + unsigned long pfn) { - int addr_width = agaw_to_width(domain->agaw); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); int offset; - unsigned long flags; BUG_ON(!domain->pgd); - - addr &= (((u64)1) << addr_width) - 1; + BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; - spin_lock_irqsave(&domain->mapping_lock, flags); while (level > 0) { void *tmp_page; - offset = address_level_offset(addr, level); + offset = pfn_level_offset(pfn, level); pte = &parent[offset]; if (level == 1) break; if (!dma_pte_present(pte)) { + uint64_t pteval; + tmp_page = alloc_pgtable_page(); - if (!tmp_page) { - spin_unlock_irqrestore(&domain->mapping_lock, - flags); + if (!tmp_page) return NULL; + + domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); + pteval = (virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; + if (cmpxchg64(&pte->val, 0ULL, pteval)) { + /* Someone else set it while we were thinking; use theirs. */ + free_pgtable_page(tmp_page); + } else { + dma_pte_addr(pte); + domain_flush_cache(domain, pte, sizeof(*pte)); } - domain_flush_cache(domain, tmp_page, PAGE_SIZE); - dma_set_pte_addr(pte, virt_to_phys(tmp_page)); - /* - * high level table always sets r/w, last level page - * table control read/write - */ - dma_set_pte_readable(pte); - dma_set_pte_writable(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); } parent = phys_to_virt(dma_pte_addr(pte)); level--; } - spin_unlock_irqrestore(&domain->mapping_lock, flags); return pte; } /* return address's pte at specific level */ -static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, - int level) +static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, + unsigned long pfn, + int level) { struct dma_pte *parent, *pte = NULL; int total = agaw_to_level(domain->agaw); @@ -697,7 +754,7 @@ static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, parent = domain->pgd; while (level <= total) { - offset = address_level_offset(addr, total); + offset = pfn_level_offset(pfn, total); pte = &parent[offset]; if (level == total) return pte; @@ -710,74 +767,82 @@ static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, return NULL; } -/* clear one page's page table */ -static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) -{ - struct dma_pte *pte = NULL; - - /* get last level pte */ - pte = dma_addr_level_pte(domain, addr, 1); - - if (pte) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - } -} - /* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) +static void dma_pte_clear_range(struct dmar_domain *domain, + unsigned long start_pfn, + unsigned long last_pfn) { - int addr_width = agaw_to_width(domain->agaw); - int npages; + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + struct dma_pte *first_pte, *pte; - start &= (((u64)1) << addr_width) - 1; - end &= (((u64)1) << addr_width) - 1; - /* in case it's partial page */ - start &= PAGE_MASK; - end = PAGE_ALIGN(end); - npages = (end - start) / VTD_PAGE_SIZE; + BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); - /* we don't need lock here, nobody else touches the iova range */ - while (npages--) { - dma_pte_clear_one(domain, start); - start += VTD_PAGE_SIZE; + /* we don't need lock here; nobody else touches the iova range */ + while (start_pfn <= last_pfn) { + first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1); + if (!pte) { + start_pfn = align_to_level(start_pfn + 1, 2); + continue; + } + do { + dma_clear_pte(pte); + start_pfn++; + pte++; + } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); + + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); } } /* free page table pages. last level pte should already be cleared */ static void dma_pte_free_pagetable(struct dmar_domain *domain, - u64 start, u64 end) + unsigned long start_pfn, + unsigned long last_pfn) { - int addr_width = agaw_to_width(domain->agaw); - struct dma_pte *pte; + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + struct dma_pte *first_pte, *pte; int total = agaw_to_level(domain->agaw); int level; - u64 tmp; + unsigned long tmp; - start &= (((u64)1) << addr_width) - 1; - end &= (((u64)1) << addr_width) - 1; + BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); + BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); - /* we don't need lock here, nobody else touches the iova range */ + /* We don't need lock here; nobody else touches the iova range */ level = 2; while (level <= total) { - tmp = align_to_level(start, level); - if (tmp >= end || (tmp + level_size(level) > end)) + tmp = align_to_level(start_pfn, level); + + /* If we can't even clear one PTE at this level, we're done */ + if (tmp + level_size(level) - 1 > last_pfn) return; - while (tmp < end) { - pte = dma_addr_level_pte(domain, tmp, level); - if (pte) { - free_pgtable_page( - phys_to_virt(dma_pte_addr(pte))); - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); + while (tmp + level_size(level) - 1 <= last_pfn) { + first_pte = pte = dma_pfn_level_pte(domain, tmp, level); + if (!pte) { + tmp = align_to_level(tmp + 1, level + 1); + continue; } - tmp += level_size(level); + do { + if (dma_pte_present(pte)) { + free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); + dma_clear_pte(pte); + } + pte++; + tmp += level_size(level); + } while (!first_pte_in_page(pte) && + tmp + level_size(level) - 1 <= last_pfn); + + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + } level++; } /* free pgd */ - if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) { + if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { free_pgtable_page(domain->pgd); domain->pgd = NULL; } @@ -805,7 +870,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) static void iommu_set_root_entry(struct intel_iommu *iommu) { void *addr; - u32 cmd, sts; + u32 sts; unsigned long flag; addr = iommu->root_entry; @@ -813,12 +878,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); - cmd = iommu->gcmd | DMA_GCMD_SRTP; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_RTPS), sts); + readl, (sts & DMA_GSTS_RTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); } @@ -830,39 +894,25 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; - val = iommu->gcmd | DMA_GCMD_WBF; spin_lock_irqsave(&iommu->register_lock, flag); - writel(val, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (!(val & DMA_GSTS_WBFS)), val); + readl, (!(val & DMA_GSTS_WBFS)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); } /* return value determine if we need a write buffer flush */ -static int __iommu_flush_context(struct intel_iommu *iommu, - u16 did, u16 source_id, u8 function_mask, u64 type, - int non_present_entry_flush) +static void __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, + u64 type) { u64 val = 0; unsigned long flag; - /* - * In the non-present entry flush case, if hardware doesn't cache - * non-present entry we do nothing and if hardware cache non-present - * entry, we flush entries of domain 0 (the domain id is used to cache - * any non-present entries) - */ - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - switch (type) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; @@ -887,33 +937,16 @@ static int __iommu_flush_context(struct intel_iommu *iommu, dmar_readq, (!(val & DMA_CCMD_ICC)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); - - /* flush context entry will implicitly flush write buffer */ - return 0; } /* return value determine if we need a write buffer flush */ -static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type, - int non_present_entry_flush) +static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; unsigned long flag; - /* - * In the non-present entry flush case, if hardware doesn't cache - * non-present entry we do nothing and if hardware cache non-present - * entry, we flush entries of domain 0 (the domain id is used to cache - * any non-present entries) - */ - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - switch (type) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */ @@ -961,37 +994,106 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", (unsigned long long)DMA_TLB_IIRG(type), (unsigned long long)DMA_TLB_IAIG(val)); - /* flush iotlb entry will implicitly flush write buffer */ - return 0; } -static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int pages, int non_present_entry_flush) +static struct device_domain_info *iommu_support_dev_iotlb( + struct dmar_domain *domain, int segment, u8 bus, u8 devfn) { - unsigned int mask; + int found = 0; + unsigned long flags; + struct device_domain_info *info; + struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn); - BUG_ON(addr & (~VTD_PAGE_MASK)); - BUG_ON(pages == 0); + if (!ecap_dev_iotlb_support(iommu->ecap)) + return NULL; + + if (!iommu->qi) + return NULL; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) + if (info->bus == bus && info->devfn == devfn) { + found = 1; + break; + } + spin_unlock_irqrestore(&device_domain_lock, flags); + + if (!found || !info->dev) + return NULL; + + if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS)) + return NULL; + + if (!dmar_find_matched_atsr_unit(info->dev)) + return NULL; + + info->iommu = iommu; + + return info; +} + +static void iommu_enable_dev_iotlb(struct device_domain_info *info) +{ + if (!info) + return; + + pci_enable_ats(info->dev, VTD_PAGE_SHIFT); +} + +static void iommu_disable_dev_iotlb(struct device_domain_info *info) +{ + if (!info->dev || !pci_ats_enabled(info->dev)) + return; + + pci_disable_ats(info->dev); +} - /* Fallback to domain selective flush if no PSI support */ - if (!cap_pgsel_inv(iommu->cap)) - return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH, - non_present_entry_flush); +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, + u64 addr, unsigned mask) +{ + u16 sid, qdep; + unsigned long flags; + struct device_domain_info *info; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (!info->dev || !pci_ats_enabled(info->dev)) + continue; + + sid = info->bus << 8 | info->devfn; + qdep = pci_ats_queue_depth(info->dev); + qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); + } + spin_unlock_irqrestore(&device_domain_lock, flags); +} + +static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + unsigned long pfn, unsigned int pages) +{ + unsigned int mask = ilog2(__roundup_pow_of_two(pages)); + uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; + + BUG_ON(pages == 0); /* + * Fallback to domain selective flush if no PSI support or the size is + * too big. * PSI requires page size to be 2 ^ x, and the base address is naturally * aligned to the size */ - mask = ilog2(__roundup_pow_of_two(pages)); - /* Fallback to domain selective flush if size is too big */ - if (mask > cap_max_amask_val(iommu->cap)) - return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH, non_present_entry_flush); + if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, did, 0, 0, + DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, did, addr, mask, + DMA_TLB_PSI_FLUSH); - return iommu->flush.flush_iotlb(iommu, did, addr, mask, - DMA_TLB_PSI_FLUSH, - non_present_entry_flush); + /* + * In caching mode, domain ID 0 is reserved for non-present to present + * mapping flush. Device IOTLB doesn't need to be flushed in this case. + */ + if (!cap_caching_mode(iommu->cap) || did) + iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -1017,13 +1119,13 @@ static int iommu_enable_translation(struct intel_iommu *iommu) unsigned long flags; spin_lock_irqsave(&iommu->register_lock, flags); - writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG); + iommu->gcmd |= DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_TES), sts); + readl, (sts & DMA_GSTS_TES), sts); - iommu->gcmd |= DMA_GCMD_TE; spin_unlock_irqrestore(&iommu->register_lock, flags); return 0; } @@ -1039,7 +1141,7 @@ static int iommu_disable_translation(struct intel_iommu *iommu) /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (!(sts & DMA_GSTS_TES)), sts); + readl, (!(sts & DMA_GSTS_TES)), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); return 0; @@ -1138,48 +1240,71 @@ void free_dmar_iommu(struct intel_iommu *iommu) free_context_table(iommu); } -static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) +static struct dmar_domain *alloc_domain(void) { - unsigned long num; - unsigned long ndomains; struct dmar_domain *domain; - unsigned long flags; domain = alloc_domain_mem(); if (!domain) return NULL; + memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); + domain->flags = 0; + + return domain; +} + +static int iommu_attach_domain(struct dmar_domain *domain, + struct intel_iommu *iommu) +{ + int num; + unsigned long ndomains; + unsigned long flags; + ndomains = cap_ndoms(iommu->cap); spin_lock_irqsave(&iommu->lock, flags); + num = find_first_zero_bit(iommu->domain_ids, ndomains); if (num >= ndomains) { spin_unlock_irqrestore(&iommu->lock, flags); - free_domain_mem(domain); printk(KERN_ERR "IOMMU: no free domain ids\n"); - return NULL; + return -ENOMEM; } - set_bit(num, iommu->domain_ids); domain->id = num; - memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); + set_bit(num, iommu->domain_ids); set_bit(iommu->seq_id, &domain->iommu_bmp); - domain->flags = 0; iommu->domains[num] = domain; spin_unlock_irqrestore(&iommu->lock, flags); - return domain; + return 0; } -static void iommu_free_domain(struct dmar_domain *domain) +static void iommu_detach_domain(struct dmar_domain *domain, + struct intel_iommu *iommu) { unsigned long flags; - struct intel_iommu *iommu; - - iommu = domain_get_iommu(domain); + int num, ndomains; + int found = 0; spin_lock_irqsave(&iommu->lock, flags); - clear_bit(domain->id, iommu->domain_ids); + ndomains = cap_ndoms(iommu->cap); + num = find_first_bit(iommu->domain_ids, ndomains); + for (; num < ndomains; ) { + if (iommu->domains[num] == domain) { + found = 1; + break; + } + num = find_next_bit(iommu->domain_ids, + cap_ndoms(iommu->cap), num+1); + } + + if (found) { + clear_bit(num, iommu->domain_ids); + clear_bit(iommu->seq_id, &domain->iommu_bmp); + iommu->domains[num] = NULL; + } spin_unlock_irqrestore(&iommu->lock, flags); } @@ -1192,7 +1317,6 @@ static void dmar_init_reserved_ranges(void) struct pci_dev *pdev = NULL; struct iova *iova; int i; - u64 addr, size; init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN); @@ -1215,12 +1339,9 @@ static void dmar_init_reserved_ranges(void) r = &pdev->resource[i]; if (!r->flags || !(r->flags & IORESOURCE_MEM)) continue; - addr = r->start; - addr &= PAGE_MASK; - size = r->end - addr; - size = PAGE_ALIGN(size); - iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr), - IOVA_PFN(size + addr) - 1); + iova = reserve_iova(&reserved_iova_list, + IOVA_PFN(r->start), + IOVA_PFN(r->end)); if (!iova) printk(KERN_ERR "Reserve iova failed\n"); } @@ -1254,7 +1375,6 @@ static int domain_init(struct dmar_domain *domain, int guest_width) unsigned long sagaw; init_iova_domain(&domain->iovad, DMA_32BIT_PFN); - spin_lock_init(&domain->mapping_lock); spin_lock_init(&domain->iommu_lock); domain_reserve_special_ranges(domain); @@ -1299,7 +1419,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width) static void domain_exit(struct dmar_domain *domain) { - u64 end; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; /* Domain 0 is reserved, so dont process it */ if (!domain) @@ -1308,21 +1429,22 @@ static void domain_exit(struct dmar_domain *domain) domain_remove_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); - end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~PAGE_MASK); /* clear ptes */ - dma_pte_clear_range(domain, 0, end); + dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ - dma_pte_free_pagetable(domain, 0, end); + dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + + for_each_active_iommu(iommu, drhd) + if (test_bit(iommu->seq_id, &domain->iommu_bmp)) + iommu_detach_domain(domain, iommu); - iommu_free_domain(domain); free_domain_mem(domain); } -static int domain_context_mapping_one(struct dmar_domain *domain, - int segment, u8 bus, u8 devfn) +static int domain_context_mapping_one(struct dmar_domain *domain, int segment, + u8 bus, u8 devfn, int translation) { struct context_entry *context; unsigned long flags; @@ -1332,10 +1454,14 @@ static int domain_context_mapping_one(struct dmar_domain *domain, unsigned long ndomains; int id; int agaw; + struct device_domain_info *info = NULL; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + BUG_ON(!domain->pgd); + BUG_ON(translation != CONTEXT_TT_PASS_THROUGH && + translation != CONTEXT_TT_MULTI_LEVEL); iommu = device_to_iommu(segment, bus, devfn); if (!iommu) @@ -1353,7 +1479,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, id = domain->id; pgd = domain->pgd; - if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) { + if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || + domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) { int found = 0; /* find an available domain id for this device in iommu */ @@ -1395,21 +1522,44 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } context_set_domain_id(context, id); - context_set_address_width(context, iommu->agaw); - context_set_address_root(context, virt_to_phys(pgd)); - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); + + if (translation != CONTEXT_TT_PASS_THROUGH) { + info = iommu_support_dev_iotlb(domain, segment, bus, devfn); + translation = info ? CONTEXT_TT_DEV_IOTLB : + CONTEXT_TT_MULTI_LEVEL; + } + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) + context_set_address_width(context, iommu->msagaw); + else { + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, iommu->agaw); + } + + context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); domain_flush_cache(domain, context, sizeof(*context)); - /* it's a non-present to present mapping */ - if (iommu->flush.flush_context(iommu, domain->id, - (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL, 1)) + /* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH); + } else { iommu_flush_write_buffer(iommu); - else - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); - + } + iommu_enable_dev_iotlb(info); spin_unlock_irqrestore(&iommu->lock, flags); spin_lock_irqsave(&domain->iommu_lock, flags); @@ -1422,13 +1572,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } static int -domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) +domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev, + int translation) { int ret; struct pci_dev *tmp, *parent; ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus), - pdev->bus->number, pdev->devfn); + pdev->bus->number, pdev->devfn, + translation); if (ret) return ret; @@ -1442,7 +1594,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) ret = domain_context_mapping_one(domain, pci_domain_nr(parent->bus), parent->bus->number, - parent->devfn); + parent->devfn, translation); if (ret) return ret; parent = parent->bus->self; @@ -1450,12 +1602,14 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */ return domain_context_mapping_one(domain, pci_domain_nr(tmp->subordinate), - tmp->subordinate->number, 0); + tmp->subordinate->number, 0, + translation); else /* this is a legacy PCI bridge */ return domain_context_mapping_one(domain, pci_domain_nr(tmp->bus), tmp->bus->number, - tmp->devfn); + tmp->devfn, + translation); } static int domain_context_mapped(struct pci_dev *pdev) @@ -1493,42 +1647,94 @@ static int domain_context_mapped(struct pci_dev *pdev) tmp->devfn); } -static int -domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, - u64 hpa, size_t size, int prot) +/* Returns a number of VTD pages, but aligned to MM page size */ +static inline unsigned long aligned_nrpages(unsigned long host_addr, + size_t size) { - u64 start_pfn, end_pfn; - struct dma_pte *pte; - int index; - int addr_width = agaw_to_width(domain->agaw); + host_addr &= ~PAGE_MASK; + return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; +} + +static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long phys_pfn, + unsigned long nr_pages, int prot) +{ + struct dma_pte *first_pte = NULL, *pte = NULL; + phys_addr_t uninitialized_var(pteval); + int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + unsigned long sg_res; - hpa &= (((u64)1) << addr_width) - 1; + BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; - iova &= PAGE_MASK; - start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT; - end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT; - index = 0; - while (start_pfn < end_pfn) { - pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index); - if (!pte) - return -ENOMEM; + + prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + + if (sg) + sg_res = 0; + else { + sg_res = nr_pages + 1; + pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; + } + + while (nr_pages--) { + uint64_t tmp; + + if (!sg_res) { + sg_res = aligned_nrpages(sg->offset, sg->length); + sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; + sg->dma_length = sg->length; + pteval = page_to_phys(sg_page(sg)) | prot; + } + if (!pte) { + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); + if (!pte) + return -ENOMEM; + } /* We don't need lock here, nobody else * touches the iova range */ - BUG_ON(dma_pte_addr(pte)); - dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT); - dma_set_pte_prot(pte, prot); - if (prot & DMA_PTE_SNP) - dma_set_pte_snp(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - start_pfn++; - index++; + tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); + if (tmp) { + static int dumps = 5; + printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", + iov_pfn, tmp, (unsigned long long)pteval); + if (dumps) { + dumps--; + debug_dma_dump_mappings(NULL); + } + WARN_ON(1); + } + pte++; + if (!nr_pages || first_pte_in_page(pte)) { + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); + pte = NULL; + } + iov_pfn++; + pteval += VTD_PAGE_SIZE; + sg_res--; + if (!sg_res) + sg = sg_next(sg); } return 0; } +static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + struct scatterlist *sg, unsigned long nr_pages, + int prot) +{ + return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); +} + +static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, + unsigned long phys_pfn, unsigned long nr_pages, + int prot) +{ + return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); +} + static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) { if (!iommu) @@ -1536,9 +1742,8 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) clear_context_table(iommu, bus, devfn); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_CCMD_GLOBAL_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } static void domain_remove_dev_info(struct dmar_domain *domain) @@ -1557,6 +1762,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); free_devinfo_mem(info); @@ -1593,6 +1799,7 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) unsigned long flags; int bus = 0, devfn = 0; int segment; + int ret; domain = find_domain(pdev); if (domain) @@ -1625,6 +1832,10 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) } } + domain = alloc_domain(); + if (!domain) + goto error; + /* Allocate new domain for the device */ drhd = dmar_find_matched_drhd_unit(pdev); if (!drhd) { @@ -1634,9 +1845,11 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) } iommu = drhd->iommu; - domain = iommu_alloc_domain(iommu); - if (!domain) + ret = iommu_attach_domain(domain, iommu); + if (ret) { + domain_exit(domain); goto error; + } if (domain_init(domain, gaw)) { domain_exit(domain); @@ -1710,55 +1923,63 @@ error: return find_domain(pdev); } +static int iommu_identity_mapping; + +static int iommu_domain_identity_map(struct dmar_domain *domain, + unsigned long long start, + unsigned long long end) +{ + unsigned long first_vpfn = start >> VTD_PAGE_SHIFT; + unsigned long last_vpfn = end >> VTD_PAGE_SHIFT; + + if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn), + dma_to_mm_pfn(last_vpfn))) { + printk(KERN_ERR "IOMMU: reserve iova failed\n"); + return -ENOMEM; + } + + pr_debug("Mapping reserved region %llx-%llx for domain %d\n", + start, end, domain->id); + /* + * RMRR range might have overlap with physical memory range, + * clear it first + */ + dma_pte_clear_range(domain, first_vpfn, last_vpfn); + + return domain_pfn_mapping(domain, first_vpfn, first_vpfn, + last_vpfn - first_vpfn + 1, + DMA_PTE_READ|DMA_PTE_WRITE); +} + static int iommu_prepare_identity_map(struct pci_dev *pdev, unsigned long long start, unsigned long long end) { struct dmar_domain *domain; - unsigned long size; - unsigned long long base; int ret; printk(KERN_INFO - "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", - pci_name(pdev), start, end); - /* page table init */ + "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; - /* The address might not be aligned */ - base = start & PAGE_MASK; - size = end - base; - size = PAGE_ALIGN(size); - if (!reserve_iova(&domain->iovad, IOVA_PFN(base), - IOVA_PFN(base + size) - 1)) { - printk(KERN_ERR "IOMMU: reserve iova failed\n"); - ret = -ENOMEM; + ret = iommu_domain_identity_map(domain, start, end); + if (ret) goto error; - } - - pr_debug("Mapping reserved region %lx@%llx for %s\n", - size, base, pci_name(pdev)); - /* - * RMRR range might have overlap with physical memory range, - * clear it first - */ - dma_pte_clear_range(domain, base, base + size); - ret = domain_page_mapping(domain, base, base, size, - DMA_PTE_READ|DMA_PTE_WRITE); + /* context entry init */ + ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL); if (ret) goto error; - /* context entry init */ - ret = domain_context_mapping(domain, pdev); - if (!ret) - return 0; -error: + return 0; + + error: domain_exit(domain); return ret; - } static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, @@ -1770,96 +1991,222 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, rmrr->end_address + 1); } -#ifdef CONFIG_DMAR_GFX_WA -struct iommu_prepare_data { +#ifdef CONFIG_DMAR_FLOPPY_WA +static inline void iommu_prepare_isa(void) +{ struct pci_dev *pdev; int ret; -}; -static int __init iommu_prepare_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - struct iommu_prepare_data *data; + pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); + if (!pdev) + return; - data = (struct iommu_prepare_data *)datax; + printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n"); + ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); - data->ret = iommu_prepare_identity_map(data->pdev, - start_pfn<ret; + if (ret) + printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; " + "floppy might not work\n"); } - -static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +#else +static inline void iommu_prepare_isa(void) { - int nid; - struct iommu_prepare_data data; - - data.pdev = pdev; - data.ret = 0; - - for_each_online_node(nid) { - work_with_active_regions(nid, iommu_prepare_work_fn, &data); - if (data.ret) - return data.ret; - } - return data.ret; + return; } +#endif /* !CONFIG_DMAR_FLPY_WA */ -static void __init iommu_prepare_gfx_mapping(void) +/* Initialize each context entry as pass through.*/ +static int __init init_context_pass_through(void) { struct pci_dev *pdev = NULL; + struct dmar_domain *domain; int ret; for_each_pci_dev(pdev) { - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO || - !IS_GFX_DEVICE(pdev)) - continue; - printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", - pci_name(pdev)); - ret = iommu_prepare_with_active_regions(pdev); + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + ret = domain_context_mapping(domain, pdev, + CONTEXT_TT_PASS_THROUGH); if (ret) - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + return ret; } + return 0; } -#else /* !CONFIG_DMAR_GFX_WA */ -static inline void iommu_prepare_gfx_mapping(void) + +static int md_domain_init(struct dmar_domain *domain, int guest_width); + +static int __init si_domain_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) { - return; + int *ret = datax; + + *ret = iommu_domain_identity_map(si_domain, + (uint64_t)start_pfn << PAGE_SHIFT, + (uint64_t)end_pfn << PAGE_SHIFT); + return *ret; + } -#endif -#ifdef CONFIG_DMAR_FLOPPY_WA -static inline void iommu_prepare_isa(void) +static int si_domain_init(void) { - struct pci_dev *pdev; - int ret; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int nid, ret = 0; - pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); - if (!pdev) - return; + si_domain = alloc_domain(); + if (!si_domain) + return -EFAULT; - printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n"); - ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); + pr_debug("Identity mapping domain is domain %d\n", si_domain->id); - if (ret) - printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, " - "floppy might not work\n"); + for_each_active_iommu(iommu, drhd) { + ret = iommu_attach_domain(si_domain, iommu); + if (ret) { + domain_exit(si_domain); + return -EFAULT; + } + } + if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + domain_exit(si_domain); + return -EFAULT; + } + + si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + + for_each_online_node(nid) { + work_with_active_regions(nid, si_domain_work_fn, &ret); + if (ret) + return ret; + } + + return 0; } -#else -static inline void iommu_prepare_isa(void) + +static void domain_remove_one_dev_info(struct dmar_domain *domain, + struct pci_dev *pdev); +static int identity_mapping(struct pci_dev *pdev) { - return; + struct device_domain_info *info; + + if (likely(!iommu_identity_mapping)) + return 0; + + + list_for_each_entry(info, &si_domain->devices, link) + if (info->dev == pdev) + return 1; + return 0; } -#endif /* !CONFIG_DMAR_FLPY_WA */ -static int __init init_dmars(void) +static int domain_add_dev_info(struct dmar_domain *domain, + struct pci_dev *pdev) +{ + struct device_domain_info *info; + unsigned long flags; + + info = alloc_devinfo_mem(); + if (!info) + return -ENOMEM; + + info->segment = pci_domain_nr(pdev->bus); + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->dev = pdev; + info->domain = domain; + + spin_lock_irqsave(&device_domain_lock, flags); + list_add(&info->link, &domain->devices); + list_add(&info->global, &device_domain_list); + pdev->dev.archdata.iommu = info; + spin_unlock_irqrestore(&device_domain_lock, flags); + + return 0; +} + +static int iommu_should_identity_map(struct pci_dev *pdev, int startup) +{ + if (iommu_identity_mapping == 2) + return IS_GFX_DEVICE(pdev); + + /* + * We want to start off with all devices in the 1:1 domain, and + * take them out later if we find they can't access all of memory. + * + * However, we can't do this for PCI devices behind bridges, + * because all PCI devices behind the same bridge will end up + * with the same source-id on their transactions. + * + * Practically speaking, we can't change things around for these + * devices at run-time, because we can't be sure there'll be no + * DMA transactions in flight for any of their siblings. + * + * So PCI devices (unless they're on the root bus) as well as + * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of + * the 1:1 domain, just in _case_ one of their siblings turns out + * not to be able to map all of memory. + */ + if (!pdev->is_pcie) { + if (!pci_is_root_bus(pdev->bus)) + return 0; + if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI) + return 0; + } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + return 0; + + /* + * At boot time, we don't yet know if devices will be 64-bit capable. + * Assume that they will -- if they turn out not to be, then we can + * take them out of the 1:1 domain later. + */ + if (!startup) + return pdev->dma_mask > DMA_BIT_MASK(32); + + return 1; +} + +static int iommu_prepare_static_identity_mapping(void) +{ + struct pci_dev *pdev = NULL; + int ret; + + ret = si_domain_init(); + if (ret) + return -EFAULT; + + for_each_pci_dev(pdev) { + if (iommu_should_identity_map(pdev, 1)) { + printk(KERN_INFO "IOMMU: identity mapping for device %s\n", + pci_name(pdev)); + + ret = domain_context_mapping(si_domain, pdev, + CONTEXT_TT_MULTI_LEVEL); + if (ret) + return ret; + ret = domain_add_dev_info(si_domain, pdev); + if (ret) + return ret; + } + } + + return 0; +} + +int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct dmar_rmrr_unit *rmrr; struct pci_dev *pdev; struct intel_iommu *iommu; int i, ret; + int pass_through = 1; + + /* + * In case pass through can not be enabled, iommu tries to use identity + * mapping. + */ + if (iommu_pass_through) + iommu_identity_mapping = 1; /* * for each drhd @@ -1913,7 +2260,15 @@ static int __init init_dmars(void) printk(KERN_ERR "IOMMU: allocate root entry failed\n"); goto error; } + if (!ecap_pass_through(iommu->ecap)) + pass_through = 0; } + if (iommu_pass_through) + if (!pass_through) { + printk(KERN_INFO + "Pass Through is not supported by hardware.\n"); + iommu_pass_through = 0; + } /* * Start from the sane iommu hardware state. @@ -1968,45 +2323,63 @@ static int __init init_dmars(void) } } -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) { - ret = enable_intr_remapping(0); - if (ret) - printk(KERN_ERR - "IOMMU: enable interrupt remapping failed\n"); + /* + * If pass through is set and enabled, context entries of all pci + * devices are intialized by pass through translation type. + */ + if (iommu_pass_through) { + ret = init_context_pass_through(); + if (ret) { + printk(KERN_ERR "IOMMU: Pass through init failed.\n"); + iommu_pass_through = 0; + } } -#endif /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor + * If pass through is not set or not enabled, setup context entries for + * identity mappings for rmrr, gfx, and isa and may fall back to static + * identity mapping if iommu_identity_mapping is set. */ - for_each_rmrr_units(rmrr) { - for (i = 0; i < rmrr->devices_cnt; i++) { - pdev = rmrr->devices[i]; - /* some BIOS lists non-exist devices in DMAR table */ - if (!pdev) - continue; - ret = iommu_prepare_rmrr_dev(rmrr, pdev); - if (ret) - printk(KERN_ERR + if (!iommu_pass_through) { +#ifdef CONFIG_DMAR_BROKEN_GFX_WA + if (!iommu_identity_mapping) + iommu_identity_mapping = 2; +#endif + if (iommu_identity_mapping) + iommu_prepare_static_identity_mapping(); + /* + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor + */ + printk(KERN_INFO "IOMMU: Setting RMRR:\n"); + for_each_rmrr_units(rmrr) { + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* + * some BIOS lists non-exist devices in DMAR + * table. + */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + } } - } - iommu_prepare_gfx_mapping(); - - iommu_prepare_isa(); + iommu_prepare_isa(); + } /* * for each drhd @@ -2028,10 +2401,8 @@ static int __init init_dmars(void) iommu_set_root_entry(iommu); - iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, - 0); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, - 0); + iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); iommu_disable_protect_mem_regions(iommu); ret = iommu_enable_translation(iommu); @@ -2051,50 +2422,32 @@ error: return ret; } -static inline u64 aligned_size(u64 host_addr, size_t size) -{ - u64 addr; - addr = (host_addr & (~PAGE_MASK)) + size; - return PAGE_ALIGN(addr); -} - -struct iova * -iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end) -{ - struct iova *piova; - - /* Make sure it's in range */ - end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end); - if (!size || (IOVA_START_ADDR + size > end)) - return NULL; - - piova = alloc_iova(&domain->iovad, - size >> PAGE_SHIFT, IOVA_PFN(end), 1); - return piova; -} - -static struct iova * -__intel_alloc_iova(struct device *dev, struct dmar_domain *domain, - size_t size, u64 dma_mask) +/* This takes a number of _MM_ pages, not VTD pages */ +static struct iova *intel_alloc_iova(struct device *dev, + struct dmar_domain *domain, + unsigned long nrpages, uint64_t dma_mask) { struct pci_dev *pdev = to_pci_dev(dev); struct iova *iova = NULL; - if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac) - iova = iommu_alloc_iova(domain, size, dma_mask); - else { + /* Restrict dma_mask to the width that the iommu can handle */ + dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask); + + if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { /* * First try to allocate an io virtual address in * DMA_BIT_MASK(32) and if that fails then try allocating * from higher range */ - iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32)); - if (!iova) - iova = iommu_alloc_iova(domain, size, dma_mask); - } - - if (!iova) { - printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev)); + iova = alloc_iova(&domain->iovad, nrpages, + IOVA_PFN(DMA_BIT_MASK(32)), 1); + if (iova) + return iova; + } + iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1); + if (unlikely(!iova)) { + printk(KERN_ERR "Allocating %ld-page iova for %s failed", + nrpages, pci_name(pdev)); return NULL; } @@ -2117,7 +2470,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev) /* make sure context mapping is ok */ if (unlikely(!domain_context_mapped(pdev))) { - ret = domain_context_mapping(domain, pdev); + ret = domain_context_mapping(domain, pdev, + CONTEXT_TT_MULTI_LEVEL); if (ret) { printk(KERN_ERR "Domain context map for %s failed", @@ -2129,6 +2483,63 @@ get_valid_domain_for_dev(struct pci_dev *pdev) return domain; } +static int iommu_dummy(struct pci_dev *pdev) +{ + return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; +} + +/* Check if the pdev needs to go through non-identity map and unmap process.*/ +static int iommu_no_mapping(struct device *dev) +{ + struct pci_dev *pdev; + int found; + + if (unlikely(dev->bus != &pci_bus_type)) + return 1; + + pdev = to_pci_dev(dev); + if (iommu_dummy(pdev)) + return 1; + + if (!iommu_identity_mapping) + return 0; + + found = identity_mapping(pdev); + if (found) { + if (iommu_should_identity_map(pdev, 0)) + return 1; + else { + /* + * 32 bit DMA is removed from si_domain and fall back + * to non-identity mapping. + */ + domain_remove_one_dev_info(si_domain, pdev); + printk(KERN_INFO "32bit %s uses non-identity mapping\n", + pci_name(pdev)); + return 0; + } + } else { + /* + * In case of a detached 64 bit DMA device from vm, the device + * is put into si_domain for identity mapping. + */ + if (iommu_should_identity_map(pdev, 0)) { + int ret; + ret = domain_add_dev_info(si_domain, pdev); + if (ret) + return 0; + ret = domain_context_mapping(si_domain, pdev, CONTEXT_TT_MULTI_LEVEL); + if (!ret) { + printk(KERN_INFO "64bit %s uses identity mapping\n", + pci_name(pdev)); + return 1; + } + } + } + + return 0; +} + static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir, u64 dma_mask) { @@ -2139,9 +2550,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, int prot = 0; int ret; struct intel_iommu *iommu; + unsigned long paddr_pfn = paddr >> PAGE_SHIFT; BUG_ON(dir == DMA_NONE); - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + + if (iommu_no_mapping(hwdev)) return paddr; domain = get_valid_domain_for_dev(pdev); @@ -2149,14 +2562,13 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, return 0; iommu = domain_get_iommu(domain); - size = aligned_size((u64)paddr, size); + size = aligned_nrpages(paddr, size); - iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), + pdev->dma_mask); if (!iova) goto error; - start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; - /* * Check if DMAR supports zero-length reads on write only * mappings.. @@ -2172,18 +2584,20 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, * might have two guest_addr mapping to the same host paddr, but this * is not a big problem */ - ret = domain_page_mapping(domain, start_paddr, - ((u64)paddr) & PAGE_MASK, size, prot); + ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo), + mm_to_dma_pfn(paddr_pfn), size, prot); if (ret) goto error; - /* it's a non-present to present mapping */ - ret = iommu_flush_iotlb_psi(iommu, domain->id, - start_paddr, size >> VTD_PAGE_SHIFT, 1); - if (ret) + /* it's a non-present to present mapping. Only flush if caching mode */ + if (cap_caching_mode(iommu->cap)) + iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size); + else iommu_flush_write_buffer(iommu); - return start_paddr + ((u64)paddr & (~PAGE_MASK)); + start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; + start_paddr += paddr & ~PAGE_MASK; + return start_paddr; error: if (iova) @@ -2214,15 +2628,22 @@ static void flush_unmaps(void) if (!iommu) continue; - if (deferred_flush[i].next) { - iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); - for (j = 0; j < deferred_flush[i].next; j++) { - __free_iova(&deferred_flush[i].domain[j]->iovad, - deferred_flush[i].iova[j]); - } - deferred_flush[i].next = 0; + if (!deferred_flush[i].next) + continue; + + iommu->flush.flush_iotlb(iommu, 0, 0, 0, + DMA_TLB_GLOBAL_FLUSH); + for (j = 0; j < deferred_flush[i].next; j++) { + unsigned long mask; + struct iova *iova = deferred_flush[i].iova[j]; + + mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT; + mask = ilog2(mask >> VTD_PAGE_SHIFT); + iommu_flush_dev_iotlb(deferred_flush[i].domain[j], + iova->pfn_lo << PAGE_SHIFT, mask); + __free_iova(&deferred_flush[i].domain[j]->iovad, iova); } + deferred_flush[i].next = 0; } list_size = 0; @@ -2269,35 +2690,38 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, { struct pci_dev *pdev = to_pci_dev(dev); struct dmar_domain *domain; - unsigned long start_addr; + unsigned long start_pfn, last_pfn; struct iova *iova; struct intel_iommu *iommu; - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(dev)) return; + domain = find_domain(pdev); BUG_ON(!domain); iommu = domain_get_iommu(domain); iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr)); - if (!iova) + if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n", + (unsigned long long)dev_addr)) return; - start_addr = iova->pfn_lo << PAGE_SHIFT; - size = aligned_size((u64)dev_addr, size); + start_pfn = mm_to_dma_pfn(iova->pfn_lo); + last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; - pr_debug("Device %s unmapping: %zx@%llx\n", - pci_name(pdev), size, (unsigned long long)start_addr); + pr_debug("Device %s unmapping: pfn %lx-%lx\n", + pci_name(pdev), start_pfn, last_pfn); /* clear the whole page */ - dma_pte_clear_range(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_pfn, last_pfn); + /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, start_addr + size); + dma_pte_free_pagetable(domain, start_pfn, last_pfn); + if (intel_iommu_strict) { - if (iommu_flush_iotlb_psi(iommu, - domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0)) - iommu_flush_write_buffer(iommu); + iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, + last_pfn - start_pfn + 1); /* free iova */ __free_iova(&domain->iovad, iova); } else { @@ -2355,17 +2779,13 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { - int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; - unsigned long start_addr; + unsigned long start_pfn, last_pfn; struct iova *iova; - size_t size = 0; - phys_addr_t addr; - struct scatterlist *sg; struct intel_iommu *iommu; - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(hwdev)) return; domain = find_domain(pdev); @@ -2374,23 +2794,21 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, iommu = domain_get_iommu(domain); iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address)); - if (!iova) + if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n", + (unsigned long long)sglist[0].dma_address)) return; - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size += aligned_size((u64)addr, sg->length); - } - start_addr = iova->pfn_lo << PAGE_SHIFT; + start_pfn = mm_to_dma_pfn(iova->pfn_lo); + last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; /* clear the whole page */ - dma_pte_clear_range(domain, start_addr, start_addr + size); + dma_pte_clear_range(domain, start_pfn, last_pfn); + /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, start_addr + size); + dma_pte_free_pagetable(domain, start_pfn, last_pfn); - if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT, 0)) - iommu_flush_write_buffer(iommu); + iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, + (last_pfn - start_pfn + 1)); /* free iova */ __free_iova(&domain->iovad, iova); @@ -2413,21 +2831,20 @@ static int intel_nontranslate_map_sg(struct device *hddev, static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t addr; int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; size_t size = 0; int prot = 0; - size_t offset = 0; + size_t offset_pfn = 0; struct iova *iova = NULL; int ret; struct scatterlist *sg; - unsigned long start_addr; + unsigned long start_vpfn; struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(hwdev)) return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir); domain = get_valid_domain_for_dev(pdev); @@ -2436,12 +2853,11 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne iommu = domain_get_iommu(domain); - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size += aligned_size((u64)addr, sg->length); - } + for_each_sg(sglist, sg, nelems, i) + size += aligned_nrpages(sg->offset, sg->length); - iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), + pdev->dma_mask); if (!iova) { sglist->dma_length = 0; return 0; @@ -2457,35 +2873,27 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) prot |= DMA_PTE_WRITE; - start_addr = iova->pfn_lo << PAGE_SHIFT; - offset = 0; - for_each_sg(sglist, sg, nelems, i) { - addr = page_to_phys(sg_page(sg)) + sg->offset; - size = aligned_size((u64)addr, sg->length); - ret = domain_page_mapping(domain, start_addr + offset, - ((u64)addr) & PAGE_MASK, - size, prot); - if (ret) { - /* clear the page */ - dma_pte_clear_range(domain, start_addr, - start_addr + offset); - /* free page tables */ - dma_pte_free_pagetable(domain, start_addr, - start_addr + offset); - /* free iova */ - __free_iova(&domain->iovad, iova); - return 0; - } - sg->dma_address = start_addr + offset + - ((u64)addr & (~PAGE_MASK)); - sg->dma_length = sg->length; - offset += size; + start_vpfn = mm_to_dma_pfn(iova->pfn_lo); + + ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); + if (unlikely(ret)) { + /* clear the page */ + dma_pte_clear_range(domain, start_vpfn, + start_vpfn + size - 1); + /* free page tables */ + dma_pte_free_pagetable(domain, start_vpfn, + start_vpfn + size - 1); + /* free iova */ + __free_iova(&domain->iovad, iova); + return 0; } - /* it's a non-present to present mapping */ - if (iommu_flush_iotlb_psi(iommu, domain->id, - start_addr, offset >> VTD_PAGE_SHIFT, 1)) + /* it's a non-present to present mapping. Only flush if caching mode */ + if (cap_caching_mode(iommu->cap)) + iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn); + else iommu_flush_write_buffer(iommu); + return nelems; } @@ -2644,9 +3052,9 @@ static int init_iommu_hw(void) iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); iommu_disable_protect_mem_regions(iommu); iommu_enable_translation(iommu); } @@ -2661,9 +3069,9 @@ static void iommu_flush_all(void) for_each_active_iommu(iommu, drhd) { iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); } } @@ -2786,7 +3194,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || swiotlb || dmar_disabled) + if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -2806,35 +3214,18 @@ int __init intel_iommu_init(void) init_timer(&unmap_timer); force_iommu = 1; - dma_ops = &intel_dma_ops; - init_iommu_sysfs(); - - register_iommu(&intel_iommu_ops); - - return 0; -} - -static int vm_domain_add_dev_info(struct dmar_domain *domain, - struct pci_dev *pdev) -{ - struct device_domain_info *info; - unsigned long flags; - info = alloc_devinfo_mem(); - if (!info) - return -ENOMEM; + if (!iommu_pass_through) { + printk(KERN_INFO + "Multi-level page-table translation for DMAR.\n"); + dma_ops = &intel_dma_ops; + } else + printk(KERN_INFO + "DMAR: Pass through translation for DMAR.\n"); - info->segment = pci_domain_nr(pdev->bus); - info->bus = pdev->bus->number; - info->devfn = pdev->devfn; - info->dev = pdev; - info->domain = domain; + init_iommu_sysfs(); - spin_lock_irqsave(&device_domain_lock, flags); - list_add(&info->link, &domain->devices); - list_add(&info->global, &device_domain_list); - pdev->dev.archdata.iommu = info; - spin_unlock_irqrestore(&device_domain_lock, flags); + register_iommu(&intel_iommu_ops); return 0; } @@ -2866,7 +3257,7 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu, } } -static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, +static void domain_remove_one_dev_info(struct dmar_domain *domain, struct pci_dev *pdev) { struct device_domain_info *info; @@ -2892,6 +3283,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, pdev); free_devinfo_mem(info); @@ -2942,6 +3334,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) spin_unlock_irqrestore(&device_domain_lock, flags1); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, info->dev); @@ -2997,12 +3390,11 @@ static struct dmar_domain *iommu_alloc_vm_domain(void) return domain; } -static int vm_domain_init(struct dmar_domain *domain, int guest_width) +static int md_domain_init(struct dmar_domain *domain, int guest_width) { int adjust_width; init_iova_domain(&domain->iovad, DMA_32BIT_PFN); - spin_lock_init(&domain->mapping_lock); spin_lock_init(&domain->iommu_lock); domain_reserve_special_ranges(domain); @@ -3016,6 +3408,7 @@ static int vm_domain_init(struct dmar_domain *domain, int guest_width) domain->iommu_count = 0; domain->iommu_coherency = 0; + domain->iommu_snooping = 0; domain->max_addr = 0; /* always allocate the top pgd */ @@ -3056,8 +3449,6 @@ static void iommu_free_vm_domain(struct dmar_domain *domain) static void vm_domain_exit(struct dmar_domain *domain) { - u64 end; - /* Domain 0 is reserved, so dont process it */ if (!domain) return; @@ -3065,14 +3456,12 @@ static void vm_domain_exit(struct dmar_domain *domain) vm_domain_remove_all_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); - end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~VTD_PAGE_MASK); /* clear ptes */ - dma_pte_clear_range(domain, 0, end); + dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); /* free page tables */ - dma_pte_free_pagetable(domain, 0, end); + dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); iommu_free_vm_domain(domain); free_domain_mem(domain); @@ -3088,7 +3477,7 @@ static int intel_iommu_domain_init(struct iommu_domain *domain) "intel_iommu_domain_init: dmar_domain == NULL\n"); return -ENOMEM; } - if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { printk(KERN_ERR "intel_iommu_domain_init() failed\n"); vm_domain_exit(dmar_domain); @@ -3123,8 +3512,9 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, old_domain = find_domain(pdev); if (old_domain) { - if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) - vm_domain_remove_one_dev_info(old_domain, pdev); + if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || + dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) + domain_remove_one_dev_info(old_domain, pdev); else domain_remove_dev_info(old_domain); } @@ -3146,11 +3536,11 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return -EFAULT; } - ret = domain_context_mapping(dmar_domain, pdev); + ret = domain_add_dev_info(dmar_domain, pdev); if (ret) return ret; - ret = vm_domain_add_dev_info(dmar_domain, pdev); + ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); return ret; } @@ -3160,7 +3550,7 @@ static void intel_iommu_detach_device(struct iommu_domain *domain, struct dmar_domain *dmar_domain = domain->priv; struct pci_dev *pdev = to_pci_dev(dev); - vm_domain_remove_one_dev_info(dmar_domain, pdev); + domain_remove_one_dev_info(dmar_domain, pdev); } static int intel_iommu_map_range(struct iommu_domain *domain, @@ -3180,7 +3570,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain, if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) prot |= DMA_PTE_SNP; - max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size); + max_addr = iova + size; if (dmar_domain->max_addr < max_addr) { int min_agaw; u64 end; @@ -3198,8 +3588,11 @@ static int intel_iommu_map_range(struct iommu_domain *domain, } dmar_domain->max_addr = max_addr; } - - ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot); + /* Round up size to next multiple of PAGE_SIZE, if it and + the low bits of hpa would take us onto the next page */ + size = aligned_nrpages(hpa, size); + ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, + hpa >> VTD_PAGE_SHIFT, size, prot); return ret; } @@ -3207,15 +3600,15 @@ static void intel_iommu_unmap_range(struct iommu_domain *domain, unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = domain->priv; - dma_addr_t base; - /* The address might not be aligned */ - base = iova & VTD_PAGE_MASK; - size = VTD_PAGE_ALIGN(size); - dma_pte_clear_range(dmar_domain, base, base + size); + if (!size) + return; + + dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, + (iova + size - 1) >> VTD_PAGE_SHIFT); - if (dmar_domain->max_addr == base + size) - dmar_domain->max_addr = base; + if (dmar_domain->max_addr == iova + size) + dmar_domain->max_addr = iova; } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, @@ -3225,7 +3618,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, struct dma_pte *pte; u64 phys = 0; - pte = addr_to_dma_pte(dmar_domain, iova); + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT); if (pte) phys = dma_pte_addr(pte);