2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/iommu.h>
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53 static void domain_remove_dev_info(struct dmar_domain *domain);
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
63 static int __init intel_iommu_setup(char *str)
68 if (!strncmp(str, "off", 3)) {
70 printk(KERN_INFO"Intel-IOMMU: disabled\n");
71 } else if (!strncmp(str, "igfx_off", 8)) {
74 "Intel-IOMMU: disable GFX device mapping\n");
75 } else if (!strncmp(str, "forcedac", 8)) {
77 "Intel-IOMMU: Forcing DAC for PCI devices\n");
81 str += strcspn(str, ",");
87 __setup("intel_iommu=", intel_iommu_setup);
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
98 /* trying to avoid low memory issues */
99 flags = current->flags & PF_MEMALLOC;
100 current->flags |= PF_MEMALLOC;
101 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102 current->flags &= (~PF_MEMALLOC | flags);
107 static inline void *alloc_pgtable_page(void)
112 /* trying to avoid low memory issues */
113 flags = current->flags & PF_MEMALLOC;
114 current->flags |= PF_MEMALLOC;
115 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116 current->flags &= (~PF_MEMALLOC | flags);
120 static inline void free_pgtable_page(void *vaddr)
122 free_page((unsigned long)vaddr);
125 static inline void *alloc_domain_mem(void)
127 return iommu_kmem_cache_alloc(iommu_domain_cache);
130 static inline void free_domain_mem(void *vaddr)
132 kmem_cache_free(iommu_domain_cache, vaddr);
135 static inline void * alloc_devinfo_mem(void)
137 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
140 static inline void free_devinfo_mem(void *vaddr)
142 kmem_cache_free(iommu_devinfo_cache, vaddr);
145 struct iova *alloc_iova_mem(void)
147 return iommu_kmem_cache_alloc(iommu_iova_cache);
150 void free_iova_mem(struct iova *iova)
152 kmem_cache_free(iommu_iova_cache, iova);
155 static inline void __iommu_flush_cache(
156 struct intel_iommu *iommu, void *addr, int size)
158 if (!ecap_coherent(iommu->ecap))
159 clflush_cache_range(addr, size);
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
166 struct root_entry *root;
167 struct context_entry *context;
168 unsigned long phy_addr;
171 spin_lock_irqsave(&iommu->lock, flags);
172 root = &iommu->root_entry[bus];
173 context = get_context_addr_from_root(root);
175 context = (struct context_entry *)alloc_pgtable_page();
177 spin_unlock_irqrestore(&iommu->lock, flags);
180 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181 phy_addr = virt_to_phys((void *)context);
182 set_root_value(root, phy_addr);
183 set_root_present(root);
184 __iommu_flush_cache(iommu, root, sizeof(*root));
186 spin_unlock_irqrestore(&iommu->lock, flags);
187 return &context[devfn];
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
192 struct root_entry *root;
193 struct context_entry *context;
197 spin_lock_irqsave(&iommu->lock, flags);
198 root = &iommu->root_entry[bus];
199 context = get_context_addr_from_root(root);
204 ret = context_present(context[devfn]);
206 spin_unlock_irqrestore(&iommu->lock, flags);
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
212 struct root_entry *root;
213 struct context_entry *context;
216 spin_lock_irqsave(&iommu->lock, flags);
217 root = &iommu->root_entry[bus];
218 context = get_context_addr_from_root(root);
220 context_clear_entry(context[devfn]);
221 __iommu_flush_cache(iommu, &context[devfn], \
224 spin_unlock_irqrestore(&iommu->lock, flags);
227 static void free_context_table(struct intel_iommu *iommu)
229 struct root_entry *root;
232 struct context_entry *context;
234 spin_lock_irqsave(&iommu->lock, flags);
235 if (!iommu->root_entry) {
238 for (i = 0; i < ROOT_ENTRY_NR; i++) {
239 root = &iommu->root_entry[i];
240 context = get_context_addr_from_root(root);
242 free_pgtable_page(context);
244 free_pgtable_page(iommu->root_entry);
245 iommu->root_entry = NULL;
247 spin_unlock_irqrestore(&iommu->lock, flags);
250 /* page table handling */
251 #define LEVEL_STRIDE (9)
252 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
254 static inline int agaw_to_level(int agaw)
259 static inline int agaw_to_width(int agaw)
261 return 30 + agaw * LEVEL_STRIDE;
265 static inline int width_to_agaw(int width)
267 return (width - 30) / LEVEL_STRIDE;
270 static inline unsigned int level_to_offset_bits(int level)
272 return (12 + (level - 1) * LEVEL_STRIDE);
275 static inline int address_level_offset(u64 addr, int level)
277 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
280 static inline u64 level_mask(int level)
282 return ((u64)-1 << level_to_offset_bits(level));
285 static inline u64 level_size(int level)
287 return ((u64)1 << level_to_offset_bits(level));
290 static inline u64 align_to_level(u64 addr, int level)
292 return ((addr + level_size(level) - 1) & level_mask(level));
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
297 int addr_width = agaw_to_width(domain->agaw);
298 struct dma_pte *parent, *pte = NULL;
299 int level = agaw_to_level(domain->agaw);
303 BUG_ON(!domain->pgd);
305 addr &= (((u64)1) << addr_width) - 1;
306 parent = domain->pgd;
308 spin_lock_irqsave(&domain->mapping_lock, flags);
312 offset = address_level_offset(addr, level);
313 pte = &parent[offset];
317 if (!dma_pte_present(*pte)) {
318 tmp_page = alloc_pgtable_page();
321 spin_unlock_irqrestore(&domain->mapping_lock,
325 __iommu_flush_cache(domain->iommu, tmp_page,
327 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
329 * high level table always sets r/w, last level page
330 * table control read/write
332 dma_set_pte_readable(*pte);
333 dma_set_pte_writable(*pte);
334 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
336 parent = phys_to_virt(dma_pte_addr(*pte));
340 spin_unlock_irqrestore(&domain->mapping_lock, flags);
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
348 struct dma_pte *parent, *pte = NULL;
349 int total = agaw_to_level(domain->agaw);
352 parent = domain->pgd;
353 while (level <= total) {
354 offset = address_level_offset(addr, total);
355 pte = &parent[offset];
359 if (!dma_pte_present(*pte))
361 parent = phys_to_virt(dma_pte_addr(*pte));
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
370 struct dma_pte *pte = NULL;
372 /* get last level pte */
373 pte = dma_addr_level_pte(domain, addr, 1);
377 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
384 int addr_width = agaw_to_width(domain->agaw);
386 start &= (((u64)1) << addr_width) - 1;
387 end &= (((u64)1) << addr_width) - 1;
388 /* in case it's partial page */
389 start = PAGE_ALIGN_4K(start);
392 /* we don't need lock here, nobody else touches the iova range */
393 while (start < end) {
394 dma_pte_clear_one(domain, start);
395 start += PAGE_SIZE_4K;
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
403 int addr_width = agaw_to_width(domain->agaw);
405 int total = agaw_to_level(domain->agaw);
409 start &= (((u64)1) << addr_width) - 1;
410 end &= (((u64)1) << addr_width) - 1;
412 /* we don't need lock here, nobody else touches the iova range */
414 while (level <= total) {
415 tmp = align_to_level(start, level);
416 if (tmp >= end || (tmp + level_size(level) > end))
420 pte = dma_addr_level_pte(domain, tmp, level);
423 phys_to_virt(dma_pte_addr(*pte)));
425 __iommu_flush_cache(domain->iommu,
428 tmp += level_size(level);
433 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434 free_pgtable_page(domain->pgd);
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
442 struct root_entry *root;
445 root = (struct root_entry *)alloc_pgtable_page();
449 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
451 spin_lock_irqsave(&iommu->lock, flags);
452 iommu->root_entry = root;
453 spin_unlock_irqrestore(&iommu->lock, flags);
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
460 unsigned long start_time = jiffies;\
462 sts = op (iommu->reg + offset);\
465 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466 panic("DMAR hardware is malfunctioning\n");\
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
477 addr = iommu->root_entry;
479 spin_lock_irqsave(&iommu->register_lock, flag);
480 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
482 cmd = iommu->gcmd | DMA_GCMD_SRTP;
483 writel(cmd, iommu->reg + DMAR_GCMD_REG);
485 /* Make sure hardware complete it */
486 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487 readl, (sts & DMA_GSTS_RTPS), sts);
489 spin_unlock_irqrestore(&iommu->register_lock, flag);
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
497 if (!cap_rwbf(iommu->cap))
499 val = iommu->gcmd | DMA_GCMD_WBF;
501 spin_lock_irqsave(&iommu->register_lock, flag);
502 writel(val, iommu->reg + DMAR_GCMD_REG);
504 /* Make sure hardware complete it */
505 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506 readl, (!(val & DMA_GSTS_WBFS)), val);
508 spin_unlock_irqrestore(&iommu->register_lock, flag);
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513 u16 did, u16 source_id, u8 function_mask, u64 type,
514 int non_present_entry_flush)
520 * In the non-present entry flush case, if hardware doesn't cache
521 * non-present entry we do nothing and if hardware cache non-present
522 * entry, we flush entries of domain 0 (the domain id is used to cache
523 * any non-present entries)
525 if (non_present_entry_flush) {
526 if (!cap_caching_mode(iommu->cap))
533 case DMA_CCMD_GLOBAL_INVL:
534 val = DMA_CCMD_GLOBAL_INVL;
536 case DMA_CCMD_DOMAIN_INVL:
537 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
539 case DMA_CCMD_DEVICE_INVL:
540 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
548 spin_lock_irqsave(&iommu->register_lock, flag);
549 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
551 /* Make sure hardware complete it */
552 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
555 spin_unlock_irqrestore(&iommu->register_lock, flag);
557 /* flush context entry will implictly flush write buffer */
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562 int non_present_entry_flush)
564 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565 non_present_entry_flush);
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569 int non_present_entry_flush)
571 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572 non_present_entry_flush);
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
578 return __iommu_flush_context(iommu, did, source_id, function_mask,
579 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584 u64 addr, unsigned int size_order, u64 type,
585 int non_present_entry_flush)
587 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588 u64 val = 0, val_iva = 0;
592 * In the non-present entry flush case, if hardware doesn't cache
593 * non-present entry we do nothing and if hardware cache non-present
594 * entry, we flush entries of domain 0 (the domain id is used to cache
595 * any non-present entries)
597 if (non_present_entry_flush) {
598 if (!cap_caching_mode(iommu->cap))
605 case DMA_TLB_GLOBAL_FLUSH:
606 /* global flush doesn't need set IVA_REG */
607 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
609 case DMA_TLB_DSI_FLUSH:
610 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
612 case DMA_TLB_PSI_FLUSH:
613 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614 /* Note: always flush non-leaf currently */
615 val_iva = size_order | addr;
620 /* Note: set drain read/write */
623 * This is probably to be super secure.. Looks like we can
624 * ignore it without any impact.
626 if (cap_read_drain(iommu->cap))
627 val |= DMA_TLB_READ_DRAIN;
629 if (cap_write_drain(iommu->cap))
630 val |= DMA_TLB_WRITE_DRAIN;
632 spin_lock_irqsave(&iommu->register_lock, flag);
633 /* Note: Only uses first TLB reg currently */
635 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636 dmar_writeq(iommu->reg + tlb_offset + 8, val);
638 /* Make sure hardware complete it */
639 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640 dmar_readq, (!(val & DMA_TLB_IVT)), val);
642 spin_unlock_irqrestore(&iommu->register_lock, flag);
644 /* check IOTLB invalidation granularity */
645 if (DMA_TLB_IAIG(val) == 0)
646 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650 /* flush context entry will implictly flush write buffer */
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655 int non_present_entry_flush)
657 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658 non_present_entry_flush);
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662 int non_present_entry_flush)
664 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665 non_present_entry_flush);
668 static int iommu_get_alignment(u64 base, unsigned int size)
673 end = base + size - 1;
674 while (base != end) {
682 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
683 u64 addr, unsigned int pages, int non_present_entry_flush)
687 BUG_ON(addr & (~PAGE_MASK_4K));
690 /* Fallback to domain selective flush if no PSI support */
691 if (!cap_pgsel_inv(iommu->cap))
692 return iommu_flush_iotlb_dsi(iommu, did,
693 non_present_entry_flush);
696 * PSI requires page size to be 2 ^ x, and the base address is naturally
697 * aligned to the size
699 align = iommu_get_alignment(addr >> PAGE_SHIFT_4K, pages);
700 /* Fallback to domain selective flush if size is too big */
701 if (align > cap_max_amask_val(iommu->cap))
702 return iommu_flush_iotlb_dsi(iommu, did,
703 non_present_entry_flush);
705 addr >>= PAGE_SHIFT_4K + align;
706 addr <<= PAGE_SHIFT_4K + align;
708 return __iommu_flush_iotlb(iommu, did, addr, align,
709 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
712 static int iommu_enable_translation(struct intel_iommu *iommu)
717 spin_lock_irqsave(&iommu->register_lock, flags);
718 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
720 /* Make sure hardware complete it */
721 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
722 readl, (sts & DMA_GSTS_TES), sts);
724 iommu->gcmd |= DMA_GCMD_TE;
725 spin_unlock_irqrestore(&iommu->register_lock, flags);
729 static int iommu_disable_translation(struct intel_iommu *iommu)
734 spin_lock_irqsave(&iommu->register_lock, flag);
735 iommu->gcmd &= ~DMA_GCMD_TE;
736 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
738 /* Make sure hardware complete it */
739 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
740 readl, (!(sts & DMA_GSTS_TES)), sts);
742 spin_unlock_irqrestore(&iommu->register_lock, flag);
746 /* iommu interrupt handling. Most stuff are MSI-like. */
748 static char *fault_reason_strings[] =
751 "Present bit in root entry is clear",
752 "Present bit in context entry is clear",
753 "Invalid context entry",
754 "Access beyond MGAW",
755 "PTE Write access is not set",
756 "PTE Read access is not set",
757 "Next page table ptr is invalid",
758 "Root table address invalid",
759 "Context table ptr is invalid",
760 "non-zero reserved fields in RTP",
761 "non-zero reserved fields in CTP",
762 "non-zero reserved fields in PTE",
765 #define MAX_FAULT_REASON_IDX ARRAY_SIZE(fault_reason_strings)
767 char *dmar_get_fault_reason(u8 fault_reason)
769 if (fault_reason > MAX_FAULT_REASON_IDX)
770 return fault_reason_strings[MAX_FAULT_REASON_IDX];
772 return fault_reason_strings[fault_reason];
775 void dmar_msi_unmask(unsigned int irq)
777 struct intel_iommu *iommu = get_irq_data(irq);
781 spin_lock_irqsave(&iommu->register_lock, flag);
782 writel(0, iommu->reg + DMAR_FECTL_REG);
783 /* Read a reg to force flush the post write */
784 readl(iommu->reg + DMAR_FECTL_REG);
785 spin_unlock_irqrestore(&iommu->register_lock, flag);
788 void dmar_msi_mask(unsigned int irq)
791 struct intel_iommu *iommu = get_irq_data(irq);
794 spin_lock_irqsave(&iommu->register_lock, flag);
795 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
796 /* Read a reg to force flush the post write */
797 readl(iommu->reg + DMAR_FECTL_REG);
798 spin_unlock_irqrestore(&iommu->register_lock, flag);
801 void dmar_msi_write(int irq, struct msi_msg *msg)
803 struct intel_iommu *iommu = get_irq_data(irq);
806 spin_lock_irqsave(&iommu->register_lock, flag);
807 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
808 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
809 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
810 spin_unlock_irqrestore(&iommu->register_lock, flag);
813 void dmar_msi_read(int irq, struct msi_msg *msg)
815 struct intel_iommu *iommu = get_irq_data(irq);
818 spin_lock_irqsave(&iommu->register_lock, flag);
819 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
820 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
821 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
822 spin_unlock_irqrestore(&iommu->register_lock, flag);
825 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
826 u8 fault_reason, u16 source_id, u64 addr)
830 reason = dmar_get_fault_reason(fault_reason);
833 "DMAR:[%s] Request device [%02x:%02x.%d] "
835 "DMAR:[fault reason %02d] %s\n",
836 (type ? "DMA Read" : "DMA Write"),
837 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
838 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
842 #define PRIMARY_FAULT_REG_LEN (16)
843 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
845 struct intel_iommu *iommu = dev_id;
846 int reg, fault_index;
850 spin_lock_irqsave(&iommu->register_lock, flag);
851 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
853 /* TBD: ignore advanced fault log currently */
854 if (!(fault_status & DMA_FSTS_PPF))
857 fault_index = dma_fsts_fault_record_index(fault_status);
858 reg = cap_fault_reg_offset(iommu->cap);
866 /* highest 32 bits */
867 data = readl(iommu->reg + reg +
868 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869 if (!(data & DMA_FRCD_F))
872 fault_reason = dma_frcd_fault_reason(data);
873 type = dma_frcd_type(data);
875 data = readl(iommu->reg + reg +
876 fault_index * PRIMARY_FAULT_REG_LEN + 8);
877 source_id = dma_frcd_source_id(data);
879 guest_addr = dmar_readq(iommu->reg + reg +
880 fault_index * PRIMARY_FAULT_REG_LEN);
881 guest_addr = dma_frcd_page_addr(guest_addr);
882 /* clear the fault */
883 writel(DMA_FRCD_F, iommu->reg + reg +
884 fault_index * PRIMARY_FAULT_REG_LEN + 12);
886 spin_unlock_irqrestore(&iommu->register_lock, flag);
888 iommu_page_fault_do_one(iommu, type, fault_reason,
889 source_id, guest_addr);
892 if (fault_index > cap_num_fault_regs(iommu->cap))
894 spin_lock_irqsave(&iommu->register_lock, flag);
897 /* clear primary fault overflow */
898 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
899 if (fault_status & DMA_FSTS_PFO)
900 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
902 spin_unlock_irqrestore(&iommu->register_lock, flag);
906 int dmar_set_interrupt(struct intel_iommu *iommu)
912 printk(KERN_ERR "IOMMU: no free vectors\n");
916 set_irq_data(irq, iommu);
919 ret = arch_setup_dmar_msi(irq);
921 set_irq_data(irq, NULL);
927 /* Force fault register is cleared */
928 iommu_page_fault(irq, iommu);
930 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
932 printk(KERN_ERR "IOMMU: can't request irq\n");
936 static int iommu_init_domains(struct intel_iommu *iommu)
938 unsigned long ndomains;
939 unsigned long nlongs;
941 ndomains = cap_ndoms(iommu->cap);
942 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
943 nlongs = BITS_TO_LONGS(ndomains);
945 /* TBD: there might be 64K domains,
946 * consider other allocation for future chip
948 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
949 if (!iommu->domain_ids) {
950 printk(KERN_ERR "Allocating domain id array failed\n");
953 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
955 if (!iommu->domains) {
956 printk(KERN_ERR "Allocating domain array failed\n");
957 kfree(iommu->domain_ids);
962 * if Caching mode is set, then invalid translations are tagged
963 * with domainid 0. Hence we need to pre-allocate it.
965 if (cap_caching_mode(iommu->cap))
966 set_bit(0, iommu->domain_ids);
970 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
972 struct intel_iommu *iommu;
977 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
980 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
982 printk(KERN_ERR "IOMMU: can't map the region\n");
985 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
986 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
988 /* the registers might be more than one page */
989 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
990 cap_max_fault_reg_offset(iommu->cap));
991 map_size = PAGE_ALIGN_4K(map_size);
992 if (map_size > PAGE_SIZE_4K) {
994 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
996 printk(KERN_ERR "IOMMU: can't map the region\n");
1001 ver = readl(iommu->reg + DMAR_VER_REG);
1002 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1003 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1004 iommu->cap, iommu->ecap);
1005 ret = iommu_init_domains(iommu);
1008 spin_lock_init(&iommu->lock);
1009 spin_lock_init(&iommu->register_lock);
1011 drhd->iommu = iommu;
1014 iounmap(iommu->reg);
1021 static void domain_exit(struct dmar_domain *domain);
1022 static void free_iommu(struct intel_iommu *iommu)
1024 struct dmar_domain *domain;
1030 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1031 for (; i < cap_ndoms(iommu->cap); ) {
1032 domain = iommu->domains[i];
1033 clear_bit(i, iommu->domain_ids);
1034 domain_exit(domain);
1035 i = find_next_bit(iommu->domain_ids,
1036 cap_ndoms(iommu->cap), i+1);
1039 if (iommu->gcmd & DMA_GCMD_TE)
1040 iommu_disable_translation(iommu);
1043 set_irq_data(iommu->irq, NULL);
1044 /* This will mask the irq */
1045 free_irq(iommu->irq, iommu);
1046 destroy_irq(iommu->irq);
1049 kfree(iommu->domains);
1050 kfree(iommu->domain_ids);
1052 /* free context mapping */
1053 free_context_table(iommu);
1056 iounmap(iommu->reg);
1060 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1063 unsigned long ndomains;
1064 struct dmar_domain *domain;
1065 unsigned long flags;
1067 domain = alloc_domain_mem();
1071 ndomains = cap_ndoms(iommu->cap);
1073 spin_lock_irqsave(&iommu->lock, flags);
1074 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1075 if (num >= ndomains) {
1076 spin_unlock_irqrestore(&iommu->lock, flags);
1077 free_domain_mem(domain);
1078 printk(KERN_ERR "IOMMU: no free domain ids\n");
1082 set_bit(num, iommu->domain_ids);
1084 domain->iommu = iommu;
1085 iommu->domains[num] = domain;
1086 spin_unlock_irqrestore(&iommu->lock, flags);
1091 static void iommu_free_domain(struct dmar_domain *domain)
1093 unsigned long flags;
1095 spin_lock_irqsave(&domain->iommu->lock, flags);
1096 clear_bit(domain->id, domain->iommu->domain_ids);
1097 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1100 static struct iova_domain reserved_iova_list;
1102 static void dmar_init_reserved_ranges(void)
1104 struct pci_dev *pdev = NULL;
1109 init_iova_domain(&reserved_iova_list);
1111 /* IOAPIC ranges shouldn't be accessed by DMA */
1112 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1113 IOVA_PFN(IOAPIC_RANGE_END));
1115 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1117 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1118 for_each_pci_dev(pdev) {
1121 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1122 r = &pdev->resource[i];
1123 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1126 addr &= PAGE_MASK_4K;
1127 size = r->end - addr;
1128 size = PAGE_ALIGN_4K(size);
1129 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1130 IOVA_PFN(size + addr) - 1);
1132 printk(KERN_ERR "Reserve iova failed\n");
1138 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1140 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1143 static inline int guestwidth_to_adjustwidth(int gaw)
1146 int r = (gaw - 12) % 9;
1157 static int domain_init(struct dmar_domain *domain, int guest_width)
1159 struct intel_iommu *iommu;
1160 int adjust_width, agaw;
1161 unsigned long sagaw;
1163 init_iova_domain(&domain->iovad);
1164 spin_lock_init(&domain->mapping_lock);
1166 domain_reserve_special_ranges(domain);
1168 /* calculate AGAW */
1169 iommu = domain->iommu;
1170 if (guest_width > cap_mgaw(iommu->cap))
1171 guest_width = cap_mgaw(iommu->cap);
1172 domain->gaw = guest_width;
1173 adjust_width = guestwidth_to_adjustwidth(guest_width);
1174 agaw = width_to_agaw(adjust_width);
1175 sagaw = cap_sagaw(iommu->cap);
1176 if (!test_bit(agaw, &sagaw)) {
1177 /* hardware doesn't support it, choose a bigger one */
1178 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1179 agaw = find_next_bit(&sagaw, 5, agaw);
1183 domain->agaw = agaw;
1184 INIT_LIST_HEAD(&domain->devices);
1186 /* always allocate the top pgd */
1187 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1190 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1194 static void domain_exit(struct dmar_domain *domain)
1198 /* Domain 0 is reserved, so dont process it */
1202 domain_remove_dev_info(domain);
1204 put_iova_domain(&domain->iovad);
1205 end = DOMAIN_MAX_ADDR(domain->gaw);
1206 end = end & (~PAGE_MASK_4K);
1209 dma_pte_clear_range(domain, 0, end);
1211 /* free page tables */
1212 dma_pte_free_pagetable(domain, 0, end);
1214 iommu_free_domain(domain);
1215 free_domain_mem(domain);
1218 static int domain_context_mapping_one(struct dmar_domain *domain,
1221 struct context_entry *context;
1222 struct intel_iommu *iommu = domain->iommu;
1223 unsigned long flags;
1225 pr_debug("Set context mapping for %02x:%02x.%d\n",
1226 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1227 BUG_ON(!domain->pgd);
1228 context = device_to_context_entry(iommu, bus, devfn);
1231 spin_lock_irqsave(&iommu->lock, flags);
1232 if (context_present(*context)) {
1233 spin_unlock_irqrestore(&iommu->lock, flags);
1237 context_set_domain_id(*context, domain->id);
1238 context_set_address_width(*context, domain->agaw);
1239 context_set_address_root(*context, virt_to_phys(domain->pgd));
1240 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1241 context_set_fault_enable(*context);
1242 context_set_present(*context);
1243 __iommu_flush_cache(iommu, context, sizeof(*context));
1245 /* it's a non-present to present mapping */
1246 if (iommu_flush_context_device(iommu, domain->id,
1247 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1248 iommu_flush_write_buffer(iommu);
1250 iommu_flush_iotlb_dsi(iommu, 0, 0);
1251 spin_unlock_irqrestore(&iommu->lock, flags);
1256 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1259 struct pci_dev *tmp, *parent;
1261 ret = domain_context_mapping_one(domain, pdev->bus->number,
1266 /* dependent device mapping */
1267 tmp = pci_find_upstream_pcie_bridge(pdev);
1270 /* Secondary interface's bus number and devfn 0 */
1271 parent = pdev->bus->self;
1272 while (parent != tmp) {
1273 ret = domain_context_mapping_one(domain, parent->bus->number,
1277 parent = parent->bus->self;
1279 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1280 return domain_context_mapping_one(domain,
1281 tmp->subordinate->number, 0);
1282 else /* this is a legacy PCI bridge */
1283 return domain_context_mapping_one(domain,
1284 tmp->bus->number, tmp->devfn);
1287 static int domain_context_mapped(struct dmar_domain *domain,
1288 struct pci_dev *pdev)
1291 struct pci_dev *tmp, *parent;
1293 ret = device_context_mapped(domain->iommu,
1294 pdev->bus->number, pdev->devfn);
1297 /* dependent device mapping */
1298 tmp = pci_find_upstream_pcie_bridge(pdev);
1301 /* Secondary interface's bus number and devfn 0 */
1302 parent = pdev->bus->self;
1303 while (parent != tmp) {
1304 ret = device_context_mapped(domain->iommu, parent->bus->number,
1308 parent = parent->bus->self;
1311 return device_context_mapped(domain->iommu,
1312 tmp->subordinate->number, 0);
1314 return device_context_mapped(domain->iommu,
1315 tmp->bus->number, tmp->devfn);
1319 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1320 u64 hpa, size_t size, int prot)
1322 u64 start_pfn, end_pfn;
1323 struct dma_pte *pte;
1326 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1328 iova &= PAGE_MASK_4K;
1329 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1330 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1332 while (start_pfn < end_pfn) {
1333 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1336 /* We don't need lock here, nobody else
1337 * touches the iova range
1339 BUG_ON(dma_pte_addr(*pte));
1340 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1341 dma_set_pte_prot(*pte, prot);
1342 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1349 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1351 clear_context_table(domain->iommu, bus, devfn);
1352 iommu_flush_context_global(domain->iommu, 0);
1353 iommu_flush_iotlb_global(domain->iommu, 0);
1356 static void domain_remove_dev_info(struct dmar_domain *domain)
1358 struct device_domain_info *info;
1359 unsigned long flags;
1361 spin_lock_irqsave(&device_domain_lock, flags);
1362 while (!list_empty(&domain->devices)) {
1363 info = list_entry(domain->devices.next,
1364 struct device_domain_info, link);
1365 list_del(&info->link);
1366 list_del(&info->global);
1368 info->dev->sysdata = NULL;
1369 spin_unlock_irqrestore(&device_domain_lock, flags);
1371 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1372 free_devinfo_mem(info);
1374 spin_lock_irqsave(&device_domain_lock, flags);
1376 spin_unlock_irqrestore(&device_domain_lock, flags);
1381 * Note: we use struct pci_dev->sysdata stores the info
1383 struct dmar_domain *
1384 find_domain(struct pci_dev *pdev)
1386 struct device_domain_info *info;
1388 /* No lock here, assumes no domain exit in normal case */
1389 info = pdev->sysdata;
1391 return info->domain;
1395 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1396 struct pci_dev *dev)
1401 for (index = 0; index < cnt; index ++)
1402 if (dev == devices[index])
1405 /* Check our parent */
1406 dev = dev->bus->self;
1412 static struct dmar_drhd_unit *
1413 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1415 struct dmar_drhd_unit *drhd = NULL;
1417 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1418 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1419 drhd->devices_cnt, dev))
1426 /* domain is initialized */
1427 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1429 struct dmar_domain *domain, *found = NULL;
1430 struct intel_iommu *iommu;
1431 struct dmar_drhd_unit *drhd;
1432 struct device_domain_info *info, *tmp;
1433 struct pci_dev *dev_tmp;
1434 unsigned long flags;
1435 int bus = 0, devfn = 0;
1437 domain = find_domain(pdev);
1441 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1443 if (dev_tmp->is_pcie) {
1444 bus = dev_tmp->subordinate->number;
1447 bus = dev_tmp->bus->number;
1448 devfn = dev_tmp->devfn;
1450 spin_lock_irqsave(&device_domain_lock, flags);
1451 list_for_each_entry(info, &device_domain_list, global) {
1452 if (info->bus == bus && info->devfn == devfn) {
1453 found = info->domain;
1457 spin_unlock_irqrestore(&device_domain_lock, flags);
1458 /* pcie-pci bridge already has a domain, uses it */
1465 /* Allocate new domain for the device */
1466 drhd = dmar_find_matched_drhd_unit(pdev);
1468 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1472 iommu = drhd->iommu;
1474 domain = iommu_alloc_domain(iommu);
1478 if (domain_init(domain, gaw)) {
1479 domain_exit(domain);
1483 /* register pcie-to-pci device */
1485 info = alloc_devinfo_mem();
1487 domain_exit(domain);
1491 info->devfn = devfn;
1493 info->domain = domain;
1494 /* This domain is shared by devices under p2p bridge */
1495 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1497 /* pcie-to-pci bridge already has a domain, uses it */
1499 spin_lock_irqsave(&device_domain_lock, flags);
1500 list_for_each_entry(tmp, &device_domain_list, global) {
1501 if (tmp->bus == bus && tmp->devfn == devfn) {
1502 found = tmp->domain;
1507 free_devinfo_mem(info);
1508 domain_exit(domain);
1511 list_add(&info->link, &domain->devices);
1512 list_add(&info->global, &device_domain_list);
1514 spin_unlock_irqrestore(&device_domain_lock, flags);
1518 info = alloc_devinfo_mem();
1521 info->bus = pdev->bus->number;
1522 info->devfn = pdev->devfn;
1524 info->domain = domain;
1525 spin_lock_irqsave(&device_domain_lock, flags);
1526 /* somebody is fast */
1527 found = find_domain(pdev);
1528 if (found != NULL) {
1529 spin_unlock_irqrestore(&device_domain_lock, flags);
1530 if (found != domain) {
1531 domain_exit(domain);
1534 free_devinfo_mem(info);
1537 list_add(&info->link, &domain->devices);
1538 list_add(&info->global, &device_domain_list);
1539 pdev->sysdata = info;
1540 spin_unlock_irqrestore(&device_domain_lock, flags);
1543 /* recheck it here, maybe others set it */
1544 return find_domain(pdev);
1547 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1549 struct dmar_domain *domain;
1555 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1556 pci_name(pdev), start, end);
1557 /* page table init */
1558 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1562 /* The address might not be aligned */
1563 base = start & PAGE_MASK_4K;
1565 size = PAGE_ALIGN_4K(size);
1566 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1567 IOVA_PFN(base + size) - 1)) {
1568 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1573 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1574 size, base, pci_name(pdev));
1576 * RMRR range might have overlap with physical memory range,
1579 dma_pte_clear_range(domain, base, base + size);
1581 ret = domain_page_mapping(domain, base, base, size,
1582 DMA_PTE_READ|DMA_PTE_WRITE);
1586 /* context entry init */
1587 ret = domain_context_mapping(domain, pdev);
1591 domain_exit(domain);
1596 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1597 struct pci_dev *pdev)
1599 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1601 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1602 rmrr->end_address + 1);
1605 #ifdef CONFIG_DMAR_GFX_WA
1606 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1607 static void __init iommu_prepare_gfx_mapping(void)
1609 struct pci_dev *pdev = NULL;
1614 for_each_pci_dev(pdev) {
1615 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO ||
1616 !IS_GFX_DEVICE(pdev))
1618 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1620 slot = arch_get_ram_range(0, &base, &size);
1622 ret = iommu_prepare_identity_map(pdev,
1626 slot = arch_get_ram_range(slot, &base, &size);
1630 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1635 #ifdef CONFIG_DMAR_FLOPPY_WA
1636 static inline void iommu_prepare_isa(void)
1638 struct pci_dev *pdev;
1641 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1645 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1646 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1649 printk("IOMMU: Failed to create 0-64M identity map, "
1650 "floppy might not work\n");
1654 static inline void iommu_prepare_isa(void)
1658 #endif /* !CONFIG_DMAR_FLPY_WA */
1660 int __init init_dmars(void)
1662 struct dmar_drhd_unit *drhd;
1663 struct dmar_rmrr_unit *rmrr;
1664 struct pci_dev *pdev;
1665 struct intel_iommu *iommu;
1671 * initialize and program root entry to not present
1674 for_each_drhd_unit(drhd) {
1677 iommu = alloc_iommu(drhd);
1685 * we could share the same root & context tables
1686 * amoung all IOMMU's. Need to Split it later.
1688 ret = iommu_alloc_root_entry(iommu);
1690 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1697 * for each dev attached to rmrr
1699 * locate drhd for dev, alloc domain for dev
1700 * allocate free domain
1701 * allocate page table entries for rmrr
1702 * if context not allocated for bus
1703 * allocate and init context
1704 * set present in root table for this bus
1705 * init context with domain, translation etc
1709 for_each_rmrr_units(rmrr) {
1711 for (i = 0; i < rmrr->devices_cnt; i++) {
1712 pdev = rmrr->devices[i];
1713 /* some BIOS lists non-exist devices in DMAR table */
1716 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1719 "IOMMU: mapping reserved region failed\n");
1723 iommu_prepare_gfx_mapping();
1725 iommu_prepare_isa();
1730 * global invalidate context cache
1731 * global invalidate iotlb
1732 * enable translation
1734 for_each_drhd_unit(drhd) {
1737 iommu = drhd->iommu;
1738 sprintf (iommu->name, "dmar%d", unit++);
1740 iommu_flush_write_buffer(iommu);
1742 ret = dmar_set_interrupt(iommu);
1746 iommu_set_root_entry(iommu);
1748 iommu_flush_context_global(iommu, 0);
1749 iommu_flush_iotlb_global(iommu, 0);
1751 ret = iommu_enable_translation(iommu);
1758 for_each_drhd_unit(drhd) {
1761 iommu = drhd->iommu;
1767 static inline u64 aligned_size(u64 host_addr, size_t size)
1770 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1771 return PAGE_ALIGN_4K(addr);
1775 iommu_alloc_iova(struct dmar_domain *domain, void *host_addr, size_t size,
1781 /* Make sure it's in range */
1782 if ((start > DOMAIN_MAX_ADDR(domain->gaw)) || end < start)
1785 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1786 start_addr = PAGE_ALIGN_4K(start);
1787 size = aligned_size((u64)host_addr, size);
1788 if (!size || (start_addr + size > end))
1791 piova = alloc_iova(&domain->iovad,
1792 size >> PAGE_SHIFT_4K, IOVA_PFN(end));
1797 static dma_addr_t __intel_map_single(struct device *dev, void *addr,
1798 size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
1800 struct dmar_domain *domain;
1801 struct pci_dev *pdev = to_pci_dev(dev);
1804 struct iova *iova = NULL;
1807 addr = (void *)virt_to_phys(addr);
1809 domain = get_domain_for_dev(pdev,
1810 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1813 "Allocating domain for %s failed", pci_name(pdev));
1817 start_addr = IOVA_START_ADDR;
1819 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1820 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1824 * First try to allocate an io virtual address in
1825 * DMA_32BIT_MASK and if that fails then try allocating
1828 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1831 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1836 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1840 /* make sure context mapping is ok */
1841 if (unlikely(!domain_context_mapped(domain, pdev))) {
1842 ret = domain_context_mapping(domain, pdev);
1848 * Check if DMAR supports zero-length reads on write only
1851 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1852 !cap_zlr(domain->iommu->cap))
1853 prot |= DMA_PTE_READ;
1854 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1855 prot |= DMA_PTE_WRITE;
1857 * addr - (addr + size) might be partial page, we should map the whole
1858 * page. Note: if two part of one page are separately mapped, we
1859 * might have two guest_addr mapping to the same host addr, but this
1860 * is not a big problem
1862 ret = domain_page_mapping(domain, iova->pfn_lo << PAGE_SHIFT_4K,
1863 ((u64)addr) & PAGE_MASK_4K,
1864 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K, prot);
1868 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1869 pci_name(pdev), size, (u64)addr,
1870 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
1871 (u64)(iova->pfn_lo << PAGE_SHIFT_4K), dir);
1873 *flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1874 *flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
1875 return (iova->pfn_lo << PAGE_SHIFT_4K) + ((u64)addr & (~PAGE_MASK_4K));
1877 __free_iova(&domain->iovad, iova);
1878 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1879 pci_name(pdev), size, (u64)addr, dir);
1883 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1884 size_t size, int dir)
1886 struct pci_dev *pdev = to_pci_dev(hwdev);
1888 struct dmar_domain *domain;
1890 unsigned int flush_size;
1892 BUG_ON(dir == DMA_NONE);
1893 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1894 return virt_to_bus(addr);
1896 ret = __intel_map_single(hwdev, addr, size,
1897 dir, &flush_addr, &flush_size);
1899 domain = find_domain(pdev);
1900 /* it's a non-present to present mapping */
1901 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
1902 flush_addr, flush_size >> PAGE_SHIFT_4K, 1))
1903 iommu_flush_write_buffer(domain->iommu);
1908 static void __intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1909 size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
1911 struct dmar_domain *domain;
1912 struct pci_dev *pdev = to_pci_dev(dev);
1915 domain = find_domain(pdev);
1918 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1923 pr_debug("Device %s unmapping: %lx@%llx\n",
1925 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
1926 (u64)(iova->pfn_lo << PAGE_SHIFT_4K));
1928 *flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1929 *flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
1930 /* clear the whole page, not just dev_addr - (dev_addr + size) */
1931 dma_pte_clear_range(domain, *flush_addr, *flush_addr + *flush_size);
1932 /* free page tables */
1933 dma_pte_free_pagetable(domain, *flush_addr, *flush_addr + *flush_size);
1935 __free_iova(&domain->iovad, iova);
1938 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1939 size_t size, int dir)
1941 struct pci_dev *pdev = to_pci_dev(dev);
1942 struct dmar_domain *domain;
1944 unsigned int flush_size;
1946 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1949 domain = find_domain(pdev);
1950 __intel_unmap_single(dev, dev_addr, size,
1951 dir, &flush_addr, &flush_size);
1952 if (flush_size == 0)
1954 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, flush_addr,
1955 flush_size >> PAGE_SHIFT_4K, 0))
1956 iommu_flush_write_buffer(domain->iommu);
1959 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1960 dma_addr_t *dma_handle, gfp_t flags)
1965 size = PAGE_ALIGN_4K(size);
1966 order = get_order(size);
1967 flags &= ~(GFP_DMA | GFP_DMA32);
1969 vaddr = (void *)__get_free_pages(flags, order);
1972 memset(vaddr, 0, size);
1974 *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1977 free_pages((unsigned long)vaddr, order);
1981 static void intel_free_coherent(struct device *hwdev, size_t size,
1982 void *vaddr, dma_addr_t dma_handle)
1986 size = PAGE_ALIGN_4K(size);
1987 order = get_order(size);
1989 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1990 free_pages((unsigned long)vaddr, order);
1993 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sg,
1994 int nelems, int dir)
1997 struct pci_dev *pdev = to_pci_dev(hwdev);
1998 struct dmar_domain *domain;
2000 unsigned int flush_size;
2002 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
2005 domain = find_domain(pdev);
2006 for (i = 0; i < nelems; i++, sg++)
2007 __intel_unmap_single(hwdev, sg->dma_address,
2008 sg->dma_length, dir, &flush_addr, &flush_size);
2010 if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 0))
2011 iommu_flush_write_buffer(domain->iommu);
2014 #define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
2015 static int intel_nontranslate_map_sg(struct device *hddev,
2016 struct scatterlist *sg, int nelems, int dir)
2020 for (i = 0; i < nelems; i++) {
2021 struct scatterlist *s = &sg[i];
2023 s->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(s));
2024 s->dma_length = s->length;
2029 static int intel_map_sg(struct device *hwdev, struct scatterlist *sg,
2030 int nelems, int dir)
2034 dma_addr_t dma_handle;
2035 struct pci_dev *pdev = to_pci_dev(hwdev);
2036 struct dmar_domain *domain;
2038 unsigned int flush_size;
2040 BUG_ON(dir == DMA_NONE);
2041 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
2042 return intel_nontranslate_map_sg(hwdev, sg, nelems, dir);
2044 for (i = 0; i < nelems; i++, sg++) {
2045 addr = SG_ENT_VIRT_ADDRESS(sg);
2046 dma_handle = __intel_map_single(hwdev, addr,
2047 sg->length, dir, &flush_addr, &flush_size);
2049 intel_unmap_sg(hwdev, sg - i, i, dir);
2050 sg[0].dma_length = 0;
2053 sg->dma_address = dma_handle;
2054 sg->dma_length = sg->length;
2057 domain = find_domain(pdev);
2059 /* it's a non-present to present mapping */
2060 if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 1))
2061 iommu_flush_write_buffer(domain->iommu);
2065 static struct dma_mapping_ops intel_dma_ops = {
2066 .alloc_coherent = intel_alloc_coherent,
2067 .free_coherent = intel_free_coherent,
2068 .map_single = intel_map_single,
2069 .unmap_single = intel_unmap_single,
2070 .map_sg = intel_map_sg,
2071 .unmap_sg = intel_unmap_sg,
2074 static inline int iommu_domain_cache_init(void)
2078 iommu_domain_cache = kmem_cache_create("iommu_domain",
2079 sizeof(struct dmar_domain),
2084 if (!iommu_domain_cache) {
2085 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2092 static inline int iommu_devinfo_cache_init(void)
2096 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2097 sizeof(struct device_domain_info),
2102 if (!iommu_devinfo_cache) {
2103 printk(KERN_ERR "Couldn't create devinfo cache\n");
2110 static inline int iommu_iova_cache_init(void)
2114 iommu_iova_cache = kmem_cache_create("iommu_iova",
2115 sizeof(struct iova),
2120 if (!iommu_iova_cache) {
2121 printk(KERN_ERR "Couldn't create iova cache\n");
2128 static int __init iommu_init_mempool(void)
2131 ret = iommu_iova_cache_init();
2135 ret = iommu_domain_cache_init();
2139 ret = iommu_devinfo_cache_init();
2143 kmem_cache_destroy(iommu_domain_cache);
2145 kmem_cache_destroy(iommu_iova_cache);
2150 static void __init iommu_exit_mempool(void)
2152 kmem_cache_destroy(iommu_devinfo_cache);
2153 kmem_cache_destroy(iommu_domain_cache);
2154 kmem_cache_destroy(iommu_iova_cache);
2158 void __init detect_intel_iommu(void)
2160 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2162 if (early_dmar_detect()) {
2167 static void __init init_no_remapping_devices(void)
2169 struct dmar_drhd_unit *drhd;
2171 for_each_drhd_unit(drhd) {
2172 if (!drhd->include_all) {
2174 for (i = 0; i < drhd->devices_cnt; i++)
2175 if (drhd->devices[i] != NULL)
2177 /* ignore DMAR unit if no pci devices exist */
2178 if (i == drhd->devices_cnt)
2186 for_each_drhd_unit(drhd) {
2188 if (drhd->ignored || drhd->include_all)
2191 for (i = 0; i < drhd->devices_cnt; i++)
2192 if (drhd->devices[i] &&
2193 !IS_GFX_DEVICE(drhd->devices[i]))
2196 if (i < drhd->devices_cnt)
2199 /* bypass IOMMU if it is just for gfx devices */
2201 for (i = 0; i < drhd->devices_cnt; i++) {
2202 if (!drhd->devices[i])
2204 drhd->devices[i]->sysdata = DUMMY_DEVICE_DOMAIN_INFO;
2209 int __init intel_iommu_init(void)
2213 if (no_iommu || swiotlb || dmar_disabled)
2216 if (dmar_table_init())
2219 iommu_init_mempool();
2220 dmar_init_reserved_ranges();
2222 init_no_remapping_devices();
2226 printk(KERN_ERR "IOMMU: dmar init failed\n");
2227 put_iova_domain(&reserved_iova_list);
2228 iommu_exit_mempool();
2232 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2235 dma_ops = &intel_dma_ops;