From: Linas Vepstas Date: Mon, 30 Oct 2006 05:15:59 +0000 (+1100) Subject: [POWERPC] Use 4kB iommu pages even on 64kB-page systems X-Git-Tag: v2.6.19-rc5~75^2~6 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5d2efba64b231a1733c4048d1708d77e07f26426;p=linux-2.6 [POWERPC] Use 4kB iommu pages even on 64kB-page systems The 10Gigabit ethernet device drivers appear to be able to chew up all 256MB of TCE mappings on pSeries systems, as evidenced by numerous error messages: iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1 Some experimentation indicates that this is essentially because one 1500 byte ethernet MTU gets mapped as a 64K DMA region when the large 64K pages are enabled. Thus, it doesn't take much to exhaust all of the available DMA mappings for a high-speed card. This patch changes the iommu allocator to work with its own unique, distinct page size. Although the patch is long, its actually quite simple: it just #defines a distinct IOMMU_PAGE_SIZE and then uses this in all the places that matter. As a side effect, it also dramatically improves network performance on platforms with H-calls on iommu translation inserts/removes (since we no longer call it 16 times for a 1500 bytes packet when the iommu HW is still 4k). In the future, we might want to make the IOMMU_PAGE_SIZE a variable in the iommu_table instance, thus allowing support for different HW page sizes in the iommu itself. Signed-off-by: Linas Vepstas Signed-off-by: Benjamin Herrenschmidt Acked-by: Olof Johansson Acked-by: Stephen Rothwell Signed-off-by: Paul Mackerras --- diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f88a2a675d..ba6b725608 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -47,6 +47,17 @@ static int novmerge = 0; static int novmerge = 1; #endif +static inline unsigned long iommu_num_pages(unsigned long vaddr, + unsigned long slen) +{ + unsigned long npages; + + npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK); + npages >>= IOMMU_PAGE_SHIFT; + + return npages; +} + static int __init setup_iommu(char *str) { if (!strcmp(str, "novmerge")) @@ -178,10 +189,10 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, } entry += tbl->it_offset; /* Offset into real TCE table */ - ret = entry << PAGE_SHIFT; /* Set the return dma address */ + ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ /* Put the TCEs in the HW table */ - ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK, + ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK, direction); @@ -203,7 +214,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long entry, free_entry; unsigned long i; - entry = dma_addr >> PAGE_SHIFT; + entry = dma_addr >> IOMMU_PAGE_SHIFT; free_entry = entry - tbl->it_offset; if (((free_entry + npages) > tbl->it_size) || @@ -270,7 +281,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Init first segment length for backout at failure */ outs->dma_length = 0; - DBG("mapping %d elements:\n", nelems); + DBG("sg mapping %d elements:\n", nelems); spin_lock_irqsave(&(tbl->it_lock), flags); @@ -285,9 +296,8 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, } /* Allocate iommu entries for that segment */ vaddr = (unsigned long)page_address(s->page) + s->offset; - npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK); - npages >>= PAGE_SHIFT; - entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0); + npages = iommu_num_pages(vaddr, slen); + entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0); DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); @@ -301,14 +311,14 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Convert entry to a dma_addr_t */ entry += tbl->it_offset; - dma_addr = entry << PAGE_SHIFT; - dma_addr |= s->offset; + dma_addr = entry << IOMMU_PAGE_SHIFT; + dma_addr |= (s->offset & ~IOMMU_PAGE_MASK); - DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n", + DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", npages, entry, dma_addr); /* Insert into HW table */ - ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction); + ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction); /* If we are in an open segment, try merging */ if (segstart != s) { @@ -323,7 +333,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, DBG(" can't merge, new segment.\n"); } else { outs->dma_length += s->length; - DBG(" merged, new len: %lx\n", outs->dma_length); + DBG(" merged, new len: %ux\n", outs->dma_length); } } @@ -367,9 +377,8 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, if (s->dma_length != 0) { unsigned long vaddr, npages; - vaddr = s->dma_address & PAGE_MASK; - npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr) - >> PAGE_SHIFT; + vaddr = s->dma_address & IOMMU_PAGE_MASK; + npages = iommu_num_pages(s->dma_address, s->dma_length); __iommu_free(tbl, vaddr, npages); s->dma_address = DMA_ERROR_CODE; s->dma_length = 0; @@ -398,8 +407,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, if (sglist->dma_length == 0) break; - npages = (PAGE_ALIGN(dma_handle + sglist->dma_length) - - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT; + npages = iommu_num_pages(dma_handle,sglist->dma_length); __iommu_free(tbl, dma_handle, npages); sglist++; } @@ -532,12 +540,11 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, BUG_ON(direction == DMA_NONE); uaddr = (unsigned long)vaddr; - npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); - npages >>= PAGE_SHIFT; + npages = iommu_num_pages(uaddr, size); if (tbl) { dma_handle = iommu_alloc(tbl, vaddr, npages, direction, - mask >> PAGE_SHIFT, 0); + mask >> IOMMU_PAGE_SHIFT, 0); if (dma_handle == DMA_ERROR_CODE) { if (printk_ratelimit()) { printk(KERN_INFO "iommu_alloc failed, " @@ -545,7 +552,7 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, tbl, vaddr, npages); } } else - dma_handle |= (uaddr & ~PAGE_MASK); + dma_handle |= (uaddr & ~IOMMU_PAGE_MASK); } return dma_handle; @@ -554,11 +561,14 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { + unsigned int npages; + BUG_ON(direction == DMA_NONE); - if (tbl) - iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) - - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT); + if (tbl) { + npages = iommu_num_pages(dma_handle, size); + iommu_free(tbl, dma_handle, npages); + } } /* Allocates a contiguous real buffer and creates mappings over it. @@ -570,11 +580,11 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, { void *ret = NULL; dma_addr_t mapping; - unsigned int npages, order; + unsigned int order; + unsigned int nio_pages, io_order; struct page *page; size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; order = get_order(size); /* @@ -598,8 +608,10 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, memset(ret, 0, size); /* Set up tces to cover the allocated range */ - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, - mask >> PAGE_SHIFT, order); + nio_pages = size >> IOMMU_PAGE_SHIFT; + io_order = get_iommu_order(size); + mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL, + mask >> IOMMU_PAGE_SHIFT, io_order); if (mapping == DMA_ERROR_CODE) { free_pages((unsigned long)ret, order); return NULL; @@ -611,12 +623,13 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, void iommu_free_coherent(struct iommu_table *tbl, size_t size, void *vaddr, dma_addr_t dma_handle) { - unsigned int npages; - if (tbl) { + unsigned int nio_pages; + + size = PAGE_ALIGN(size); + nio_pages = size >> IOMMU_PAGE_SHIFT; + iommu_free(tbl, dma_handle, nio_pages); size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - iommu_free(tbl, dma_handle, npages); free_pages((unsigned long)vaddr, get_order(size)); } } diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index cb87e71eec..ed007878d1 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -92,9 +92,9 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) &tbl->it_index, &offset, &size); /* TCE table size - measured in tce entries */ - tbl->it_size = size >> PAGE_SHIFT; + tbl->it_size = size >> IOMMU_PAGE_SHIFT; /* offset for VIO should always be 0 */ - tbl->it_offset = offset >> PAGE_SHIFT; + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; tbl->it_busno = 0; tbl->it_type = TCE_VB; diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c index f4cbbcf877..218817d13c 100644 --- a/arch/powerpc/platforms/iseries/iommu.c +++ b/arch/powerpc/platforms/iseries/iommu.c @@ -43,9 +43,6 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, u64 rc; u64 tce, rpn; - index <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - while (npages--) { rpn = virt_to_abs(uaddr) >> TCE_SHIFT; tce = (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; @@ -75,9 +72,6 @@ static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages) { u64 rc; - npages <<= TCE_PAGE_FACTOR; - index <<= TCE_PAGE_FACTOR; - while (npages--) { rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0); if (rc) @@ -136,10 +130,9 @@ void iommu_table_getparms_iSeries(unsigned long busno, panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms); /* itc_size is in pages worth of table, it_size is in # of entries */ - tbl->it_size = ((parms->itc_size * TCE_PAGE_SIZE) / - TCE_ENTRY_SIZE) >> TCE_PAGE_FACTOR; + tbl->it_size = (parms->itc_size * TCE_PAGE_SIZE) / TCE_ENTRY_SIZE; tbl->it_busno = parms->itc_busno; - tbl->it_offset = parms->itc_offset >> TCE_PAGE_FACTOR; + tbl->it_offset = parms->itc_offset; tbl->it_index = parms->itc_index; tbl->it_blocksize = 1; tbl->it_type = virtbus ? TCE_VB : TCE_PCI; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index d24ba547e5..556c279a78 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -57,9 +57,6 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index, u64 *tcep; u64 rpn; - index <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - proto_tce = TCE_PCI_READ; // Read allowed if (direction != DMA_TO_DEVICE) @@ -82,9 +79,6 @@ static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) { u64 *tcep; - npages <<= TCE_PAGE_FACTOR; - index <<= TCE_PAGE_FACTOR; - tcep = ((u64 *)tbl->it_base) + index; while (npages--) @@ -95,7 +89,6 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) { u64 *tcep; - index <<= TCE_PAGE_FACTOR; tcep = ((u64 *)tbl->it_base) + index; return *tcep; @@ -109,9 +102,6 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, u64 proto_tce, tce; u64 rpn; - tcenum <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) @@ -146,7 +136,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, u64 rpn; long l, limit; - if (TCE_PAGE_FACTOR == 0 && npages == 1) + if (npages == 1) return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction); @@ -164,9 +154,6 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, __get_cpu_var(tce_page) = tcep; } - tcenum <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) @@ -207,9 +194,6 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages { u64 rc; - tcenum <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - while (npages--) { rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0); @@ -229,9 +213,6 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n { u64 rc; - tcenum <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); if (rc && printk_ratelimit()) { @@ -248,7 +229,6 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) u64 rc; unsigned long tce_ret; - tcenum <<= TCE_PAGE_FACTOR; rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); if (rc && printk_ratelimit()) { @@ -289,7 +269,7 @@ static void iommu_table_setparms(struct pci_controller *phb, tbl->it_busno = phb->bus->number; /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT; + tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT; /* Test if we are going over 2GB of DMA space */ if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { @@ -300,7 +280,7 @@ static void iommu_table_setparms(struct pci_controller *phb, phb->dma_window_base_cur += phb->dma_window_size; /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> PAGE_SHIFT; + tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT; tbl->it_index = 0; tbl->it_blocksize = 16; @@ -325,8 +305,8 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, tbl->it_base = 0; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> PAGE_SHIFT; - tbl->it_size = size >> PAGE_SHIFT; + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; + tbl->it_size = size >> IOMMU_PAGE_SHIFT; } static void iommu_bus_setup_pSeries(struct pci_bus *bus) @@ -522,8 +502,6 @@ static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev) const void *dma_window = NULL; struct pci_dn *pci; - DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev)); - /* dev setup for LPAR is a little tricky, since the device tree might * contain the dma-window properties per-device and not neccesarily * for the bus. So we need to search upwards in the tree until we @@ -532,6 +510,9 @@ static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev) */ dn = pci_device_to_OF_node(dev); + DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n", + dev, pci_name(dev), dn->full_name); + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; pdn = pdn->parent) { dma_window = get_property(pdn, "ibm,dma-window", NULL); diff --git a/arch/powerpc/sysdev/dart.h b/arch/powerpc/sysdev/dart.h index 1c8817c483..ff202edb05 100644 --- a/arch/powerpc/sysdev/dart.h +++ b/arch/powerpc/sysdev/dart.h @@ -72,7 +72,6 @@ #define DART_PAGE_SHIFT 12 #define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT) -#define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT) #endif /* _POWERPC_SYSDEV_DART_H */ diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 03b4477dd7..572b7846cc 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -156,9 +156,6 @@ static void dart_build(struct iommu_table *tbl, long index, DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr); - index <<= DART_PAGE_FACTOR; - npages <<= DART_PAGE_FACTOR; - dp = ((unsigned int*)tbl->it_base) + index; /* On U3, all memory is contigous, so we can move this @@ -199,9 +196,6 @@ static void dart_free(struct iommu_table *tbl, long index, long npages) DBG("dart: free at: %lx, %lx\n", index, npages); - index <<= DART_PAGE_FACTOR; - npages <<= DART_PAGE_FACTOR; - dp = ((unsigned int *)tbl->it_base) + index; while (npages--) @@ -281,7 +275,7 @@ static void iommu_table_dart_setup(void) iommu_table_dart.it_busno = 0; iommu_table_dart.it_offset = 0; /* it_size is in number of entries */ - iommu_table_dart.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR; + iommu_table_dart.it_size = dart_tablesize / sizeof(u32); /* Initialize the common IOMMU code */ iommu_table_dart.it_base = (unsigned long)dart_vbase; diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h index a5e98641a2..39fad685ff 100644 --- a/include/asm-powerpc/iommu.h +++ b/include/asm-powerpc/iommu.h @@ -22,17 +22,35 @@ #define _ASM_IOMMU_H #ifdef __KERNEL__ -#include +#include #include #include #include +#include +#include + +#define IOMMU_PAGE_SHIFT 12 +#define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT) +#define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1)) +#define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE) + +#ifndef __ASSEMBLY__ + +/* Pure 2^n version of get_order */ +static __inline__ __attribute_const__ int get_iommu_order(unsigned long size) +{ + return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT) + 1; +} + +#endif /* __ASSEMBLY__ */ + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 * allows up to 2**12 pages (4096 * 4096) = 16 MB */ -#define IOMAP_MAX_ORDER 13 +#define IOMAP_MAX_ORDER 13 struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ diff --git a/include/asm-powerpc/tce.h b/include/asm-powerpc/tce.h index c9483adbf5..f663634ccc 100644 --- a/include/asm-powerpc/tce.h +++ b/include/asm-powerpc/tce.h @@ -22,6 +22,8 @@ #define _ASM_POWERPC_TCE_H #ifdef __KERNEL__ +#include + /* * Tces come in two formats, one for the virtual bus and a different * format for PCI @@ -33,7 +35,6 @@ #define TCE_SHIFT 12 #define TCE_PAGE_SIZE (1 << TCE_SHIFT) -#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT) #define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */