From: David S. Miller Date: Wed, 19 Mar 2008 11:52:48 +0000 (-0700) Subject: [SPARC64]: NUMA device infrastructure. X-Git-Tag: v2.6.26-rc1~1089^2~10 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c1b1a5f1f1b2612b69b67381b223bce9f8ec4da5;p=linux-2.6 [SPARC64]: NUMA device infrastructure. Record and propagate NUMA information for devices. Signed-off-by: David S. Miller --- diff --git a/arch/sparc64/kernel/ebus.c b/arch/sparc64/kernel/ebus.c index 04ab81cb4f..bc26322748 100644 --- a/arch/sparc64/kernel/ebus.c +++ b/arch/sparc64/kernel/ebus.c @@ -396,6 +396,7 @@ static void __init fill_ebus_device(struct device_node *dp, struct linux_ebus_de sd->op = &dev->ofdev; sd->iommu = dev->bus->ofdev.dev.parent->archdata.iommu; sd->stc = dev->bus->ofdev.dev.parent->archdata.stc; + sd->numa_node = dev->bus->ofdev.dev.parent->archdata.numa_node; dev->ofdev.node = dp; dev->ofdev.dev.parent = &dev->bus->ofdev.dev; diff --git a/arch/sparc64/kernel/iommu.c b/arch/sparc64/kernel/iommu.c index 756fa24eee..2a37a6ca2a 100644 --- a/arch/sparc64/kernel/iommu.c +++ b/arch/sparc64/kernel/iommu.c @@ -173,9 +173,11 @@ void iommu_range_free(struct iommu *iommu, dma_addr_t dma_addr, unsigned long np } int iommu_table_init(struct iommu *iommu, int tsbsize, - u32 dma_offset, u32 dma_addr_mask) + u32 dma_offset, u32 dma_addr_mask, + int numa_node) { - unsigned long i, tsbbase, order, sz, num_tsb_entries; + unsigned long i, order, sz, num_tsb_entries; + struct page *page; num_tsb_entries = tsbsize / sizeof(iopte_t); @@ -188,11 +190,12 @@ int iommu_table_init(struct iommu *iommu, int tsbsize, /* Allocate and initialize the free area map. */ sz = num_tsb_entries / 8; sz = (sz + 7UL) & ~7UL; - iommu->arena.map = kzalloc(sz, GFP_KERNEL); + iommu->arena.map = kmalloc_node(sz, GFP_KERNEL, numa_node); if (!iommu->arena.map) { printk(KERN_ERR "IOMMU: Error, kmalloc(arena.map) failed.\n"); return -ENOMEM; } + memset(iommu->arena.map, 0, sz); iommu->arena.limit = num_tsb_entries; if (tlb_type != hypervisor) @@ -201,21 +204,23 @@ int iommu_table_init(struct iommu *iommu, int tsbsize, /* Allocate and initialize the dummy page which we * set inactive IO PTEs to point to. */ - iommu->dummy_page = get_zeroed_page(GFP_KERNEL); - if (!iommu->dummy_page) { + page = alloc_pages_node(numa_node, GFP_KERNEL, 0); + if (!page) { printk(KERN_ERR "IOMMU: Error, gfp(dummy_page) failed.\n"); goto out_free_map; } + iommu->dummy_page = (unsigned long) page_address(page); + memset((void *)iommu->dummy_page, 0, PAGE_SIZE); iommu->dummy_page_pa = (unsigned long) __pa(iommu->dummy_page); /* Now allocate and setup the IOMMU page table itself. */ order = get_order(tsbsize); - tsbbase = __get_free_pages(GFP_KERNEL, order); - if (!tsbbase) { + page = alloc_pages_node(numa_node, GFP_KERNEL, order); + if (!page) { printk(KERN_ERR "IOMMU: Error, gfp(tsb) failed.\n"); goto out_free_dummy_page; } - iommu->page_table = (iopte_t *)tsbbase; + iommu->page_table = (iopte_t *)page_address(page); for (i = 0; i < num_tsb_entries; i++) iopte_make_dummy(iommu, &iommu->page_table[i]); @@ -276,20 +281,24 @@ static inline void iommu_free_ctx(struct iommu *iommu, int ctx) static void *dma_4u_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addrp, gfp_t gfp) { + unsigned long flags, order, first_page; struct iommu *iommu; + struct page *page; + int npages, nid; iopte_t *iopte; - unsigned long flags, order, first_page; void *ret; - int npages; size = IO_PAGE_ALIGN(size); order = get_order(size); if (order >= 10) return NULL; - first_page = __get_free_pages(gfp, order); - if (first_page == 0UL) + nid = dev->archdata.numa_node; + page = alloc_pages_node(nid, gfp, order); + if (unlikely(!page)) return NULL; + + first_page = (unsigned long) page_address(page); memset((char *)first_page, 0, PAGE_SIZE << order); iommu = dev->archdata.iommu; diff --git a/arch/sparc64/kernel/isa.c b/arch/sparc64/kernel/isa.c index b5f7b35408..a2af5ed784 100644 --- a/arch/sparc64/kernel/isa.c +++ b/arch/sparc64/kernel/isa.c @@ -92,6 +92,7 @@ static void __init isa_fill_devices(struct sparc_isa_bridge *isa_br) sd->op = &isa_dev->ofdev; sd->iommu = isa_br->ofdev.dev.parent->archdata.iommu; sd->stc = isa_br->ofdev.dev.parent->archdata.stc; + sd->numa_node = isa_br->ofdev.dev.parent->archdata.numa_node; isa_dev->ofdev.node = dp; isa_dev->ofdev.dev.parent = &isa_br->ofdev.dev; diff --git a/arch/sparc64/kernel/of_device.c b/arch/sparc64/kernel/of_device.c index 0fd9db95b8..9e58e8cba1 100644 --- a/arch/sparc64/kernel/of_device.c +++ b/arch/sparc64/kernel/of_device.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -660,6 +661,7 @@ static unsigned int __init build_one_device_irq(struct of_device *op, struct device_node *dp = op->node; struct device_node *pp, *ip; unsigned int orig_irq = irq; + int nid; if (irq == 0xffffffff) return irq; @@ -672,7 +674,7 @@ static unsigned int __init build_one_device_irq(struct of_device *op, printk("%s: direct translate %x --> %x\n", dp->full_name, orig_irq, irq); - return irq; + goto out; } /* Something more complicated. Walk up to the root, applying @@ -744,6 +746,14 @@ static unsigned int __init build_one_device_irq(struct of_device *op, printk("%s: Apply IRQ trans [%s] %x --> %x\n", op->node->full_name, ip->full_name, orig_irq, irq); +out: + nid = of_node_to_nid(dp); + if (nid != -1) { + cpumask_t numa_mask = node_to_cpumask(nid); + + irq_set_affinity(irq, numa_mask); + } + return irq; } diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c index 545356b00e..49f9127665 100644 --- a/arch/sparc64/kernel/pci.c +++ b/arch/sparc64/kernel/pci.c @@ -369,10 +369,12 @@ struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm, sd->host_controller = pbm; sd->prom_node = node; sd->op = of_find_device_by_node(node); + sd->numa_node = pbm->numa_node; sd = &sd->op->dev.archdata; sd->iommu = pbm->iommu; sd->stc = &pbm->stc; + sd->numa_node = pbm->numa_node; type = of_get_property(node, "device_type", NULL); if (type == NULL) @@ -1159,6 +1161,16 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_NUMA +int pcibus_to_node(struct pci_bus *pbus) +{ + struct pci_pbm_info *pbm = pbus->sysdata; + + return pbm->numa_node; +} +EXPORT_SYMBOL(pcibus_to_node); +#endif + /* Return the domain nuber for this pci bus */ int pci_domain_nr(struct pci_bus *pbus) diff --git a/arch/sparc64/kernel/pci_fire.c b/arch/sparc64/kernel/pci_fire.c index 7571ed5631..d23bb6f53c 100644 --- a/arch/sparc64/kernel/pci_fire.c +++ b/arch/sparc64/kernel/pci_fire.c @@ -71,7 +71,8 @@ static int pci_fire_pbm_iommu_init(struct pci_pbm_info *pbm) */ fire_write(iommu->iommu_flushinv, ~(u64)0); - err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask); + err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask, + pbm->numa_node); if (err) return err; @@ -449,6 +450,8 @@ static int __init pci_fire_pbm_init(struct pci_controller_info *p, pbm->next = pci_pbm_root; pci_pbm_root = pbm; + pbm->numa_node = -1; + pbm->scan_bus = pci_fire_scan_bus; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 12; diff --git a/arch/sparc64/kernel/pci_impl.h b/arch/sparc64/kernel/pci_impl.h index f79c923fdd..218bac4ff7 100644 --- a/arch/sparc64/kernel/pci_impl.h +++ b/arch/sparc64/kernel/pci_impl.h @@ -148,6 +148,8 @@ struct pci_pbm_info { struct pci_bus *pci_bus; void (*scan_bus)(struct pci_pbm_info *); struct pci_ops *pci_ops; + + int numa_node; }; struct pci_controller_info { diff --git a/arch/sparc64/kernel/pci_msi.c b/arch/sparc64/kernel/pci_msi.c index d6d64b44af..db5e8fd8f6 100644 --- a/arch/sparc64/kernel/pci_msi.c +++ b/arch/sparc64/kernel/pci_msi.c @@ -279,11 +279,17 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm, unsigned long devino) { int irq = ops->msiq_build_irq(pbm, msiqid, devino); - int err; + int err, nid; if (irq < 0) return irq; + nid = pbm->numa_node; + if (nid != -1) { + cpumask_t numa_mask = node_to_cpumask(nid); + + irq_set_affinity(irq, numa_mask); + } err = request_irq(irq, sparc64_msiq_interrupt, 0, "MSIQ", &pbm->msiq_irq_cookies[msiqid - pbm->msiq_first]); diff --git a/arch/sparc64/kernel/pci_psycho.c b/arch/sparc64/kernel/pci_psycho.c index 0bad96e5d1..994dbe0603 100644 --- a/arch/sparc64/kernel/pci_psycho.c +++ b/arch/sparc64/kernel/pci_psycho.c @@ -848,7 +848,8 @@ static int psycho_iommu_init(struct pci_pbm_info *pbm) /* Leave diag mode enabled for full-flushing done * in pci_iommu.c */ - err = iommu_table_init(iommu, IO_TSB_SIZE, 0xc0000000, 0xffffffff); + err = iommu_table_init(iommu, IO_TSB_SIZE, 0xc0000000, 0xffffffff, + pbm->numa_node); if (err) return err; @@ -979,6 +980,8 @@ static void __init psycho_pbm_init(struct pci_controller_info *p, pbm->next = pci_pbm_root; pci_pbm_root = pbm; + pbm->numa_node = -1; + pbm->scan_bus = psycho_scan_bus; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 8; diff --git a/arch/sparc64/kernel/pci_sabre.c b/arch/sparc64/kernel/pci_sabre.c index 1c5f5fa233..4c34195baf 100644 --- a/arch/sparc64/kernel/pci_sabre.c +++ b/arch/sparc64/kernel/pci_sabre.c @@ -704,7 +704,7 @@ static int sabre_iommu_init(struct pci_pbm_info *pbm, * in pci_iommu.c */ err = iommu_table_init(iommu, tsbsize * 1024 * 8, - dvma_offset, dma_mask); + dvma_offset, dma_mask, pbm->numa_node); if (err) return err; @@ -737,6 +737,8 @@ static void __init sabre_pbm_init(struct pci_controller_info *p, pbm->name = dp->full_name; printk("%s: SABRE PCI Bus Module\n", pbm->name); + pbm->numa_node = -1; + pbm->scan_bus = sabre_scan_bus; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 8; diff --git a/arch/sparc64/kernel/pci_schizo.c b/arch/sparc64/kernel/pci_schizo.c index e306093623..615edd9c8e 100644 --- a/arch/sparc64/kernel/pci_schizo.c +++ b/arch/sparc64/kernel/pci_schizo.c @@ -1220,7 +1220,8 @@ static int schizo_pbm_iommu_init(struct pci_pbm_info *pbm) /* Leave diag mode enabled for full-flushing done * in pci_iommu.c */ - err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask); + err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask, + pbm->numa_node); if (err) return err; @@ -1379,6 +1380,8 @@ static int __init schizo_pbm_init(struct pci_controller_info *p, pbm->next = pci_pbm_root; pci_pbm_root = pbm; + pbm->numa_node = -1; + pbm->scan_bus = schizo_scan_bus; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 8; diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c index 01839706bd..e2bb979003 100644 --- a/arch/sparc64/kernel/pci_sun4v.c +++ b/arch/sparc64/kernel/pci_sun4v.c @@ -127,10 +127,12 @@ static inline long iommu_batch_end(void) static void *dma_4v_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addrp, gfp_t gfp) { - struct iommu *iommu; unsigned long flags, order, first_page, npages, n; + struct iommu *iommu; + struct page *page; void *ret; long entry; + int nid; size = IO_PAGE_ALIGN(size); order = get_order(size); @@ -139,10 +141,12 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, npages = size >> IO_PAGE_SHIFT; - first_page = __get_free_pages(gfp, order); - if (unlikely(first_page == 0UL)) + nid = dev->archdata.numa_node; + page = alloc_pages_node(nid, gfp, order); + if (unlikely(!page)) return NULL; + first_page = (unsigned long) page_address(page); memset((char *)first_page, 0, PAGE_SIZE << order); iommu = dev->archdata.iommu; @@ -899,6 +903,8 @@ static void __init pci_sun4v_pbm_init(struct pci_controller_info *p, pbm->next = pci_pbm_root; pci_pbm_root = pbm; + pbm->numa_node = of_node_to_nid(dp); + pbm->scan_bus = pci_sun4v_scan_bus; pbm->pci_ops = &sun4v_pci_ops; pbm->config_space_reg_bits = 12; @@ -913,6 +919,7 @@ static void __init pci_sun4v_pbm_init(struct pci_controller_info *p, pbm->name = dp->full_name; printk("%s: SUN4V PCI Bus Module\n", pbm->name); + printk("%s: On NUMA node %d\n", pbm->name, pbm->numa_node); pci_determine_mem_io_space(pbm); diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c index d1fb13ba02..fa2827c4a3 100644 --- a/arch/sparc64/kernel/sbus.c +++ b/arch/sparc64/kernel/sbus.c @@ -544,6 +544,7 @@ static void __init sbus_iommu_init(int __node, struct sbus_bus *sbus) sbus->ofdev.dev.archdata.iommu = iommu; sbus->ofdev.dev.archdata.stc = strbuf; + sbus->ofdev.dev.archdata.numa_node = -1; reg_base = regs + SYSIO_IOMMUREG_BASE; iommu->iommu_control = reg_base + IOMMU_CONTROL; @@ -575,7 +576,7 @@ static void __init sbus_iommu_init(int __node, struct sbus_bus *sbus) sbus->portid, regs); /* Setup for TSB_SIZE=7, TBW_SIZE=0, MMU_DE=1, MMU_EN=1 */ - if (iommu_table_init(iommu, IO_TSB_SIZE, MAP_BASE, 0xffffffff)) + if (iommu_table_init(iommu, IO_TSB_SIZE, MAP_BASE, 0xffffffff, -1)) goto fatal_memory_error; control = upa_readq(iommu->iommu_control); diff --git a/include/asm-sparc/device.h b/include/asm-sparc/device.h index 680e51d873..19790eb99c 100644 --- a/include/asm-sparc/device.h +++ b/include/asm-sparc/device.h @@ -16,6 +16,8 @@ struct dev_archdata { struct device_node *prom_node; struct of_device *op; + + int numa_node; }; #endif /* _ASM_SPARC_DEVICE_H */ diff --git a/include/asm-sparc/prom.h b/include/asm-sparc/prom.h index df5dc44224..fd55522481 100644 --- a/include/asm-sparc/prom.h +++ b/include/asm-sparc/prom.h @@ -77,6 +77,11 @@ extern int of_getintprop_default(struct device_node *np, const char *name, int def); extern int of_find_in_proplist(const char *list, const char *match, int len); +#ifdef CONFIG_NUMA +extern int of_node_to_nid(struct device_node *dp); +#else +#define of_node_to_nid(dp) (-1) +#endif extern void prom_build_devicetree(void); diff --git a/include/asm-sparc64/iommu.h b/include/asm-sparc64/iommu.h index 46325ddee2..d7b9afcba0 100644 --- a/include/asm-sparc64/iommu.h +++ b/include/asm-sparc64/iommu.h @@ -56,6 +56,7 @@ struct strbuf { }; extern int iommu_table_init(struct iommu *iommu, int tsbsize, - u32 dma_offset, u32 dma_addr_mask); + u32 dma_offset, u32 dma_addr_mask, + int numa_node); #endif /* !(_SPARC64_IOMMU_H) */