From 5fb7dc37dc16fbc8b80d81318a582201ef7e280d Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 19 Jul 2007 01:48:12 -0700 Subject: [PATCH] define new percpu interface for shared data per cpu data section contains two types of data. One set which is exclusively accessed by the local cpu and the other set which is per cpu, but also shared by remote cpus. In the current kernel, these two sets are not clearely separated out. This can potentially cause the same data cacheline shared between the two sets of data, which will result in unnecessary bouncing of the cacheline between cpus. One way to fix the problem is to cacheline align the remotely accessed per cpu data, both at the beginning and at the end. Because of the padding at both ends, this will likely cause some memory wastage and also the interface to achieve this is not clean. This patch: Moves the remotely accessed per cpu data (which is currently marked as ____cacheline_aligned_in_smp) into a different section, where all the data elements are cacheline aligned. And as such, this differentiates the local only data and remotely accessed data cleanly. Signed-off-by: Fenghua Yu Acked-by: Suresh Siddha Cc: Rusty Russell Cc: Christoph Lameter Cc: Cc: "Luck, Tony" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/vmlinux.lds.S | 5 +---- arch/arm/kernel/vmlinux.lds.S | 1 + arch/cris/arch-v32/vmlinux.lds.S | 5 +---- arch/frv/kernel/vmlinux.lds.S | 5 +---- arch/i386/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m32r/kernel/vmlinux.lds.S | 5 +---- arch/mips/kernel/vmlinux.lds.S | 5 +---- arch/parisc/kernel/vmlinux.lds.S | 7 +++---- arch/powerpc/kernel/vmlinux.lds.S | 1 + arch/ppc/kernel/vmlinux.lds.S | 5 +---- arch/s390/kernel/vmlinux.lds.S | 5 +---- arch/sh/kernel/vmlinux.lds.S | 5 +---- arch/sh64/kernel/vmlinux.lds.S | 5 ++++- arch/sparc/kernel/vmlinux.lds.S | 5 +---- arch/sparc64/kernel/vmlinux.lds.S | 6 ++---- arch/x86_64/kernel/vmlinux.lds.S | 6 ++---- arch/xtensa/kernel/vmlinux.lds.S | 5 +---- include/asm-generic/percpu.h | 8 ++++++++ include/asm-generic/vmlinux.lds.h | 8 ++++++++ include/asm-i386/percpu.h | 5 +++++ include/asm-ia64/percpu.h | 10 ++++++++++ include/asm-powerpc/percpu.h | 7 +++++++ include/asm-s390/percpu.h | 7 +++++++ include/asm-sparc64/percpu.h | 7 +++++++ include/asm-x86_64/percpu.h | 7 +++++++ 26 files changed, 84 insertions(+), 53 deletions(-) diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 449e76f118..6f4f0378e7 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -69,10 +69,7 @@ SECTIONS . = ALIGN(8); SECURITY_INIT - . = ALIGN(8192); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(8192) . = ALIGN(2*8192); __init_end = .; diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 2b7a8f5d8c..5ff5406666 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -66,6 +66,7 @@ SECTIONS . = ALIGN(4096); __per_cpu_start = .; *(.data.percpu) + *(.data.percpu.shared_aligned) __per_cpu_end = .; #ifndef CONFIG_XIP_KERNEL __init_begin = _stext; diff --git a/arch/cris/arch-v32/vmlinux.lds.S b/arch/cris/arch-v32/vmlinux.lds.S index dfa25e1542..651a77f2cc 100644 --- a/arch/cris/arch-v32/vmlinux.lds.S +++ b/arch/cris/arch-v32/vmlinux.lds.S @@ -91,10 +91,7 @@ SECTIONS } SECURITY_INIT - . = ALIGN (8192); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(8192) #ifdef CONFIG_BLK_DEV_INITRD .init.ramfs : { diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 481dc13746..3b71e0c863 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -57,10 +57,7 @@ SECTIONS __alt_instructions_end = .; .altinstr_replacement : { *(.altinstr_replacement) } - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(4096); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 00f1bc47d3..4dc44b8007 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -181,6 +181,7 @@ SECTIONS .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) + *(.data.percpu.shared_aligned) __per_cpu_end = .; } . = ALIGN(4096); diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 5a65965c8b..860f251d2f 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -206,6 +206,7 @@ SECTIONS { __per_cpu_start = .; *(.data.percpu) + *(.data.percpu.shared_aligned) __per_cpu_end = .; } . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 4e2d5b9f0a..942a8c7a44 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -110,10 +110,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) . = ALIGN(4096); __init_end = .; /* freed after init ends here */ diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 9b9992cd56..bc9bae2a73 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -119,10 +119,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(_PAGE_SIZE); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(_PAGE_SIZE) . = ALIGN(_PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 4d96ba4b98..d4e6a93c8d 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -181,10 +181,9 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(ASM_PAGE_SIZE); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + + PERCPU(ASM_PAGE_SIZE) + . = ALIGN(ASM_PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index ae4acd8414..39fda6e6aa 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -144,6 +144,7 @@ SECTIONS .data.percpu : { __per_cpu_start = .; *(.data.percpu) + *(.data.percpu.shared_aligned) __per_cpu_end = .; } diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index 19db8746ff..c0aac3ff9e 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -130,10 +130,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(4096); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 7158a804a5..61ffd50fac 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -107,10 +107,7 @@ SECTIONS . = ALIGN(2); __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) . = ALIGN(4096); __init_end = .; /* freed after init ends here */ diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 0696402f44..5ba216180b 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -60,10 +60,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); __nosave_end = .; - . = ALIGN(PAGE_SIZE); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(PAGE_SIZE) .data.cacheline_aligned : { *(.data.cacheline_aligned) } _edata = .; /* End of data section */ diff --git a/arch/sh64/kernel/vmlinux.lds.S b/arch/sh64/kernel/vmlinux.lds.S index 02aea86c59..8ac9c7c5f8 100644 --- a/arch/sh64/kernel/vmlinux.lds.S +++ b/arch/sh64/kernel/vmlinux.lds.S @@ -87,7 +87,10 @@ SECTIONS . = ALIGN(PAGE_SIZE); __per_cpu_start = .; - .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) } + .data.percpu : C_PHYS(.data.percpu) { + *(.data.percpu) + *(.data.percpu.shared_aligned) + } __per_cpu_end = . ; .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) } diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index f75a1b8227..47583887ab 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -65,10 +65,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) . = ALIGN(4096); __init_end = .; . = ALIGN(32); diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S index 3ad10f3027..481861764d 100644 --- a/arch/sparc64/kernel/vmlinux.lds.S +++ b/arch/sparc64/kernel/vmlinux.lds.S @@ -90,10 +90,8 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(PAGE_SIZE); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(PAGE_SIZE) + . = ALIGN(PAGE_SIZE); __init_end = .; __bss_start = .; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index dbccfda836..22590690a8 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -194,10 +194,8 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) + . = ALIGN(4096); __init_end = .; diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index b0582c3c5f..3e31512109 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -190,10 +190,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) /* We need this dummy segment here */ diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index d984a90414..d85172e9ed 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -14,6 +14,11 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __typeof__(type) per_cpu__##name \ + ____cacheline_aligned_in_smp + /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*({ \ extern int simple_identifier_##var(void); \ @@ -34,6 +39,9 @@ do { \ #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU(type, name) + #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #define __raw_get_cpu_var(var) per_cpu__##var diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 84155eb67f..a2b09ed852 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -245,3 +245,11 @@ *(.initcall7.init) \ *(.initcall7s.init) +#define PERCPU(align) \ + . = ALIGN(align); \ + __per_cpu_start = .; \ + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ + *(.data.percpu) \ + *(.data.percpu.shared_aligned) \ + } \ + __per_cpu_end = .; diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index f54830b5d5..a7ebd436f3 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -54,6 +54,11 @@ extern unsigned long __per_cpu_offset[]; #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __typeof__(type) per_cpu__##name \ + ____cacheline_aligned_in_smp + /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h index fbe5cf3ab8..43a7aac414 100644 --- a/include/asm-ia64/percpu.h +++ b/include/asm-ia64/percpu.h @@ -29,6 +29,16 @@ __attribute__((__section__(".data.percpu"))) \ __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name +#ifdef CONFIG_SMP +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name \ + ____cacheline_aligned_in_smp +#else +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU(type, name) +#endif + /* * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an * external routine, to avoid include-hell. diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h index 2f2e3024fa..73dc8ba401 100644 --- a/include/asm-powerpc/percpu.h +++ b/include/asm-powerpc/percpu.h @@ -20,6 +20,11 @@ #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __typeof__(type) per_cpu__##name \ + ____cacheline_aligned_in_smp + /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) @@ -40,6 +45,8 @@ extern void setup_per_cpu_areas(void); #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU(type, name) #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h index 9ea7f1023e..545857e644 100644 --- a/include/asm-s390/percpu.h +++ b/include/asm-s390/percpu.h @@ -41,6 +41,11 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; __attribute__((__section__(".data.percpu"))) \ __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __typeof__(type) per_cpu__##name \ + ____cacheline_aligned_in_smp + #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) @@ -59,6 +64,8 @@ do { \ #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU(type, name) #define __get_cpu_var(var) __reloc_hide(var,0) #define __raw_get_cpu_var(var) __reloc_hide(var,0) diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h index 88db872ce2..caf8750792 100644 --- a/include/asm-sparc64/percpu.h +++ b/include/asm-sparc64/percpu.h @@ -18,6 +18,11 @@ extern unsigned long __per_cpu_shift; #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __typeof__(type) per_cpu__##name \ + ____cacheline_aligned_in_smp + register unsigned long __local_per_cpu_offset asm("g5"); /* var is in discarded region: offset to particular copy we want */ @@ -38,6 +43,8 @@ do { \ #define real_setup_per_cpu_areas() do { } while (0) #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU(type, name) #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index c6fbb67eac..5abd482701 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -20,6 +20,11 @@ #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + __attribute__((__section__(".data.percpu.shared_aligned"))) \ + __typeof__(type) per_cpu__##name \ + ____cacheline_internodealigned_in_smp + /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*({ \ extern int simple_identifier_##var(void); \ @@ -46,6 +51,8 @@ extern void setup_per_cpu_areas(void); #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU(type, name) #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var -- 2.39.5