From: Grant Grundler Date: Sat, 22 Oct 2005 02:45:22 +0000 (-0400) Subject: [PARISC] Update bitops from parisc tree X-Git-Tag: v2.6.15-rc1~732^2^2~9^2~37 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a366064c3ff46c985a3c7243468be197d29874dc;p=linux-2.6 [PARISC] Update bitops from parisc tree Optimize ext2_find_next_zero_bit. Gives about 25% perf improvement with a rsync test with ext3. Signed-off-by: Randolph Chung fix ext3 performance - ext2_find_next_zero() was culprit. Kudos to jejb for pointing out the the possibility that ext2_test_bit and ext2_find_next_zero() may in fact not be enumerating bits in the bitmap because of endianess. Took sparc64 implementation and adapted it to our tree. I suspect the real problem is ffz() wants an unsigned long and was getting garbage in the top half of the unsigned int. Not confirmed but that's what I suspect. Signed-off-by: Grant Grundler Fix find_next_bit for 32-bit Make masking consistent for bitops From: Joel Soete Signed-off-by: Randolph Chung Add back incorrectly removed ext2_find_first_zero_bit definition Signed-off-by: James Bottomley Fixup bitops.h to use volatile for *_bit() ops Based on this email thread: http://marc.theaimsgroup.com/?t=108826637900003 In a nutshell: *_bit() want use of volatile. __*_bit() are "relaxed" and don't use spinlock or volatile. other minor changes: o replaces hweight64() macro with alias to generic_hweight64() (Joel Soete) o cleanup ext2* macros so (a) it's obvious what the XOR magic is about and (b) one version that works for both 32/64-bit. o replace 2 uses of CONFIG_64BIT with __LP64__. bitops.h used both. I think header files that might go to user space should use something userspace will know about (__LP64__). Signed-off-by: Grant Grundler Move SHIFT_PER_LONG to standard location for BITS_PER_LONG (asm/types.h) and ditch the second definition of BITS_PER_LONG in bitops.h Signed-off-by: Grant Grundler Signed-off-by: Kyle McMartin --- diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h index af7db694b2..55b98c67fd 100644 --- a/include/asm-parisc/bitops.h +++ b/include/asm-parisc/bitops.h @@ -2,7 +2,7 @@ #define _PARISC_BITOPS_H #include -#include +#include /* for BITS_PER_LONG/SHIFT_PER_LONG */ #include #include @@ -12,193 +12,157 @@ * to include/asm-i386/bitops.h or kerneldoc */ -#ifdef __LP64__ -# define SHIFT_PER_LONG 6 -#ifndef BITS_PER_LONG -# define BITS_PER_LONG 64 -#endif -#else -# define SHIFT_PER_LONG 5 -#ifndef BITS_PER_LONG -# define BITS_PER_LONG 32 -#endif -#endif - -#define CHOP_SHIFTCOUNT(x) ((x) & (BITS_PER_LONG - 1)) +#define CHOP_SHIFTCOUNT(x) (((unsigned long) (x)) & (BITS_PER_LONG - 1)) #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ void set_bit(int nr, volatile unsigned long * address) +/* See http://marc.theaimsgroup.com/?t=108826637900003 for discussion + * on use of volatile and __*_bit() (set/clear/change): + * *_bit() want use of volatile. + * __*_bit() are "relaxed" and don't use spinlock or volatile. + */ + +static __inline__ void set_bit(int nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); unsigned long flags; addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); _atomic_spin_lock_irqsave(addr, flags); *addr |= mask; _atomic_spin_unlock_irqrestore(addr, flags); } -static __inline__ void __set_bit(int nr, volatile unsigned long * address) +static __inline__ void __set_bit(unsigned long nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; + unsigned long *m = (unsigned long *) addr + (nr >> SHIFT_PER_LONG); - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); - *addr |= mask; + *m |= 1UL << CHOP_SHIFTCOUNT(nr); } -static __inline__ void clear_bit(int nr, volatile unsigned long * address) +static __inline__ void clear_bit(int nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; + unsigned long mask = ~(1UL << CHOP_SHIFTCOUNT(nr)); unsigned long flags; addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); _atomic_spin_lock_irqsave(addr, flags); - *addr &= ~mask; + *addr &= mask; _atomic_spin_unlock_irqrestore(addr, flags); } -static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long * address) +static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; + unsigned long *m = (unsigned long *) addr + (nr >> SHIFT_PER_LONG); - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); - *addr &= ~mask; + *m &= ~(1UL << CHOP_SHIFTCOUNT(nr)); } -static __inline__ void change_bit(int nr, volatile unsigned long * address) +static __inline__ void change_bit(int nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); unsigned long flags; addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); _atomic_spin_lock_irqsave(addr, flags); *addr ^= mask; _atomic_spin_unlock_irqrestore(addr, flags); } -static __inline__ void __change_bit(int nr, volatile unsigned long * address) +static __inline__ void __change_bit(unsigned long nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; + unsigned long *m = (unsigned long *) addr + (nr >> SHIFT_PER_LONG); - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); - *addr ^= mask; + *m ^= 1UL << CHOP_SHIFTCOUNT(nr); } -static __inline__ int test_and_set_bit(int nr, volatile unsigned long * address) +static __inline__ int test_and_set_bit(int nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; - int oldbit; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + unsigned long oldbit; unsigned long flags; addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); _atomic_spin_lock_irqsave(addr, flags); - oldbit = (*addr & mask) ? 1 : 0; - *addr |= mask; + oldbit = *addr; + *addr = oldbit | mask; _atomic_spin_unlock_irqrestore(addr, flags); - return oldbit; + return (oldbit & mask) ? 1 : 0; } static __inline__ int __test_and_set_bit(int nr, volatile unsigned long * address) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; - int oldbit; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + unsigned long oldbit; + unsigned long *addr = (unsigned long *)address + (nr >> SHIFT_PER_LONG); - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); - oldbit = (*addr & mask) ? 1 : 0; - *addr |= mask; + oldbit = *addr; + *addr = oldbit | mask; - return oldbit; + return (oldbit & mask) ? 1 : 0; } -static __inline__ int test_and_clear_bit(int nr, volatile unsigned long * address) +static __inline__ int test_and_clear_bit(int nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; - int oldbit; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + unsigned long oldbit; unsigned long flags; addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); _atomic_spin_lock_irqsave(addr, flags); - oldbit = (*addr & mask) ? 1 : 0; - *addr &= ~mask; + oldbit = *addr; + *addr = oldbit & ~mask; _atomic_spin_unlock_irqrestore(addr, flags); - return oldbit; + return (oldbit & mask) ? 1 : 0; } static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long * address) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; - int oldbit; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + unsigned long *addr = (unsigned long *)address + (nr >> SHIFT_PER_LONG); + unsigned long oldbit; - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); - oldbit = (*addr & mask) ? 1 : 0; - *addr &= ~mask; + oldbit = *addr; + *addr = oldbit & ~mask; - return oldbit; + return (oldbit & mask) ? 1 : 0; } -static __inline__ int test_and_change_bit(int nr, volatile unsigned long * address) +static __inline__ int test_and_change_bit(int nr, volatile unsigned long * addr) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; - int oldbit; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + unsigned long oldbit; unsigned long flags; addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); _atomic_spin_lock_irqsave(addr, flags); - oldbit = (*addr & mask) ? 1 : 0; - *addr ^= mask; + oldbit = *addr; + *addr = oldbit ^ mask; _atomic_spin_unlock_irqrestore(addr, flags); - return oldbit; + return (oldbit & mask) ? 1 : 0; } static __inline__ int __test_and_change_bit(int nr, volatile unsigned long * address) { - unsigned long mask; - unsigned long *addr = (unsigned long *) address; - int oldbit; + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + unsigned long *addr = (unsigned long *)address + (nr >> SHIFT_PER_LONG); + unsigned long oldbit; - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); - oldbit = (*addr & mask) ? 1 : 0; - *addr ^= mask; + oldbit = *addr; + *addr = oldbit ^ mask; - return oldbit; + return (oldbit & mask) ? 1 : 0; } static __inline__ int test_bit(int nr, const volatile unsigned long *address) { - unsigned long mask; - const unsigned long *addr = (const unsigned long *)address; - - addr += (nr >> SHIFT_PER_LONG); - mask = 1L << CHOP_SHIFTCOUNT(nr); + unsigned long mask = 1UL << CHOP_SHIFTCOUNT(nr); + const unsigned long *addr = (const unsigned long *)address + (nr >> SHIFT_PER_LONG); return !!(*addr & mask); } @@ -229,7 +193,7 @@ static __inline__ unsigned long __ffs(unsigned long x) unsigned long ret; __asm__( -#if BITS_PER_LONG > 32 +#ifdef __LP64__ " ldi 63,%1\n" " extrd,u,*<> %0,63,32,%%r0\n" " extrd,u,*TR %0,31,32,%0\n" /* move top 32-bits down */ @@ -304,14 +268,7 @@ static __inline__ int fls(int x) * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ -#define hweight64(x) \ -({ \ - unsigned long __x = (x); \ - unsigned int __w; \ - __w = generic_hweight32((unsigned int) __x); \ - __w += generic_hweight32((unsigned int) (__x>>32)); \ - __w; \ -}) +#define hweight64(x) generic_hweight64(x) #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) @@ -324,7 +281,13 @@ static __inline__ int fls(int x) */ static inline int sched_find_first_bit(const unsigned long *b) { -#ifndef __LP64__ +#ifdef __LP64__ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 64; + return __ffs(b[2]) + 128; +#else if (unlikely(b[0])) return __ffs(b[0]); if (unlikely(b[1])) @@ -334,14 +297,6 @@ static inline int sched_find_first_bit(const unsigned long *b) if (b[3]) return __ffs(b[3]) + 96; return __ffs(b[4]) + 128; -#else - if (unlikely(b[0])) - return __ffs(b[0]); - if (unlikely(((unsigned int)b[1]))) - return __ffs(b[1]) + 64; - if (b[1] >> 32) - return __ffs(b[1] >> 32) + 96; - return __ffs(b[2]) + 128; #endif } @@ -391,7 +346,7 @@ found_middle: static __inline__ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - const unsigned long *p = addr + (offset >> 6); + const unsigned long *p = addr + (offset >> SHIFT_PER_LONG); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; @@ -445,71 +400,90 @@ found_middle: * test_and_{set,clear}_bit guarantee atomicity without * disabling interrupts. */ -#ifdef __LP64__ -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x38, (unsigned long *)addr) -#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x38, (unsigned long *)addr) -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x38, (unsigned long *)addr) -#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, (unsigned long *)addr) -#else -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)addr) -#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x18, (unsigned long *)addr) -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)addr) -#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)addr) -#endif -#endif /* __KERNEL__ */ +/* '3' is bits per byte */ +#define LE_BYTE_ADDR ((sizeof(unsigned long) - 1) << 3) -static __inline__ int ext2_test_bit(int nr, __const__ void * addr) -{ - __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; +#define ext2_test_bit(nr, addr) \ + test_bit((nr) ^ LE_BYTE_ADDR, (unsigned long *)addr) +#define ext2_set_bit(nr, addr) \ + __test_and_set_bit((nr) ^ LE_BYTE_ADDR, (unsigned long *)addr) +#define ext2_clear_bit(nr, addr) \ + __test_and_clear_bit((nr) ^ LE_BYTE_ADDR, (unsigned long *)addr) - return (ADDR[nr >> 3] >> (nr & 7)) & 1; -} +#define ext2_set_bit_atomic(l,nr,addr) \ + test_and_set_bit((nr) ^ LE_BYTE_ADDR, (unsigned long *)addr) +#define ext2_clear_bit_atomic(l,nr,addr) \ + test_and_clear_bit( (nr) ^ LE_BYTE_ADDR, (unsigned long *)addr) + +#endif /* __KERNEL__ */ -/* - * This implementation of ext2_find_{first,next}_zero_bit was stolen from - * Linus' asm-alpha/bitops.h and modified for a big-endian machine. - */ #define ext2_find_first_zero_bit(addr, size) \ - ext2_find_next_zero_bit((addr), (size), 0) + ext2_find_next_zero_bit((addr), (size), 0) -extern __inline__ unsigned long ext2_find_next_zero_bit(void *addr, - unsigned long size, unsigned long offset) +/* include/linux/byteorder does not support "unsigned long" type */ +static inline unsigned long ext2_swabp(unsigned long * x) { - unsigned int *p = ((unsigned int *) addr) + (offset >> 5); - unsigned int result = offset & ~31UL; - unsigned int tmp; +#ifdef __LP64__ + return (unsigned long) __swab64p((u64 *) x); +#else + return (unsigned long) __swab32p((u32 *) x); +#endif +} + +/* include/linux/byteorder doesn't support "unsigned long" type */ +static inline unsigned long ext2_swab(unsigned long y) +{ +#ifdef __LP64__ + return (unsigned long) __swab64((u64) y); +#else + return (unsigned long) __swab32((u32) y); +#endif +} + +static __inline__ unsigned long ext2_find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = (unsigned long *) addr + (offset >> SHIFT_PER_LONG); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; if (offset >= size) return size; size -= result; - offset &= 31UL; + offset &= (BITS_PER_LONG - 1UL); if (offset) { - tmp = cpu_to_le32p(p++); - tmp |= ~0UL >> (32-offset); - if (size < 32) + tmp = ext2_swabp(p++); + tmp |= (~0UL >> (BITS_PER_LONG - offset)); + if (size < BITS_PER_LONG) goto found_first; - if (tmp != ~0U) + if (~tmp) goto found_middle; - size -= 32; - result += 32; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; } - while (size >= 32) { - if ((tmp = cpu_to_le32p(p++)) != ~0U) - goto found_middle; - result += 32; - size -= 32; + + while (size & ~(BITS_PER_LONG - 1)) { + if (~(tmp = *(p++))) + goto found_middle_swap; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; } if (!size) return result; - tmp = cpu_to_le32p(p); + tmp = ext2_swabp(p); found_first: - tmp |= ~0U << size; + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. Skip ffz */ found_middle: return result + ffz(tmp); + +found_middle_swap: + return result + ffz(ext2_swab(tmp)); } + /* Bitmap functions for the minix filesystem. */ #define minix_test_and_set_bit(nr,addr) ext2_set_bit(nr,addr) #define minix_set_bit(nr,addr) ((void)ext2_set_bit(nr,addr)) diff --git a/include/asm-parisc/types.h b/include/asm-parisc/types.h index d21b9d0d63..34fdce361a 100644 --- a/include/asm-parisc/types.h +++ b/include/asm-parisc/types.h @@ -33,8 +33,10 @@ typedef unsigned long long __u64; #ifdef __LP64__ #define BITS_PER_LONG 64 +#define SHIFT_PER_LONG 6 #else #define BITS_PER_LONG 32 +#define SHIFT_PER_LONG 5 #endif #ifndef __ASSEMBLY__