powerpc: Add support for popcnt instructions
authorAnton Blanchard <anton@samba.org>
Thu, 12 Aug 2010 16:28:09 +0000 (16:28 +0000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Mon, 29 Nov 2010 04:48:17 +0000 (15:48 +1100)
POWER5 added popcntb, and POWER7 added popcntw and popcntd. As a first step
this patch does all the work out of line, but it would be nice to implement
them as inlines with an out of line fallback.

The performance issue with hweight was noticed when disabling SMT on a large
(192 thread) POWER7 box. The patch improves that testcase by about 8%.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/include/asm/bitops.h
arch/powerpc/include/asm/cputable.h
arch/powerpc/kernel/ppc_ksyms.c
arch/powerpc/lib/Makefile
arch/powerpc/lib/hweight_64.S [new file with mode: 0644]

index 30964ae2d096a9c754407d7fd33a6e96ab9c9692..8a7e9314c68a3b404713d3c282743d098fffb60f 100644 (file)
@@ -267,7 +267,16 @@ static __inline__ int fls64(__u64 x)
 #include <asm-generic/bitops/fls64.h>
 #endif /* __powerpc64__ */
 
+#ifdef CONFIG_PPC64
+unsigned int __arch_hweight8(unsigned int w);
+unsigned int __arch_hweight16(unsigned int w);
+unsigned int __arch_hweight32(unsigned int w);
+unsigned long __arch_hweight64(__u64 w);
+#include <asm-generic/bitops/const_hweight.h>
+#else
 #include <asm-generic/bitops/hweight.h>
+#endif
+
 #include <asm-generic/bitops/find.h>
 
 /* Little-endian versions */
index f3a1fdd9cf08b67ede4102dab01b8a353c3c826f..f0a211d969233d2c063136b1c7d2aa10b1f98d70 100644 (file)
@@ -199,6 +199,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTR_UNALIGNED_LD_STD       LONG_ASM_CONST(0x0080000000000000)
 #define CPU_FTR_ASYM_SMT               LONG_ASM_CONST(0x0100000000000000)
 #define CPU_FTR_STCX_CHECKS_ADDRESS    LONG_ASM_CONST(0x0200000000000000)
+#define CPU_FTR_POPCNTB                        LONG_ASM_CONST(0x0400000000000000)
+#define CPU_FTR_POPCNTD                        LONG_ASM_CONST(0x0800000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -403,21 +405,22 @@ extern const char *powerpc_base_platform;
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
            CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
-           CPU_FTR_PURR | CPU_FTR_STCX_CHECKS_ADDRESS)
+           CPU_FTR_PURR | CPU_FTR_STCX_CHECKS_ADDRESS | \
+           CPU_FTR_POPCNTB)
 #define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
            CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
            CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
            CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
-           CPU_FTR_STCX_CHECKS_ADDRESS)
+           CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
            CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
            CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
            CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
-           CPU_FTR_STCX_CHECKS_ADDRESS)
+           CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD)
 #define CPU_FTRS_CELL  (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
index ab3e392ac63c41fda474ce4857660bbd38e459bd..ef3ef566235e07fd9b5bf0107321a3ca52ae6dc5 100644 (file)
@@ -186,3 +186,10 @@ EXPORT_SYMBOL(__mtdcr);
 EXPORT_SYMBOL(__mfdcr);
 #endif
 EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_PPC64
+EXPORT_SYMBOL(__arch_hweight8);
+EXPORT_SYMBOL(__arch_hweight16);
+EXPORT_SYMBOL(__arch_hweight32);
+EXPORT_SYMBOL(__arch_hweight64);
+#endif
index 889f2bc106dd86af018b894dab475d79638f2694..166a6a0ad54406858f66cbb1cdd093650ceb9678 100644 (file)
@@ -16,7 +16,7 @@ obj-$(CONFIG_HAS_IOMEM)       += devres.o
 
 obj-$(CONFIG_PPC64)    += copypage_64.o copyuser_64.o \
                           memcpy_64.o usercopy_64.o mem_64.o string.o \
-                          checksum_wrappers_64.o
+                          checksum_wrappers_64.o hweight_64.o
 obj-$(CONFIG_XMON)     += sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)  += sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += sstep.o ldstfp.o
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
new file mode 100644 (file)
index 0000000..ee2320b
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+/* Note: This code relies on -mminimal-toc */
+
+_GLOBAL(__arch_hweight8)
+BEGIN_FTR_SECTION
+       b .__sw_hweight8
+       nop
+       nop
+FTR_SECTION_ELSE
+       popcntb r3,r3
+       clrldi  r3,r3,64-8
+       blr
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight16)
+BEGIN_FTR_SECTION
+       b .__sw_hweight16
+       nop
+       nop
+       nop
+       nop
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(50)
+       popcntb r3,r3
+       srdi    r4,r3,8
+       add     r3,r4,r3
+       clrldi  r3,r3,64-8
+       blr
+  FTR_SECTION_ELSE_NESTED(50)
+       clrlwi  r3,r3,16
+       popcntw r3,r3
+       clrldi  r3,r3,64-8
+       blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight32)
+BEGIN_FTR_SECTION
+       b .__sw_hweight32
+       nop
+       nop
+       nop
+       nop
+       nop
+       nop
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(51)
+       popcntb r3,r3
+       srdi    r4,r3,16
+       add     r3,r4,r3
+       srdi    r4,r3,8
+       add     r3,r4,r3
+       clrldi  r3,r3,64-8
+       blr
+  FTR_SECTION_ELSE_NESTED(51)
+       popcntw r3,r3
+       clrldi  r3,r3,64-8
+       blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight64)
+BEGIN_FTR_SECTION
+       b .__sw_hweight64
+       nop
+       nop
+       nop
+       nop
+       nop
+       nop
+       nop
+       nop
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(52)
+       popcntb r3,r3
+       srdi    r4,r3,32
+       add     r3,r4,r3
+       srdi    r4,r3,16
+       add     r3,r4,r3
+       srdi    r4,r3,8
+       add     r3,r4,r3
+       clrldi  r3,r3,64-8
+       blr
+  FTR_SECTION_ELSE_NESTED(52)
+       popcntd r3,r3
+       clrldi  r3,r3,64-8
+       blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)