x86/hweight: Get rid of the special calling convention
authorBorislav Petkov <bp@suse.de>
Mon, 30 May 2016 10:56:27 +0000 (12:56 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 8 Jun 2016 13:01:02 +0000 (15:01 +0200)
People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
into kcov, lto, etc, experimentations.

Add asm versions for __sw_hweight{32,64}() and do explicit saving and
restoring of clobbered registers. This gets rid of the special calling
convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs.

We still need to hardcode POPCNT and register operands as some old gas
versions which we support, do not know about POPCNT.

Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives
can do padding now.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/Kconfig
arch/x86/include/asm/arch_hweight.h
arch/x86/kernel/i386_ksyms_32.c
arch/x86/kernel/x8664_ksyms_64.c
arch/x86/lib/Makefile
arch/x86/lib/hweight.S [new file with mode: 0644]
lib/Makefile
lib/hweight.c

index 0a7b885964baccd540f2c9e600dce014398bd987..729d41d9ced389cb218feec8d02eb6feee6aae78 100644 (file)
@@ -294,11 +294,6 @@ config X86_32_LAZY_GS
        def_bool y
        depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-       string
-       default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-       default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
        def_bool y
 
index 02e799fa43d1b19c878290f6424e1b2f7293074d..e7cd63175de443f68aa536237a82a92120ca08f3 100644 (file)
@@ -4,8 +4,8 @@
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-       unsigned int res = 0;
+       unsigned int res;
 
        asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-                    : "="REG_OUT (res)
-                    : REG_IN (w));
+                        : "="REG_OUT (res)
+                        : REG_IN (w));
 
        return res;
 }
@@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-       unsigned long res = 0;
+       unsigned long res;
 
        asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-                    : "="REG_OUT (res)
-                    : REG_IN (w));
+                        : "="REG_OUT (res)
+                        : REG_IN (w));
 
        return res;
 }
index 64341aa485ae1ad6ab62c07984c9a70dadd44c64..d40ee8a38fed3568d39410d75101111251ace73a 100644 (file)
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(___preempt_schedule);
 EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
+
+EXPORT_SYMBOL(__sw_hweight32);
index cd05942bc9189452d8ec7c0cebd96431cf8dd394..f1aebfb49c36688b9287a9f444a223750c3170b8 100644 (file)
@@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
index 72a576752a7ec062f92e433362306e7a6b4648de..ec969cc3eb20e9c03c24b90130cbe78c5969e9a4 100644 (file)
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644 (file)
index 0000000..02de3d7
--- /dev/null
@@ -0,0 +1,77 @@
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+       movl %edi, %eax                         # w
+#endif
+       __ASM_SIZE(push,) %__ASM_REG(dx)
+       movl %eax, %edx                         # w -> t
+       shrl %edx                               # t >>= 1
+       andl $0x55555555, %edx                  # t &= 0x55555555
+       subl %edx, %eax                         # w -= t
+
+       movl %eax, %edx                         # w -> t
+       shrl $2, %eax                           # w_tmp >>= 2
+       andl $0x33333333, %edx                  # t     &= 0x33333333
+       andl $0x33333333, %eax                  # w_tmp &= 0x33333333
+       addl %edx, %eax                         # w = w_tmp + t
+
+       movl %eax, %edx                         # w -> t
+       shrl $4, %edx                           # t >>= 4
+       addl %edx, %eax                         # w_tmp += t
+       andl  $0x0f0f0f0f, %eax                 # w_tmp &= 0x0f0f0f0f
+       imull $0x01010101, %eax, %eax           # w_tmp *= 0x01010101
+       shrl $24, %eax                          # w = w_tmp >> 24
+       __ASM_SIZE(pop,) %__ASM_REG(dx)
+       ret
+ENDPROC(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+       pushq   %rdx
+
+       movq    %rdi, %rdx                      # w -> t
+       movabsq $0x5555555555555555, %rax
+       shrq    %rdx                            # t >>= 1
+       andq    %rdx, %rax                      # t &= 0x5555555555555555
+       movabsq $0x3333333333333333, %rdx
+       subq    %rax, %rdi                      # w -= t
+
+       movq    %rdi, %rax                      # w -> t
+       shrq    $2, %rdi                        # w_tmp >>= 2
+       andq    %rdx, %rax                      # t     &= 0x3333333333333333
+       andq    %rdi, %rdx                      # w_tmp &= 0x3333333333333333
+       addq    %rdx, %rax                      # w = w_tmp + t
+
+       movq    %rax, %rdx                      # w -> t
+       shrq    $4, %rdx                        # t >>= 4
+       addq    %rdx, %rax                      # w_tmp += t
+       movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+       andq    %rdx, %rax                      # w_tmp &= 0x0f0f0f0f0f0f0f0f
+       movabsq $0x0101010101010101, %rdx
+       imulq   %rdx, %rax                      # w_tmp *= 0x0101010101010101
+       shrq    $56, %rax                       # w = w_tmp >> 56
+
+       popq    %rdx
+       ret
+#else /* CONFIG_X86_32 */
+       /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+       pushl   %ecx
+
+       call    __sw_hweight32
+       movl    %eax, %ecx                      # stash away result
+       movl    %edx, %eax                      # second part of input
+       call    __sw_hweight32
+       addl    %ecx, %eax                      # result
+
+       popl    %ecx
+       ret
+#endif
+ENDPROC(__sw_hweight64)
index ff6a7a6c63951f080a655df67f7ad0524a03201a..07d06a8b9788c5390f32b39ed165a08e502d673d 100644 (file)
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
         rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -74,8 +71,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
index 9a5c1f2215585f35a8eea43eedd2566a6f459231..43273a7d83cf41621221354bc0d1b5680027780e 100644 (file)
@@ -9,6 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
+#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {
@@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w)
 }
 EXPORT_SYMBOL(__sw_hweight8);
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
@@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight64);
+#endif