powerpc/64s: Improve RFI L1-D cache flush fallback
authorNicholas Piggin <npiggin@gmail.com>
Sat, 26 May 2018 04:27:27 +0000 (14:27 +1000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 May 2018 05:51:50 +0000 (07:51 +0200)
commit bdcb1aefc5b3f7d0f1dc8b02673602bca2ff7a4b upstream.

The fallback RFI flush is used when firmware does not provide a way
to flush the cache. It's a "displacement flush" that evicts useful
data by displacing it with an uninteresting buffer.

The flush has to take care to work with implementation specific cache
replacment policies, so the recipe has been in flux. The initial
slow but conservative approach is to touch all lines of a congruence
class, with dependencies between each load. It has since been
determined that a linear pattern of loads without dependencies is
sufficient, and is significantly faster.

Measuring the speed of a null syscall with RFI fallback flush enabled
gives the relative improvement:

P8 - 1.83x
P9 - 1.75x

The flush also becomes simpler and more adaptable to different cache
geometries.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
arch/powerpc/include/asm/paca.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/setup_64.c
arch/powerpc/xmon/xmon.c

index b8366df50d1956903e6d80d0d2c3a93043f7360e..e6bd59353e40bbf41b12bd3462725d4a94177fda 100644 (file)
@@ -238,8 +238,7 @@ struct paca_struct {
         */
        u64 exrfi[EX_SIZE] __aligned(0x80);
        void *rfi_flush_fallback_area;
-       u64 l1d_flush_congruence;
-       u64 l1d_flush_sets;
+       u64 l1d_flush_size;
 #endif
 };
 
index 748cdc4bb89ab41488e6d8f49ffc6baeb2998a0f..2e5ea300258a0673453e9739c5382c487719e09b 100644 (file)
@@ -239,8 +239,7 @@ int main(void)
        OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
        OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
        OFFSET(PACA_EXRFI, paca_struct, exrfi);
-       OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
-       OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
+       OFFSET(PACA_L1D_FLUSH_SIZE, paca_struct, l1d_flush_size);
 
 #endif
        OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
index f9ca4bb3d48ea14ff1652e7b9cec381a8cd57581..feba0a8d040ed616008d5d6f53bd6c52d077f1e8 100644 (file)
@@ -1440,39 +1440,37 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback)
        std     r9,PACA_EXRFI+EX_R9(r13)
        std     r10,PACA_EXRFI+EX_R10(r13)
        std     r11,PACA_EXRFI+EX_R11(r13)
-       std     r12,PACA_EXRFI+EX_R12(r13)
-       std     r8,PACA_EXRFI+EX_R13(r13)
        mfctr   r9
        ld      r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
-       ld      r11,PACA_L1D_FLUSH_SETS(r13)
-       ld      r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
-       /*
-        * The load adresses are at staggered offsets within cachelines,
-        * which suits some pipelines better (on others it should not
-        * hurt).
-        */
-       addi    r12,r12,8
+       ld      r11,PACA_L1D_FLUSH_SIZE(r13)
+       srdi    r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
        mtctr   r11
        DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
 
        /* order ld/st prior to dcbt stop all streams with flushing */
        sync
-1:     li      r8,0
-       .rept   8 /* 8-way set associative */
-       ldx     r11,r10,r8
-       add     r8,r8,r12
-       xor     r11,r11,r11     // Ensure r11 is 0 even if fallback area is not
-       add     r8,r8,r11       // Add 0, this creates a dependency on the ldx
-       .endr
-       addi    r10,r10,128 /* 128 byte cache line */
+
+       /*
+        * The load adresses are at staggered offsets within cachelines,
+        * which suits some pipelines better (on others it should not
+        * hurt).
+        */
+1:
+       ld      r11,(0x80 + 8)*0(r10)
+       ld      r11,(0x80 + 8)*1(r10)
+       ld      r11,(0x80 + 8)*2(r10)
+       ld      r11,(0x80 + 8)*3(r10)
+       ld      r11,(0x80 + 8)*4(r10)
+       ld      r11,(0x80 + 8)*5(r10)
+       ld      r11,(0x80 + 8)*6(r10)
+       ld      r11,(0x80 + 8)*7(r10)
+       addi    r10,r10,0x80*8
        bdnz    1b
 
        mtctr   r9
        ld      r9,PACA_EXRFI+EX_R9(r13)
        ld      r10,PACA_EXRFI+EX_R10(r13)
        ld      r11,PACA_EXRFI+EX_R11(r13)
-       ld      r12,PACA_EXRFI+EX_R12(r13)
-       ld      r8,PACA_EXRFI+EX_R13(r13)
        GET_SCRATCH0(r13);
        rfid
 
@@ -1482,39 +1480,37 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback)
        std     r9,PACA_EXRFI+EX_R9(r13)
        std     r10,PACA_EXRFI+EX_R10(r13)
        std     r11,PACA_EXRFI+EX_R11(r13)
-       std     r12,PACA_EXRFI+EX_R12(r13)
-       std     r8,PACA_EXRFI+EX_R13(r13)
        mfctr   r9
        ld      r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
-       ld      r11,PACA_L1D_FLUSH_SETS(r13)
-       ld      r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
-       /*
-        * The load adresses are at staggered offsets within cachelines,
-        * which suits some pipelines better (on others it should not
-        * hurt).
-        */
-       addi    r12,r12,8
+       ld      r11,PACA_L1D_FLUSH_SIZE(r13)
+       srdi    r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
        mtctr   r11
        DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
 
        /* order ld/st prior to dcbt stop all streams with flushing */
        sync
-1:     li      r8,0
-       .rept   8 /* 8-way set associative */
-       ldx     r11,r10,r8
-       add     r8,r8,r12
-       xor     r11,r11,r11     // Ensure r11 is 0 even if fallback area is not
-       add     r8,r8,r11       // Add 0, this creates a dependency on the ldx
-       .endr
-       addi    r10,r10,128 /* 128 byte cache line */
+
+       /*
+        * The load adresses are at staggered offsets within cachelines,
+        * which suits some pipelines better (on others it should not
+        * hurt).
+        */
+1:
+       ld      r11,(0x80 + 8)*0(r10)
+       ld      r11,(0x80 + 8)*1(r10)
+       ld      r11,(0x80 + 8)*2(r10)
+       ld      r11,(0x80 + 8)*3(r10)
+       ld      r11,(0x80 + 8)*4(r10)
+       ld      r11,(0x80 + 8)*5(r10)
+       ld      r11,(0x80 + 8)*6(r10)
+       ld      r11,(0x80 + 8)*7(r10)
+       addi    r10,r10,0x80*8
        bdnz    1b
 
        mtctr   r9
        ld      r9,PACA_EXRFI+EX_R9(r13)
        ld      r10,PACA_EXRFI+EX_R10(r13)
        ld      r11,PACA_EXRFI+EX_R11(r13)
-       ld      r12,PACA_EXRFI+EX_R12(r13)
-       ld      r8,PACA_EXRFI+EX_R13(r13)
        GET_SCRATCH0(r13);
        hrfid
 
index 9527a4c6cbc27caa2f03c95c686512f9abe71fa1..333c64a794eb5e04d1cf0c0e0bf8bee63596abbe 100644 (file)
@@ -851,19 +851,8 @@ static void init_fallback_flush(void)
        memset(l1d_flush_fallback_area, 0, l1d_size * 2);
 
        for_each_possible_cpu(cpu) {
-               /*
-                * The fallback flush is currently coded for 8-way
-                * associativity. Different associativity is possible, but it
-                * will be treated as 8-way and may not evict the lines as
-                * effectively.
-                *
-                * 128 byte lines are mandatory.
-                */
-               u64 c = l1d_size / 8;
-
                paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
-               paca[cpu].l1d_flush_congruence = c;
-               paca[cpu].l1d_flush_sets = c / 128;
+               paca[cpu].l1d_flush_size = l1d_size;
        }
 }
 
index 2c8b325591cc256fedf59701e32cc90bbb498432..a5938fadd031ebe40635f4876ea3729c769d0d2a 100644 (file)
@@ -2348,6 +2348,8 @@ static void dump_one_paca(int cpu)
        DUMP(p, slb_cache_ptr, "x");
        for (i = 0; i < SLB_CACHE_ENTRIES; i++)
                printf(" slb_cache[%d]:        = 0x%016lx\n", i, p->slb_cache[i]);
+
+       DUMP(p, rfi_flush_fallback_area, "px");
 #endif
        DUMP(p, dscr_default, "llx");
 #ifdef CONFIG_PPC_BOOK3E