This patch change the kernel VSID range so that we limit VSID_BITS to 37.
This enables us to support 64TB with 65 bit VA (37+28). Without this patch
we have boot hangs on platforms that only support 65 bit VA.
With this patch we now have proto vsid generated as below:
We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
from mmu context id and effective segment id of the address.
For user processes max context id is limited to ((1ul << 19) - 5)
for kernel space, we use the top 4 context ids to map address as below
0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: <stable@vger.kernel.org> [v3.8]
/*
* VSID allocation (256MB segment)
*
- * We first generate a 38-bit "proto-VSID". For kernel addresses this
- * is equal to the ESID | 1 << 37, for user addresses it is:
- * (context << USER_ESID_BITS) | (esid & ((1U << USER_ESID_BITS) - 1)
+ * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
+ * from mmu context id and effective segment id of the address.
*
- * This splits the proto-VSID into the below range
- * 0 - (2^(CONTEXT_BITS + USER_ESID_BITS) - 1) : User proto-VSID range
- * 2^(CONTEXT_BITS + USER_ESID_BITS) - 2^(VSID_BITS) : Kernel proto-VSID range
- *
- * We also have CONTEXT_BITS + USER_ESID_BITS = VSID_BITS - 1
- * That is, we assign half of the space to user processes and half
- * to the kernel.
+ * For user processes max context id is limited to ((1ul << 19) - 5)
+ * for kernel space, we use the top 4 context ids to map address as below
+ * NOTE: each context only support 64TB now.
+ * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
+ * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
+ * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
+ * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]
*
* The proto-VSIDs are then scrambled into real VSIDs with the
* multiplicative hash:
* VSID_MULTIPLIER is prime, so in particular it is
* co-prime to VSID_MODULUS, making this a 1:1 scrambling function.
* Because the modulus is 2^n-1 we can compute it efficiently without
- * a divide or extra multiply (see below).
- *
- * This scheme has several advantages over older methods:
+ * a divide or extra multiply (see below). The scramble function gives
+ * robust scattering in the hash table (at least based on some initial
+ * results).
*
- * - We have VSIDs allocated for every kernel address
- * (i.e. everything above 0xC000000000000000), except the very top
- * segment, which simplifies several things.
+ * We also consider VSID 0 special. We use VSID 0 for slb entries mapping
+ * bad address. This enables us to consolidate bad address handling in
+ * hash_page.
*
- * - We allow for USER_ESID_BITS significant bits of ESID and
- * CONTEXT_BITS bits of context for user addresses.
- * i.e. 64T (46 bits) of address space for up to half a million contexts.
- *
- * - The scramble function gives robust scattering in the hash
- * table (at least based on some initial results). The previous
- * method was more susceptible to pathological cases giving excessive
- * hash collisions.
+ * We also need to avoid the last segment of the last context, because that
+ * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
+ * because of the modulo operation in vsid scramble. But the vmemmap
+ * (which is what uses region 0xf) will never be close to 64TB in size
+ * (it's 56 bytes per page of system memory).
*/
#define CONTEXT_BITS 19
#define USER_ESID_BITS 18
#define USER_ESID_BITS_1T 6
+/*
+ * 256MB segment
+ * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments
+ * available for user + kernel mapping. The top 4 contexts are used for
+ * kernel mapping. Each segment contains 2^28 bytes. Each
+ * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
+ * (19 == 37 + 28 - 46).
+ */
+#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5)
+
/*
* This should be computed such that protovosid * vsid_mulitplier
* doesn't overflow 64 bits. It should also be co-prime to vsid_modulus
*/
#define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */
-#define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS + 1)
+#define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS)
#define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1)
#define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */
-#define VSID_BITS_1T (CONTEXT_BITS + USER_ESID_BITS_1T + 1)
+#define VSID_BITS_1T (CONTEXT_BITS + USER_ESID_BITS_1T)
#define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1)
srdi rx,rt,VSID_BITS_##size; \
clrldi rt,rt,(64-VSID_BITS_##size); \
add rt,rt,rx; /* add high and low bits */ \
- /* Now, r3 == VSID (mod 2^36-1), and lies between 0 and \
+ /* NOTE: explanation based on VSID_BITS_##size = 36 \
+ * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \
* 2^36-1+2^28-1. That in particular means that if r3 >= \
* 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \
* the bit clear, r3 already has the answer we want, if it \
})
#endif /* 1 */
-/*
- * This is only valid for addresses >= PAGE_OFFSET
- * The proto-VSID space is divided into two class
- * User: 0 to 2^(CONTEXT_BITS + USER_ESID_BITS) -1
- * kernel: 2^(CONTEXT_BITS + USER_ESID_BITS) to 2^(VSID_BITS) - 1
- *
- * With KERNEL_START at 0xc000000000000000, the proto vsid for
- * the kernel ends up with 0xc00000000 (36 bits). With 64TB
- * support we need to have kernel proto-VSID in the
- * [2^37 to 2^38 - 1] range due to the increased USER_ESID_BITS.
- */
-static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
-{
- unsigned long proto_vsid;
- /*
- * We need to make sure proto_vsid for the kernel is
- * >= 2^(CONTEXT_BITS + USER_ESID_BITS[_1T])
- */
- if (ssize == MMU_SEGSIZE_256M) {
- proto_vsid = ea >> SID_SHIFT;
- proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS));
- return vsid_scramble(proto_vsid, 256M);
- }
- proto_vsid = ea >> SID_SHIFT_1T;
- proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS_1T));
- return vsid_scramble(proto_vsid, 1T);
-}
-
/* Returns the segment size indicator for a user address */
static inline int user_segment_size(unsigned long addr)
{
return MMU_SEGSIZE_256M;
}
-/* This is only valid for user addresses (which are below 2^44) */
static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
int ssize)
{
+ /*
+ * Bad address. We return VSID 0 for that
+ */
+ if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
+ return 0;
+
if (ssize == MMU_SEGSIZE_256M)
return vsid_scramble((context << USER_ESID_BITS)
| (ea >> SID_SHIFT), 256M);
| (ea >> SID_SHIFT_1T), 1T);
}
+/*
+ * This is only valid for addresses >= PAGE_OFFSET
+ *
+ * For kernel space, we use the top 4 context ids to map address as below
+ * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
+ * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
+ * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
+ * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]
+ */
+static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
+{
+ unsigned long context;
+
+ /*
+ * kernel take the top 4 context from the available range
+ */
+ context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
+ return get_vsid(context, ea, ssize);
+}
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_MMU_HASH64_H_ */
_GLOBAL(do_stab_bolted)
stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */
+ mfspr r11,SPRN_DAR /* ea */
+ /*
+ * check for bad kernel/user address
+ * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+ */
+ rldicr. r9,r11,4,(63 - 46 - 4)
+ li r9,0 /* VSID = 0 for bad address */
+ bne- 0f
+
+ /*
+ * Calculate VSID:
+ * This is the kernel vsid, we take the top for context from
+ * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * Here we know that (ea >> 60) == 0xc
+ */
+ lis r9,(MAX_USER_CONTEXT + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT + 1)@l
+
+ srdi r10,r11,SID_SHIFT
+ rldimi r10,r9,USER_ESID_BITS,0 /* proto vsid */
+ ASM_VSID_SCRAMBLE(r10, r9, 256M)
+ rldic r9,r10,12,16 /* r9 = vsid << 12 */
+
+0:
/* Hash to the primary group */
ld r10,PACASTABVIRT(r13)
- mfspr r11,SPRN_DAR
- srdi r11,r11,28
+ srdi r11,r11,SID_SHIFT
rldimi r10,r11,7,52 /* r10 = first ste of the group */
- /* Calculate VSID */
- /* This is a kernel address, so protovsid = ESID | 1 << 37 */
- li r9,0x1
- rldimi r11,r9,(CONTEXT_BITS + USER_ESID_BITS),0
- ASM_VSID_SCRAMBLE(r11, r9, 256M)
- rldic r9,r11,12,16 /* r9 = vsid << 12 */
-
/* Search the primary group for a free entry */
1: ld r11,0(r10) /* Test valid bit of the current ste */
andi. r11,r11,0x80
unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
unsigned long tprot = prot;
+ /*
+ * If we hit a bad address return error.
+ */
+ if (!vsid)
+ return -1;
/* Make kernel text executable */
if (overlaps_kernel_text(vaddr, vaddr + step))
tprot &= ~HPTE_R_N;
DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
ea, access, trap);
- if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
- DBG_LOW(" out of pgtable range !\n");
- return 1;
- }
-
/* Get region & vsid */
switch (REGION_ID(ea)) {
case USER_REGION_ID:
}
DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+ /* Bad address. */
+ if (!vsid) {
+ DBG_LOW("Bad address!\n");
+ return 1;
+ }
/* Get pgdir */
pgdir = mm->pgd;
if (pgdir == NULL)
/* Get VSID */
ssize = user_segment_size(ea);
vsid = get_vsid(mm->context.id, ea, ssize);
+ if (!vsid)
+ return;
/* Hash doesn't like irqs */
local_irq_save(flags);
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr),
mode, HPTE_V_BOLTED,
mmu_linear_psize, mmu_kernel_ssize);
static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
-/*
- * 256MB segment
- * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments
- * available for user mappings. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
- * (19 == 37 + 28 - 46).
- */
-#define MAX_CONTEXT ((1UL << CONTEXT_BITS) - 1)
-
int __init_new_context(void)
{
int index;
else if (err)
return err;
- if (index > MAX_CONTEXT) {
+ if (index > MAX_USER_CONTEXT) {
spin_lock(&mmu_context_lock);
ida_remove(&mmu_context_ida, index);
spin_unlock(&mmu_context_lock);
* No other registers are examined or changed.
*/
_GLOBAL(slb_allocate_realmode)
- /* r3 = faulting address */
+ /*
+ * check for bad kernel/user address
+ * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+ */
+ rldicr. r9,r3,4,(63 - 46 - 4)
+ bne- 8f
srdi r9,r3,60 /* get region */
- srdi r10,r3,28 /* get esid */
+ srdi r10,r3,SID_SHIFT /* get esid */
cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
*/
_GLOBAL(slb_miss_kernel_load_linear)
li r11,0
- li r9,0x1
/*
- * for 1T we shift 12 bits more. slb_finish_load_1T will do
- * the necessary adjustment
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
*/
- rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
+
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
_GLOBAL(slb_miss_kernel_load_io)
li r11,0
6:
- li r9,0x1
/*
- * for 1T we shift 12 bits more. slb_finish_load_1T will do
- * the necessary adjustment
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
*/
- rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
-0: /* user address: proto-VSID = context << 15 | ESID. First check
- * if the address is within the boundaries of the user region
- */
- srdi. r9,r10,USER_ESID_BITS
- bne- 8f /* invalid ea bits set */
-
-
+0:
/* when using slices, we extract the psize off the slice bitmaps
* and then we need to get the sllp encoding off the mmu_psize_defs
* array.
ld r9,PACACONTEXTID(r13)
BEGIN_FTR_SECTION
cmpldi r10,0x1000
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- rldimi r10,r9,USER_ESID_BITS,0
-BEGIN_FTR_SECTION
bge slb_finish_load_1T
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
b slb_finish_load
8: /* invalid EA */
li r10,0 /* BAD_VSID */
+ li r9,0 /* BAD_VSID */
li r11,SLB_VSID_USER /* flags don't much matter */
b slb_finish_load
/* get context to calculate proto-VSID */
ld r9,PACACONTEXTID(r13)
- rldimi r10,r9,USER_ESID_BITS,0
-
/* fall through slb_finish_load */
#endif /* __DISABLED__ */
/*
* Finish loading of an SLB entry and return
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
+ * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
*/
slb_finish_load:
+ rldimi r10,r9,USER_ESID_BITS,0
ASM_VSID_SCRAMBLE(r10,r9,256M)
/*
* bits above VSID_BITS_256M need to be ignored from r10
/*
* Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9
+ * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
*/
slb_finish_load_1T:
- srdi r10,r10,40-28 /* get 1T ESID */
+ srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
+ rldimi r10,r9,USER_ESID_BITS_1T,0
ASM_VSID_SCRAMBLE(r10,r9,1T)
/*
* bits above VSID_BITS_1T need to be ignored from r10
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
+ WARN_ON(vsid == 0);
vpn = hpt_vpn(addr, vsid, ssize);
rpte = __real_pte(__pte(pte), ptep);