return 0;
}
+/**
+ * radeon_vm_frag_ptes - add fragment information to PTEs
+ *
+ * @rdev: radeon_device pointer
+ * @ib: IB for the update
+ * @pe_start: first PTE to handle
+ * @pe_end: last PTE to handle
+ * @addr: addr those PTEs should point to
+ * @flags: hw mapping flags
+ *
+ * Global and local mutex must be locked!
+ */
+static void radeon_vm_frag_ptes(struct radeon_device *rdev,
+ struct radeon_ib *ib,
+ uint64_t pe_start, uint64_t pe_end,
+ uint64_t addr, uint32_t flags)
+{
+ /**
+ * The MC L1 TLB supports variable sized pages, based on a fragment
+ * field in the PTE. When this field is set to a non-zero value, page
+ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+ * flags are considered valid for all PTEs within the fragment range
+ * and corresponding mappings are assumed to be physically contiguous.
+ *
+ * The L1 TLB can store a single PTE for the whole fragment,
+ * significantly increasing the space available for translation
+ * caching. This leads to large improvements in throughput when the
+ * TLB is under pressure.
+ *
+ * The L2 TLB distributes small and large fragments into two
+ * asymmetric partitions. The large fragment cache is significantly
+ * larger. Thus, we try to use large fragments wherever possible.
+ * Userspace can support this by aligning virtual base address and
+ * allocation size to the fragment size.
+ */
+
+ /* NI is optimized for 256KB fragments, SI and newer for 64KB */
+ uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
+ R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
+ uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
+
+ uint64_t frag_start = ALIGN(pe_start, frag_align);
+ uint64_t frag_end = pe_end & ~(frag_align - 1);
+
+ unsigned count;
+
+ /* system pages are non continuously */
+ if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
+ (frag_start >= frag_end)) {
+
+ count = (pe_end - pe_start) / 8;
+ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
+ RADEON_GPU_PAGE_SIZE, flags);
+ return;
+ }
+
+ /* handle the 4K area at the beginning */
+ if (pe_start != frag_start) {
+ count = (frag_start - pe_start) / 8;
+ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
+ RADEON_GPU_PAGE_SIZE, flags);
+ addr += RADEON_GPU_PAGE_SIZE * count;
+ }
+
+ /* handle the area in the middle */
+ count = (frag_end - frag_start) / 8;
+ radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
+ RADEON_GPU_PAGE_SIZE, flags | frag_flags);
+
+ /* handle the 4K area at the end */
+ if (frag_end != pe_end) {
+ addr += RADEON_GPU_PAGE_SIZE * count;
+ count = (pe_end - frag_end) / 8;
+ radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
+ RADEON_GPU_PAGE_SIZE, flags);
+ }
+}
+
/**
* radeon_vm_update_ptes - make sure that page tables are valid
*
if ((last_pte + 8 * count) != pte) {
if (count) {
- radeon_asic_vm_set_page(rdev, ib, last_pte,
- last_dst, count,
- RADEON_GPU_PAGE_SIZE,
- flags);
+ radeon_vm_frag_ptes(rdev, ib, last_pte,
+ last_pte + 8 * count,
+ last_dst, flags);
}
count = nptes;
}
if (count) {
- radeon_asic_vm_set_page(rdev, ib, last_pte,
- last_dst, count,
- RADEON_GPU_PAGE_SIZE, flags);
+ radeon_vm_frag_ptes(rdev, ib, last_pte,
+ last_pte + 8 * count,
+ last_dst, flags);
}
}
WREG32(MC_VM_MX_L1_TLB_CNTL,
(0xA << 7) |
ENABLE_L1_TLB |
+ ENABLE_L1_FRAGMENT_PROCESSING |
SYSTEM_ACCESS_MODE_NOT_IN_SYS |
ENABLE_ADVANCED_DRIVER_MODEL |
SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
/* Setup L2 cache */
WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
+ ENABLE_L2_FRAGMENT_PROCESSING |
ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
EFFECTIVE_L2_QUEUE_SIZE(7) |
CONTEXT1_IDENTITY_ACCESS_MODE(1));
WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
- L2_CACHE_BIGK_FRAGMENT_SIZE(0));
+ BANK_SELECT(4) |
+ L2_CACHE_BIGK_FRAGMENT_SIZE(4));
/* setup context0 */
WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);