drm/vc4: Add T-format scanout support.
authorEric Anholt <eric@anholt.net>
Thu, 8 Jun 2017 00:13:35 +0000 (17:13 -0700)
committerEric Anholt <eric@anholt.net>
Thu, 15 Jun 2017 23:02:45 +0000 (16:02 -0700)
The T tiling format is what V3D uses for textures, with no raster
support at all until later revisions of the hardware (and always at a
large 3D performance penalty).  If we can't scan out V3D's format,
then we often need to do a relayout at some stage of the pipeline,
either right before texturing from the scanout buffer (common in X11
without a compositor) or between a tiled screen buffer right before
scanout (an option I've considered in trying to resolve this
inconsistency, but which means needing to use the dirty fb ioctl and
having some update policy).

T-format scanout lets us avoid either of those shadow copies, for a
massive, obvious performance improvement to X11 window dragging
without a compositor.  Unfortunately, enabling a compositor to work
around the discrepancy has turned out to be too costly in memory
consumption for the Raspbian distribution.

Because the HVS operates a scanline at a time, compositing from T does
increase the memory bandwidth cost of scanout.  On my 1920x1080@32bpp
display on a RPi3, we go from about 15% of system memory bandwidth
with linear to about 20% with tiled.  However, for X11 this still ends
up being a huge performance win in active usage.

This patch doesn't yet handle src_x/src_y offsetting within the tiled
buffer.  However, we fail to do so for untiled buffers already.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: http://patchwork.freedesktop.org/patch/msgid/20170608001336.12842-1-eric@anholt.net
Reviewed-by: Boris Brezillon <boris.brezillon@free-electrons.com>
drivers/gpu/drm/vc4/vc4_plane.c
drivers/gpu/drm/vc4/vc4_regs.h
include/uapi/drm/drm_fourcc.h

index da18dec2169627c7849017db83d00db28fc419da..fa6809d8b0fe69ed6f7372e48e704449d04b1878 100644 (file)
@@ -500,8 +500,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
        u32 ctl0_offset = vc4_state->dlist_count;
        const struct hvs_format *format = vc4_get_hvs_format(fb->format->format);
        int num_planes = drm_format_num_planes(format->drm);
-       u32 scl0, scl1;
-       u32 lbm_size;
+       u32 scl0, scl1, pitch0;
+       u32 lbm_size, tiling;
        unsigned long irqflags;
        int ret, i;
 
@@ -542,11 +542,31 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
                scl1 = vc4_get_scl_field(state, 0);
        }
 
+       switch (fb->modifier) {
+       case DRM_FORMAT_MOD_LINEAR:
+               tiling = SCALER_CTL0_TILING_LINEAR;
+               pitch0 = VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH);
+               break;
+       case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED:
+               tiling = SCALER_CTL0_TILING_256B_OR_T;
+
+               pitch0 = (VC4_SET_FIELD(0, SCALER_PITCH0_TILE_Y_OFFSET),
+                         VC4_SET_FIELD(0, SCALER_PITCH0_TILE_WIDTH_L),
+                         VC4_SET_FIELD((vc4_state->src_w[0] + 31) >> 5,
+                                       SCALER_PITCH0_TILE_WIDTH_R));
+               break;
+       default:
+               DRM_DEBUG_KMS("Unsupported FB tiling flag 0x%16llx",
+                             (long long)fb->modifier);
+               return -EINVAL;
+       }
+
        /* Control word */
        vc4_dlist_write(vc4_state,
                        SCALER_CTL0_VALID |
                        (format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
                        (format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
+                       VC4_SET_FIELD(tiling, SCALER_CTL0_TILING) |
                        (vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
                        VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) |
                        VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1));
@@ -600,8 +620,11 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
        for (i = 0; i < num_planes; i++)
                vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
-       /* Pitch word 0/1/2 */
-       for (i = 0; i < num_planes; i++) {
+       /* Pitch word 0 */
+       vc4_dlist_write(vc4_state, pitch0);
+
+       /* Pitch word 1/2 */
+       for (i = 1; i < num_planes; i++) {
                vc4_dlist_write(vc4_state,
                                VC4_SET_FIELD(fb->pitches[i], SCALER_SRC_PITCH));
        }
index 932093936178674173a84002b33e07e9a37fdfe9..d382c34c1b9e0c6d5b949ca09bf1e8401d44f91a 100644 (file)
@@ -709,6 +709,13 @@ enum hvs_pixel_format {
 #define SCALER_CTL0_SIZE_MASK                  VC4_MASK(29, 24)
 #define SCALER_CTL0_SIZE_SHIFT                 24
 
+#define SCALER_CTL0_TILING_MASK                        VC4_MASK(21, 20)
+#define SCALER_CTL0_TILING_SHIFT               20
+#define SCALER_CTL0_TILING_LINEAR              0
+#define SCALER_CTL0_TILING_64B                 1
+#define SCALER_CTL0_TILING_128B                        2
+#define SCALER_CTL0_TILING_256B_OR_T           3
+
 #define SCALER_CTL0_HFLIP                       BIT(16)
 #define SCALER_CTL0_VFLIP                       BIT(15)
 
@@ -838,7 +845,19 @@ enum hvs_pixel_format {
 #define SCALER_PPF_KERNEL_OFFSET_SHIFT         0
 #define SCALER_PPF_KERNEL_UNCACHED             BIT(31)
 
+/* PITCH0/1/2 fields for raster. */
 #define SCALER_SRC_PITCH_MASK                  VC4_MASK(15, 0)
 #define SCALER_SRC_PITCH_SHIFT                 0
 
+/* PITCH0 fields for T-tiled. */
+#define SCALER_PITCH0_TILE_WIDTH_L_MASK                VC4_MASK(22, 16)
+#define SCALER_PITCH0_TILE_WIDTH_L_SHIFT       16
+#define SCALER_PITCH0_TILE_LINE_DIR            BIT(15)
+#define SCALER_PITCH0_TILE_INITIAL_LINE_DIR    BIT(14)
+/* Y offset within a tile. */
+#define SCALER_PITCH0_TILE_Y_OFFSET_MASK       VC4_MASK(13, 7)
+#define SCALER_PITCH0_TILE_Y_OFFSET_SHIFT      7
+#define SCALER_PITCH0_TILE_WIDTH_R_MASK                VC4_MASK(6, 0)
+#define SCALER_PITCH0_TILE_WIDTH_R_SHIFT       0
+
 #endif /* VC4_REGS_H */
index 55e301047b3e94a4d0eee18c6bb7732cc162b695..7586c46f68bf22f48a1536c0f27eed0d8d06eb92 100644 (file)
@@ -182,6 +182,7 @@ extern "C" {
 #define DRM_FORMAT_MOD_VENDOR_SAMSUNG 0x04
 #define DRM_FORMAT_MOD_VENDOR_QCOM    0x05
 #define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06
+#define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07
 /* add more to the end as needed */
 
 #define fourcc_mod_code(vendor, val) \
@@ -306,7 +307,6 @@ extern "C" {
  */
 #define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4)
 
-
 /* NVIDIA Tegra frame buffer modifiers */
 
 /*
@@ -351,6 +351,27 @@ extern "C" {
  */
 #define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v)
 
+/*
+ * Broadcom VC4 "T" format
+ *
+ * This is the primary layout that the V3D GPU can texture from (it
+ * can't do linear).  The T format has:
+ *
+ * - 64b utiles of pixels in a raster-order grid according to cpp.  It's 4x4
+ *   pixels at 32 bit depth.
+ *
+ * - 1k subtiles made of a 4x4 raster-order grid of 64b utiles (so usually
+ *   16x16 pixels).
+ *
+ * - 4k tiles made of a 2x2 grid of 1k subtiles (so usually 32x32 pixels).  On
+ *   even 4k tile rows, they're arranged as (BL, TL, TR, BR), and on odd rows
+ *   they're (TR, BR, BL, TL), where bottom left is start of memory.
+ *
+ * - an image made of 4k tiles in rows either left-to-right (even rows of 4k
+ *   tiles) or right-to-left (odd rows of 4k tiles).
+ */
+#define DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED fourcc_mod_code(BROADCOM, 1)
+
 #if defined(__cplusplus)
 }
 #endif