From 40b4a7599d5555b408e594f4c8dae8015ccaae8f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 12 Feb 2011 19:21:35 +0100 Subject: [PATCH] drm/radeon/kms: optimize CS state checking for r100->r500 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The colorbuffer, zbuffer, and texture states are checked only once when they get changed. This improves performance in the apps which emit lots of draw packets and few state changes. This drops performance in glxgears by a 1% or so, but glxgears is not a benchmark we care about. The time spent in the kernel when running Torcs dropped from 33% to 23% and the frame rate is higher, which is a good thing. r600 might need something like this as well. Signed-off-by: Marek Olšák Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/r100.c | 40 ++++++++++++++++++++++++++--- drivers/gpu/drm/radeon/r100_track.h | 11 +++----- drivers/gpu/drm/radeon/r200.c | 18 +++++++++++++ drivers/gpu/drm/radeon/r300.c | 20 ++++++++++++++- 4 files changed, 77 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 5f15820efe1..fdf4bc67ae5 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -1427,6 +1427,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, } track->zb.robj = reloc->robj; track->zb.offset = idx_value; + track->zb_dirty = true; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); break; case RADEON_RB3D_COLOROFFSET: @@ -1439,6 +1440,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, } track->cb[0].robj = reloc->robj; track->cb[0].offset = idx_value; + track->cb_dirty = true; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); break; case RADEON_PP_TXOFFSET_0: @@ -1454,6 +1456,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, } ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); track->textures[i].robj = reloc->robj; + track->tex_dirty = true; break; case RADEON_PP_CUBIC_OFFSET_T0_0: case RADEON_PP_CUBIC_OFFSET_T0_1: @@ -1471,6 +1474,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, track->textures[0].cube_info[i].offset = idx_value; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); track->textures[0].cube_info[i].robj = reloc->robj; + track->tex_dirty = true; break; case RADEON_PP_CUBIC_OFFSET_T1_0: case RADEON_PP_CUBIC_OFFSET_T1_1: @@ -1488,6 +1492,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, track->textures[1].cube_info[i].offset = idx_value; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); track->textures[1].cube_info[i].robj = reloc->robj; + track->tex_dirty = true; break; case RADEON_PP_CUBIC_OFFSET_T2_0: case RADEON_PP_CUBIC_OFFSET_T2_1: @@ -1505,9 +1510,12 @@ static int r100_packet0_check(struct radeon_cs_parser *p, track->textures[2].cube_info[i].offset = idx_value; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); track->textures[2].cube_info[i].robj = reloc->robj; + track->tex_dirty = true; break; case RADEON_RE_WIDTH_HEIGHT: track->maxy = ((idx_value >> 16) & 0x7FF); + track->cb_dirty = true; + track->zb_dirty = true; break; case RADEON_RB3D_COLORPITCH: r = r100_cs_packet_next_reloc(p, &reloc); @@ -1528,9 +1536,11 @@ static int r100_packet0_check(struct radeon_cs_parser *p, ib[idx] = tmp; track->cb[0].pitch = idx_value & RADEON_COLORPITCH_MASK; + track->cb_dirty = true; break; case RADEON_RB3D_DEPTHPITCH: track->zb.pitch = idx_value & RADEON_DEPTHPITCH_MASK; + track->zb_dirty = true; break; case RADEON_RB3D_CNTL: switch ((idx_value >> RADEON_RB3D_COLOR_FORMAT_SHIFT) & 0x1f) { @@ -1555,6 +1565,8 @@ static int r100_packet0_check(struct radeon_cs_parser *p, return -EINVAL; } track->z_enabled = !!(idx_value & RADEON_Z_ENABLE); + track->cb_dirty = true; + track->zb_dirty = true; break; case RADEON_RB3D_ZSTENCILCNTL: switch (idx_value & 0xf) { @@ -1572,6 +1584,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, default: break; } + track->zb_dirty = true; break; case RADEON_RB3D_ZPASS_ADDR: r = r100_cs_packet_next_reloc(p, &reloc); @@ -1588,6 +1601,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, uint32_t temp = idx_value >> 4; for (i = 0; i < track->num_texture; i++) track->textures[i].enabled = !!(temp & (1 << i)); + track->tex_dirty = true; } break; case RADEON_SE_VF_CNTL: @@ -1602,12 +1616,14 @@ static int r100_packet0_check(struct radeon_cs_parser *p, i = (reg - RADEON_PP_TEX_SIZE_0) / 8; track->textures[i].width = (idx_value & RADEON_TEX_USIZE_MASK) + 1; track->textures[i].height = ((idx_value & RADEON_TEX_VSIZE_MASK) >> RADEON_TEX_VSIZE_SHIFT) + 1; + track->tex_dirty = true; break; case RADEON_PP_TEX_PITCH_0: case RADEON_PP_TEX_PITCH_1: case RADEON_PP_TEX_PITCH_2: i = (reg - RADEON_PP_TEX_PITCH_0) / 8; track->textures[i].pitch = idx_value + 32; + track->tex_dirty = true; break; case RADEON_PP_TXFILTER_0: case RADEON_PP_TXFILTER_1: @@ -1621,6 +1637,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, tmp = (idx_value >> 27) & 0x7; if (tmp == 2 || tmp == 6) track->textures[i].roundup_h = false; + track->tex_dirty = true; break; case RADEON_PP_TXFORMAT_0: case RADEON_PP_TXFORMAT_1: @@ -1673,6 +1690,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, } track->textures[i].cube_info[4].width = 1 << ((idx_value >> 16) & 0xf); track->textures[i].cube_info[4].height = 1 << ((idx_value >> 20) & 0xf); + track->tex_dirty = true; break; case RADEON_PP_CUBIC_FACES_0: case RADEON_PP_CUBIC_FACES_1: @@ -1683,6 +1701,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, track->textures[i].cube_info[face].width = 1 << ((tmp >> (face * 8)) & 0xf); track->textures[i].cube_info[face].height = 1 << ((tmp >> ((face * 8) + 4)) & 0xf); } + track->tex_dirty = true; break; default: printk(KERN_ERR "Forbidden register 0x%04X in cs at %d\n", @@ -3318,9 +3337,9 @@ int r100_cs_track_check(struct radeon_device *rdev, struct r100_cs_track *track) unsigned long size; unsigned prim_walk; unsigned nverts; - unsigned num_cb = track->num_cb; + unsigned num_cb = track->cb_dirty ? track->num_cb : 0; - if (!track->zb_cb_clear && !track->color_channel_mask && + if (num_cb && !track->zb_cb_clear && !track->color_channel_mask && !track->blend_read_enable) num_cb = 0; @@ -3341,7 +3360,9 @@ int r100_cs_track_check(struct radeon_device *rdev, struct r100_cs_track *track) return -EINVAL; } } - if (track->z_enabled) { + track->cb_dirty = false; + + if (track->zb_dirty && track->z_enabled) { if (track->zb.robj == NULL) { DRM_ERROR("[drm] No buffer for z buffer !\n"); return -EINVAL; @@ -3358,6 +3379,8 @@ int r100_cs_track_check(struct radeon_device *rdev, struct r100_cs_track *track) return -EINVAL; } } + track->zb_dirty = false; + prim_walk = (track->vap_vf_cntl >> 4) & 0x3; if (track->vap_vf_cntl & (1 << 14)) { nverts = track->vap_alt_nverts; @@ -3417,13 +3440,22 @@ int r100_cs_track_check(struct radeon_device *rdev, struct r100_cs_track *track) prim_walk); return -EINVAL; } - return r100_cs_track_texture_check(rdev, track); + + if (track->tex_dirty) { + track->tex_dirty = false; + return r100_cs_track_texture_check(rdev, track); + } + return 0; } void r100_cs_track_clear(struct radeon_device *rdev, struct r100_cs_track *track) { unsigned i, face; + track->cb_dirty = true; + track->zb_dirty = true; + track->tex_dirty = true; + if (rdev->family < CHIP_R300) { track->num_cb = 1; if (rdev->family <= CHIP_RS200) diff --git a/drivers/gpu/drm/radeon/r100_track.h b/drivers/gpu/drm/radeon/r100_track.h index af65600e656..ee85c4a1fc0 100644 --- a/drivers/gpu/drm/radeon/r100_track.h +++ b/drivers/gpu/drm/radeon/r100_track.h @@ -52,14 +52,7 @@ struct r100_cs_track_texture { unsigned compress_format; }; -struct r100_cs_track_limits { - unsigned num_cb; - unsigned num_texture; - unsigned max_levels; -}; - struct r100_cs_track { - struct radeon_device *rdev; unsigned num_cb; unsigned num_texture; unsigned maxy; @@ -78,6 +71,10 @@ struct r100_cs_track { bool separate_cube; bool zb_cb_clear; bool blend_read_enable; + + bool cb_dirty; + bool zb_dirty; + bool tex_dirty; }; int r100_cs_track_check(struct radeon_device *rdev, struct r100_cs_track *track); diff --git a/drivers/gpu/drm/radeon/r200.c b/drivers/gpu/drm/radeon/r200.c index d2408c39561..f2405830041 100644 --- a/drivers/gpu/drm/radeon/r200.c +++ b/drivers/gpu/drm/radeon/r200.c @@ -184,6 +184,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, } track->zb.robj = reloc->robj; track->zb.offset = idx_value; + track->zb_dirty = true; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); break; case RADEON_RB3D_COLOROFFSET: @@ -196,6 +197,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, } track->cb[0].robj = reloc->robj; track->cb[0].offset = idx_value; + track->cb_dirty = true; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); break; case R200_PP_TXOFFSET_0: @@ -214,6 +216,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, } ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); track->textures[i].robj = reloc->robj; + track->tex_dirty = true; break; case R200_PP_CUBIC_OFFSET_F1_0: case R200_PP_CUBIC_OFFSET_F2_0: @@ -257,9 +260,12 @@ int r200_packet0_check(struct radeon_cs_parser *p, track->textures[i].cube_info[face - 1].offset = idx_value; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); track->textures[i].cube_info[face - 1].robj = reloc->robj; + track->tex_dirty = true; break; case RADEON_RE_WIDTH_HEIGHT: track->maxy = ((idx_value >> 16) & 0x7FF); + track->cb_dirty = true; + track->zb_dirty = true; break; case RADEON_RB3D_COLORPITCH: r = r100_cs_packet_next_reloc(p, &reloc); @@ -280,9 +286,11 @@ int r200_packet0_check(struct radeon_cs_parser *p, ib[idx] = tmp; track->cb[0].pitch = idx_value & RADEON_COLORPITCH_MASK; + track->cb_dirty = true; break; case RADEON_RB3D_DEPTHPITCH: track->zb.pitch = idx_value & RADEON_DEPTHPITCH_MASK; + track->zb_dirty = true; break; case RADEON_RB3D_CNTL: switch ((idx_value >> RADEON_RB3D_COLOR_FORMAT_SHIFT) & 0x1f) { @@ -312,6 +320,8 @@ int r200_packet0_check(struct radeon_cs_parser *p, } track->z_enabled = !!(idx_value & RADEON_Z_ENABLE); + track->cb_dirty = true; + track->zb_dirty = true; break; case RADEON_RB3D_ZSTENCILCNTL: switch (idx_value & 0xf) { @@ -329,6 +339,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, default: break; } + track->zb_dirty = true; break; case RADEON_RB3D_ZPASS_ADDR: r = r100_cs_packet_next_reloc(p, &reloc); @@ -345,6 +356,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, uint32_t temp = idx_value >> 4; for (i = 0; i < track->num_texture; i++) track->textures[i].enabled = !!(temp & (1 << i)); + track->tex_dirty = true; } break; case RADEON_SE_VF_CNTL: @@ -369,6 +381,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, i = (reg - R200_PP_TXSIZE_0) / 32; track->textures[i].width = (idx_value & RADEON_TEX_USIZE_MASK) + 1; track->textures[i].height = ((idx_value & RADEON_TEX_VSIZE_MASK) >> RADEON_TEX_VSIZE_SHIFT) + 1; + track->tex_dirty = true; break; case R200_PP_TXPITCH_0: case R200_PP_TXPITCH_1: @@ -378,6 +391,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, case R200_PP_TXPITCH_5: i = (reg - R200_PP_TXPITCH_0) / 32; track->textures[i].pitch = idx_value + 32; + track->tex_dirty = true; break; case R200_PP_TXFILTER_0: case R200_PP_TXFILTER_1: @@ -394,6 +408,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, tmp = (idx_value >> 27) & 0x7; if (tmp == 2 || tmp == 6) track->textures[i].roundup_h = false; + track->tex_dirty = true; break; case R200_PP_TXMULTI_CTL_0: case R200_PP_TXMULTI_CTL_1: @@ -432,6 +447,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, track->textures[i].tex_coord_type = 1; break; } + track->tex_dirty = true; break; case R200_PP_TXFORMAT_0: case R200_PP_TXFORMAT_1: @@ -488,6 +504,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, } track->textures[i].cube_info[4].width = 1 << ((idx_value >> 16) & 0xf); track->textures[i].cube_info[4].height = 1 << ((idx_value >> 20) & 0xf); + track->tex_dirty = true; break; case R200_PP_CUBIC_FACES_0: case R200_PP_CUBIC_FACES_1: @@ -501,6 +518,7 @@ int r200_packet0_check(struct radeon_cs_parser *p, track->textures[i].cube_info[face].width = 1 << ((tmp >> (face * 8)) & 0xf); track->textures[i].cube_info[face].height = 1 << ((tmp >> ((face * 8) + 4)) & 0xf); } + track->tex_dirty = true; break; default: printk(KERN_ERR "Forbidden register 0x%04X in cs at %d\n", diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 55fe5ba7def..15f94648f27 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -667,6 +667,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, } track->cb[i].robj = reloc->robj; track->cb[i].offset = idx_value; + track->cb_dirty = true; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); break; case R300_ZB_DEPTHOFFSET: @@ -679,6 +680,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, } track->zb.robj = reloc->robj; track->zb.offset = idx_value; + track->zb_dirty = true; ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset); break; case R300_TX_OFFSET_0: @@ -717,6 +719,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, tmp |= tile_flags; ib[idx] = tmp; track->textures[i].robj = reloc->robj; + track->tex_dirty = true; break; /* Tracked registers */ case 0x2084: @@ -743,6 +746,8 @@ static int r300_packet0_check(struct radeon_cs_parser *p, if (p->rdev->family < CHIP_RV515) { track->maxy -= 1440; } + track->cb_dirty = true; + track->zb_dirty = true; break; case 0x4E00: /* RB3D_CCTL */ @@ -752,6 +757,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, return -EINVAL; } track->num_cb = ((idx_value >> 5) & 0x3) + 1; + track->cb_dirty = true; break; case 0x4E38: case 0x4E3C: @@ -814,6 +820,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, ((idx_value >> 21) & 0xF)); return -EINVAL; } + track->cb_dirty = true; break; case 0x4F00: /* ZB_CNTL */ @@ -822,6 +829,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, } else { track->z_enabled = false; } + track->zb_dirty = true; break; case 0x4F10: /* ZB_FORMAT */ @@ -838,6 +846,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, (idx_value & 0xF)); return -EINVAL; } + track->zb_dirty = true; break; case 0x4F24: /* ZB_DEPTHPITCH */ @@ -861,6 +870,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, ib[idx] = tmp; track->zb.pitch = idx_value & 0x3FFC; + track->zb_dirty = true; break; case 0x4104: for (i = 0; i < 16; i++) { @@ -869,6 +879,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, enabled = !!(idx_value & (1 << i)); track->textures[i].enabled = enabled; } + track->tex_dirty = true; break; case 0x44C0: case 0x44C4: @@ -951,8 +962,8 @@ static int r300_packet0_check(struct radeon_cs_parser *p, DRM_ERROR("Invalid texture format %u\n", (idx_value & 0x1F)); return -EINVAL; - break; } + track->tex_dirty = true; break; case 0x4400: case 0x4404: @@ -980,6 +991,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, if (tmp == 2 || tmp == 4 || tmp == 6) { track->textures[i].roundup_h = false; } + track->tex_dirty = true; break; case 0x4500: case 0x4504: @@ -1017,6 +1029,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, DRM_ERROR("Forbidden bit TXFORMAT_MSB\n"); return -EINVAL; } + track->tex_dirty = true; break; case 0x4480: case 0x4484: @@ -1046,6 +1059,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, track->textures[i].use_pitch = !!tmp; tmp = (idx_value >> 22) & 0xF; track->textures[i].txdepth = tmp; + track->tex_dirty = true; break; case R300_ZB_ZPASS_ADDR: r = r100_cs_packet_next_reloc(p, &reloc); @@ -1060,6 +1074,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, case 0x4e0c: /* RB3D_COLOR_CHANNEL_MASK */ track->color_channel_mask = idx_value; + track->cb_dirty = true; break; case 0x43a4: /* SC_HYPERZ_EN */ @@ -1073,6 +1088,8 @@ static int r300_packet0_check(struct radeon_cs_parser *p, case 0x4f1c: /* ZB_BW_CNTL */ track->zb_cb_clear = !!(idx_value & (1 << 5)); + track->cb_dirty = true; + track->zb_dirty = true; if (p->rdev->hyperz_filp != p->filp) { if (idx_value & (R300_HIZ_ENABLE | R300_RD_COMP_ENABLE | @@ -1084,6 +1101,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, case 0x4e04: /* RB3D_BLENDCNTL */ track->blend_read_enable = !!(idx_value & (1 << 2)); + track->cb_dirty = true; break; case 0x4f28: /* ZB_DEPTHCLEARVALUE */ break; -- 2.20.1