diff -u mesa-23.2.1/debian/changelog mesa-23.2.1/debian/changelog --- mesa-23.2.1/debian/changelog +++ mesa-23.2.1/debian/changelog @@ -1,3 +1,10 @@ +mesa (23.2.1-1ubuntu2) mantic; urgency=medium + + * d/p/v3d-v3dv-support-for-HW-7.1.x.patch + - [FFe] Raspberry Pi 5 (LP: #2037642) + + -- Juerg Haefliger Fri, 29 Sep 2023 08:27:32 +0200 + mesa (23.2.1-1ubuntu1) mantic; urgency=medium * Merge from Debian. diff -u mesa-23.2.1/debian/patches/series mesa-23.2.1/debian/patches/series --- mesa-23.2.1/debian/patches/series +++ mesa-23.2.1/debian/patches/series @@ -2,3 +2,4 @@ path_max.diff src_glx_dri_common.h.diff fix-clover-build-without-spirv.diff +v3d-v3dv-support-for-HW-7.1.x.patch only in patch2: unchanged: --- mesa-23.2.1.orig/debian/patches/v3d-v3dv-support-for-HW-7.1.x.patch +++ mesa-23.2.1/debian/patches/v3d-v3dv-support-for-HW-7.1.x.patch @@ -0,0 +1,11461 @@ +From 2b2fb2d7889c5cc6a624a867ef0123c7ba8cb2a1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= +Date: Mon, 26 Apr 2021 00:02:21 +0200 +Subject: [PATCH] v3d, v3dv: support for HW 7.1.x + +Squash of MR: +https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25450 + +Signed-off-by: Juerg Haefliger +--- + include/drm-uapi/v3d_drm.h | 5 + + src/broadcom/cle/meson.build | 3 +- + src/broadcom/cle/v3d_packet_v33.xml | 386 ++++- + src/broadcom/cle/v3dx_pack.h | 2 + + src/broadcom/clif/clif_private.h | 2 + + src/broadcom/common/v3d_device_info.c | 17 +- + src/broadcom/common/v3d_device_info.h | 6 + + src/broadcom/common/v3d_limits.h | 3 +- + src/broadcom/common/v3d_macros.h | 3 + + .../common/v3d_performance_counters.h | 108 ++ + src/broadcom/common/v3d_tfu.h | 23 + + src/broadcom/common/v3d_util.c | 128 +- + src/broadcom/common/v3d_util.h | 38 +- + src/broadcom/compiler/nir_to_vir.c | 63 +- + src/broadcom/compiler/qpu_schedule.c | 813 +++++++-- + src/broadcom/compiler/qpu_validate.c | 98 +- + src/broadcom/compiler/v3d_compiler.h | 9 +- + src/broadcom/compiler/v3d_nir_lower_io.c | 10 +- + src/broadcom/compiler/vir.c | 32 +- + src/broadcom/compiler/vir_dump.c | 8 +- + src/broadcom/compiler/vir_live_variables.c | 21 +- + .../compiler/vir_opt_copy_propagate.c | 95 +- + .../compiler/vir_opt_redundant_flags.c | 8 +- + .../compiler/vir_opt_small_immediates.c | 26 +- + src/broadcom/compiler/vir_register_allocate.c | 480 ++++-- + src/broadcom/compiler/vir_to_qpu.c | 155 +- + src/broadcom/meson.build | 2 +- + src/broadcom/qpu/qpu_disasm.c | 81 +- + src/broadcom/qpu/qpu_instr.c | 121 +- + src/broadcom/qpu/qpu_instr.h | 87 +- + src/broadcom/qpu/qpu_pack.c | 1453 ++++++++++++++--- + src/broadcom/qpu/tests/qpu_disasm.c | 8 +- + src/broadcom/simulator/v3d_simulator.c | 52 +- + src/broadcom/simulator/v3d_simulator.h | 26 + + src/broadcom/simulator/v3dx_simulator.c | 94 +- + src/broadcom/simulator/v3dx_simulator.h | 1 + + src/broadcom/vulkan/meson.build | 7 +- + src/broadcom/vulkan/v3dv_cmd_buffer.c | 107 +- + src/broadcom/vulkan/v3dv_device.c | 37 +- + src/broadcom/vulkan/v3dv_image.c | 7 +- + src/broadcom/vulkan/v3dv_limits.h | 2 - + src/broadcom/vulkan/v3dv_meta_clear.c | 9 +- + src/broadcom/vulkan/v3dv_meta_copy.c | 19 +- + src/broadcom/vulkan/v3dv_pass.c | 19 +- + src/broadcom/vulkan/v3dv_pipeline.c | 89 +- + src/broadcom/vulkan/v3dv_private.h | 71 +- + src/broadcom/vulkan/v3dv_query.c | 43 +- + src/broadcom/vulkan/v3dv_queue.c | 2 +- + src/broadcom/vulkan/v3dv_uniforms.c | 13 +- + src/broadcom/vulkan/v3dvx_cmd_buffer.c | 471 +++++- + src/broadcom/vulkan/v3dvx_device.c | 53 +- + src/broadcom/vulkan/v3dvx_image.c | 66 +- + src/broadcom/vulkan/v3dvx_meta_common.c | 108 +- + src/broadcom/vulkan/v3dvx_pipeline.c | 137 +- + src/broadcom/vulkan/v3dvx_private.h | 49 +- + src/broadcom/vulkan/v3dvx_query.c | 67 + + src/broadcom/vulkan/v3dvx_queue.c | 18 +- + src/gallium/drivers/v3d/meson.build | 5 +- + src/gallium/drivers/v3d/v3d_blit.c | 166 +- + src/gallium/drivers/v3d/v3d_context.c | 9 +- + src/gallium/drivers/v3d/v3d_context.h | 54 +- + src/gallium/drivers/v3d/v3d_job.c | 6 +- + src/gallium/drivers/v3d/v3d_query.c | 20 +- + src/gallium/drivers/v3d/v3d_query.h | 6 - + src/gallium/drivers/v3d/v3d_screen.c | 3 +- + src/gallium/drivers/v3d/v3d_uniforms.c | 14 +- + src/gallium/drivers/v3d/v3dx_context.h | 20 + + src/gallium/drivers/v3d/v3dx_draw.c | 72 +- + src/gallium/drivers/v3d/v3dx_emit.c | 48 +- + ...d_query_perfcnt.c => v3dx_query_perfcnt.c} | 12 +- + src/gallium/drivers/v3d/v3dx_rcl.c | 190 ++- + src/gallium/drivers/v3d/v3dx_state.c | 129 +- + src/gallium/drivers/v3d/v3dx_tfu.c | 202 +++ + 73 files changed, 5451 insertions(+), 1366 deletions(-) + create mode 100644 src/broadcom/vulkan/v3dvx_query.c + rename src/gallium/drivers/v3d/{v3d_query_perfcnt.c => v3dx_query_perfcnt.c} (94%) + create mode 100644 src/gallium/drivers/v3d/v3dx_tfu.c + +diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h +index 3dfc0af8756a..1a7d7a689de3 100644 +--- a/include/drm-uapi/v3d_drm.h ++++ b/include/drm-uapi/v3d_drm.h +@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu { + + /* Pointer to an array of ioctl extensions*/ + __u64 extensions; ++ ++ struct { ++ __u32 ioc; ++ __u32 pad; ++ } v71; + }; + + /* Submits a compute shader for dispatch. This job will block on any +diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build +index 31a0d5bfa94a..8ac32b313e4d 100644 +--- a/src/broadcom/cle/meson.build ++++ b/src/broadcom/cle/meson.build +@@ -23,7 +23,8 @@ v3d_versions = [ + [21, 21], + [33, 33], + [41, 33], +- [42, 33] ++ [42, 33], ++ [71, 33] + ] + + v3d_xml_files = [] +diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml +index a0242b5f1c2f..624353ca2bf2 100644 +--- a/src/broadcom/cle/v3d_packet_v33.xml ++++ b/src/broadcom/cle/v3d_packet_v33.xml +@@ -1,4 +1,4 @@ +- ++ + + + +@@ -167,13 +167,36 @@ + + + +- ++ + + + + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + ++ + + + +@@ -1099,7 +1263,7 @@ + + + +- ++ + + + +@@ -1108,6 +1272,15 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1117,7 +1290,7 @@ + + + +- ++ + + + +@@ -1126,6 +1299,19 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1135,7 +1321,7 @@ + + + +- ++ + + + +@@ -1144,6 +1330,13 @@ + + + ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1155,7 +1348,7 @@ + + + +- ++ + + + +@@ -1166,6 +1359,13 @@ + + + ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1240,7 +1440,7 @@ + + + +- ++ + + + +@@ -1299,6 +1499,63 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1543,7 +1800,7 @@ + + + +- ++ + + + +@@ -1558,6 +1815,23 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -1611,7 +1885,7 @@ + + + +- ++ + + + +@@ -1652,6 +1926,82 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h +index 5762e5aaa708..e5a1eb266983 100644 +--- a/src/broadcom/cle/v3dx_pack.h ++++ b/src/broadcom/cle/v3dx_pack.h +@@ -37,6 +37,8 @@ + # include "cle/v3d_packet_v41_pack.h" + #elif (V3D_VERSION == 42) + # include "cle/v3d_packet_v42_pack.h" ++#elif (V3D_VERSION == 71) ++# include "cle/v3d_packet_v71_pack.h" + #else + # error "Need to add a pack header include for this v3d version" + #endif +diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h +index 6ace62b03101..cda407a00bf4 100644 +--- a/src/broadcom/clif/clif_private.h ++++ b/src/broadcom/clif/clif_private.h +@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset, + const uint8_t *cl, uint32_t *size, bool reloc_mode); + bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset, + const uint8_t *cl, uint32_t *size, bool reloc_mode); ++bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset, ++ const uint8_t *cl, uint32_t *size, bool reloc_mode); + + static inline void + out(struct clif_dump *clif, const char *fmt, ...) +diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c +index 272190eb2e54..7bc2b662cfc7 100644 +--- a/src/broadcom/common/v3d_device_info.c ++++ b/src/broadcom/common/v3d_device_info.c +@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + struct drm_v3d_get_param ident1 = { + .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1, + }; ++ struct drm_v3d_get_param hub_ident3 = { ++ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3, ++ }; + int ret; + + ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0); +@@ -62,10 +65,13 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + int qups = (ident1.value >> 8) & 0xf; + devinfo->qpu_count = nslc * qups; + ++ devinfo->has_accumulators = devinfo->ver < 71; ++ + switch (devinfo->ver) { + case 33: + case 41: + case 42: ++ case 71: + break; + default: + fprintf(stderr, +@@ -75,5 +81,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i + return false; + } + +- return true; ++ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3); ++ if (ret != 0) { ++ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n", ++ strerror(errno)); ++ return false; ++ } ++ ++ devinfo->rev = (hub_ident3.value >> 8) & 0xff; ++ ++ return true; + } +diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h +index 97abd9b8d9fc..8dfc7858727e 100644 +--- a/src/broadcom/common/v3d_device_info.h ++++ b/src/broadcom/common/v3d_device_info.h +@@ -34,11 +34,17 @@ struct v3d_device_info { + /** Simple V3D version: major * 10 + minor */ + uint8_t ver; + ++ /** V3D revision number */ ++ uint8_t rev; ++ + /** Size of the VPM, in bytes. */ + int vpm_size; + + /* NSLC * QUPS from the core's IDENT registers. */ + int qpu_count; ++ ++ /* If the hw has accumulator registers */ ++ bool has_accumulators; + }; + + typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg); +diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h +index 46f38bd74846..354c8784914c 100644 +--- a/src/broadcom/common/v3d_limits.h ++++ b/src/broadcom/common/v3d_limits.h +@@ -42,7 +42,8 @@ + + #define V3D_MAX_SAMPLES 4 + +-#define V3D_MAX_DRAW_BUFFERS 4 ++#define V3D_MAX_DRAW_BUFFERS 8 ++#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8) + + #define V3D_MAX_POINT_SIZE 512.0f + #define V3D_MAX_LINE_WIDTH 32 +diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h +index fe89398208ab..b4291fb53500 100644 +--- a/src/broadcom/common/v3d_macros.h ++++ b/src/broadcom/common/v3d_macros.h +@@ -41,6 +41,9 @@ + #elif (V3D_VERSION == 42) + # define V3DX(x) V3D42_##x + # define v3dX(x) v3d42_##x ++#elif (V3D_VERSION == 71) ++# define V3DX(x) V3D71_##x ++# define v3dX(x) v3d71_##x + #else + # error "Need to add prefixing macros for this v3d version" + #endif +diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h +index 08d750c2cbe7..a8f0cff8784a 100644 +--- a/src/broadcom/common/v3d_performance_counters.h ++++ b/src/broadcom/common/v3d_performance_counters.h +@@ -28,6 +28,110 @@ + #define V3D_PERFCNT_NAME 1 + #define V3D_PERFCNT_DESCRIPTION 2 + ++#ifndef V3D_VERSION ++# error "The V3D_VERSION macro must be defined" ++#endif ++ ++#if (V3D_VERSION >= 71) ++ ++static const char *v3d_performance_counters[][3] = { ++ {"CORE", "cycle-count", "[CORE] Cycle counter"}, ++ {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"}, ++ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, ++ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, ++ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, ++ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, ++ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, ++ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, ++ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, ++ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, ++ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, ++ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, ++ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, ++ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, ++ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, ++ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, ++ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, ++ {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"}, ++ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, ++ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, ++ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, ++ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, ++ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, ++ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, ++ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, ++ {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"}, ++ {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"}, ++ {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"}, ++ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, ++ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, ++ {"L2T", "L2T-local", "[L2T] Local mode access"}, ++ {"L2T", "L2T-writeback", "[L2T] Writeback"}, ++ {"L2T", "L2T-zero", "[L2T] Zero"}, ++ {"L2T", "L2T-merge", "[L2T] Merge"}, ++ {"L2T", "L2T-fill", "[L2T] Fill"}, ++ {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"}, ++ {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"}, ++ {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"}, ++ {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"}, ++ {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"}, ++ {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"}, ++ {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"}, ++ {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"}, ++ {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"}, ++ {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"}, ++ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, ++ {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"}, ++ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, ++ {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"}, ++ {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"}, ++ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, ++ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, ++ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, ++ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, ++ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, ++ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, ++ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, ++ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, ++ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, ++ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, ++ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, ++ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, ++ {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, ++ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, ++ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, ++ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, ++ {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, ++ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, ++ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, ++ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, ++ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, ++ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, ++ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, ++ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, ++ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, ++ {"AXI", "AXI-read-trans", "[AXI] Read transaction count"}, ++ {"AXI", "AXI-write-trans", "[AXI] Write transaction count"}, ++ {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"}, ++ {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"}, ++ {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"}, ++ {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"}, ++ {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"}, ++ {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"}, ++ {"QPU", "QPU-active", "[QPU] Executed shader instruction"}, ++ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, ++ {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"}, ++ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, ++ {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"}, ++ {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"}, ++ {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"}, ++ {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"}, ++ {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"}, ++ {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"}, ++}; ++ ++#elif (V3D_VERSION >= 41) ++ + static const char *v3d_performance_counters[][3] = { + {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, + {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, +@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = { + {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, + }; + ++#else ++static const char *v3d_performance_counters[][3] = { }; ++#endif ++ + #endif +diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h +index 80da224ca2d9..572d00747940 100644 +--- a/src/broadcom/common/v3d_tfu.h ++++ b/src/broadcom/common/v3d_tfu.h +@@ -48,4 +48,27 @@ + #define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14 + #define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15 + ++/* Disable level 0 write, just write following mipmaps */ ++#define V3D71_TFU_IOC_DIMTW (1 << 0) ++#define V3D71_TFU_IOC_FORMAT_SHIFT 12 ++#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3 ++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4 ++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5 ++#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6 ++#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7 ++ ++#define V3D71_TFU_IOC_STRIDE_SHIFT 16 ++#define V3D71_TFU_IOC_NUMMM_SHIFT 4 ++ ++#define V3D71_TFU_ICFG_OTYPE_SHIFT 16 ++#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23 ++#define V3D71_TFU_ICFG_FORMAT_RASTER 0 ++#define V3D71_TFU_ICFG_FORMAT_SAND_128 1 ++#define V3D71_TFU_ICFG_FORMAT_SAND_256 2 ++#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11 ++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12 ++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13 ++#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14 ++#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15 ++ + #endif +diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c +index 57872a923d34..8a50d2799853 100644 +--- a/src/broadcom/common/v3d_util.c ++++ b/src/broadcom/common/v3d_util.c +@@ -87,10 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + return best_wgs_per_sg; + } + ++#define V3D71_TLB_COLOR_SIZE (16 * 1024) ++#define V3D71_TLB_DETPH_SIZE (16 * 1024) ++#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024) ++ ++static bool ++tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp) ++{ ++ /* First, we check if we can fit this tile size allocating the depth ++ * TLB memory to color. ++ */ ++ if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE && ++ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) { ++ return true; ++ } ++ ++ /* Otherwise the tile must fit in the main TLB buffers */ ++ return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE && ++ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE; ++} ++ + void +-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, +- bool msaa, bool double_buffer, +- uint32_t *width, uint32_t *height) ++v3d_choose_tile_size(const struct v3d_device_info *devinfo, ++ uint32_t color_attachment_count, ++ /* V3D 4.x max internal bpp of all RTs */ ++ uint32_t max_internal_bpp, ++ /* V3D 7.x accumulated bpp for all RTs (in bytes) */ ++ uint32_t total_color_bpp, ++ bool msaa, ++ bool double_buffer, ++ uint32_t *width, ++ uint32_t *height) + { + static const uint8_t tile_sizes[] = { + 64, 64, +@@ -103,19 +130,65 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, + }; + + uint32_t idx = 0; +- if (color_attachment_count > 2) +- idx += 2; +- else if (color_attachment_count > 1) +- idx += 1; ++ if (devinfo->ver >= 71) { ++ /* In V3D 7.x, we use the actual bpp used by color attachments to compute ++ * the tile size instead of the maximum bpp. This may allow us to choose a ++ * larger tile size than we would in 4.x in scenarios with multiple RTs ++ * with different bpps. ++ * ++ * Also, the TLB has an auxiliary buffer of 8KB that will be automatically ++ * used for depth instead of the main 16KB depth TLB buffer when the depth ++ * tile fits in the auxiliary buffer, allowing the hardware to allocate ++ * the 16KB from the main depth TLB to the color TLB. If we can do that, ++ * then we are effectively doubling the memory we have for color and we ++ * can also select a larger tile size. This is necessary to support ++ * the most expensive configuration: 8x128bpp RTs + MSAA. ++ * ++ * FIXME: the docs state that depth TLB memory can be used for color ++ * if depth testing is not used by setting the 'depth disable' bit in the ++ * rendering configuration. However, this comes with a requirement that ++ * occlussion queries must not be active. We need to clarify if this means ++ * active at the point at which we emit a tile rendering configuration ++ * item, meaning that the we have a query spanning a full render pass ++ * (this is something we can tell before we emit the rendering ++ * configuration item) or active in the subpass for which we are enabling ++ * the bit (which we can't tell until later, when we record commands for ++ * the subpass). If it is the latter, then we cannot use this feature. ++ * ++ * FIXME: pending handling double_buffer. ++ */ ++ const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1); ++ const uint32_t depth_bpp = 4 * (msaa ? 4 : 1); ++ do { ++ const uint32_t tile_w = tile_sizes[idx * 2]; ++ const uint32_t tile_h = tile_sizes[idx * 2 + 1]; ++ if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp)) ++ break; ++ idx++; ++ } while (idx < ARRAY_SIZE(tile_sizes) / 2); ++ ++ /* FIXME: pending handling double_buffer */ ++ assert(!double_buffer); ++ } else { ++ /* On V3D 4.x tile size is selected based on the number of RTs, the ++ * maximum bpp across all of them and whether 4x MSAA is used. ++ */ ++ if (color_attachment_count > 4) ++ idx += 3; ++ else if (color_attachment_count > 2) ++ idx += 2; ++ else if (color_attachment_count > 1) ++ idx += 1; + +- /* MSAA and double-buffer are mutually exclusive */ +- assert(!msaa || !double_buffer); +- if (msaa) +- idx += 2; +- else if (double_buffer) +- idx += 1; ++ /* MSAA and double-buffer are mutually exclusive */ ++ assert(!msaa || !double_buffer); ++ if (msaa) ++ idx += 2; ++ else if (double_buffer) ++ idx += 1; + +- idx += max_color_bpp; ++ idx += max_internal_bpp; ++ } + + assert(idx < ARRAY_SIZE(tile_sizes) / 2); + +@@ -170,3 +243,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type) + unreachable("Unsupported primitive type"); + } + } ++ ++uint32_t ++v3d_internal_bpp_words(uint32_t internal_bpp) ++{ ++ switch (internal_bpp) { ++ case 0 /* V3D_INTERNAL_BPP_32 */: ++ return 1; ++ case 1 /* V3D_INTERNAL_BPP_64 */: ++ return 2; ++ case 2 /* V3D_INTERNAL_BPP_128 */: ++ return 4; ++ default: ++ unreachable("Unsupported internal BPP"); ++ } ++} ++ ++uint32_t ++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, ++ uint32_t bpp) ++{ ++ /* stride in multiples of 128 bits, and covers 2 rows. This is the ++ * reason we divide by 2 instead of 4, as we divide number of 32-bit ++ * words per row by 2. ++ */ ++ ++ return (tile_width * bpp) / 2; ++} +diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h +index eb802b77f674..d02d41dd0897 100644 +--- a/src/broadcom/common/v3d_util.h ++++ b/src/broadcom/common/v3d_util.h +@@ -24,6 +24,7 @@ + #ifndef V3D_UTIL_H + #define V3D_UTIL_H + ++#include "util/macros.h" + #include "common/v3d_device_info.h" + #include "pipe/p_defines.h" + +@@ -36,9 +37,14 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + uint32_t wg_size); + + void +-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, +- bool msaa, bool double_buffer, +- uint32_t *width, uint32_t *height); ++v3d_choose_tile_size(const struct v3d_device_info *devinfo, ++ uint32_t color_attachment_count, ++ uint32_t max_internal_bpp, ++ uint32_t total_color_bpp, ++ bool msaa, ++ bool double_buffer, ++ uint32_t *width, ++ uint32_t *height); + + uint32_t + v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); +@@ -46,4 +52,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); + uint32_t + v3d_hw_prim_type(enum mesa_prim prim_type); + ++uint32_t ++v3d_internal_bpp_words(uint32_t internal_bpp); ++ ++/* Some configuration packets want the size on log2, but starting at 0 for ++ * size 8. ++ */ ++static inline uint8_t ++log2_tile_size(uint32_t size) ++{ ++ switch(size) { ++ case 8: ++ return 0; ++ case 16: ++ return 1; ++ case 32: ++ return 2; ++ case 64: ++ return 3; ++ default: ++ unreachable("Unsupported tile width/height"); ++ } ++} ++ ++uint32_t ++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, ++ uint32_t bpp); + #endif +diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c +index ca072971f01d..bef4126c2dc2 100644 +--- a/src/broadcom/compiler/nir_to_vir.c ++++ b/src/broadcom/compiler/nir_to_vir.c +@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr) + + static struct qreg + emit_smooth_varying(struct v3d_compile *c, +- struct qreg vary, struct qreg w, struct qreg r5) ++ struct qreg vary, struct qreg w, struct qreg c_reg) + { +- return vir_FADD(c, vir_FMUL(c, vary, w), r5); ++ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg); + } + + static struct qreg + emit_noperspective_varying(struct v3d_compile *c, +- struct qreg vary, struct qreg r5) ++ struct qreg vary, struct qreg c_reg) + { +- return vir_FADD(c, vir_MOV(c, vary), r5); ++ return vir_FADD(c, vir_MOV(c, vary), c_reg); + } + + static struct qreg + emit_flat_varying(struct v3d_compile *c, +- struct qreg vary, struct qreg r5) ++ struct qreg vary, struct qreg c_reg) + { + vir_MOV_dest(c, c->undef, vary); +- return vir_MOV(c, r5); ++ return vir_MOV(c, c_reg); + } + + static struct qreg + emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + int8_t input_idx, uint8_t swizzle, int array_index) + { +- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); +- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); ++ struct qreg c_reg; /* C coefficient */ ++ ++ if (c->devinfo->has_accumulators) ++ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); ++ else ++ c_reg = vir_reg(QFILE_REG, 0); + + struct qinst *ldvary = NULL; + struct qreg vary; +@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + vary = vir_emit_def(c, ldvary); + } else { + vir_NOP(c)->qpu.sig.ldvary = true; +- vary = r3; ++ vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); + } + + /* Store the input value before interpolation so we can implement +@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + if (input_idx >= 0) { + assert(var); + c->interp[input_idx].vp = vary; +- c->interp[input_idx].C = vir_MOV(c, r5); ++ c->interp[input_idx].C = vir_MOV(c, c_reg); + c->interp[input_idx].mode = var->data.interpolation; + } + +@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + */ + if (!var) { + assert(input_idx < 0); +- return emit_smooth_varying(c, vary, c->payload_w, r5); ++ return emit_smooth_varying(c, vary, c->payload_w, c_reg); + } + + int i = c->num_inputs++; +@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + if (var->data.centroid) { + BITSET_SET(c->centroid_flags, i); + result = emit_smooth_varying(c, vary, +- c->payload_w_centroid, r5); ++ c->payload_w_centroid, c_reg); + } else { +- result = emit_smooth_varying(c, vary, c->payload_w, r5); ++ result = emit_smooth_varying(c, vary, c->payload_w, c_reg); + } + break; + + case INTERP_MODE_NOPERSPECTIVE: + BITSET_SET(c->noperspective_flags, i); +- result = emit_noperspective_varying(c, vary, r5); ++ result = emit_noperspective_varying(c, vary, c_reg); + break; + + case INTERP_MODE_FLAT: + BITSET_SET(c->flat_shade_flags, i); +- result = emit_flat_varying(c, vary, r5); ++ result = emit_flat_varying(c, vary, c_reg); + break; + + default: +@@ -2440,15 +2444,17 @@ ntq_setup_outputs(struct v3d_compile *c) + + switch (var->data.location) { + case FRAG_RESULT_COLOR: +- c->output_color_var[0] = var; +- c->output_color_var[1] = var; +- c->output_color_var[2] = var; +- c->output_color_var[3] = var; ++ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) ++ c->output_color_var[i] = var; + break; + case FRAG_RESULT_DATA0: + case FRAG_RESULT_DATA1: + case FRAG_RESULT_DATA2: + case FRAG_RESULT_DATA3: ++ case FRAG_RESULT_DATA4: ++ case FRAG_RESULT_DATA5: ++ case FRAG_RESULT_DATA6: ++ case FRAG_RESULT_DATA7: + c->output_color_var[var->data.location - + FRAG_RESULT_DATA0] = var; + break; +@@ -4321,7 +4327,11 @@ nir_to_vir(struct v3d_compile *c) + { + switch (c->s->info.stage) { + case MESA_SHADER_FRAGMENT: +- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); ++ if (c->devinfo->ver < 71) ++ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); ++ else ++ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3)); ++ + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); + c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); + +@@ -4354,8 +4364,13 @@ nir_to_vir(struct v3d_compile *c) + V3D_QPU_WADDR_SYNC)); + } + +- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); +- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); ++ if (c->devinfo->ver <= 42) { ++ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); ++ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); ++ } else if (c->devinfo->ver >= 71) { ++ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3)); ++ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); ++ } + + /* Set up the division between gl_LocalInvocationIndex and + * wg_in_mem in the payload reg. +@@ -4534,8 +4549,8 @@ vir_check_payload_w(struct v3d_compile *c) + + vir_for_each_inst_inorder(inst, c) { + for (int i = 0; i < vir_get_nsrc(inst); i++) { +- if (inst->src[i].file == QFILE_REG && +- inst->src[i].index == 0) { ++ if (inst->src[i].file == c->payload_w.file && ++ inst->src[i].index == c->payload_w.index) { + c->uses_center_w = true; + return; + } +diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c +index 3b32b48f86f0..864947063861 100644 +--- a/src/broadcom/compiler/qpu_schedule.c ++++ b/src/broadcom/compiler/qpu_schedule.c +@@ -155,12 +155,13 @@ static void + process_mux_deps(struct schedule_state *state, struct schedule_node *n, + enum v3d_qpu_mux mux) + { ++ assert(state->devinfo->ver < 71); + switch (mux) { + case V3D_QPU_MUX_A: + add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); + break; + case V3D_QPU_MUX_B: +- if (!n->inst->qpu.sig.small_imm) { ++ if (!n->inst->qpu.sig.small_imm_b) { + add_read_dep(state, + state->last_rf[n->inst->qpu.raddr_b], n); + } +@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, + } + } + ++ ++static void ++process_raddr_deps(struct schedule_state *state, struct schedule_node *n, ++ uint8_t raddr, bool is_small_imm) ++{ ++ assert(state->devinfo->ver >= 71); ++ ++ if (!is_small_imm) ++ add_read_dep(state, state->last_rf[raddr], n); ++} ++ + static bool + tmu_write_is_sequence_terminator(uint32_t waddr) + { +@@ -285,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + /* If the input and output segments are shared, then all VPM reads to + * a location need to happen before all writes. We handle this by + * serializing all VPM operations for now. ++ * ++ * FIXME: we are assuming that the segments are shared. That is ++ * correct right now as we are only using shared, but technically you ++ * can choose. + */ + bool separate_vpm_segment = false; + +@@ -305,15 +321,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + + /* XXX: LOAD_IMM */ + +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) +- process_mux_deps(state, n, inst->alu.add.a); +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) +- process_mux_deps(state, n, inst->alu.add.b); ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.add.a.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.add.a.raddr, ++ inst->sig.small_imm_a); ++ } ++ } ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.add.b.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.add.b.raddr, ++ inst->sig.small_imm_b); ++ } ++ } + +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) +- process_mux_deps(state, n, inst->alu.mul.a); +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) +- process_mux_deps(state, n, inst->alu.mul.b); ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.mul.a.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.mul.a.raddr, ++ inst->sig.small_imm_c); ++ } ++ } ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { ++ if (devinfo->ver < 71) { ++ process_mux_deps(state, n, inst->alu.mul.b.mux); ++ } else { ++ process_raddr_deps(state, n, inst->alu.mul.b.raddr, ++ inst->sig.small_imm_d); ++ } ++ } + + switch (inst->alu.add.op) { + case V3D_QPU_A_VPMSETUP: +@@ -386,6 +426,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) + add_write_dep(state, &state->last_r[4], n); + if (v3d_qpu_writes_r5(devinfo, inst)) + add_write_dep(state, &state->last_r[5], n); ++ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) ++ add_write_dep(state, &state->last_rf[0], n); + + /* If we add any more dependencies here we should consider whether we + * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. +@@ -500,6 +542,10 @@ struct choose_scoreboard { + int ldvary_count; + int pending_ldtmu_count; + bool first_ldtmu_after_thrsw; ++ ++ /* V3D 7.x */ ++ int last_implicit_rf0_write_tick; ++ bool has_rf0_flops_conflict; + }; + + static bool +@@ -524,7 +570,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, + } + + static bool +-reads_too_soon_after_write(struct choose_scoreboard *scoreboard, ++reads_too_soon(struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst, uint8_t raddr) ++{ ++ switch (raddr) { ++ case 0: /* ldvary delayed write of C coefficient to rf0 */ ++ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) ++ return true; ++ break; ++ default: ++ break; ++ } ++ ++ return false; ++} ++ ++static bool ++reads_too_soon_after_write(const struct v3d_device_info *devinfo, ++ struct choose_scoreboard *scoreboard, + struct qinst *qinst) + { + const struct v3d_qpu_instr *inst = &qinst->qpu; +@@ -536,24 +599,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + + if (inst->alu.add.op != V3D_QPU_A_NOP) { +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { +- return true; ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr)) ++ return true; ++ } + } +- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { +- return true; ++ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr)) ++ return true; ++ } + } + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP) { +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { +- return true; ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr)) ++ return true; ++ } + } +- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && +- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { +- return true; ++ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { ++ if (devinfo->ver < 71) { ++ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) ++ return true; ++ } else { ++ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) ++ return true; ++ } + } + } + +@@ -577,6 +660,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, + v3d_qpu_writes_r4(devinfo, inst)) + return true; + ++ if (devinfo->ver <= 42) ++ return false; ++ ++ /* Don't schedule anything that writes rf0 right after ldvary, since ++ * that would clash with the ldvary's delayed rf0 write (the exception ++ * is another ldvary, since its implicit rf0 write would also have ++ * one cycle of delay and would not clash). ++ */ ++ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick && ++ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || ++ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) && ++ !inst->sig.ldvary))) { ++ return true; ++ } ++ + return false; + } + +@@ -604,29 +702,36 @@ pixel_scoreboard_too_soon(struct v3d_compile *c, + } + + static bool +-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, ++qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst, + uint32_t waddr) { + + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + +- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && +- inst->raddr_a == waddr) +- return true; ++ if (devinfo->ver < 71) { ++ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && ++ inst->raddr_a == waddr) ++ return true; + +- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && +- !inst->sig.small_imm && (inst->raddr_b == waddr)) +- return true; ++ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && ++ !inst->sig.small_imm_b && (inst->raddr_b == waddr)) ++ return true; ++ } else { ++ if (v3d71_qpu_reads_raddr(inst, waddr)) ++ return true; ++ } + + return false; + } + + static bool +-mux_read_stalls(struct choose_scoreboard *scoreboard, +- const struct v3d_qpu_instr *inst) ++read_stalls(const struct v3d_device_info *devinfo, ++ struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst) + { + return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && +- qpu_instruction_uses_rf(inst, ++ qpu_instruction_uses_rf(devinfo, inst, + scoreboard->last_stallable_sfu_reg); + } + +@@ -692,7 +797,8 @@ enum { + V3D_PERIPHERAL_TMU_WAIT = (1 << 6), + V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), + V3D_PERIPHERAL_TSY = (1 << 8), +- V3D_PERIPHERAL_TLB = (1 << 9), ++ V3D_PERIPHERAL_TLB_READ = (1 << 9), ++ V3D_PERIPHERAL_TLB_WRITE = (1 << 10), + }; + + static uint32_t +@@ -717,8 +823,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo, + if (v3d_qpu_uses_sfu(inst)) + result |= V3D_PERIPHERAL_SFU; + +- if (v3d_qpu_uses_tlb(inst)) +- result |= V3D_PERIPHERAL_TLB; ++ if (v3d_qpu_reads_tlb(inst)) ++ result |= V3D_PERIPHERAL_TLB_READ; ++ if (v3d_qpu_writes_tlb(inst)) ++ result |= V3D_PERIPHERAL_TLB_WRITE; + + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && +@@ -749,32 +857,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, + if (devinfo->ver < 41) + return false; + +- /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than +- * tmuc). ++ /* V3D 4.x can't do more than one peripheral access except in a ++ * few cases: + */ +- if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && +- b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { +- return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); ++ if (devinfo->ver <= 42) { ++ /* WRTMUC signal with TMU register write (other than tmuc). */ ++ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { ++ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); ++ } ++ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) { ++ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); ++ } ++ ++ /* TMU read with VPM read/write. */ ++ if (a_peripherals == V3D_PERIPHERAL_TMU_READ && ++ (b_peripherals == V3D_PERIPHERAL_VPM_READ || ++ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { ++ return true; ++ } ++ if (b_peripherals == V3D_PERIPHERAL_TMU_READ && ++ (a_peripherals == V3D_PERIPHERAL_VPM_READ || ++ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { ++ return true; ++ } ++ ++ return false; + } + +- if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE && +- b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) { +- return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); ++ /* V3D 7.x can't have more than one of these restricted peripherals */ ++ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE | ++ V3D_PERIPHERAL_TMU_WRTMUC_SIG | ++ V3D_PERIPHERAL_TSY | ++ V3D_PERIPHERAL_TLB_READ | ++ V3D_PERIPHERAL_SFU | ++ V3D_PERIPHERAL_VPM_READ | ++ V3D_PERIPHERAL_VPM_WRITE; ++ ++ const uint32_t a_restricted = a_peripherals & restricted; ++ const uint32_t b_restricted = b_peripherals & restricted; ++ if (a_restricted && b_restricted) { ++ /* WRTMUC signal with TMU register write (other than tmuc) is ++ * allowed though. ++ */ ++ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ b_restricted == V3D_PERIPHERAL_TMU_WRITE && ++ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || ++ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && ++ a_restricted == V3D_PERIPHERAL_TMU_WRITE && ++ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) { ++ return false; ++ } + } + +- /* V3D 4.1+ allows TMU read with VPM read/write. */ +- if (a_peripherals == V3D_PERIPHERAL_TMU_READ && +- (b_peripherals == V3D_PERIPHERAL_VPM_READ || +- b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { +- return true; ++ /* Only one TMU read per instruction */ ++ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) && ++ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) { ++ return false; + } +- if (b_peripherals == V3D_PERIPHERAL_TMU_READ && +- (a_peripherals == V3D_PERIPHERAL_VPM_READ || +- a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { +- return true; ++ ++ /* Only one TLB access per instruction */ ++ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE | ++ V3D_PERIPHERAL_TLB_READ)) && ++ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE | ++ V3D_PERIPHERAL_TLB_READ))) { ++ return false; + } + +- return false; ++ return true; + } + + /* Compute a bitmask of which rf registers are used between +@@ -790,42 +941,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, + uint64_t raddrs_used = 0; + if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) + raddrs_used |= (1ll << a->raddr_a); +- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) ++ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + raddrs_used |= (1ll << a->raddr_b); + if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) + raddrs_used |= (1ll << b->raddr_a); +- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) ++ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + raddrs_used |= (1ll << b->raddr_b); + + return raddrs_used; + } + +-/* Take two instructions and attempt to merge their raddr fields +- * into one merged instruction. Returns false if the two instructions +- * access more than two different rf registers between them, or more +- * than one rf register and one small immediate. ++/* Takes two instructions and attempts to merge their raddr fields (including ++ * small immediates) into one merged instruction. For V3D 4.x, returns false ++ * if the two instructions access more than two different rf registers between ++ * them, or more than one rf register and one small immediate. For 7.x returns ++ * false if both instructions use small immediates. + */ + static bool + qpu_merge_raddrs(struct v3d_qpu_instr *result, + const struct v3d_qpu_instr *add_instr, +- const struct v3d_qpu_instr *mul_instr) ++ const struct v3d_qpu_instr *mul_instr, ++ const struct v3d_device_info *devinfo) + { ++ if (devinfo->ver >= 71) { ++ assert(add_instr->sig.small_imm_a + ++ add_instr->sig.small_imm_b <= 1); ++ assert(add_instr->sig.small_imm_c + ++ add_instr->sig.small_imm_d == 0); ++ assert(mul_instr->sig.small_imm_a + ++ mul_instr->sig.small_imm_b == 0); ++ assert(mul_instr->sig.small_imm_c + ++ mul_instr->sig.small_imm_d <= 1); ++ ++ result->sig.small_imm_a = add_instr->sig.small_imm_a; ++ result->sig.small_imm_b = add_instr->sig.small_imm_b; ++ result->sig.small_imm_c = mul_instr->sig.small_imm_c; ++ result->sig.small_imm_d = mul_instr->sig.small_imm_d; ++ ++ return (result->sig.small_imm_a + ++ result->sig.small_imm_b + ++ result->sig.small_imm_c + ++ result->sig.small_imm_d) <= 1; ++ } ++ ++ assert(devinfo->ver <= 42); ++ + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); + int naddrs = util_bitcount64(raddrs_used); + + if (naddrs > 2) + return false; + +- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { ++ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) { + if (naddrs > 1) + return false; + +- if (add_instr->sig.small_imm && mul_instr->sig.small_imm) ++ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b) + if (add_instr->raddr_b != mul_instr->raddr_b) + return false; + +- result->sig.small_imm = true; +- result->raddr_b = add_instr->sig.small_imm ? ++ result->sig.small_imm_b = true; ++ result->raddr_b = add_instr->sig.small_imm_b ? + add_instr->raddr_b : mul_instr->raddr_b; + } + +@@ -836,23 +1012,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + raddrs_used &= ~(1ll << raddr_a); + result->raddr_a = raddr_a; + +- if (!result->sig.small_imm) { ++ if (!result->sig.small_imm_b) { + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && + raddr_a == add_instr->raddr_b) { +- if (add_instr->alu.add.a == V3D_QPU_MUX_B) +- result->alu.add.a = V3D_QPU_MUX_A; +- if (add_instr->alu.add.b == V3D_QPU_MUX_B && ++ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B) ++ result->alu.add.a.mux = V3D_QPU_MUX_A; ++ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B && + v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { +- result->alu.add.b = V3D_QPU_MUX_A; ++ result->alu.add.b.mux = V3D_QPU_MUX_A; + } + } + if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && + raddr_a == mul_instr->raddr_b) { +- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) +- result->alu.mul.a = V3D_QPU_MUX_A; +- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && ++ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B) ++ result->alu.mul.a.mux = V3D_QPU_MUX_A; ++ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B && + v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { +- result->alu.mul.b = V3D_QPU_MUX_A; ++ result->alu.mul.b.mux = V3D_QPU_MUX_A; + } + } + } +@@ -863,20 +1039,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, + result->raddr_b = raddr_b; + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && + raddr_b == add_instr->raddr_a) { +- if (add_instr->alu.add.a == V3D_QPU_MUX_A) +- result->alu.add.a = V3D_QPU_MUX_B; +- if (add_instr->alu.add.b == V3D_QPU_MUX_A && ++ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A) ++ result->alu.add.a.mux = V3D_QPU_MUX_B; ++ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A && + v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { +- result->alu.add.b = V3D_QPU_MUX_B; ++ result->alu.add.b.mux = V3D_QPU_MUX_B; + } + } + if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && + raddr_b == mul_instr->raddr_a) { +- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) +- result->alu.mul.a = V3D_QPU_MUX_B; +- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && ++ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A) ++ result->alu.mul.a.mux = V3D_QPU_MUX_B; ++ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A && + v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { +- result->alu.mul.b = V3D_QPU_MUX_B; ++ result->alu.mul.b.mux = V3D_QPU_MUX_B; + } + } + +@@ -909,7 +1085,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op) + } + + static void +-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) ++qpu_convert_add_to_mul(const struct v3d_device_info *devinfo, ++ struct v3d_qpu_instr *inst) + { + STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); + assert(inst->alu.add.op != V3D_QPU_A_NOP); +@@ -927,11 +1104,85 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) + inst->flags.auf = V3D_QPU_UF_NONE; + + inst->alu.mul.output_pack = inst->alu.add.output_pack; +- inst->alu.mul.a_unpack = inst->alu.add.a_unpack; +- inst->alu.mul.b_unpack = inst->alu.add.b_unpack; ++ ++ inst->alu.mul.a.unpack = inst->alu.add.a.unpack; ++ inst->alu.mul.b.unpack = inst->alu.add.b.unpack; + inst->alu.add.output_pack = V3D_QPU_PACK_NONE; +- inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; +- inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; ++ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; ++ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; ++ ++ if (devinfo->ver >= 71) { ++ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d); ++ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1); ++ if (inst->sig.small_imm_a) { ++ inst->sig.small_imm_c = true; ++ inst->sig.small_imm_a = false; ++ } else if (inst->sig.small_imm_b) { ++ inst->sig.small_imm_d = true; ++ inst->sig.small_imm_b = false; ++ } ++ } ++} ++ ++static bool ++can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op) ++{ ++ switch (op) { ++ case V3D_QPU_M_MOV: ++ case V3D_QPU_M_FMOV: ++ return devinfo->ver >= 71; ++ default: ++ return false; ++ } ++} ++ ++static enum v3d_qpu_mul_op ++mul_op_as_add_op(enum v3d_qpu_mul_op op) ++{ ++ switch (op) { ++ case V3D_QPU_M_MOV: ++ return V3D_QPU_A_MOV; ++ case V3D_QPU_M_FMOV: ++ return V3D_QPU_A_FMOV; ++ default: ++ unreachable("unexpected mov opcode"); ++ } ++} ++ ++static void ++qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) ++{ ++ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul)); ++ assert(inst->alu.mul.op != V3D_QPU_M_NOP); ++ assert(inst->alu.add.op == V3D_QPU_A_NOP); ++ ++ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add)); ++ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op); ++ inst->alu.mul.op = V3D_QPU_M_NOP; ++ ++ inst->flags.ac = inst->flags.mc; ++ inst->flags.apf = inst->flags.mpf; ++ inst->flags.auf = inst->flags.muf; ++ inst->flags.mc = V3D_QPU_COND_NONE; ++ inst->flags.mpf = V3D_QPU_PF_NONE; ++ inst->flags.muf = V3D_QPU_UF_NONE; ++ ++ inst->alu.add.output_pack = inst->alu.mul.output_pack; ++ inst->alu.add.a.unpack = inst->alu.mul.a.unpack; ++ inst->alu.add.b.unpack = inst->alu.mul.b.unpack; ++ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; ++ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++ ++ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b); ++ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1); ++ if (inst->sig.small_imm_c) { ++ inst->sig.small_imm_a = true; ++ inst->sig.small_imm_c = false; ++ } else if (inst->sig.small_imm_d) { ++ inst->sig.small_imm_b = true; ++ inst->sig.small_imm_d = false; ++ } + } + + static bool +@@ -970,20 +1221,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + else if (a->alu.mul.op == V3D_QPU_M_NOP && + can_do_add_as_mul(b->alu.add.op)) { + mul_inst = *b; +- qpu_convert_add_to_mul(&mul_inst); ++ qpu_convert_add_to_mul(devinfo, &mul_inst); + + merge.alu.mul = mul_inst.alu.mul; + +- merge.flags.mc = b->flags.ac; +- merge.flags.mpf = b->flags.apf; +- merge.flags.muf = b->flags.auf; ++ merge.flags.mc = mul_inst.flags.mc; ++ merge.flags.mpf = mul_inst.flags.mpf; ++ merge.flags.muf = mul_inst.flags.muf; + + add_instr = a; + mul_instr = &mul_inst; + } else if (a->alu.mul.op == V3D_QPU_M_NOP && + can_do_add_as_mul(a->alu.add.op)) { + mul_inst = *a; +- qpu_convert_add_to_mul(&mul_inst); ++ qpu_convert_add_to_mul(devinfo, &mul_inst); + + merge = mul_inst; + merge.alu.add = b->alu.add; +@@ -999,22 +1250,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + } + } + ++ struct v3d_qpu_instr add_inst; + if (b->alu.mul.op != V3D_QPU_M_NOP) { +- if (a->alu.mul.op != V3D_QPU_M_NOP) +- return false; +- merge.alu.mul = b->alu.mul; ++ if (a->alu.mul.op == V3D_QPU_M_NOP) { ++ merge.alu.mul = b->alu.mul; ++ ++ merge.flags.mc = b->flags.mc; ++ merge.flags.mpf = b->flags.mpf; ++ merge.flags.muf = b->flags.muf; ++ ++ mul_instr = b; ++ add_instr = a; ++ } ++ /* If a's mul op is used but its add op is not, then see if we ++ * can convert either a's mul op or b's mul op to an add op ++ * so we can merge. ++ */ ++ else if (a->alu.add.op == V3D_QPU_A_NOP && ++ can_do_mul_as_add(devinfo, b->alu.mul.op)) { ++ add_inst = *b; ++ qpu_convert_mul_to_add(&add_inst); + +- merge.flags.mc = b->flags.mc; +- merge.flags.mpf = b->flags.mpf; +- merge.flags.muf = b->flags.muf; ++ merge.alu.add = add_inst.alu.add; + +- mul_instr = b; +- add_instr = a; ++ merge.flags.ac = add_inst.flags.ac; ++ merge.flags.apf = add_inst.flags.apf; ++ merge.flags.auf = add_inst.flags.auf; ++ ++ mul_instr = a; ++ add_instr = &add_inst; ++ } else if (a->alu.add.op == V3D_QPU_A_NOP && ++ can_do_mul_as_add(devinfo, a->alu.mul.op)) { ++ add_inst = *a; ++ qpu_convert_mul_to_add(&add_inst); ++ ++ merge = add_inst; ++ merge.alu.mul = b->alu.mul; ++ ++ merge.flags.mc = b->flags.mc; ++ merge.flags.mpf = b->flags.mpf; ++ merge.flags.muf = b->flags.muf; ++ ++ mul_instr = b; ++ add_instr = &add_inst; ++ } else { ++ return false; ++ } + } + ++ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and ++ * they have restrictions on the number of raddrs that can be adressed ++ * in a single instruction. In V3D 7.x, we don't have that restriction, ++ * but we are still limited to a single small immediate per instruction. ++ */ + if (add_instr && mul_instr && +- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { +- return false; ++ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { ++ return false; + } + + merge.sig.thrsw |= b->sig.thrsw; +@@ -1025,7 +1316,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, + merge.sig.ldtmu |= b->sig.ldtmu; + merge.sig.ldvary |= b->sig.ldvary; + merge.sig.ldvpm |= b->sig.ldvpm; +- merge.sig.small_imm |= b->sig.small_imm; + merge.sig.ldtlb |= b->sig.ldtlb; + merge.sig.ldtlbu |= b->sig.ldtlbu; + merge.sig.ucb |= b->sig.ucb; +@@ -1108,7 +1398,7 @@ retry: + * regfile A or B that was written to by the previous + * instruction." + */ +- if (reads_too_soon_after_write(scoreboard, n->inst)) ++ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst)) + continue; + + if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) +@@ -1122,10 +1412,11 @@ retry: + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) + continue; + +- /* ldunif and ldvary both write r5, but ldunif does so a tick +- * sooner. If the ldvary's r5 wasn't used, then ldunif might ++ /* ldunif and ldvary both write the same register (r5 for v42 ++ * and below, rf0 for v71), but ldunif does so a tick sooner. ++ * If the ldvary's register wasn't used, then ldunif might + * otherwise get scheduled so ldunif and ldvary try to update +- * r5 in the same tick. ++ * the register in the same tick. + */ + if ((inst->sig.ldunif || inst->sig.ldunifa) && + scoreboard->tick == scoreboard->last_ldvary_tick + 1) { +@@ -1204,11 +1495,20 @@ retry: + * ldvary now if the follow-up fixup would place + * it in the delay slots of a thrsw, which is not + * allowed and would prevent the fixup from being +- * successful. ++ * successful. In V3D 7.x we can allow this to happen ++ * as long as it is not the last delay slot. + */ +- if (inst->sig.ldvary && +- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { +- continue; ++ if (inst->sig.ldvary) { ++ if (c->devinfo->ver <= 42 && ++ scoreboard->last_thrsw_tick + 2 >= ++ scoreboard->tick - 1) { ++ continue; ++ } ++ if (c->devinfo->ver >= 71 && ++ scoreboard->last_thrsw_tick + 2 == ++ scoreboard->tick - 1) { ++ continue; ++ } + } + + /* We can emit a new tmu lookup with a previous ldtmu +@@ -1243,7 +1543,7 @@ retry: + + int prio = get_instruction_priority(c->devinfo, inst); + +- if (mux_read_stalls(scoreboard, inst)) { ++ if (read_stalls(c->devinfo, scoreboard, inst)) { + /* Don't merge an instruction that stalls */ + if (prev_inst) + continue; +@@ -1340,6 +1640,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, + } + } + ++static void ++set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst, ++ const struct v3d_device_info *devinfo) ++{ ++ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick && ++ v3d_qpu_sig_writes_address(devinfo, &inst->sig) && ++ !inst->sig_magic) { ++ scoreboard->has_rf0_flops_conflict = true; ++ } ++} ++ ++static void ++update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard, ++ const struct v3d_qpu_instr *inst, ++ const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->ver < 71) ++ return; ++ ++ /* Thread switch restrictions: ++ * ++ * At the point of a thread switch or thread end (when the actual ++ * thread switch or thread end happens, not when the signalling ++ * instruction is processed): ++ * ++ * - If the most recent write to rf0 was from a ldunif, ldunifa, or ++ * ldvary instruction in which another signal also wrote to the ++ * register file, and the final instruction of the thread section ++ * contained a signal which wrote to the register file, then the ++ * value of rf0 is undefined at the start of the new section ++ * ++ * Here we use the scoreboard to track if our last rf0 implicit write ++ * happens at the same time that another signal writes the register ++ * file (has_rf0_flops_conflict). We will use that information when ++ * scheduling thrsw instructions to avoid putting anything in their ++ * last delay slot which has a signal that writes to the register file. ++ */ ++ ++ /* Reset tracking if we have an explicit rf0 write or we are starting ++ * a new thread section. ++ */ ++ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || ++ scoreboard->tick - scoreboard->last_thrsw_tick == 3) { ++ scoreboard->last_implicit_rf0_write_tick = -10; ++ scoreboard->has_rf0_flops_conflict = false; ++ } ++ ++ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) { ++ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ? ++ scoreboard->tick + 1 : scoreboard->tick; ++ } ++ ++ set_has_rf0_flops_conflict(scoreboard, inst, devinfo); ++} ++ + static void + update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, + const struct qinst *qinst, +@@ -1383,6 +1739,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, + if (inst->sig.ldvary) + scoreboard->last_ldvary_tick = scoreboard->tick; + ++ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo); ++ + update_scoreboard_tmu_tracking(scoreboard, qinst); + } + +@@ -1580,7 +1938,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + if (slot > 0 && qinst->uniform != ~0) + return false; + +- if (v3d_qpu_waits_vpm(inst)) ++ if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst)) + return false; + + if (inst->sig.ldvary) +@@ -1588,35 +1946,67 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + /* GFXH-1625: TMUWT not allowed in the final instruction. */ +- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) ++ if (c->devinfo->ver <= 42 && slot == 2 && ++ inst->alu.add.op == V3D_QPU_A_TMUWT) { + return false; ++ } + +- /* No writing physical registers at the end. */ +- bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; +- bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; +- if ((!add_is_nop && !inst->alu.add.magic_write) || +- (!mul_is_nop && !inst->alu.mul.magic_write)) { +- return false; ++ if (c->devinfo->ver <= 42) { ++ /* No writing physical registers at the end. */ ++ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; ++ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; ++ if ((!add_is_nop && !inst->alu.add.magic_write) || ++ (!mul_is_nop && !inst->alu.mul.magic_write)) { ++ return false; ++ } ++ ++ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && ++ !inst->sig_magic) { ++ return false; ++ } + } + +- if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && +- !inst->sig_magic) { +- return false; ++ if (c->devinfo->ver >= 71) { ++ /* The thread end instruction must not write to the ++ * register file via the add/mul ALUs. ++ */ ++ if (slot == 0 && ++ (!inst->alu.add.magic_write || ++ !inst->alu.mul.magic_write)) { ++ return false; ++ } + } + + if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) + return false; + +- /* RF0-2 might be overwritten during the delay slots by +- * fragment shader setup. +- */ +- if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) +- return false; ++ if (c->devinfo->ver <= 42) { ++ /* RF0-2 might be overwritten during the delay slots by ++ * fragment shader setup. ++ */ ++ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) ++ return false; + +- if (inst->raddr_b < 3 && +- !inst->sig.small_imm && +- v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { +- return false; ++ if (inst->raddr_b < 3 && ++ !inst->sig.small_imm_b && ++ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { ++ return false; ++ } ++ } ++ ++ if (c->devinfo->ver >= 71) { ++ /* RF2-3 might be overwritten during the delay slots by ++ * fragment shader setup. ++ */ ++ if (v3d71_qpu_reads_raddr(inst, 2) || ++ v3d71_qpu_reads_raddr(inst, 3)) { ++ return false; ++ } ++ ++ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) || ++ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) { ++ return false; ++ } + } + } + +@@ -1632,6 +2022,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + */ + static bool + qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, ++ struct choose_scoreboard *scoreboard, + const struct qinst *qinst, + uint32_t slot) + { +@@ -1642,8 +2033,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu)) + return false; + +- if (slot > 0 && qinst->qpu.sig.ldvary) +- return false; ++ if (qinst->qpu.sig.ldvary) { ++ if (c->devinfo->ver <= 42 && slot > 0) ++ return false; ++ if (c->devinfo->ver >= 71 && slot == 2) ++ return false; ++ } + + /* unifa and the following 3 instructions can't overlap a + * thread switch/end. The docs further clarify that this means +@@ -1662,6 +2057,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) + return false; + ++ /* See comment when we set has_rf0_flops_conflict for details */ ++ if (c->devinfo->ver >= 71 && ++ slot == 2 && ++ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) && ++ !qinst->qpu.sig_magic) { ++ if (scoreboard->has_rf0_flops_conflict) ++ return false; ++ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick) ++ return false; ++ } ++ + return true; + } + +@@ -1694,7 +2100,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, + * also apply to instructions scheduled after the thrsw that we want + * to place in its delay slots. + */ +- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) ++ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot)) + return false; + + /* TLB access is disallowed until scoreboard wait is executed, which +@@ -1767,8 +2173,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard + bool is_thrend) + { + for (int slot = 0; slot < instructions_in_sequence; slot++) { +- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) ++ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, ++ qinst, slot)) { + return false; ++ } + + if (is_thrend && + !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { +@@ -1969,10 +2377,11 @@ emit_branch(struct v3d_compile *c, + assert(scoreboard->last_branch_tick + 3 < branch_tick); + assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); + +- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after ++ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after + * setmsf. + */ + bool is_safe_msf_branch = ++ c->devinfo->ver >= 71 || + inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || +@@ -2056,46 +2465,72 @@ emit_branch(struct v3d_compile *c, + } + + static bool +-alu_reads_register(struct v3d_qpu_instr *inst, ++alu_reads_register(const struct v3d_device_info *devinfo, ++ struct v3d_qpu_instr *inst, + bool add, bool magic, uint32_t index) + { + uint32_t num_src; +- enum v3d_qpu_mux mux_a, mux_b; +- +- if (add) { ++ if (add) + num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); +- mux_a = inst->alu.add.a; +- mux_b = inst->alu.add.b; +- } else { ++ else + num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); +- mux_a = inst->alu.mul.a; +- mux_b = inst->alu.mul.b; +- } + +- for (int i = 0; i < num_src; i++) { +- if (magic) { +- if (i == 0 && mux_a == index) +- return true; +- if (i == 1 && mux_b == index) +- return true; ++ if (devinfo->ver <= 42) { ++ enum v3d_qpu_mux mux_a, mux_b; ++ if (add) { ++ mux_a = inst->alu.add.a.mux; ++ mux_b = inst->alu.add.b.mux; + } else { +- if (i == 0 && mux_a == V3D_QPU_MUX_A && +- inst->raddr_a == index) { +- return true; +- } +- if (i == 0 && mux_a == V3D_QPU_MUX_B && +- inst->raddr_b == index) { +- return true; +- } +- if (i == 1 && mux_b == V3D_QPU_MUX_A && +- inst->raddr_a == index) { +- return true; +- } +- if (i == 1 && mux_b == V3D_QPU_MUX_B && +- inst->raddr_b == index) { +- return true; ++ mux_a = inst->alu.mul.a.mux; ++ mux_b = inst->alu.mul.b.mux; ++ } ++ ++ for (int i = 0; i < num_src; i++) { ++ if (magic) { ++ if (i == 0 && mux_a == index) ++ return true; ++ if (i == 1 && mux_b == index) ++ return true; ++ } else { ++ if (i == 0 && mux_a == V3D_QPU_MUX_A && ++ inst->raddr_a == index) { ++ return true; ++ } ++ if (i == 0 && mux_a == V3D_QPU_MUX_B && ++ inst->raddr_b == index) { ++ return true; ++ } ++ if (i == 1 && mux_b == V3D_QPU_MUX_A && ++ inst->raddr_a == index) { ++ return true; ++ } ++ if (i == 1 && mux_b == V3D_QPU_MUX_B && ++ inst->raddr_b == index) { ++ return true; ++ } + } + } ++ ++ return false; ++ } ++ ++ assert(devinfo->ver >= 71); ++ assert(!magic); ++ ++ uint32_t raddr_a, raddr_b; ++ if (add) { ++ raddr_a = inst->alu.add.a.raddr; ++ raddr_b = inst->alu.add.b.raddr; ++ } else { ++ raddr_a = inst->alu.mul.a.raddr; ++ raddr_b = inst->alu.mul.b.raddr; ++ } ++ ++ for (int i = 0; i < num_src; i++) { ++ if (i == 0 && raddr_a == index) ++ return true; ++ if (i == 1 && raddr_b == index) ++ return true; + } + + return false; +@@ -2130,6 +2565,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + struct qblock *block, + struct v3d_qpu_instr *inst) + { ++ const struct v3d_device_info *devinfo = c->devinfo; ++ + /* We only call this if we have successfully merged an ldvary into a + * previous instruction. + */ +@@ -2142,9 +2579,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + * the ldvary destination, if it does, then moving the ldvary before + * it would overwrite it. + */ +- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) ++ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index)) + return false; +- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) ++ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The implicit ldvary destination may not be written to by a signal +@@ -2180,13 +2617,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + } + + /* The previous instruction cannot have a conflicting signal */ +- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) ++ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig)) + return false; + + uint32_t sig; + struct v3d_qpu_sig new_sig = prev->qpu.sig; + new_sig.ldvary = true; +- if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig)) ++ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) + return false; + + /* The previous instruction cannot use flags since ldvary uses the +@@ -2199,9 +2636,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + + /* We can't put an ldvary in the delay slots of a thrsw. We should've + * prevented this when pairing up the ldvary with another instruction +- * and flagging it for a fixup. ++ * and flagging it for a fixup. In V3D 7.x this is limited only to the ++ * second delay slot. + */ +- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); ++ assert((devinfo->ver <= 42 && ++ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) || ++ (devinfo->ver >= 71 && ++ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1)); + + /* Move the ldvary to the previous instruction and remove it from the + * current one. +@@ -2215,14 +2656,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c, + inst->sig_magic = false; + inst->sig_addr = 0; + +- /* By moving ldvary to the previous instruction we make it update +- * r5 in the current one, so nothing else in it should write r5. +- * This should've been prevented by our dependency tracking, which ++ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */ ++ if (devinfo->ver >= 71) { ++ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick; ++ set_has_rf0_flops_conflict(scoreboard, inst, devinfo); ++ } ++ ++ /* By moving ldvary to the previous instruction we make it update r5 ++ * (rf0 for ver >= 71) in the current one, so nothing else in it ++ * should write this register. ++ * ++ * This should've been prevented by our depedency tracking, which + * would not allow ldvary to be paired up with an instruction that +- * writes r5 (since our dependency tracking doesn't know that the +- * ldvary write r5 happens in the next instruction). ++ * writes r5/rf0 (since our dependency tracking doesn't know that the ++ * ldvary write to r5/rf0 happens in the next instruction). + */ +- assert(!v3d_qpu_writes_r5(c->devinfo, inst)); ++ assert(!v3d_qpu_writes_r5(devinfo, inst)); ++ assert(devinfo->ver <= 42 || ++ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) && ++ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0))); + + return true; + } +@@ -2313,7 +2765,7 @@ schedule_instructions(struct v3d_compile *c, + } + } + } +- if (mux_read_stalls(scoreboard, inst)) ++ if (read_stalls(c->devinfo, scoreboard, inst)) + c->qpu_inst_stalled_count++; + } + +@@ -2538,6 +2990,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) + scoreboard.last_setmsf_tick = -10; + scoreboard.last_stallable_sfu_tick = -10; + scoreboard.first_ldtmu_after_thrsw = true; ++ scoreboard.last_implicit_rf0_write_tick = - 10; + + if (debug) { + fprintf(stderr, "Pre-schedule instructions\n"); +diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c +index 2cc7a0eb0ae6..0466ee5d0b69 100644 +--- a/src/broadcom/compiler/qpu_validate.c ++++ b/src/broadcom/compiler/qpu_validate.c +@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state { + int last_sfu_write; + int last_branch_ip; + int last_thrsw_ip; ++ int first_tlb_z_write; + + /* Set when we've found the last-THRSW signal, or if we were started + * in single-segment mode. +@@ -110,11 +111,58 @@ static void + qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + { + const struct v3d_device_info *devinfo = state->c->devinfo; ++ ++ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write) ++ state->first_tlb_z_write = state->ip; ++ + const struct v3d_qpu_instr *inst = &qinst->qpu; + ++ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && ++ state->first_tlb_z_write >= 0 && ++ state->ip > state->first_tlb_z_write && ++ inst->branch.msfign != V3D_QPU_MSFIGN_NONE && ++ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && ++ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && ++ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { ++ fail_instr(state, "Implicit branch MSF read after TLB Z write"); ++ } ++ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return; + ++ if (inst->alu.add.op == V3D_QPU_A_SETMSF && ++ state->first_tlb_z_write >= 0 && ++ state->ip > state->first_tlb_z_write) { ++ fail_instr(state, "SETMSF after TLB Z write"); ++ } ++ ++ if (state->first_tlb_z_write >= 0 && ++ state->ip > state->first_tlb_z_write && ++ inst->alu.add.op == V3D_QPU_A_MSF) { ++ fail_instr(state, "MSF read after TLB Z write"); ++ } ++ ++ if (devinfo->ver < 71) { ++ if (inst->sig.small_imm_a || inst->sig.small_imm_c || ++ inst->sig.small_imm_d) { ++ fail_instr(state, "small imm a/c/d added after V3D 7.1"); ++ } ++ } else { ++ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) && ++ !vir_is_add(qinst)) { ++ fail_instr(state, "small imm a/b used but no ADD inst"); ++ } ++ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) && ++ !vir_is_mul(qinst)) { ++ fail_instr(state, "small imm c/d used but no MUL inst"); ++ } ++ if (inst->sig.small_imm_a + inst->sig.small_imm_b + ++ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) { ++ fail_instr(state, "only one small immediate can be " ++ "enabled per instruction"); ++ } ++ } ++ + /* LDVARY writes r5 two instructions later and LDUNIF writes + * r5 one instruction later, which is illegal to have + * together. +@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + "SFU write started during THRSW delay slots "); + } + +- if (inst->sig.ldvary) +- fail_instr(state, "LDVARY during THRSW delay slots"); ++ if (inst->sig.ldvary) { ++ if (devinfo->ver <= 42) ++ fail_instr(state, "LDVARY during THRSW delay slots"); ++ if (devinfo->ver >= 71 && ++ state->ip - state->last_thrsw_ip == 2) { ++ fail_instr(state, "LDVARY in 2nd THRSW delay slot"); ++ } ++ } + } + + (void)qpu_magic_waddr_matches; /* XXX */ +@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + vpm_writes + + tlb_writes + + tsy_writes + +- inst->sig.ldtmu + ++ (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) + + inst->sig.ldtlb + + inst->sig.ldvpm + + inst->sig.ldtlbu > 1) { +@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) + inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if ((inst->alu.add.op != V3D_QPU_A_NOP && + !inst->alu.add.magic_write)) { +- fail_instr(state, "RF write after THREND"); ++ if (devinfo->ver <= 42) { ++ fail_instr(state, "RF write after THREND"); ++ } else if (devinfo->ver >= 71) { ++ if (state->last_thrsw_ip - state->ip == 0) { ++ fail_instr(state, ++ "ADD RF write at THREND"); ++ } ++ if (inst->alu.add.waddr == 2 || ++ inst->alu.add.waddr == 3) { ++ fail_instr(state, ++ "RF2-3 write after THREND"); ++ } ++ } + } + + if ((inst->alu.mul.op != V3D_QPU_M_NOP && + !inst->alu.mul.magic_write)) { +- fail_instr(state, "RF write after THREND"); ++ if (devinfo->ver <= 42) { ++ fail_instr(state, "RF write after THREND"); ++ } else if (devinfo->ver >= 71) { ++ if (state->last_thrsw_ip - state->ip == 0) { ++ fail_instr(state, ++ "MUL RF write at THREND"); ++ } ++ ++ if (inst->alu.mul.waddr == 2 || ++ inst->alu.mul.waddr == 3) { ++ fail_instr(state, ++ "RF2-3 write after THREND"); ++ } ++ } + } + + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { +- fail_instr(state, "RF write after THREND"); ++ if (devinfo->ver <= 42) { ++ fail_instr(state, "RF write after THREND"); ++ } else if (devinfo->ver >= 71 && ++ (inst->sig_addr == 2 || ++ inst->sig_addr == 3)) { ++ fail_instr(state, "RF2-3 write after THREND"); ++ } + } + + /* GFXH-1625: No TMUWT in the last instruction */ +@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c) + .last_sfu_write = -10, + .last_thrsw_ip = -10, + .last_branch_ip = -10, ++ .first_tlb_z_write = INT_MAX, + .ip = 0, + + .last_thrsw_found = !c->last_thrsw, +diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h +index eb4e692464b2..889979cdda07 100644 +--- a/src/broadcom/compiler/v3d_compiler.h ++++ b/src/broadcom/compiler/v3d_compiler.h +@@ -613,6 +613,11 @@ struct v3d_ra_node_info { + struct { + uint32_t priority; + uint8_t class_bits; ++ bool is_program_end; ++ bool unused; ++ ++ /* V3D 7.x */ ++ bool is_ldunif_dst; + } *info; + uint32_t alloc_count; + }; +@@ -1149,8 +1154,8 @@ bool vir_is_raw_mov(struct qinst *inst); + bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); + bool vir_is_add(struct qinst *inst); + bool vir_is_mul(struct qinst *inst); +-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); +-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); ++bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); ++bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); + struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); + uint8_t vir_channels_written(struct qinst *inst); + struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); +diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c +index 3ef0e398228a..4cdba3748a1c 100644 +--- a/src/broadcom/compiler/v3d_nir_lower_io.c ++++ b/src/broadcom/compiler/v3d_nir_lower_io.c +@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, + * The correct fix for this as recommended by Broadcom + * is to convert to .8 fixed-point with ffloor(). + */ +- pos = nir_f2i32(b, nir_ffloor(b, pos)); +- v3d_nir_store_output(b, state->vp_vpm_offset + i, +- offset_reg, pos); ++ if (c->devinfo->ver <= 42) ++ pos = nir_f2i32(b, nir_ffloor(b, pos)); ++ else ++ pos = nir_f2i32(b, nir_fround_even(b, pos)); ++ ++ v3d_nir_store_output(b, state->vp_vpm_offset + i, ++ offset_reg, pos); + } + } + +diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c +index 660b11b05776..f6965012d93c 100644 +--- a/src/broadcom/compiler/vir.c ++++ b/src/broadcom/compiler/vir.c +@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst) + return false; + } + +- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || +- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || +- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || +- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { ++ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE || ++ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE || ++ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || ++ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + +@@ -156,8 +156,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) + } + + bool +-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) ++vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, ++ struct qinst *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + for (int i = 0; i < vir_get_nsrc(inst); i++) { + switch (inst->src[i].file) { + case QFILE_VPM: +@@ -178,8 +182,12 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) + } + + bool +-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) ++vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, ++ struct qinst *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + switch (inst->dst.file) { + case QFILE_MAGIC: + switch (inst->dst.index) { +@@ -209,15 +217,15 @@ vir_set_unpack(struct qinst *inst, int src, + + if (vir_is_add(inst)) { + if (src == 0) +- inst->qpu.alu.add.a_unpack = unpack; ++ inst->qpu.alu.add.a.unpack = unpack; + else +- inst->qpu.alu.add.b_unpack = unpack; ++ inst->qpu.alu.add.b.unpack = unpack; + } else { + assert(vir_is_mul(inst)); + if (src == 0) +- inst->qpu.alu.mul.a_unpack = unpack; ++ inst->qpu.alu.mul.a.unpack = unpack; + else +- inst->qpu.alu.mul.b_unpack = unpack; ++ inst->qpu.alu.mul.b.unpack = unpack; + } + } + +@@ -737,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c, + + /* Set us up for shared input/output segments. This is apparently + * necessary for our VCM setup to avoid varying corruption. ++ * ++ * FIXME: initial testing on V3D 7.1 seems to work fine when using ++ * separate segments. So we could try to reevaluate in the future, if ++ * there is any advantage of using separate segments. + */ + prog_data->separate_segments = false; + prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, +diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c +index 5c47bbdc1b01..ab5d40430393 100644 +--- a/src/broadcom/compiler/vir_dump.c ++++ b/src/broadcom/compiler/vir_dump.c +@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) + vir_print_reg(c, inst, inst->dst); + fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); + +- unpack[0] = instr->alu.add.a_unpack; +- unpack[1] = instr->alu.add.b_unpack; ++ unpack[0] = instr->alu.add.a.unpack; ++ unpack[1] = instr->alu.add.b.unpack; + } else { + fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); + fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc)); +@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) + vir_print_reg(c, inst, inst->dst); + fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); + +- unpack[0] = instr->alu.mul.a_unpack; +- unpack[1] = instr->alu.mul.b_unpack; ++ unpack[0] = instr->alu.mul.a.unpack; ++ unpack[1] = instr->alu.mul.b.unpack; + } + + for (int i = 0; i < nsrc; i++) { +diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c +index 575b0481dc81..d1f44aa9cf76 100644 +--- a/src/broadcom/compiler/vir_live_variables.c ++++ b/src/broadcom/compiler/vir_live_variables.c +@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c) + flags_inst = NULL; + } + +- /* Payload registers: r0/1/2 contain W, centroid W, +- * and Z at program start. Register allocation will +- * force their nodes to R0/1/2. ++ /* Payload registers: for fragment shaders, W, ++ * centroid W, and Z will be initialized in r0/1/2 ++ * until v42, or r1/r2/r3 since v71. ++ * ++ * For compute shaders, payload is in r0/r2 up to v42, ++ * r2/r3 since v71. ++ * ++ * Register allocation will force their nodes to those ++ * registers. + */ + if (inst->src[0].file == QFILE_REG) { +- switch (inst->src[0].index) { +- case 0: +- case 1: +- case 2: ++ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0; ++ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2; ++ if (inst->src[0].index >= min_payload_r || ++ inst->src[0].index <= max_payload_r) { + c->temp_start[inst->dst.index] = 0; +- break; + } + } + +diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c +index da121c2a5bd4..1260838ca056 100644 +--- a/src/broadcom/compiler/vir_opt_copy_propagate.c ++++ b/src/broadcom/compiler/vir_opt_copy_propagate.c +@@ -35,7 +35,7 @@ + #include "v3d_compiler.h" + + static bool +-is_copy_mov(struct qinst *inst) ++is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst) + { + if (!inst) + return false; +@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst) + return false; + } + +- switch (inst->src[0].file) { +- case QFILE_MAGIC: +- /* No copy propagating from R3/R4/R5 -- the MOVs from those +- * are there to register allocate values produced into R3/4/5 +- * to other regs (though hopefully r3/4/5). +- */ +- switch (inst->src[0].index) { +- case V3D_QPU_WADDR_R3: +- case V3D_QPU_WADDR_R4: +- case V3D_QPU_WADDR_R5: +- return false; ++ if (devinfo->ver <= 42) { ++ switch (inst->src[0].file) { ++ case QFILE_MAGIC: ++ /* No copy propagating from R3/R4/R5 -- the MOVs from ++ * those are there to register allocate values produced ++ * into R3/4/5 to other regs (though hopefully r3/4/5). ++ */ ++ switch (inst->src[0].index) { ++ case V3D_QPU_WADDR_R3: ++ case V3D_QPU_WADDR_R4: ++ case V3D_QPU_WADDR_R5: ++ return false; ++ default: ++ break; ++ } ++ break; ++ ++ case QFILE_REG: ++ switch (inst->src[0].index) { ++ case 0: ++ case 1: ++ case 2: ++ /* MOVs from rf0/1/2 are only to track the live ++ * intervals for W/centroid W/Z. ++ */ ++ return false; ++ } ++ break; ++ + default: + break; + } +- break; +- +- case QFILE_REG: +- switch (inst->src[0].index) { +- case 0: +- case 1: +- case 2: +- /* MOVs from rf0/1/2 are only to track the live ++ } else { ++ assert(devinfo->ver >= 71); ++ switch (inst->src[0].file) { ++ case QFILE_REG: ++ switch (inst->src[0].index) { ++ /* MOVs from rf1/2/3 are only to track the live + * intervals for W/centroid W/Z. ++ * ++ * Note: rf0 can be implicitly written by ldvary ++ * (no temp involved), so it is not an SSA value and ++ * could clash with writes to other temps that are ++ * also allocated to rf0. In theory, that would mean ++ * that we can't copy propagate from it, but we handle ++ * this at register allocation time, preventing temps ++ * from being allocated to rf0 while the rf0 value from ++ * ldvary is still live. + */ +- return false; +- } +- break; ++ case 1: ++ case 2: ++ case 3: ++ return false; ++ } ++ break; + +- default: +- break; ++ default: ++ break; ++ } + } + + return true; +@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan) + + if (vir_is_add(inst)) { + if (chan == 0) +- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE; + else +- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE; + } else { + if (chan == 0) +- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE; + else +- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE; ++ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE; + } + } + +@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) + */ + struct qinst *mov = movs[inst->src[i].index]; + if (!mov) { +- if (!is_copy_mov(c->defs[inst->src[i].index])) ++ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index])) + continue; + mov = c->defs[inst->src[i].index]; + +@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) + continue; + + /* these ops can't represent abs. */ +- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) { ++ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_VFPACK: + case V3D_QPU_A_FROUND: +@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) + + inst->src[i] = mov->src[0]; + if (vir_has_unpack(mov, 0)) { +- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack; ++ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack; + + vir_set_unpack(inst, i, unpack); + } +@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c) + + apply_kills(c, movs, inst); + +- if (is_copy_mov(inst)) ++ if (is_copy_mov(c->devinfo, inst)) + movs[inst->dst.index] = inst; + } + } +diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c +index c7896d57f2bd..6b61ed6a39ac 100644 +--- a/src/broadcom/compiler/vir_opt_redundant_flags.c ++++ b/src/broadcom/compiler/vir_opt_redundant_flags.c +@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b) + a->qpu.flags.mpf != b->qpu.flags.mpf || + a->qpu.alu.add.op != b->qpu.alu.add.op || + a->qpu.alu.mul.op != b->qpu.alu.mul.op || +- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack || +- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack || ++ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack || ++ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack || + a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack || +- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack || +- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack || ++ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack || ++ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack || + a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) { + return false; + } +diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c +index 47d7722968d8..ed5bc0119642 100644 +--- a/src/broadcom/compiler/vir_opt_small_immediates.c ++++ b/src/broadcom/compiler/vir_opt_small_immediates.c +@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c) + /* The small immediate value sits in the raddr B field, so we + * can't have 2 small immediates in one instruction (unless + * they're the same value, but that should be optimized away +- * elsewhere). ++ * elsewhere). Since 7.x we can encode small immediates in ++ * any raddr field, but each instruction can still only use ++ * one. + */ + bool uses_small_imm = false; + for (int i = 0; i < vir_get_nsrc(inst); i++) { +@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c) + */ + struct v3d_qpu_sig new_sig = inst->qpu.sig; + uint32_t sig_packed; +- new_sig.small_imm = true; ++ if (c->devinfo->ver <= 42) { ++ new_sig.small_imm_b = true; ++ } else { ++ if (vir_is_add(inst)) { ++ if (i == 0) ++ new_sig.small_imm_a = true; ++ else ++ new_sig.small_imm_b = true; ++ } else { ++ if (i == 0) ++ new_sig.small_imm_c = true; ++ else ++ new_sig.small_imm_d = true; ++ } ++ } ++ + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) + continue; + +@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c) + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } +- inst->qpu.sig.small_imm = true; ++ inst->qpu.sig.small_imm_a = new_sig.small_imm_a; ++ inst->qpu.sig.small_imm_b = new_sig.small_imm_b; ++ inst->qpu.sig.small_imm_c = new_sig.small_imm_c; ++ inst->qpu.sig.small_imm_d = new_sig.small_imm_d; + inst->qpu.raddr_b = packed; + + inst->src[i].file = QFILE_SMALL_IMM; +diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c +index b22f915d1dfc..8eac2b75bd79 100644 +--- a/src/broadcom/compiler/vir_register_allocate.c ++++ b/src/broadcom/compiler/vir_register_allocate.c +@@ -28,41 +28,73 @@ + + #define ACC_INDEX 0 + #define ACC_COUNT 6 +-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) +-#define PHYS_COUNT 64 + ++/* RA nodes used to track RF registers with implicit writes */ ++#define IMPLICIT_RF_COUNT 1 ++ ++#define PHYS_COUNT 64 ++ ++static uint8_t ++get_phys_index(const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->has_accumulators) ++ return ACC_INDEX + ACC_COUNT; ++ else ++ return 0; ++} ++ ++/* ACC as accumulator */ + #define CLASS_BITS_PHYS (1 << 0) + #define CLASS_BITS_ACC (1 << 1) + #define CLASS_BITS_R5 (1 << 4) +-#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \ +- CLASS_BITS_ACC | \ +- CLASS_BITS_R5) ++ ++static uint8_t ++get_class_bit_any(const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->has_accumulators) ++ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5); ++ else ++ return CLASS_BITS_PHYS; ++} ++ ++static uint8_t ++filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits) ++{ ++ if (!devinfo->has_accumulators) { ++ assert(class_bits & CLASS_BITS_PHYS); ++ class_bits = CLASS_BITS_PHYS; ++ } ++ return class_bits; ++} + + static inline uint32_t +-temp_to_node(uint32_t temp) ++temp_to_node(struct v3d_compile *c, uint32_t temp) + { +- return temp + ACC_COUNT; ++ return temp + (c->devinfo->has_accumulators ? ACC_COUNT : ++ IMPLICIT_RF_COUNT); + } + + static inline uint32_t +-node_to_temp(uint32_t node) ++node_to_temp(struct v3d_compile *c, uint32_t node) + { +- assert(node >= ACC_COUNT); +- return node - ACC_COUNT; ++ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) || ++ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT)); ++ return node - (c->devinfo->has_accumulators ? ACC_COUNT : ++ IMPLICIT_RF_COUNT); + } + + static inline uint8_t +-get_temp_class_bits(struct v3d_ra_node_info *nodes, ++get_temp_class_bits(struct v3d_compile *c, + uint32_t temp) + { +- return nodes->info[temp_to_node(temp)].class_bits; ++ return c->nodes.info[temp_to_node(c, temp)].class_bits; + } + + static inline void +-set_temp_class_bits(struct v3d_ra_node_info *nodes, ++set_temp_class_bits(struct v3d_compile *c, + uint32_t temp, uint8_t class_bits) + { +- nodes->info[temp_to_node(temp)].class_bits = class_bits; ++ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits; + } + + static struct ra_class * +@@ -71,11 +103,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits) + if (class_bits == CLASS_BITS_PHYS) { + return c->compiler->reg_class_phys[c->thread_index]; + } else if (class_bits == (CLASS_BITS_R5)) { ++ assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_r5[c->thread_index]; + } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) { ++ assert(c->devinfo->has_accumulators); + return c->compiler->reg_class_phys_or_acc[c->thread_index]; + } else { +- assert(class_bits == CLASS_BITS_ANY); ++ assert(class_bits == get_class_bit_any(c->devinfo)); + return c->compiler->reg_class_any[c->thread_index]; + } + } +@@ -84,7 +118,7 @@ static inline struct ra_class * + choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp) + { + assert(temp < c->num_temps && temp < c->nodes.alloc_count); +- return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp)); ++ return choose_reg_class(c, get_temp_class_bits(c, temp)); + } + + static inline bool +@@ -313,7 +347,7 @@ v3d_choose_spill_node(struct v3d_compile *c) + + for (unsigned i = 0; i < c->num_temps; i++) { + if (BITSET_TEST(c->spillable, i)) { +- ra_set_node_spill_cost(c->g, temp_to_node(i), ++ ra_set_node_spill_cost(c->g, temp_to_node(c, i), + spill_costs[i]); + } + } +@@ -331,7 +365,8 @@ ensure_nodes(struct v3d_compile *c) + c->nodes.info = reralloc_array_size(c, + c->nodes.info, + sizeof(c->nodes.info[0]), +- c->nodes.alloc_count + ACC_COUNT); ++ c->nodes.alloc_count + ++ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT)); + } + + /* Creates the interference node for a new temp. We use this to keep the node +@@ -343,11 +378,15 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) + ensure_nodes(c); + + int node = ra_add_node(c->g, choose_reg_class(c, class_bits)); +- assert(node == temp + ACC_COUNT); ++ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT : ++ node == temp + IMPLICIT_RF_COUNT); + + /* We fill the node priority after we are done inserting spills */ + c->nodes.info[node].class_bits = class_bits; + c->nodes.info[node].priority = 0; ++ c->nodes.info[node].is_ldunif_dst = false; ++ c->nodes.info[node].is_program_end = false; ++ c->nodes.info[node].unused = false; + } + + /* The spill offset for this thread takes a bit of setup, so do it once at +@@ -395,8 +434,10 @@ v3d_setup_spill_base(struct v3d_compile *c) + */ + if (c->spilling) { + int temp_class = CLASS_BITS_PHYS; +- if (i != c->spill_base.index) ++ if (c->devinfo->has_accumulators && ++ i != c->spill_base.index) { + temp_class |= CLASS_BITS_ACC; ++ } + add_node(c, i, temp_class); + } + } +@@ -436,7 +477,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, + */ + assert(c->disable_ldunif_opt); + struct qreg offset = vir_uniform_ui(c, spill_offset); +- add_node(c, offset.index, CLASS_BITS_ANY); ++ add_node(c, offset.index, get_class_bit_any(c->devinfo)); + + /* We always enable per-quad on spills/fills to ensure we spill + * any channels involved with helper invocations. +@@ -455,14 +496,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c, + * temp will be used immediately so just like the uniform above we + * can allow accumulators. + */ ++ int temp_class = ++ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC); + if (!fill_dst) { + struct qreg dst = vir_TMUWT(c); + assert(dst.file == QFILE_TEMP); +- add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC); ++ add_node(c, dst.index, temp_class); + } else { + *fill_dst = vir_LDTMU(c); + assert(fill_dst->file == QFILE_TEMP); +- add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC); ++ add_node(c, fill_dst->index, temp_class); + } + + /* Temps across the thread switch we injected can't be assigned to +@@ -482,7 +525,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, + c->temp_start[i] < ip && c->temp_end[i] >= ip : + c->temp_start[i] <= ip && c->temp_end[i] > ip; + if (thrsw_cross) { +- ra_set_node_class(c->g, temp_to_node(i), ++ ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class(c, CLASS_BITS_PHYS)); + } + } +@@ -509,8 +552,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c, + * same register class bits as the original. + */ + if (inst == position) { +- uint8_t class_bits = get_temp_class_bits(&c->nodes, +- inst->dst.index); ++ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index); + inst->dst = vir_get_temp(c); + add_node(c, inst->dst.index, class_bits); + } else { +@@ -542,7 +584,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end) + } + + static void +-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) ++v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes, ++ int spill_temp) + { + c->spill_start_num_temps = c->num_temps; + c->spilling = true; +@@ -554,8 +597,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + spill_offset = c->spill_size; + c->spill_size += V3D_CHANNELS * sizeof(uint32_t); + +- if (spill_offset == 0) ++ if (spill_offset == 0) { + v3d_setup_spill_base(c); ++ ++ /* Don't allocate our spill base to rf0 to avoid ++ * conflicts with instructions doing implicit writes ++ * to that register. ++ */ ++ if (!c->devinfo->has_accumulators) { ++ ra_add_node_interference( ++ c->g, ++ temp_to_node(c, c->spill_base.index), ++ implicit_rf_nodes[0]); ++ } ++ } + } + + struct qinst *last_thrsw = c->last_thrsw; +@@ -574,7 +629,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + reconstruct_op = orig_def->qpu.alu.add.op; + } + +- uint32_t spill_node = temp_to_node(spill_temp); ++ uint32_t spill_node = temp_to_node(c, spill_temp); + + /* We must disable the ldunif optimization if we are spilling uniforms */ + bool had_disable_ldunif_opt = c->disable_ldunif_opt; +@@ -635,7 +690,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + * instruction immediately after, so + * we can use any register class for it. + */ +- add_node(c, unif.index, CLASS_BITS_ANY); ++ add_node(c, unif.index, ++ get_class_bit_any(c->devinfo)); + } else if (spill_type == SPILL_TYPE_RECONSTRUCT) { + struct qreg temp = + reconstruct_temp(c, reconstruct_op); +@@ -644,8 +700,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + * instruction immediately after so we + * can use ACC. + */ +- add_node(c, temp.index, CLASS_BITS_PHYS | +- CLASS_BITS_ACC); ++ int temp_class = ++ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | ++ CLASS_BITS_ACC); ++ add_node(c, temp.index, temp_class); + } else { + /* If we have a postponed spill, we + * don't need a fill as the temp would +@@ -739,12 +797,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + * update node priorities based one new liveness data. + */ + uint32_t sb_temp =c->spill_base.index; +- uint32_t sb_node = temp_to_node(sb_temp); ++ uint32_t sb_node = temp_to_node(c, sb_temp); + for (uint32_t i = 0; i < c->num_temps; i++) { + if (c->temp_end[i] == -1) + continue; + +- uint32_t node_i = temp_to_node(i); ++ uint32_t node_i = temp_to_node(c, i); + c->nodes.info[node_i].priority = + c->temp_end[i] - c->temp_start[i]; + +@@ -752,7 +810,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + j < c->num_temps; j++) { + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { +- uint32_t node_j = temp_to_node(j); ++ uint32_t node_j = temp_to_node(c, j); + ra_add_node_interference(c->g, node_i, node_j); + } + } +@@ -771,9 +829,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) + } + + struct v3d_ra_select_callback_data { ++ uint32_t phys_index; + uint32_t next_acc; + uint32_t next_phys; + struct v3d_ra_node_info *nodes; ++ const struct v3d_device_info *devinfo; + }; + + /* Choosing accumulators improves chances of merging QPU instructions +@@ -785,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, + BITSET_WORD *regs, + int priority) + { ++ if (!v3d_ra->devinfo->has_accumulators) ++ return false; ++ + /* Favor accumulators if we have less that this number of physical + * registers. Accumulators have more restrictions (like being + * invalidated through thrsw), so running out of physical registers +@@ -794,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, + static const int available_rf_threshold = 5; + int available_rf = 0 ; + for (int i = 0; i < PHYS_COUNT; i++) { +- if (BITSET_TEST(regs, PHYS_INDEX + i)) ++ if (BITSET_TEST(regs, v3d_ra->phys_index + i)) + available_rf++; + if (available_rf >= available_rf_threshold) + break; +@@ -820,6 +883,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, + BITSET_WORD *regs, + unsigned int *out) + { ++ if (!v3d_ra->devinfo->has_accumulators) ++ return false; ++ + /* Choose r5 for our ldunifs if possible (nobody else can load to that + * reg, and it keeps the QPU cond field free from being occupied by + * ldunifrf). +@@ -849,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, + + static bool + v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, ++ unsigned int node, + BITSET_WORD *regs, + unsigned int *out) + { ++ /* If this node is for an unused temp, ignore. */ ++ if (v3d_ra->nodes->info[node].unused) { ++ *out = 0; ++ return true; ++ } ++ ++ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst ++ * so we can avoid turning them into ldunifrf (which uses the ++ * cond field to encode the dst and would prevent merge with ++ * instructions that use cond flags). ++ */ ++ if (v3d_ra->nodes->info[node].is_ldunif_dst && ++ BITSET_TEST(regs, v3d_ra->phys_index)) { ++ assert(v3d_ra->devinfo->ver >= 71); ++ *out = v3d_ra->phys_index; ++ return true; ++ } ++ ++ /* The last 3 instructions in a shader can't use some specific registers ++ * (usually early rf registers, depends on v3d version) so try to ++ * avoid allocating these to registers used by the last instructions ++ * in the shader. ++ */ ++ const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4; ++ if (v3d_ra->nodes->info[node].is_program_end && ++ v3d_ra->next_phys < safe_rf_start) { ++ v3d_ra->next_phys = safe_rf_start; ++ } ++ + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; +- int phys = PHYS_INDEX + phys_off; ++ ++ /* Try to keep rf0 available for ldunif in 7.x (see above). */ ++ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0) ++ continue; ++ ++ int phys = v3d_ra->phys_index + phys_off; + + if (BITSET_TEST(regs, phys)) { + v3d_ra->next_phys = phys_off + 1; +@@ -863,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + } + } + ++ /* If we couldn't allocate, do try to assign rf0 if it is available. */ ++ if (v3d_ra->devinfo->ver >= 71 && ++ BITSET_TEST(regs, v3d_ra->phys_index)) { ++ v3d_ra->next_phys = 1; ++ *out = v3d_ra->phys_index; ++ return true; ++ } ++ + return false; + } + +@@ -877,7 +986,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) + return reg; + } + +- if (v3d_ra_select_rf(v3d_ra, regs, ®)) ++ if (v3d_ra_select_rf(v3d_ra, n, regs, ®)) + return reg; + + /* If we ran out of physical registers try to assign an accumulator +@@ -896,8 +1005,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + * register file can be divided up for fragment shader threading. + */ + int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); ++ uint8_t phys_index = get_phys_index(compiler->devinfo); + +- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, ++ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT, + false); + if (!compiler->regs) + return false; +@@ -905,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler) + for (int threads = 0; threads < max_thread_index; threads++) { + compiler->reg_class_any[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); +- compiler->reg_class_r5[threads] = +- ra_alloc_contig_reg_class(compiler->regs, 1); +- compiler->reg_class_phys_or_acc[threads] = +- ra_alloc_contig_reg_class(compiler->regs, 1); ++ if (compiler->devinfo->has_accumulators) { ++ compiler->reg_class_r5[threads] = ++ ra_alloc_contig_reg_class(compiler->regs, 1); ++ compiler->reg_class_phys_or_acc[threads] = ++ ra_alloc_contig_reg_class(compiler->regs, 1); ++ } + compiler->reg_class_phys[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + +- for (int i = PHYS_INDEX; +- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { +- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ++ /* Init physical regs */ ++ for (int i = phys_index; ++ i < phys_index + (PHYS_COUNT >> threads); i++) { ++ if (compiler->devinfo->has_accumulators) ++ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_phys[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); + } + +- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { +- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); +- ra_class_add_reg(compiler->reg_class_any[threads], i); ++ /* Init accumulator regs */ ++ if (compiler->devinfo->has_accumulators) { ++ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { ++ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ++ ra_class_add_reg(compiler->reg_class_any[threads], i); ++ } ++ /* r5 can only store a single 32-bit value, so not much can ++ * use it. ++ */ ++ ra_class_add_reg(compiler->reg_class_r5[threads], ++ ACC_INDEX + 5); ++ ra_class_add_reg(compiler->reg_class_any[threads], ++ ACC_INDEX + 5); + } +- /* r5 can only store a single 32-bit value, so not much can +- * use it. +- */ +- ra_class_add_reg(compiler->reg_class_r5[threads], +- ACC_INDEX + 5); +- ra_class_add_reg(compiler->reg_class_any[threads], +- ACC_INDEX + 5); + } + + ra_set_finalize(compiler->regs, NULL); +@@ -944,7 +1061,10 @@ tmu_spilling_allowed(struct v3d_compile *c) + } + + static void +-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, ++update_graph_and_reg_classes_for_inst(struct v3d_compile *c, ++ int *acc_nodes, ++ int *implicit_rf_nodes, ++ int last_ldvary_ip, + struct qinst *inst) + { + int32_t ip = inst->ip; +@@ -954,26 +1074,39 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * result to a temp), nothing else can be stored in r3/r4 across + * it. + */ +- if (vir_writes_r3(c->devinfo, inst)) { ++ if (vir_writes_r3_implicitly(c->devinfo, inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, +- temp_to_node(i), ++ temp_to_node(c, i), + acc_nodes[3]); + } + } + } + +- if (vir_writes_r4(c->devinfo, inst)) { ++ if (vir_writes_r4_implicitly(c->devinfo, inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, +- temp_to_node(i), ++ temp_to_node(c, i), + acc_nodes[4]); + } + } + } + ++ /* If any instruction writes to a physical register implicitly ++ * nothing else can write the same register across it. ++ */ ++ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { ++ for (int i = 0; i < c->num_temps; i++) { ++ if (c->temp_start[i] < ip && c->temp_end[i] > ip) { ++ ra_add_node_interference(c->g, ++ temp_to_node(c, i), ++ implicit_rf_nodes[0]); ++ } ++ } ++ } ++ + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_LDVPMV_IN: +@@ -987,7 +1120,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * decides whether the LDVPM is in or out) + */ + assert(inst->dst.file == QFILE_TEMP); +- set_temp_class_bits(&c->nodes, inst->dst.index, ++ set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } +@@ -1002,7 +1135,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * phys regfile. + */ + assert(inst->dst.file == QFILE_TEMP); +- set_temp_class_bits(&c->nodes, inst->dst.index, ++ set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_PHYS); + break; + } +@@ -1015,6 +1148,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + if (inst->src[0].file == QFILE_REG) { + switch (inst->src[0].index) { + case 0: ++ /* V3D 7.x doesn't use rf0 for thread payload */ ++ if (c->devinfo->ver >= 71) ++ break; ++ else ++ FALLTHROUGH; + case 1: + case 2: + case 3: { +@@ -1024,14 +1162,34 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + */ + assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); + assert(inst->dst.file == QFILE_TEMP); +- uint32_t node = temp_to_node(inst->dst.index); ++ uint32_t node = temp_to_node(c, inst->dst.index); + ra_set_node_reg(c->g, node, +- PHYS_INDEX + inst->src[0].index); ++ get_phys_index(c->devinfo) + ++ inst->src[0].index); + break; + } + } + } + ++ /* Don't allocate rf0 to temps that cross ranges where we have ++ * live implicit rf0 writes from ldvary. We can identify these ++ * by tracking the last ldvary instruction and explicit reads ++ * of rf0. ++ */ ++ if (c->devinfo->ver >= 71 && ++ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) || ++ (vir_get_nsrc(inst) > 1 && ++ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) { ++ for (int i = 0; i < c->num_temps; i++) { ++ if (c->temp_start[i] < ip && ++ c->temp_end[i] > last_ldvary_ip) { ++ ra_add_node_interference(c->g, ++ temp_to_node(c, i), ++ implicit_rf_nodes[0]); ++ } ++ } ++ } ++ + if (inst->dst.file == QFILE_TEMP) { + /* Only a ldunif gets to write to R5, which only has a + * single 32-bit channel of storage. +@@ -1041,36 +1199,95 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, + * because ldunif has usually a shorter lifespan, allowing for + * more accumulator reuse and QPU merges. + */ +- if (!inst->qpu.sig.ldunif) { +- uint8_t class_bits = +- get_temp_class_bits(&c->nodes, inst->dst.index) & +- ~CLASS_BITS_R5; +- set_temp_class_bits(&c->nodes, inst->dst.index, +- class_bits); +- ++ if (c->devinfo->has_accumulators) { ++ if (!inst->qpu.sig.ldunif) { ++ uint8_t class_bits = ++ get_temp_class_bits(c, inst->dst.index) & ++ ~CLASS_BITS_R5; ++ set_temp_class_bits(c, inst->dst.index, ++ class_bits); ++ ++ } else { ++ /* Until V3D 4.x, we could only load a uniform ++ * to r5, so we'll need to spill if uniform ++ * loads interfere with each other. ++ */ ++ if (c->devinfo->ver < 40) { ++ set_temp_class_bits(c, inst->dst.index, ++ CLASS_BITS_R5); ++ } ++ } + } else { +- /* Until V3D 4.x, we could only load a uniform +- * to r5, so we'll need to spill if uniform +- * loads interfere with each other. ++ /* Make sure we don't allocate the ldvary's ++ * destination to rf0, since it would clash ++ * with its implicit write to that register. ++ */ ++ if (inst->qpu.sig.ldvary) { ++ ra_add_node_interference(c->g, ++ temp_to_node(c, inst->dst.index), ++ implicit_rf_nodes[0]); ++ } ++ /* Flag dst temps from ldunif(a) instructions ++ * so we can try to assign rf0 to them and avoid ++ * converting these to ldunif(a)rf. + */ +- if (c->devinfo->ver < 40) { +- set_temp_class_bits(&c->nodes, inst->dst.index, +- CLASS_BITS_R5); ++ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) { ++ const uint32_t dst_n = ++ temp_to_node(c, inst->dst.index); ++ c->nodes.info[dst_n].is_ldunif_dst = true; + } + } + } + + /* All accumulators are invalidated across a thread switch. */ +- if (inst->qpu.sig.thrsw) { ++ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { +- set_temp_class_bits(&c->nodes, i, ++ set_temp_class_bits(c, i, + CLASS_BITS_PHYS); + } + } + } + } + ++static void ++flag_program_end_nodes(struct v3d_compile *c) ++{ ++ /* Only look for registers used in this many instructions */ ++ uint32_t last_set_count = 6; ++ ++ struct qblock *last_block = vir_exit_block(c); ++ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) { ++ if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) ++ continue; ++ ++ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); ++ for (int i = 0; i < num_src; i++) { ++ if (inst->src[i].file == QFILE_TEMP) { ++ int node = temp_to_node(c, inst->src[i].index); ++ c->nodes.info[node].is_program_end = true; ++ } ++ } ++ ++ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); ++ for (int i = 0; i < num_src; i++) { ++ if (inst->src[i].file == QFILE_TEMP) { ++ int node = temp_to_node(c, inst->src[i].index); ++ c->nodes.info[node].is_program_end = true; ++ ++ } ++ } ++ ++ if (inst->dst.file == QFILE_TEMP) { ++ int node = temp_to_node(c, inst->dst.index); ++ c->nodes.info[node].is_program_end = true; ++ } ++ ++ if (--last_set_count == 0) ++ break; ++ } ++} ++ + /** + * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. + * +@@ -1080,19 +1297,32 @@ struct qpu_reg * + v3d_register_allocate(struct v3d_compile *c) + { + int acc_nodes[ACC_COUNT]; ++ int implicit_rf_nodes[IMPLICIT_RF_COUNT]; ++ ++ unsigned num_ra_nodes = c->num_temps; ++ if (c->devinfo->has_accumulators) ++ num_ra_nodes += ARRAY_SIZE(acc_nodes); ++ else ++ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes); ++ + c->nodes = (struct v3d_ra_node_info) { + .alloc_count = c->num_temps, + .info = ralloc_array_size(c, sizeof(c->nodes.info[0]), +- c->num_temps + ACC_COUNT), ++ num_ra_nodes), + }; + ++ uint32_t phys_index = get_phys_index(c->devinfo); ++ + struct v3d_ra_select_callback_data callback_data = { ++ .phys_index = phys_index, + .next_acc = 0, + /* Start at RF3, to try to keep the TLB writes from using +- * RF0-2. ++ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from ++ * using RF2-3. + */ +- .next_phys = 3, ++ .next_phys = c->devinfo->ver <= 42 ? 3 : 4, + .nodes = &c->nodes, ++ .devinfo = c->devinfo, + }; + + vir_calculate_live_intervals(c); +@@ -1108,27 +1338,35 @@ v3d_register_allocate(struct v3d_compile *c) + c->thread_index--; + } + +- c->g = ra_alloc_interference_graph(c->compiler->regs, +- c->num_temps + ARRAY_SIZE(acc_nodes)); ++ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes); + ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); + + /* Make some fixed nodes for the accumulators, which we will need to + * interfere with when ops have implied r3/r4 writes or for the thread + * switches. We could represent these as classes for the nodes to + * live in, but the classes take up a lot of memory to set up, so we +- * don't want to make too many. ++ * don't want to make too many. We use the same mechanism on platforms ++ * without accumulators that can have implicit writes to phys regs. + */ +- for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) { +- if (i < ACC_COUNT) { ++ for (uint32_t i = 0; i < num_ra_nodes; i++) { ++ c->nodes.info[i].is_ldunif_dst = false; ++ c->nodes.info[i].is_program_end = false; ++ c->nodes.info[i].unused = false; ++ c->nodes.info[i].priority = 0; ++ c->nodes.info[i].class_bits = 0; ++ if (c->devinfo->has_accumulators && i < ACC_COUNT) { + acc_nodes[i] = i; + ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); +- c->nodes.info[i].priority = 0; +- c->nodes.info[i].class_bits = 0; ++ } else if (!c->devinfo->has_accumulators && ++ i < ARRAY_SIZE(implicit_rf_nodes)) { ++ implicit_rf_nodes[i] = i; ++ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i); + } else { +- uint32_t t = node_to_temp(i); ++ uint32_t t = node_to_temp(c, i); + c->nodes.info[i].priority = + c->temp_end[t] - c->temp_start[t]; +- c->nodes.info[i].class_bits = CLASS_BITS_ANY; ++ c->nodes.info[i].class_bits = ++ get_class_bit_any(c->devinfo); + } + } + +@@ -1136,25 +1374,61 @@ v3d_register_allocate(struct v3d_compile *c) + * interferences. + */ + int ip = 0; ++ int last_ldvary_ip = -1; + vir_for_each_inst_inorder(inst, c) { + inst->ip = ip++; +- update_graph_and_reg_classes_for_inst(c, acc_nodes, inst); ++ ++ /* ldunif(a) always write to a temporary, so we have ++ * liveness info available to decide if rf0 is ++ * available for them, however, ldvary is different: ++ * it always writes to rf0 directly so we don't have ++ * liveness information for its implicit rf0 write. ++ * ++ * That means the allocator may assign rf0 to a temp ++ * that is defined while an implicit rf0 write from ++ * ldvary is still live. We fix that by manually ++ * tracking rf0 live ranges from ldvary instructions. ++ */ ++ if (inst->qpu.sig.ldvary) ++ last_ldvary_ip = ip; ++ ++ update_graph_and_reg_classes_for_inst(c, acc_nodes, ++ implicit_rf_nodes, ++ last_ldvary_ip, inst); + } + ++ /* Flag the nodes that are used in the last instructions of the program ++ * (there are some registers that cannot be used in the last 3 ++ * instructions). We only do this for fragment shaders, because the idea ++ * is that by avoiding this conflict we may be able to emit the last ++ * thread switch earlier in some cases, however, in non-fragment shaders ++ * this won't happen because the last instructions are always VPM stores ++ * with a small immediate, which conflicts with other signals, ++ * preventing us from ever moving the thrsw earlier. ++ */ ++ if (c->s->info.stage == MESA_SHADER_FRAGMENT) ++ flag_program_end_nodes(c); ++ + /* Set the register classes for all our temporaries in the graph */ + for (uint32_t i = 0; i < c->num_temps; i++) { +- ra_set_node_class(c->g, temp_to_node(i), ++ ra_set_node_class(c->g, temp_to_node(c, i), + choose_reg_class_for_temp(c, i)); + } + + /* Add register interferences based on liveness data */ + for (uint32_t i = 0; i < c->num_temps; i++) { ++ /* And while we are here, let's also flag nodes for ++ * unused temps. ++ */ ++ if (c->temp_start[i] > c->temp_end[i]) ++ c->nodes.info[temp_to_node(c, i)].unused = true; ++ + for (uint32_t j = i + 1; j < c->num_temps; j++) { + if (interferes(c->temp_start[i], c->temp_end[i], + c->temp_start[j], c->temp_end[j])) { + ra_add_node_interference(c->g, +- temp_to_node(i), +- temp_to_node(j)); ++ temp_to_node(c, i), ++ temp_to_node(c, j)); + } + } + } +@@ -1171,9 +1445,9 @@ v3d_register_allocate(struct v3d_compile *c) + if (c->spill_size < + V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { + int node = v3d_choose_spill_node(c); +- uint32_t temp = node_to_temp(node); ++ uint32_t temp = node_to_temp(c, node); + if (node != -1) { +- v3d_spill_reg(c, acc_nodes, temp); ++ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + continue; + } + } +@@ -1186,11 +1460,11 @@ v3d_register_allocate(struct v3d_compile *c) + if (node == -1) + goto spill_fail; + +- uint32_t temp = node_to_temp(node); ++ uint32_t temp = node_to_temp(c, node); + enum temp_spill_type spill_type = + get_spill_type_for_temp(c, temp); + if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { +- v3d_spill_reg(c, acc_nodes, temp); ++ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); + if (c->spills + c->fills > c->max_tmu_spills) + goto spill_fail; + } else { +@@ -1201,14 +1475,14 @@ v3d_register_allocate(struct v3d_compile *c) + /* Allocation was successful, build the 'temp -> reg' map */ + temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); + for (uint32_t i = 0; i < c->num_temps; i++) { +- int ra_reg = ra_get_node_reg(c->g, temp_to_node(i)); +- if (ra_reg < PHYS_INDEX) { ++ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i)); ++ if (ra_reg < phys_index) { + temp_registers[i].magic = true; + temp_registers[i].index = (V3D_QPU_WADDR_R0 + + ra_reg - ACC_INDEX); + } else { + temp_registers[i].magic = false; +- temp_registers[i].index = ra_reg - PHYS_INDEX; ++ temp_registers[i].index = ra_reg - phys_index; + } + } + +diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c +index 45e6bfa1470c..4ed184cbbcb7 100644 +--- a/src/broadcom/compiler/vir_to_qpu.c ++++ b/src/broadcom/compiler/vir_to_qpu.c +@@ -86,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst) + return q; + } + ++static void ++v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) ++{ ++ /* If we have a small immediate move it from inst->raddr_b to the ++ * corresponding raddr. ++ */ ++ if (src.smimm) { ++ assert(instr->sig.small_imm_a || instr->sig.small_imm_b || ++ instr->sig.small_imm_c || instr->sig.small_imm_d); ++ *raddr = instr->raddr_b; ++ return; ++ } ++ ++ assert(!src.magic); ++ *raddr = src.index; ++} ++ + /** + * Allocates the src register (accumulator or register file) into the RADDR + * fields of the instruction. + */ + static void +-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) ++v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + { + if (src.smimm) { +- assert(instr->sig.small_imm); ++ assert(instr->sig.small_imm_b); + *mux = V3D_QPU_MUX_B; + return; + } +@@ -106,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + return; + } + +- if (instr->alu.add.a != V3D_QPU_MUX_A && +- instr->alu.add.b != V3D_QPU_MUX_A && +- instr->alu.mul.a != V3D_QPU_MUX_A && +- instr->alu.mul.b != V3D_QPU_MUX_A) { ++ if (instr->alu.add.a.mux != V3D_QPU_MUX_A && ++ instr->alu.add.b.mux != V3D_QPU_MUX_A && ++ instr->alu.mul.a.mux != V3D_QPU_MUX_A && ++ instr->alu.mul.b.mux != V3D_QPU_MUX_A) { + instr->raddr_a = src.index; + *mux = V3D_QPU_MUX_A; + } else { + if (instr->raddr_a == src.index) { + *mux = V3D_QPU_MUX_A; + } else { +- assert(!(instr->alu.add.a == V3D_QPU_MUX_B && +- instr->alu.add.b == V3D_QPU_MUX_B && +- instr->alu.mul.a == V3D_QPU_MUX_B && +- instr->alu.mul.b == V3D_QPU_MUX_B) || ++ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B && ++ instr->alu.add.b.mux == V3D_QPU_MUX_B && ++ instr->alu.mul.a.mux == V3D_QPU_MUX_B && ++ instr->alu.mul.b.mux == V3D_QPU_MUX_B) || + src.index == instr->raddr_b); + + instr->raddr_b = src.index; +@@ -128,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) + } + } + +-static bool +-is_no_op_mov(struct qinst *qinst) ++/* ++ * The main purpose of the following wrapper is to make calling set_src ++ * cleaner. This is the reason it receives both mux and raddr pointers. Those ++ * will be filled or not based on the device version. ++ */ ++static void ++set_src(struct v3d_qpu_instr *instr, ++ enum v3d_qpu_mux *mux, ++ uint8_t *raddr, ++ struct qpu_reg src, ++ const struct v3d_device_info *devinfo) + { +- static const struct v3d_qpu_sig no_sig = {0}; +- +- /* Make sure it's just a lone MOV. */ +- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || +- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || +- qinst->qpu.alu.add.op != V3D_QPU_A_NOP || +- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { +- return false; +- } ++ if (devinfo->ver < 71) ++ return v3d33_set_src(instr, mux, src); ++ else ++ return v3d71_set_src(instr, raddr, src); ++} + +- /* Check if it's a MOV from a register to itself. */ ++static bool ++v3d33_mov_src_and_dst_equal(struct qinst *qinst) ++{ + enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; + if (qinst->qpu.alu.mul.magic_write) { + if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) + return false; + +- if (qinst->qpu.alu.mul.a != ++ if (qinst->qpu.alu.mul.a.mux != + V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) { + return false; + } + } else { + int raddr; + +- switch (qinst->qpu.alu.mul.a) { ++ switch (qinst->qpu.alu.mul.a.mux) { + case V3D_QPU_MUX_A: + raddr = qinst->qpu.raddr_a; + break; +@@ -168,10 +192,61 @@ is_no_op_mov(struct qinst *qinst) + return false; + } + ++ return true; ++} ++ ++static bool ++v3d71_mov_src_and_dst_equal(struct qinst *qinst) ++{ ++ if (qinst->qpu.alu.mul.magic_write) ++ return false; ++ ++ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; ++ int raddr; ++ ++ raddr = qinst->qpu.alu.mul.a.raddr; ++ if (raddr != waddr) ++ return false; ++ ++ return true; ++} ++ ++static bool ++mov_src_and_dst_equal(struct qinst *qinst, ++ const struct v3d_device_info *devinfo) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_mov_src_and_dst_equal(qinst); ++ else ++ return v3d71_mov_src_and_dst_equal(qinst); ++} ++ ++ ++static bool ++is_no_op_mov(struct qinst *qinst, ++ const struct v3d_device_info *devinfo) ++{ ++ static const struct v3d_qpu_sig no_sig = {0}; ++ ++ /* Make sure it's just a lone MOV. We only check for M_MOV. Although ++ * for V3D 7.x there is also A_MOV, we don't need to check for it as ++ * we always emit using M_MOV. We could use A_MOV later on the ++ * squedule to improve performance ++ */ ++ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || ++ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || ++ qinst->qpu.alu.add.op != V3D_QPU_A_NOP || ++ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { ++ return false; ++ } ++ ++ if (!mov_src_and_dst_equal(qinst, devinfo)) ++ return false; ++ + /* No packing or flags updates, or we need to execute the + * instruction. + */ +- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || ++ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE || + qinst->qpu.flags.mc != V3D_QPU_COND_NONE || + qinst->qpu.flags.mpf != V3D_QPU_PF_NONE || +@@ -277,8 +352,15 @@ v3d_generate_code_block(struct v3d_compile *c, + assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + +- if (!dst.magic || +- dst.index != V3D_QPU_WADDR_R5) { ++ bool use_rf; ++ if (c->devinfo->has_accumulators) { ++ use_rf = !dst.magic || ++ dst.index != V3D_QPU_WADDR_R5; ++ } else { ++ use_rf = dst.magic || dst.index != 0; ++ } ++ ++ if (use_rf) { + assert(c->devinfo->ver >= 40); + + if (qinst->qpu.sig.ldunif) { +@@ -300,13 +382,18 @@ v3d_generate_code_block(struct v3d_compile *c, + qinst->qpu.sig_magic = dst.magic; + } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); ++ + if (nsrc >= 1) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.add.a, src[0]); ++ &qinst->qpu.alu.add.a.mux, ++ &qinst->qpu.alu.add.a.raddr, ++ src[0], c->devinfo); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.add.b, src[1]); ++ &qinst->qpu.alu.add.b.mux, ++ &qinst->qpu.alu.add.b.raddr, ++ src[1], c->devinfo); + } + + qinst->qpu.alu.add.waddr = dst.index; +@@ -314,17 +401,21 @@ v3d_generate_code_block(struct v3d_compile *c, + } else { + if (nsrc >= 1) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.mul.a, src[0]); ++ &qinst->qpu.alu.mul.a.mux, ++ &qinst->qpu.alu.mul.a.raddr, ++ src[0], c->devinfo); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, +- &qinst->qpu.alu.mul.b, src[1]); ++ &qinst->qpu.alu.mul.b.mux, ++ &qinst->qpu.alu.mul.b.raddr, ++ src[1], c->devinfo); + } + + qinst->qpu.alu.mul.waddr = dst.index; + qinst->qpu.alu.mul.magic_write = dst.magic; + +- if (is_no_op_mov(qinst)) { ++ if (is_no_op_mov(qinst, c->devinfo)) { + vir_remove_instruction(c, qinst); + continue; + } +diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build +index 2c10e46b1882..73cb7aa05756 100644 +--- a/src/broadcom/meson.build ++++ b/src/broadcom/meson.build +@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle') + + subdir('cle') + +-v3d_versions = ['33', '41', '42'] ++v3d_versions = ['33', '41', '42', '71'] + v3d_libs = [] + + if with_gallium_v3d or with_broadcom_vk +diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c +index 28fb2357b971..c1590a760de5 100644 +--- a/src/broadcom/qpu/qpu_disasm.c ++++ b/src/broadcom/qpu/qpu_disasm.c +@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n) + + + static void +-v3d_qpu_disasm_raddr(struct disasm_state *disasm, +- const struct v3d_qpu_instr *instr, uint8_t mux) ++v3d33_qpu_disasm_raddr(struct disasm_state *disasm, ++ const struct v3d_qpu_instr *instr, ++ enum v3d_qpu_mux mux) + { + if (mux == V3D_QPU_MUX_A) { + append(disasm, "rf%d", instr->raddr_a); + } else if (mux == V3D_QPU_MUX_B) { +- if (instr->sig.small_imm) { ++ if (instr->sig.small_imm_b) { + uint32_t val; + ASSERTED bool ok = + v3d_qpu_small_imm_unpack(disasm->devinfo, +@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm, + } + } + ++enum v3d_qpu_input_class { ++ V3D_QPU_ADD_A, ++ V3D_QPU_ADD_B, ++ V3D_QPU_MUL_A, ++ V3D_QPU_MUL_B ++}; ++ ++static void ++v3d71_qpu_disasm_raddr(struct disasm_state *disasm, ++ const struct v3d_qpu_instr *instr, ++ uint8_t raddr, ++ enum v3d_qpu_input_class input_class) ++{ ++ bool is_small_imm = false; ++ switch(input_class) { ++ case V3D_QPU_ADD_A: ++ is_small_imm = instr->sig.small_imm_a; ++ break; ++ case V3D_QPU_ADD_B: ++ is_small_imm = instr->sig.small_imm_b; ++ break; ++ case V3D_QPU_MUL_A: ++ is_small_imm = instr->sig.small_imm_c; ++ break; ++ case V3D_QPU_MUL_B: ++ is_small_imm = instr->sig.small_imm_d; ++ break; ++ } ++ ++ if (is_small_imm) { ++ uint32_t val; ++ ASSERTED bool ok = ++ v3d_qpu_small_imm_unpack(disasm->devinfo, ++ raddr, ++ &val); ++ ++ if ((int)val >= -16 && (int)val <= 15) ++ append(disasm, "%d", val); ++ else ++ append(disasm, "0x%08x", val); ++ assert(ok); ++ } else { ++ append(disasm, "rf%d", raddr); ++ } ++} ++ ++static void ++v3d_qpu_disasm_raddr(struct disasm_state *disasm, ++ const struct v3d_qpu_instr *instr, ++ const struct v3d_qpu_input *input, ++ enum v3d_qpu_input_class input_class) ++{ ++ if (disasm->devinfo->ver < 71) ++ v3d33_qpu_disasm_raddr(disasm, instr, input->mux); ++ else ++ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class); ++} ++ + static void + v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic) + { +@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm, + if (num_src >= 1) { + if (has_dst) + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.add.a_unpack)); ++ v3d_qpu_unpack_name(instr->alu.add.a.unpack)); + } + + if (num_src >= 2) { + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.add.b_unpack)); ++ v3d_qpu_unpack_name(instr->alu.add.b.unpack)); + } + } + +@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, + if (num_src >= 1) { + if (has_dst) + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.mul.a_unpack)); ++ v3d_qpu_unpack_name(instr->alu.mul.a.unpack)); + } + + if (num_src >= 2) { + append(disasm, ", "); +- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b); ++ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B); + append(disasm, "%s", +- v3d_qpu_unpack_name(instr->alu.mul.b_unpack)); ++ v3d_qpu_unpack_name(instr->alu.mul.b.unpack)); + } + } + +diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c +index 60dabf74e8e0..44f20618a5a3 100644 +--- a/src/broadcom/qpu/qpu_instr.c ++++ b/src/broadcom/qpu/qpu_instr.c +@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo, + if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU) + return "tmu"; + ++ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below ++ */ ++ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD) ++ return "quad"; ++ ++ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP) ++ return "rep"; ++ + static const char *waddr_magic[] = { + [V3D_QPU_WADDR_R0] = "r0", + [V3D_QPU_WADDR_R1] = "r1", +@@ -169,6 +177,12 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op) + [V3D_QPU_A_ITOF] = "itof", + [V3D_QPU_A_CLZ] = "clz", + [V3D_QPU_A_UTOF] = "utof", ++ [V3D_QPU_A_MOV] = "mov", ++ [V3D_QPU_A_FMOV] = "fmov", ++ [V3D_QPU_A_VPACK] = "vpack", ++ [V3D_QPU_A_V8PACK] = "v8pack", ++ [V3D_QPU_A_V10PACK] = "v10pack", ++ [V3D_QPU_A_V11FPACK] = "v11fpack", + }; + + if (op >= ARRAY_SIZE(op_names)) +@@ -191,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op) + [V3D_QPU_M_MOV] = "mov", + [V3D_QPU_M_NOP] = "nop", + [V3D_QPU_M_FMUL] = "fmul", ++ [V3D_QPU_M_FTOUNORM16] = "ftounorm16", ++ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16", ++ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8", ++ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8", ++ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo", ++ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi", + }; + + if (op >= ARRAY_SIZE(op_names)) +@@ -450,6 +470,13 @@ static const uint8_t add_op_args[] = { + [V3D_QPU_A_ITOF] = D | A, + [V3D_QPU_A_CLZ] = D | A, + [V3D_QPU_A_UTOF] = D | A, ++ ++ [V3D_QPU_A_MOV] = D | A, ++ [V3D_QPU_A_FMOV] = D | A, ++ [V3D_QPU_A_VPACK] = D | A | B, ++ [V3D_QPU_A_V8PACK] = D | A | B, ++ [V3D_QPU_A_V10PACK] = D | A | B, ++ [V3D_QPU_A_V11FPACK] = D | A | B, + }; + + static const uint8_t mul_op_args[] = { +@@ -463,6 +490,12 @@ static const uint8_t mul_op_args[] = { + [V3D_QPU_M_NOP] = 0, + [V3D_QPU_M_MOV] = D | A, + [V3D_QPU_M_FMUL] = D | A | B, ++ [V3D_QPU_M_FTOUNORM16] = D | A, ++ [V3D_QPU_M_FTOSNORM16] = D | A, ++ [V3D_QPU_M_VFTOUNORM8] = D | A, ++ [V3D_QPU_M_VFTOSNORM8] = D | A, ++ [V3D_QPU_M_VFTOUNORM10LO] = D | A, ++ [V3D_QPU_M_VFTOUNORM10HI] = D | A, + }; + + bool +@@ -636,12 +669,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op) + } + + bool +-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ++v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) + { +- if (inst->sig.ldtlb || +- inst->sig.ldtlbu) +- return true; ++ return inst->sig.ldtlb || inst->sig.ldtlbu; ++} + ++bool ++v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ++{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && +@@ -659,6 +694,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) + return false; + } + ++bool ++v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ++{ ++ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst); ++} ++ + bool + v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) + { +@@ -846,6 +887,9 @@ bool + v3d_qpu_writes_r3(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if(!devinfo->has_accumulators) ++ return false; ++ + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3)) + return true; + +@@ -856,6 +900,9 @@ bool + v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && +@@ -886,6 +933,9 @@ bool + v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5)) + return true; + +@@ -896,6 +946,9 @@ bool + v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) + { ++ if (!devinfo->has_accumulators) ++ return false; ++ + if (v3d_qpu_writes_r5(devinfo, inst)) + return true; + if (v3d_qpu_writes_r4(devinfo, inst)) +@@ -912,16 +965,68 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, + return false; + } + ++bool ++v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst) ++{ ++ if (devinfo->ver >= 71 && ++ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) { ++ return true; ++ } ++ ++ return false; ++} ++ + bool + v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) + { + int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); + int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + +- return ((add_nsrc > 0 && inst->alu.add.a == mux) || +- (add_nsrc > 1 && inst->alu.add.b == mux) || +- (mul_nsrc > 0 && inst->alu.mul.a == mux) || +- (mul_nsrc > 1 && inst->alu.mul.b == mux)); ++ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) || ++ (add_nsrc > 1 && inst->alu.add.b.mux == mux) || ++ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) || ++ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux)); ++} ++ ++bool ++v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr) ++{ ++ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); ++ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); ++ ++ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) || ++ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) || ++ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) || ++ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr); ++} ++ ++bool ++v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst, ++ uint8_t waddr) ++{ ++ if (inst->type != V3D_QPU_INSTR_TYPE_ALU) ++ return false; ++ ++ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) && ++ !inst->alu.add.magic_write && ++ inst->alu.add.waddr == waddr) { ++ return true; ++ } ++ ++ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) && ++ !inst->alu.mul.magic_write && ++ inst->alu.mul.waddr == waddr) { ++ return true; ++ } ++ ++ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && ++ !inst->sig_magic && inst->sig_addr == waddr) { ++ return true; ++ } ++ ++ return false; + } + + bool +diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h +index 2e1334726987..56eee9f9cac8 100644 +--- a/src/broadcom/qpu/qpu_instr.h ++++ b/src/broadcom/qpu/qpu_instr.h +@@ -50,10 +50,13 @@ struct v3d_qpu_sig { + bool ldvpm:1; + bool ldtlb:1; + bool ldtlbu:1; +- bool small_imm:1; + bool ucb:1; + bool rotate:1; + bool wrtmuc:1; ++ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */ ++ bool small_imm_b:1; /* raddr_b (add b) */ ++ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */ ++ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */ + }; + + enum v3d_qpu_cond { +@@ -88,12 +91,13 @@ enum v3d_qpu_uf { + }; + + enum v3d_qpu_waddr { +- V3D_QPU_WADDR_R0 = 0, +- V3D_QPU_WADDR_R1 = 1, +- V3D_QPU_WADDR_R2 = 2, +- V3D_QPU_WADDR_R3 = 3, +- V3D_QPU_WADDR_R4 = 4, +- V3D_QPU_WADDR_R5 = 5, ++ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */ ++ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */ + V3D_QPU_WADDR_NOP = 6, + V3D_QPU_WADDR_TLB = 7, + V3D_QPU_WADDR_TLBU = 8, +@@ -108,12 +112,12 @@ enum v3d_qpu_waddr { + V3D_QPU_WADDR_SYNC = 16, + V3D_QPU_WADDR_SYNCU = 17, + V3D_QPU_WADDR_SYNCB = 18, +- V3D_QPU_WADDR_RECIP = 19, +- V3D_QPU_WADDR_RSQRT = 20, +- V3D_QPU_WADDR_EXP = 21, +- V3D_QPU_WADDR_LOG = 22, +- V3D_QPU_WADDR_SIN = 23, +- V3D_QPU_WADDR_RSQRT2 = 24, ++ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */ ++ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_TMUC = 32, + V3D_QPU_WADDR_TMUS = 33, + V3D_QPU_WADDR_TMUT = 34, +@@ -129,7 +133,8 @@ enum v3d_qpu_waddr { + V3D_QPU_WADDR_TMUHSCM = 44, + V3D_QPU_WADDR_TMUHSF = 45, + V3D_QPU_WADDR_TMUHSLOD = 46, +- V3D_QPU_WADDR_R5REP = 55, ++ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */ ++ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */ + }; + + struct v3d_qpu_flags { +@@ -222,6 +227,14 @@ enum v3d_qpu_add_op { + V3D_QPU_A_ITOF, + V3D_QPU_A_CLZ, + V3D_QPU_A_UTOF, ++ ++ /* V3D 7.x */ ++ V3D_QPU_A_FMOV, ++ V3D_QPU_A_MOV, ++ V3D_QPU_A_VPACK, ++ V3D_QPU_A_V8PACK, ++ V3D_QPU_A_V10PACK, ++ V3D_QPU_A_V11FPACK, + }; + + enum v3d_qpu_mul_op { +@@ -235,6 +248,14 @@ enum v3d_qpu_mul_op { + V3D_QPU_M_MOV, + V3D_QPU_M_NOP, + V3D_QPU_M_FMUL, ++ ++ /* V3D 7.x */ ++ V3D_QPU_M_FTOUNORM16, ++ V3D_QPU_M_FTOSNORM16, ++ V3D_QPU_M_VFTOUNORM8, ++ V3D_QPU_M_VFTOSNORM8, ++ V3D_QPU_M_VFTOUNORM10LO, ++ V3D_QPU_M_VFTOUNORM10HI, + }; + + enum v3d_qpu_output_pack { +@@ -276,6 +297,15 @@ enum v3d_qpu_input_unpack { + + /** Swap high and low 16 bits */ + V3D_QPU_UNPACK_SWAP_16, ++ ++ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */ ++ V3D_QPU_UNPACK_UL, ++ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */ ++ V3D_QPU_UNPACK_UH, ++ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */ ++ V3D_QPU_UNPACK_IL, ++ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */ ++ V3D_QPU_UNPACK_IH, + }; + + enum v3d_qpu_mux { +@@ -289,25 +319,29 @@ enum v3d_qpu_mux { + V3D_QPU_MUX_B, + }; + ++struct v3d_qpu_input { ++ union { ++ enum v3d_qpu_mux mux; /* V3D 4.x */ ++ uint8_t raddr; /* V3D 7.x */ ++ }; ++ enum v3d_qpu_input_unpack unpack; ++}; ++ + struct v3d_qpu_alu_instr { + struct { + enum v3d_qpu_add_op op; +- enum v3d_qpu_mux a, b; ++ struct v3d_qpu_input a, b; + uint8_t waddr; + bool magic_write; + enum v3d_qpu_output_pack output_pack; +- enum v3d_qpu_input_unpack a_unpack; +- enum v3d_qpu_input_unpack b_unpack; + } add; + + struct { + enum v3d_qpu_mul_op op; +- enum v3d_qpu_mux a, b; ++ struct v3d_qpu_input a, b; + uint8_t waddr; + bool magic_write; + enum v3d_qpu_output_pack output_pack; +- enum v3d_qpu_input_unpack a_unpack; +- enum v3d_qpu_input_unpack b_unpack; + } mul; + }; + +@@ -379,8 +413,8 @@ struct v3d_qpu_instr { + struct v3d_qpu_sig sig; + uint8_t sig_addr; + bool sig_magic; /* If the signal writes to a magic address */ +- uint8_t raddr_a; +- uint8_t raddr_b; ++ uint8_t raddr_a; /* V3D 4.x */ ++ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */ + struct v3d_qpu_flags flags; + + union { +@@ -450,6 +484,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; + bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; + bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; + bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; ++bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; ++bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +@@ -464,6 +500,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; + bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; ++bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; + bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +@@ -483,4 +521,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + + bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; ++ ++bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr); ++bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *inst, ++ uint8_t waddr); + #endif +diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c +index a875683c6f80..f09bc041e7de 100644 +--- a/src/broadcom/qpu/qpu_pack.c ++++ b/src/broadcom/qpu/qpu_pack.c +@@ -84,6 +84,9 @@ + #define V3D_QPU_MUL_A_SHIFT 18 + #define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18) + ++#define V3D_QPU_RADDR_C_SHIFT 18 ++#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18) ++ + #define V3D_QPU_ADD_B_SHIFT 15 + #define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15) + +@@ -98,6 +101,9 @@ + #define V3D_QPU_BRANCH_BDI_SHIFT 12 + #define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12) + ++#define V3D_QPU_RADDR_D_SHIFT 12 ++#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12) ++ + #define V3D_QPU_RADDR_A_SHIFT 6 + #define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6) + +@@ -112,12 +118,15 @@ + #define LDTMU .ldtmu = true + #define LDVARY .ldvary = true + #define LDVPM .ldvpm = true +-#define SMIMM .small_imm = true + #define LDTLB .ldtlb = true + #define LDTLBU .ldtlbu = true + #define UCB .ucb = true + #define ROT .rotate = true + #define WRTMUC .wrtmuc = true ++#define SMIMM_A .small_imm_a = true ++#define SMIMM_B .small_imm_b = true ++#define SMIMM_C .small_imm_c = true ++#define SMIMM_D .small_imm_d = true + + static const struct v3d_qpu_sig v33_sig_map[] = { + /* MISC R3 R4 R5 */ +@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { + [11] = { THRSW, LDVARY, LDUNIF }, + [12] = { LDVARY, LDTMU, }, + [13] = { THRSW, LDVARY, LDTMU, }, +- [14] = { SMIMM, LDVARY, }, +- [15] = { SMIMM, }, ++ [14] = { SMIMM_B, LDVARY, }, ++ [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + /* 18-21 reserved */ +@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { + [27] = { THRSW, LDVPM, LDUNIF }, + [28] = { LDVPM, LDTMU, }, + [29] = { THRSW, LDVPM, LDTMU, }, +- [30] = { SMIMM, LDVPM, }, +- [31] = { SMIMM, }, ++ [30] = { SMIMM_B, LDVPM, }, ++ [31] = { SMIMM_B, }, + }; + + static const struct v3d_qpu_sig v40_sig_map[] = { +@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = { + [10] = { LDVARY, LDUNIF }, + [11] = { THRSW, LDVARY, LDUNIF }, + /* 12-13 reserved */ +- [14] = { SMIMM, LDVARY, }, +- [15] = { SMIMM, }, ++ [14] = { SMIMM_B, LDVARY, }, ++ [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + [18] = { WRTMUC }, +@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = { + [22] = { UCB, }, + [23] = { ROT, }, + /* 24-30 reserved */ +- [31] = { SMIMM, LDTMU, }, ++ [31] = { SMIMM_B, LDTMU, }, + }; + + static const struct v3d_qpu_sig v41_sig_map[] = { +@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = { + [11] = { THRSW, LDVARY, LDUNIF }, + [12] = { LDUNIFRF }, + [13] = { THRSW, LDUNIFRF }, +- [14] = { SMIMM, LDVARY, }, +- [15] = { SMIMM, }, ++ [14] = { SMIMM_B, LDVARY }, ++ [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + [18] = { WRTMUC }, +@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = { + [24] = { LDUNIFA}, + [25] = { LDUNIFARF }, + /* 26-30 reserved */ +- [31] = { SMIMM, LDTMU, }, ++ [31] = { SMIMM_B, LDTMU, }, ++}; ++ ++ ++static const struct v3d_qpu_sig v71_sig_map[] = { ++ /* MISC phys RF0 */ ++ [0] = { }, ++ [1] = { THRSW, }, ++ [2] = { LDUNIF }, ++ [3] = { THRSW, LDUNIF }, ++ [4] = { LDTMU, }, ++ [5] = { THRSW, LDTMU, }, ++ [6] = { LDTMU, LDUNIF }, ++ [7] = { THRSW, LDTMU, LDUNIF }, ++ [8] = { LDVARY, }, ++ [9] = { THRSW, LDVARY, }, ++ [10] = { LDVARY, LDUNIF }, ++ [11] = { THRSW, LDVARY, LDUNIF }, ++ [12] = { LDUNIFRF }, ++ [13] = { THRSW, LDUNIFRF }, ++ [14] = { SMIMM_A, }, ++ [15] = { SMIMM_B, }, ++ [16] = { LDTLB, }, ++ [17] = { LDTLBU, }, ++ [18] = { WRTMUC }, ++ [19] = { THRSW, WRTMUC }, ++ [20] = { LDVARY, WRTMUC }, ++ [21] = { THRSW, LDVARY, WRTMUC }, ++ [22] = { UCB, }, ++ /* 23 reserved */ ++ [24] = { LDUNIFA}, ++ [25] = { LDUNIFARF }, ++ /* 26-29 reserved */ ++ [30] = { SMIMM_C, }, ++ [31] = { SMIMM_D, }, + }; + + bool +@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo, + if (packed_sig >= ARRAY_SIZE(v33_sig_map)) + return false; + +- if (devinfo->ver >= 41) ++ if (devinfo->ver >= 71) ++ *sig = v71_sig_map[packed_sig]; ++ else if (devinfo->ver >= 41) + *sig = v41_sig_map[packed_sig]; + else if (devinfo->ver == 40) + *sig = v40_sig_map[packed_sig]; +@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo, + { + static const struct v3d_qpu_sig *map; + +- if (devinfo->ver >= 41) ++ if (devinfo->ver >= 71) ++ map = v71_sig_map; ++ else if (devinfo->ver >= 41) + map = v41_sig_map; + else if (devinfo->ver == 40) + map = v40_sig_map; +@@ -443,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo, + + /* Make a mapping of the table of opcodes in the spec. The opcode is + * determined by a combination of the opcode field, and in the case of 0 or +- * 1-arg opcodes, the mux_b field as well. ++ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as ++ * well. + */ +-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1)) +-#define ANYMUX MUX_MASK(0, 7) ++#define OP_MASK(val) BITFIELD64_BIT(val) ++#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1) ++#define ANYMUX OP_RANGE(0, 7) ++#define ANYOPMASK OP_RANGE(0, 63) + + struct opcode_desc { + uint8_t opcode_first; + uint8_t opcode_last; +- uint8_t mux_b_mask; +- uint8_t mux_a_mask; ++ ++ union { ++ struct { ++ uint8_t b_mask; ++ uint8_t a_mask; ++ } mux; ++ uint64_t raddr_mask; ++ }; ++ + uint8_t op; + + /* first_ver == 0 if it's the same across all V3D versions. +@@ -465,122 +522,321 @@ struct opcode_desc { + uint8_t last_ver; + }; + +-static const struct opcode_desc add_ops[] = { ++static const struct opcode_desc add_ops_v33[] = { + /* FADD is FADDNF depending on the order of the mux_a/mux_b. */ +- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD }, +- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF }, +- { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, +- { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD }, +- { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, +- { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB }, +- { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, +- { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB }, +- { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN }, +- { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX }, +- { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN }, +- { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX }, +- { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL }, +- { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR }, +- { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR }, +- { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR }, ++ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD }, ++ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF }, ++ { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, ++ { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD }, ++ { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, ++ { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB }, ++ { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, ++ { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB }, ++ { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN }, ++ { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX }, ++ { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN }, ++ { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX }, ++ { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL }, ++ { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR }, ++ { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR }, ++ { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR }, + /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */ +- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN }, +- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX }, +- { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN }, +- +- { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND }, +- { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR }, +- { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR }, +- +- { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD }, +- { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB }, +- { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT }, +- { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG }, +- { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH }, +- { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH }, +- { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP }, +- { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP }, +- { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF }, +- { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF }, +- { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 }, +- { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX }, +- { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX }, +- { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR }, +- { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA }, +- { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA }, +- { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB }, +- { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB }, +- +- { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD }, +- { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD }, +- { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD }, +- { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD }, +- +- { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF }, +- { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF }, +- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 }, +- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 }, +- { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 }, +- { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 }, +- { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT }, +- { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT }, +- { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 }, +- { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 }, +- { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, +- +- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, +- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, +- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, +- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, +- { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 }, +- { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 }, +- { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 }, +- { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 }, +- { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 }, +- { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 }, +- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, +- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, ++ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN }, ++ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX }, ++ { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN }, ++ ++ { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND }, ++ { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR }, ++ { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR }, ++ ++ { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD }, ++ { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB }, ++ { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT }, ++ { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG }, ++ { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH }, ++ { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH }, ++ { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP }, ++ { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP }, ++ { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF }, ++ { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB }, ++ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, ++ ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD }, ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD }, ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD }, ++ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD }, ++ ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT }, ++ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 }, ++ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 }, ++ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, ++ ++ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 }, ++ { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 }, ++ { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 }, ++ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, ++ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, + + /* FIXME: MORE COMPLICATED */ +- /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ ++ /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ + +- { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP }, +- { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX }, ++ { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP }, ++ { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX }, + +- { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND }, +- { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN }, +- { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC }, +- { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ }, +- { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR }, +- { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ }, +- { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL }, +- { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC }, ++ { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND }, ++ { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN }, ++ { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC }, ++ { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ }, ++ { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR }, ++ { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ }, ++ { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL }, ++ { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC }, + +- { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX }, +- { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY }, ++ { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX }, ++ { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY }, + + /* The stvpms are distinguished by the waddr field. */ +- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV }, +- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD }, +- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP }, ++ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV }, ++ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD }, ++ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP }, ++ ++ { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF }, ++ { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ }, ++ { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF }, ++}; ++ ++static const struct opcode_desc mul_ops_v33[] = { ++ { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD }, ++ { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB }, ++ { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 }, ++ { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL }, ++ { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 }, ++ { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP }, ++ { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 }, ++ { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42}, ++ { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 }, ++ { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 }, ++ ++ { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL }, ++}; ++ ++/* Note that it would have been possible to define all the add/mul opcodes in ++ * just one table, using the first_ver/last_ver. But taking into account that ++ * for v71 there were a lot of changes, it was more tidy this way. Also right ++ * now we are doing a linear search on those tables, so this maintains the ++ * tables smaller. ++ * ++ * Just in case we merge the tables, we define the first_ver as 71 for those ++ * opcodes that changed on v71 ++ */ ++static const struct opcode_desc add_ops_v71[] = { ++ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */ ++ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD }, ++ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF }, ++ { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, ++ { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD }, ++ { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, ++ { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB }, ++ { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, ++ { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB }, ++ { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN }, ++ { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX }, ++ { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN }, ++ { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX }, ++ { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL }, ++ { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR }, ++ { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR }, ++ { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR }, ++ /* FMIN is instead FMAX depending on the raddr_a/b order. */ ++ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN }, ++ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX }, ++ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN }, ++ ++ { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND }, ++ { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR }, ++ { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR }, ++ { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD }, ++ { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB }, ++ ++ { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT }, ++ { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG }, ++ { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH }, ++ { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH }, ++ { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP }, ++ { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ }, ++ { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF }, ++ { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF }, ++ ++ { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, ++ { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX }, ++ { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX }, ++ { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR }, ++ { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA }, ++ { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, ++ { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB }, ++ { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, ++ { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD }, ++ { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD }, ++ { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF }, ++ { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF }, ++ { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID }, ++ { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID }, ++ { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID }, ++ { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT }, ++ { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT }, ++ { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST }, ++ { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST }, ++ ++ { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD }, ++ { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD }, ++ ++ { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 }, ++ ++ { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 }, ++ { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 }, ++ ++ { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 }, + +- { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF }, +- { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ }, +- { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF }, ++ /* The stvpms are distinguished by the waddr field. */ ++ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71}, ++ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71}, ++ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71}, ++ ++ { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 }, ++ { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 }, ++ { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 }, ++ ++ { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC }, ++ { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC }, ++ { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC }, ++ { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC }, ++ ++ { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 }, ++ ++ { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 }, ++ { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 }, ++ ++ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 }, ++ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 }, ++ ++ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 }, ++ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 }, ++ ++ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 }, ++ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 }, ++ ++ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 }, ++ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 }, + }; + +-static const struct opcode_desc mul_ops[] = { +- { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD }, +- { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB }, +- { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 }, +- { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL }, +- { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 }, +- { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP }, +- { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV }, +- { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV }, +- { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 }, +- { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV }, +- { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL }, ++static const struct opcode_desc mul_ops_v71[] = { ++ /* For V3D 7.1, second mask field would be ignored */ ++ { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 }, ++ { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 }, ++ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, ++ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, ++ { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 }, ++ { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 }, ++ { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 }, ++ { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 }, ++ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 }, ++ ++ { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 }, ++ ++ { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL }, + }; + + /* Returns true if op_desc should be filtered out based on devinfo->ver +@@ -589,17 +845,23 @@ static const struct opcode_desc mul_ops[] = { + */ + static bool + opcode_invalid_in_version(const struct v3d_device_info *devinfo, +- const struct opcode_desc *op_desc) ++ const uint8_t first_ver, ++ const uint8_t last_ver) + { +- return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) || +- (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver); ++ return (first_ver != 0 && devinfo->ver < first_ver) || ++ (last_ver != 0 && devinfo->ver > last_ver); + } + ++/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending ++ * on the devinfo->ver some would be ignored. We do this way just to avoid ++ * having two really similar lookup_opcode methods ++ */ + static const struct opcode_desc * + lookup_opcode_from_packed(const struct v3d_device_info *devinfo, + const struct opcode_desc *opcodes, + size_t num_opcodes, uint32_t opcode, +- uint32_t mux_a, uint32_t mux_b) ++ uint32_t mux_a, uint32_t mux_b, ++ uint32_t raddr) + { + for (int i = 0; i < num_opcodes; i++) { + const struct opcode_desc *op_desc = &opcodes[i]; +@@ -608,14 +870,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo, + opcode > op_desc->opcode_last) + continue; + +- if (opcode_invalid_in_version(devinfo, op_desc)) ++ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) + continue; + +- if (!(op_desc->mux_b_mask & (1 << mux_b))) +- continue; ++ if (devinfo->ver < 71) { ++ if (!(op_desc->mux.b_mask & (1 << mux_b))) ++ continue; + +- if (!(op_desc->mux_a_mask & (1 << mux_a))) +- continue; ++ if (!(op_desc->mux.a_mask & (1 << mux_a))) ++ continue; ++ } else { ++ if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr))) ++ continue; ++ } + + return op_desc; + } +@@ -667,6 +934,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked, + } + } + ++static bool ++v3d_qpu_int32_unpack_unpack(uint32_t packed, ++ enum v3d_qpu_input_unpack *unpacked) ++{ ++ switch (packed) { ++ case 0: ++ *unpacked = V3D_QPU_UNPACK_NONE; ++ return true; ++ case 1: ++ *unpacked = V3D_QPU_UNPACK_UL; ++ return true; ++ case 2: ++ *unpacked = V3D_QPU_UNPACK_UH; ++ return true; ++ case 3: ++ *unpacked = V3D_QPU_UNPACK_IL; ++ return true; ++ case 4: ++ *unpacked = V3D_QPU_UNPACK_IH; ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static bool ++v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked, ++ uint32_t *packed) ++{ ++ switch (unpacked) { ++ case V3D_QPU_UNPACK_NONE: ++ *packed = 0; ++ return true; ++ case V3D_QPU_UNPACK_UL: ++ *packed = 1; ++ return true; ++ case V3D_QPU_UNPACK_UH: ++ *packed = 2; ++ return true; ++ case V3D_QPU_UNPACK_IL: ++ *packed = 3; ++ return true; ++ case V3D_QPU_UNPACK_IH: ++ *packed = 4; ++ return true; ++ default: ++ return false; ++ } ++} ++ + static bool + v3d_qpu_float16_unpack_unpack(uint32_t packed, + enum v3d_qpu_input_unpack *unpacked) +@@ -737,8 +1054,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack, + } + + static bool +-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, +- struct v3d_qpu_instr *instr) ++v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) + { + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); + uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A); +@@ -755,8 +1072,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + map_op = (map_op - 253 + 245); + + const struct opcode_desc *desc = +- lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops), +- map_op, mux_a, mux_b); ++ lookup_opcode_from_packed(devinfo, add_ops_v33, ++ ARRAY_SIZE(add_ops_v33), ++ map_op, mux_a, mux_b, 0); + + if (!desc) + return false; +@@ -812,12 +1130,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, +- &instr->alu.add.b_unpack)) { ++ &instr->alu.add.b.unpack)) { + return false; + } + break; +@@ -831,7 +1149,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.add.output_pack = mux_b & 0x3; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + break; +@@ -843,7 +1161,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + break; +@@ -851,23 +1169,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + if (!v3d_qpu_float16_unpack_unpack(op & 0x7, +- &instr->alu.add.a_unpack)) { ++ &instr->alu.add.a.unpack)) { + return false; + } + + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; +- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + + default: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; +- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; +- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + +- instr->alu.add.a = mux_a; +- instr->alu.add.b = mux_b; ++ instr->alu.add.a.mux = mux_a; ++ instr->alu.add.b.mux = mux_b; + instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); + + instr->alu.add.magic_write = false; +@@ -892,8 +1210,194 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + } + + static bool +-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) ++{ ++ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); ++ uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A); ++ uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B); ++ uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); ++ uint32_t map_op = op; ++ ++ const struct opcode_desc *desc = ++ lookup_opcode_from_packed(devinfo, ++ add_ops_v71, ++ ARRAY_SIZE(add_ops_v71), ++ map_op, 0, 0, ++ raddr_b); ++ if (!desc) ++ return false; ++ ++ instr->alu.add.op = desc->op; ++ ++ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the ++ * operands. ++ */ ++ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a > ++ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) { ++ if (instr->alu.add.op == V3D_QPU_A_FMIN) ++ instr->alu.add.op = V3D_QPU_A_FMAX; ++ if (instr->alu.add.op == V3D_QPU_A_FADD) ++ instr->alu.add.op = V3D_QPU_A_FADDNF; ++ } ++ ++ /* Some QPU ops require a bit more than just basic opcode and mux a/b ++ * comparisons to distinguish them. ++ */ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_STVPMV: ++ case V3D_QPU_A_STVPMD: ++ case V3D_QPU_A_STVPMP: ++ switch (waddr) { ++ case 0: ++ instr->alu.add.op = V3D_QPU_A_STVPMV; ++ break; ++ case 1: ++ instr->alu.add.op = V3D_QPU_A_STVPMD; ++ break; ++ case 2: ++ instr->alu.add.op = V3D_QPU_A_STVPMP; ++ break; ++ default: ++ return false; ++ } ++ break; ++ default: ++ break; ++ } ++ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_FADD: ++ case V3D_QPU_A_FADDNF: ++ case V3D_QPU_A_FSUB: ++ case V3D_QPU_A_FMIN: ++ case V3D_QPU_A_FMAX: ++ case V3D_QPU_A_FCMP: ++ case V3D_QPU_A_VFPACK: ++ if (instr->alu.add.op != V3D_QPU_A_VFPACK && ++ instr->alu.add.op != V3D_QPU_A_FCMP) { ++ instr->alu.add.output_pack = (op >> 4) & 0x3; ++ } else { ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ } ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, ++ &instr->alu.add.b.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_FFLOOR: ++ case V3D_QPU_A_FROUND: ++ case V3D_QPU_A_FTRUNC: ++ case V3D_QPU_A_FCEIL: ++ case V3D_QPU_A_FDX: ++ case V3D_QPU_A_FDY: ++ instr->alu.add.output_pack = raddr_b & 0x3; ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_FTOIN: ++ case V3D_QPU_A_FTOIZ: ++ case V3D_QPU_A_FTOUZ: ++ case V3D_QPU_A_FTOC: ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_VFMIN: ++ case V3D_QPU_A_VFMAX: ++ unreachable("pending v71 update"); ++ if (!v3d_qpu_float16_unpack_unpack(op & 0x7, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; ++ break; ++ ++ case V3D_QPU_A_MOV: ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ case V3D_QPU_A_FMOV: ++ instr->alu.add.output_pack = raddr_b & 0x3; ++ ++ /* Mul alu FMOV has one additional variant */ ++ int32_t unpack = (raddr_b >> 2) & 0x7; ++ if (unpack == 7) ++ return false; ++ ++ if (!v3d_qpu_float32_unpack_unpack(unpack, ++ &instr->alu.add.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ default: ++ instr->alu.add.output_pack = V3D_QPU_PACK_NONE; ++ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; ++ break; ++ } ++ ++ instr->alu.add.a.raddr = raddr_a; ++ instr->alu.add.b.raddr = raddr_b; ++ instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); ++ ++ instr->alu.add.magic_write = false; ++ if (packed_inst & V3D_QPU_MA) { ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_LDVPMV_IN: ++ instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT; ++ break; ++ case V3D_QPU_A_LDVPMD_IN: ++ instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT; ++ break; ++ case V3D_QPU_A_LDVPMG_IN: ++ instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT; ++ break; ++ default: ++ instr->alu.add.magic_write = true; ++ break; ++ } ++ } ++ ++ return true; ++} ++ ++static bool ++v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_add_unpack(devinfo, packed_inst, instr); ++ else ++ return v3d71_qpu_add_unpack(devinfo, packed_inst, instr); ++} ++ ++static bool ++v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) + { + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); + uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A); +@@ -901,9 +1405,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + + { + const struct opcode_desc *desc = +- lookup_opcode_from_packed(devinfo, mul_ops, +- ARRAY_SIZE(mul_ops), +- op, mux_a, mux_b); ++ lookup_opcode_from_packed(devinfo, ++ mul_ops_v33, ++ ARRAY_SIZE(mul_ops_v33), ++ op, mux_a, mux_b, 0); + if (!desc) + return false; + +@@ -915,12 +1420,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, +- &instr->alu.mul.a_unpack)) { ++ &instr->alu.mul.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, +- &instr->alu.mul.b_unpack)) { ++ &instr->alu.mul.b.unpack)) { + return false; + } + +@@ -931,7 +1436,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + ((mux_b >> 2) & 1)); + + if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3, +- &instr->alu.mul.a_unpack)) { ++ &instr->alu.mul.a.unpack)) { + return false; + } + +@@ -941,74 +1446,169 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, +- &instr->alu.mul.a_unpack)) { ++ &instr->alu.mul.a.unpack)) { + return false; + } + +- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + break; + + default: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; +- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE; +- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + +- instr->alu.mul.a = mux_a; +- instr->alu.mul.b = mux_b; ++ instr->alu.mul.a.mux = mux_a; ++ instr->alu.mul.b.mux = mux_b; + instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); + instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; + + return true; + } + +-static const struct opcode_desc * +-lookup_opcode_from_instr(const struct v3d_device_info *devinfo, +- const struct opcode_desc *opcodes, size_t num_opcodes, +- uint8_t op) ++static bool ++v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) + { +- for (int i = 0; i < num_opcodes; i++) { +- const struct opcode_desc *op_desc = &opcodes[i]; +- +- if (op_desc->op != op) +- continue; ++ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); ++ uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C); ++ uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D); + +- if (opcode_invalid_in_version(devinfo, op_desc)) +- continue; ++ { ++ const struct opcode_desc *desc = ++ lookup_opcode_from_packed(devinfo, ++ mul_ops_v71, ++ ARRAY_SIZE(mul_ops_v71), ++ op, 0, 0, ++ raddr_d); ++ if (!desc) ++ return false; + +- return op_desc; ++ instr->alu.mul.op = desc->op; + } + +- return NULL; +-} +- ++ switch (instr->alu.mul.op) { ++ case V3D_QPU_M_FMUL: ++ instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, ++ &instr->alu.mul.b.unpack)) { ++ return false; ++ } ++ ++ break; ++ ++ case V3D_QPU_M_FMOV: ++ instr->alu.mul.output_pack = raddr_d & 0x3; ++ ++ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ ++ break; ++ ++ case V3D_QPU_M_VFMUL: ++ unreachable("pending v71 update"); ++ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++ ++ break; ++ ++ case V3D_QPU_M_MOV: ++ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ ++ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7, ++ &instr->alu.mul.a.unpack)) { ++ return false; ++ } ++ break; ++ ++ default: ++ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; ++ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; ++ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; ++ break; ++ } ++ ++ instr->alu.mul.a.raddr = raddr_c; ++ instr->alu.mul.b.raddr = raddr_d; ++ instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); ++ instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; ++ ++ return true; ++} ++ + static bool +-v3d_qpu_add_pack(const struct v3d_device_info *devinfo, +- const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ++ struct v3d_qpu_instr *instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr); ++ else ++ return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr); ++} ++ ++static const struct opcode_desc * ++lookup_opcode_from_instr(const struct v3d_device_info *devinfo, ++ const struct opcode_desc *opcodes, size_t num_opcodes, ++ uint8_t op) ++{ ++ for (int i = 0; i < num_opcodes; i++) { ++ const struct opcode_desc *op_desc = &opcodes[i]; ++ ++ if (op_desc->op != op) ++ continue; ++ ++ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) ++ continue; ++ ++ return op_desc; ++ } ++ ++ return NULL; ++} ++ ++static bool ++v3d33_qpu_add_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) + { + uint32_t waddr = instr->alu.add.waddr; +- uint32_t mux_a = instr->alu.add.a; +- uint32_t mux_b = instr->alu.add.b; ++ uint32_t mux_a = instr->alu.add.a.mux; ++ uint32_t mux_b = instr->alu.add.b.mux; + int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); + const struct opcode_desc *desc = +- lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops), ++ lookup_opcode_from_instr(devinfo, add_ops_v33, ++ ARRAY_SIZE(add_ops_v33), + instr->alu.add.op); + + if (!desc) + return false; + +- uint32_t opcode = desc->opcode_first; ++ uint32_t opcode = opcode = desc->opcode_first; + + /* If an operation doesn't use an arg, its mux values may be used to + * identify the operation type. + */ + if (nsrc < 2) +- mux_b = ffs(desc->mux_b_mask) - 1; ++ mux_b = ffs(desc->mux.b_mask) - 1; + + if (nsrc < 1) +- mux_a = ffs(desc->mux_a_mask) - 1; ++ mux_a = ffs(desc->mux.a_mask) - 1; + + bool no_magic_write = false; + +@@ -1061,12 +1661,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + opcode |= output_pack << 4; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } +@@ -1100,23 +1700,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + uint32_t a_unpack; + uint32_t b_unpack; + +- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS || +- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) { ++ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || ++ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { + return false; + } + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } + +- opcode = (opcode & ~(1 << 2)) | (a_unpack << 2); +- opcode = (opcode & ~(1 << 0)) | (b_unpack << 0); ++ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); ++ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); + + break; + } +@@ -1135,13 +1735,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + mux_b |= packed; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + if (packed == 0) + return false; +- opcode = (opcode & ~(1 << 2)) | packed << 2; ++ opcode = (opcode & ~(0x3 << 2)) | packed << 2; + break; + } + +@@ -1153,7 +1753,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + return false; + + uint32_t packed; +- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } +@@ -1166,11 +1766,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || +- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) { ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + +- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack, ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } +@@ -1180,8 +1780,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + default: + if (instr->alu.add.op != V3D_QPU_A_NOP && + (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || +- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE || +- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) { ++ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } + break; +@@ -1198,15 +1798,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + } + + static bool +-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, +- const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ uint32_t waddr = instr->alu.add.waddr; ++ uint32_t raddr_a = instr->alu.add.a.raddr; ++ uint32_t raddr_b = instr->alu.add.b.raddr; ++ ++ int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); ++ const struct opcode_desc *desc = ++ lookup_opcode_from_instr(devinfo, add_ops_v71, ++ ARRAY_SIZE(add_ops_v71), ++ instr->alu.add.op); ++ if (!desc) ++ return false; ++ ++ uint32_t opcode = opcode = desc->opcode_first; ++ ++ /* If an operation doesn't use an arg, its raddr values may be used to ++ * identify the operation type. ++ */ ++ if (nsrc < 2) ++ raddr_b = ffsll(desc->raddr_mask) - 1; ++ ++ bool no_magic_write = false; ++ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_STVPMV: ++ waddr = 0; ++ no_magic_write = true; ++ break; ++ case V3D_QPU_A_STVPMD: ++ waddr = 1; ++ no_magic_write = true; ++ break; ++ case V3D_QPU_A_STVPMP: ++ waddr = 2; ++ no_magic_write = true; ++ break; ++ ++ case V3D_QPU_A_LDVPMV_IN: ++ case V3D_QPU_A_LDVPMD_IN: ++ case V3D_QPU_A_LDVPMP: ++ case V3D_QPU_A_LDVPMG_IN: ++ assert(!instr->alu.add.magic_write); ++ break; ++ ++ case V3D_QPU_A_LDVPMV_OUT: ++ case V3D_QPU_A_LDVPMD_OUT: ++ case V3D_QPU_A_LDVPMG_OUT: ++ assert(!instr->alu.add.magic_write); ++ *packed_instr |= V3D_QPU_MA; ++ break; ++ ++ default: ++ break; ++ } ++ ++ switch (instr->alu.add.op) { ++ case V3D_QPU_A_FADD: ++ case V3D_QPU_A_FADDNF: ++ case V3D_QPU_A_FSUB: ++ case V3D_QPU_A_FMIN: ++ case V3D_QPU_A_FMAX: ++ case V3D_QPU_A_FCMP: { ++ uint32_t output_pack; ++ uint32_t a_unpack; ++ uint32_t b_unpack; ++ ++ if (instr->alu.add.op != V3D_QPU_A_FCMP) { ++ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, ++ &output_pack)) { ++ return false; ++ } ++ opcode |= output_pack << 4; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &a_unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, ++ &b_unpack)) { ++ return false; ++ } ++ ++ /* These operations with commutative operands are ++ * distinguished by the order of the operands come in. ++ */ ++ bool ordering = ++ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a > ++ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b; ++ if (((instr->alu.add.op == V3D_QPU_A_FMIN || ++ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) || ++ ((instr->alu.add.op == V3D_QPU_A_FMAX || ++ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) { ++ uint32_t temp; ++ ++ temp = a_unpack; ++ a_unpack = b_unpack; ++ b_unpack = temp; ++ ++ temp = raddr_a; ++ raddr_a = raddr_b; ++ raddr_b = temp; ++ ++ /* If we are swapping raddr_a/b we also need to swap ++ * small_imm_a/b. ++ */ ++ if (instr->sig.small_imm_a || instr->sig.small_imm_b) { ++ assert(instr->sig.small_imm_a != ++ instr->sig.small_imm_b); ++ struct v3d_qpu_sig new_sig = instr->sig; ++ new_sig.small_imm_a = !instr->sig.small_imm_a; ++ new_sig.small_imm_b = !instr->sig.small_imm_b; ++ uint32_t sig; ++ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) ++ return false; ++ *packed_instr &= ~V3D_QPU_SIG_MASK; ++ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); ++ } ++ } ++ ++ opcode |= a_unpack << 2; ++ opcode |= b_unpack << 0; ++ ++ break; ++ } ++ ++ case V3D_QPU_A_VFPACK: { ++ uint32_t a_unpack; ++ uint32_t b_unpack; ++ ++ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || ++ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &a_unpack)) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, ++ &b_unpack)) { ++ return false; ++ } ++ ++ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); ++ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); ++ ++ break; ++ } ++ ++ case V3D_QPU_A_FFLOOR: ++ case V3D_QPU_A_FROUND: ++ case V3D_QPU_A_FTRUNC: ++ case V3D_QPU_A_FCEIL: ++ case V3D_QPU_A_FDX: ++ case V3D_QPU_A_FDY: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, ++ &packed)) { ++ return false; ++ } ++ raddr_b |= packed; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ if (packed == 0) ++ return false; ++ raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2; ++ break; ++ } ++ ++ case V3D_QPU_A_FTOIN: ++ case V3D_QPU_A_FTOIZ: ++ case V3D_QPU_A_FTOUZ: ++ case V3D_QPU_A_FTOC: ++ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ uint32_t packed; ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ if (packed == 0) ++ return false; ++ ++ raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2; ++ ++ break; ++ ++ case V3D_QPU_A_VFMIN: ++ case V3D_QPU_A_VFMAX: ++ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { ++ return false; ++ } ++ ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ opcode |= packed; ++ break; ++ ++ case V3D_QPU_A_MOV: { ++ uint32_t packed; ++ ++ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ ++ raddr_b |= packed << 2; ++ break; ++ } ++ ++ case V3D_QPU_A_FMOV: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, ++ &packed)) { ++ return false; ++ } ++ raddr_b = packed; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, ++ &packed)) { ++ return false; ++ } ++ raddr_b |= packed << 2; ++ break; ++ } ++ ++ default: ++ if (instr->alu.add.op != V3D_QPU_A_NOP && ++ (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { ++ return false; ++ } ++ break; ++ } ++ ++ *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A); ++ *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B); ++ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD); ++ *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A); ++ if (instr->alu.add.magic_write && !no_magic_write) ++ *packed_instr |= V3D_QPU_MA; ++ ++ return true; ++} ++ ++static bool ++v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) + { +- uint32_t mux_a = instr->alu.mul.a; +- uint32_t mux_b = instr->alu.mul.b; ++ uint32_t mux_a = instr->alu.mul.a.mux; ++ uint32_t mux_b = instr->alu.mul.b.mux; + int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); + + const struct opcode_desc *desc = +- lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops), ++ lookup_opcode_from_instr(devinfo, mul_ops_v33, ++ ARRAY_SIZE(mul_ops_v33), + instr->alu.mul.op); + + if (!desc) +@@ -1218,10 +2083,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + * that here. If mux a/b determine packing, it will be set below. + */ + if (nsrc < 2) +- mux_b = ffs(desc->mux_b_mask) - 1; ++ mux_b = ffs(desc->mux.b_mask) - 1; + + if (nsrc < 1) +- mux_a = ffs(desc->mux_a_mask) - 1; ++ mux_a = ffs(desc->mux.a_mask) - 1; + + switch (instr->alu.mul.op) { + case V3D_QPU_M_FMUL: { +@@ -1236,13 +2101,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + */ + opcode += packed << 4; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + opcode |= packed << 2; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, + &packed)) { + return false; + } +@@ -1260,7 +2125,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + opcode |= (packed >> 1) & 1; + mux_b = (packed & 1) << 2; + +- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } +@@ -1274,22 +2139,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) + return false; + +- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack, ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } +- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16) ++ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) + opcode = 8; + else + opcode |= (packed + 4) & 7; + +- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) ++ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) + return false; + + break; + } + + default: ++ if (instr->alu.mul.op != V3D_QPU_M_NOP && ++ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { ++ return false; ++ } + break; + } + +@@ -1304,6 +2175,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + return true; + } + ++static bool ++v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ uint32_t raddr_c = instr->alu.mul.a.raddr; ++ uint32_t raddr_d = instr->alu.mul.b.raddr; ++ int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); ++ ++ const struct opcode_desc *desc = ++ lookup_opcode_from_instr(devinfo, mul_ops_v71, ++ ARRAY_SIZE(mul_ops_v71), ++ instr->alu.mul.op); ++ if (!desc) ++ return false; ++ ++ uint32_t opcode = desc->opcode_first; ++ ++ /* Some opcodes have a single valid value for their raddr_d, so set ++ * that here. If raddr_b determine packing, it will be set below. ++ */ ++ if (nsrc < 2) ++ raddr_d = ffsll(desc->raddr_mask) - 1; ++ ++ switch (instr->alu.mul.op) { ++ case V3D_QPU_M_FMUL: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, ++ &packed)) { ++ return false; ++ } ++ /* No need for a +1 because desc->opcode_first has a 1 in this ++ * field. ++ */ ++ opcode += packed << 4; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ opcode |= packed << 2; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, ++ &packed)) { ++ return false; ++ } ++ opcode |= packed << 0; ++ break; ++ } ++ ++ case V3D_QPU_M_FMOV: { ++ uint32_t packed; ++ ++ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, ++ &packed)) { ++ return false; ++ } ++ raddr_d |= packed; ++ ++ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ raddr_d |= packed << 2; ++ break; ++ } ++ ++ case V3D_QPU_M_VFMUL: { ++ unreachable("pending v71 update"); ++ uint32_t packed; ++ ++ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) ++ opcode = 8; ++ else ++ opcode |= (packed + 4) & 7; ++ ++ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) ++ return false; ++ ++ break; ++ } ++ ++ case V3D_QPU_M_MOV: { ++ uint32_t packed; ++ ++ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) ++ return false; ++ ++ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack, ++ &packed)) { ++ return false; ++ } ++ ++ raddr_d |= packed << 2; ++ break; ++ } ++ ++ default: ++ if (instr->alu.mul.op != V3D_QPU_M_NOP && ++ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || ++ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || ++ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { ++ return false; ++ } ++ break; ++ } ++ ++ *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C); ++ *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D); ++ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL); ++ *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M); ++ if (instr->alu.mul.magic_write) ++ *packed_instr |= V3D_QPU_MM; ++ ++ return true; ++} ++ ++static bool ++v3d_qpu_add_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_add_pack(devinfo, instr, packed_instr); ++ else ++ return v3d71_qpu_add_pack(devinfo, instr, packed_instr); ++} ++ ++static bool ++v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, ++ const struct v3d_qpu_instr *instr, uint64_t *packed_instr) ++{ ++ if (devinfo->ver < 71) ++ return v3d33_qpu_mul_pack(devinfo, instr, packed_instr); ++ else ++ return v3d71_qpu_mul_pack(devinfo, instr, packed_instr); ++} ++ + static bool + v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, + uint64_t packed_instr, +@@ -1332,8 +2347,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, + return false; + } + +- instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); +- instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); ++ if (devinfo->ver <= 71) { ++ /* ++ * For v71 this will be set on add/mul unpack, as raddr are now ++ * part of v3d_qpu_input ++ */ ++ instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); ++ instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); ++ } + + if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr)) + return false; +@@ -1419,8 +2440,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo, + *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); + + if (instr->type == V3D_QPU_INSTR_TYPE_ALU) { +- *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); +- *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); ++ if (devinfo->ver < 71) { ++ /* ++ * For v71 this will be set on add/mul unpack, as raddr are now ++ * part of v3d_qpu_input ++ */ ++ *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); ++ *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); ++ } + + if (!v3d_qpu_add_pack(devinfo, instr, packed_instr)) + return false; +diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c +index 2f8e19c73fed..be7b78d5ef00 100644 +--- a/src/broadcom/qpu/tests/qpu_disasm.c ++++ b/src/broadcom/qpu/tests/qpu_disasm.c +@@ -160,10 +160,10 @@ main(int argc, char **argv) + /* Swap the operands to be sure that we test + * how the QPUs distinguish between these ops. + */ +- swap_mux(&instr.alu.add.a, +- &instr.alu.add.b); +- swap_pack(&instr.alu.add.a_unpack, +- &instr.alu.add.b_unpack); ++ swap_mux(&instr.alu.add.a.mux, ++ &instr.alu.add.b.mux); ++ swap_pack(&instr.alu.add.a.unpack, ++ &instr.alu.add.b.unpack); + break; + default: + break; +diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c +index eea5d3f050ea..c3802dd78575 100644 +--- a/src/broadcom/simulator/v3d_simulator.c ++++ b/src/broadcom/simulator/v3d_simulator.c +@@ -92,6 +92,9 @@ static struct v3d_simulator_state { + /** Last performance monitor ID. */ + uint32_t last_perfid; + ++ /** Total performance counters */ ++ uint32_t perfcnt_total; ++ + struct util_dynarray bin_oom; + int refcount; + } sim_state = { +@@ -436,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid) + + perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid); + if (perfmon) +- v3d41_simulator_perfmon_stop(sim_state.v3d, +- perfmon->ncounters, +- perfmon->values); ++ v3d_X_simulator(perfmon_stop)(sim_state.v3d, ++ perfmon->ncounters, ++ perfmon->values); + + perfmon = v3d_get_simulator_perfmon(fd, perfid); + if (perfmon) +- v3d41_simulator_perfmon_start(sim_state.v3d, +- perfmon->ncounters, +- perfmon->counters); ++ v3d_X_simulator(perfmon_start)(sim_state.v3d, ++ perfmon->ncounters, ++ perfmon->counters); + + file->active_perfid = perfid; + } +@@ -489,11 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) + bin_fd = fd; + + v3d_simulator_perfmon_switch(fd, submit->perfmon_id); +- +- if (sim_state.ver >= 41) +- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); +- else +- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); ++ v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs); + + util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *, + sim_bo) { +@@ -632,15 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args) + return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args); + } + +-static int +-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args) +-{ +- if (sim_state.ver >= 41) +- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args); +- else +- return v3d33_simulator_get_param_ioctl(sim_state.v3d, args); +-} +- + static int + v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) + { +@@ -652,10 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) + v3d_simulator_copy_in_handle(file, args->bo_handles[2]); + v3d_simulator_copy_in_handle(file, args->bo_handles[3]); + +- if (sim_state.ver >= 41) +- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args); +- else +- ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args); ++ ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args); + + v3d_simulator_copy_out_handle(file, args->bo_handles[0]); + +@@ -682,11 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args) + + v3d_simulator_perfmon_switch(fd, args->perfmon_id); + +- if (sim_state.ver >= 41) +- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, +- file->gmp->ofs); +- else +- ret = -1; ++ ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args, ++ file->gmp->ofs); + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_out_handle(file, bo_handles[i]); +@@ -716,7 +700,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args) + + perfmon->ncounters = args->ncounters; + for (int i = 0; i < args->ncounters; i++) { +- if (args->counters[i] >= V3D_PERFCNT_NUM) { ++ if (args->counters[i] >= sim_state.perfcnt_total) { + ralloc_free(perfmon); + return -EINVAL; + } else { +@@ -797,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args) + return 0; + + case DRM_IOCTL_V3D_GET_PARAM: +- return v3d_simulator_get_param_ioctl(fd, args); ++ return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args); + + case DRM_IOCTL_GEM_CLOSE: + return v3d_simulator_gem_close_ioctl(fd, args); +@@ -880,10 +864,8 @@ v3d_simulator_init_global() + + util_dynarray_init(&sim_state.bin_oom, NULL); + +- if (sim_state.ver >= 41) +- v3d41_simulator_init_regs(sim_state.v3d); +- else +- v3d33_simulator_init_regs(sim_state.v3d); ++ v3d_X_simulator(init_regs)(sim_state.v3d); ++ v3d_X_simulator(get_perfcnt_total)(&sim_state.perfcnt_total); + } + + struct v3d_simulator_file * +diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h +index ddb079c14559..923056344687 100644 +--- a/src/broadcom/simulator/v3d_simulator.h ++++ b/src/broadcom/simulator/v3d_simulator.h +@@ -52,6 +52,32 @@ uint32_t v3d_simulator_get_mem_free(void); + # define v3dX(x) v3d41_##x + # include "v3dx_simulator.h" + # undef v3dX ++ ++# define v3dX(x) v3d71_##x ++# include "v3dx_simulator.h" ++# undef v3dX ++ + #endif + ++/* Helper to call simulator ver specific functions */ ++#define v3d_X_simulator(thing) ({ \ ++ __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\ ++ switch (sim_state.ver) { \ ++ case 33: \ ++ case 40: \ ++ v3d_X_sim_thing = &v3d33_simulator_##thing; \ ++ break; \ ++ case 41: \ ++ case 42: \ ++ v3d_X_sim_thing = &v3d41_simulator_##thing; \ ++ break; \ ++ case 71: \ ++ v3d_X_sim_thing = &v3d71_simulator_##thing; \ ++ break; \ ++ default: \ ++ unreachable("Unsupported hardware generation"); \ ++ } \ ++ v3d_X_sim_thing; \ ++}) ++ + #endif +diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c +index c9322f0397b8..904cf2d1b764 100644 +--- a/src/broadcom/simulator/v3dx_simulator.c ++++ b/src/broadcom/simulator/v3dx_simulator.c +@@ -40,17 +40,23 @@ + #include "v3d_simulator.h" + #include "v3d_simulator_wrapper.h" + ++#include "common/v3d_performance_counters.h" ++ + #include "util/macros.h" + #include "util/bitscan.h" + #include "drm-uapi/v3d_drm.h" + + #define HW_REGISTER_RO(x) (x) + #define HW_REGISTER_RW(x) (x) +-#if V3D_VERSION >= 41 +-#include "libs/core/v3d/registers/4.1.35.0/v3d.h" ++#if V3D_VERSION == 71 ++#include "libs/core/v3d/registers/7.1.6.0/v3d.h" ++#else ++#if V3D_VERSION == 41 || V3D_VERSION == 42 ++#include "libs/core/v3d/registers/4.2.14.0/v3d.h" + #else + #include "libs/core/v3d/registers/3.3.0.0/v3d.h" + #endif ++#endif + + #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val) + #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg) +@@ -178,38 +184,48 @@ v3d_flush_caches(struct v3d_hw *v3d) + v3d_flush_l2t(v3d); + } + ++#if V3D_VERSION < 71 ++#define TFU_REG(NAME) V3D_TFU_ ## NAME ++#else ++#define TFU_REG(NAME) V3D_IFC_ ## NAME ++#endif ++ ++ + int + v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, + struct drm_v3d_submit_tfu *args) + { +- int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; +- +- V3D_WRITE(V3D_TFU_IIA, args->iia); +- V3D_WRITE(V3D_TFU_IIS, args->iis); +- V3D_WRITE(V3D_TFU_ICA, args->ica); +- V3D_WRITE(V3D_TFU_IUA, args->iua); +- V3D_WRITE(V3D_TFU_IOA, args->ioa); +- V3D_WRITE(V3D_TFU_IOS, args->ios); +- V3D_WRITE(V3D_TFU_COEF0, args->coef[0]); +- V3D_WRITE(V3D_TFU_COEF1, args->coef[1]); +- V3D_WRITE(V3D_TFU_COEF2, args->coef[2]); +- V3D_WRITE(V3D_TFU_COEF3, args->coef[3]); +- +- V3D_WRITE(V3D_TFU_ICFG, args->icfg); +- +- while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { ++ int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET; ++ ++ V3D_WRITE(TFU_REG(IIA), args->iia); ++ V3D_WRITE(TFU_REG(IIS), args->iis); ++ V3D_WRITE(TFU_REG(ICA), args->ica); ++ V3D_WRITE(TFU_REG(IUA), args->iua); ++ V3D_WRITE(TFU_REG(IOA), args->ioa); ++#if V3D_VERSION >= 71 ++ V3D_WRITE(TFU_REG(IOC), args->v71.ioc); ++#endif ++ V3D_WRITE(TFU_REG(IOS), args->ios); ++ V3D_WRITE(TFU_REG(COEF0), args->coef[0]); ++ V3D_WRITE(TFU_REG(COEF1), args->coef[1]); ++ V3D_WRITE(TFU_REG(COEF2), args->coef[2]); ++ V3D_WRITE(TFU_REG(COEF3), args->coef[3]); ++ ++ V3D_WRITE(TFU_REG(ICFG), args->icfg); ++ ++ while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { + v3d_hw_tick(v3d); + } + + return 0; + } + +-#if V3D_VERSION >= 41 + int + v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + struct drm_v3d_submit_csd *args, + uint32_t gmp_ofs) + { ++#if V3D_VERSION >= 41 + int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) & + V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET); + g_gmp_ofs = gmp_ofs; +@@ -223,6 +239,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); ++#if V3D_VERSION >= 71 ++ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0); ++#endif + /* CFG0 kicks off the job */ + V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); + +@@ -239,8 +258,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + v3d_flush_caches(v3d); + + return 0; +-} ++#else ++ return -1; + #endif ++} + + int + v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, +@@ -310,16 +331,17 @@ v3d_isr_core(struct v3d_hw *v3d, + return; + } + ++#if V3D_VERSION <= 42 + if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { + fprintf(stderr, "GMP violation at 0x%08x\n", + V3D_READ(V3D_GMP_VIO_ADDR)); +- abort(); + } else { + fprintf(stderr, + "Unexpected ISR with core status 0x%08x\n", + core_status); + } + abort(); ++#endif + } + + static void +@@ -396,6 +418,18 @@ v3d_isr_hub(struct v3d_hw *v3d) + } + + handle_mmu_interruptions(v3d, hub_status); ++ ++#if V3D_VERSION == 71 ++ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) { ++ fprintf(stderr, "GMP violation at 0x%08x\n", ++ V3D_READ(V3D_GMP_VIO_ADDR)); ++ } else { ++ fprintf(stderr, ++ "Unexpected ISR with status 0x%08x\n", ++ hub_status); ++ } ++ abort(); ++#endif + } + + static void +@@ -436,8 +470,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) + * for tracing. Perhaps we should evaluate to do the same here and add + * some debug options. + */ +- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET | +- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET); ++ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET; ++#if V3D_VERSION <= 42 ++ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET; ++#endif ++ + V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); + V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); + +@@ -447,6 +484,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) + V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */ + V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */ + ++#if V3D_VERSION == 71 ++ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET; ++#endif + V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts); + V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts); + +@@ -509,7 +549,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, + #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x)) + #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8) + #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \ +- V3D_PCTR_0_SRC_N_SHIFT(x) + 6)) ++ V3D_PCTR_0_SRC_N_SHIFT(x) + \ ++ V3D_PCTR_0_SRC_0_3_PCTRS0_MSB)) + #endif + + void +@@ -549,4 +590,9 @@ void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d, + #endif + } + ++void v3dX(simulator_get_perfcnt_total)(uint32_t *count) ++{ ++ *count = ARRAY_SIZE(v3d_performance_counters); ++} ++ + #endif /* USE_V3D_SIMULATOR */ +diff --git a/src/broadcom/simulator/v3dx_simulator.h b/src/broadcom/simulator/v3dx_simulator.h +index f7d2cc67b03a..51fc2409d3e2 100644 +--- a/src/broadcom/simulator/v3dx_simulator.h ++++ b/src/broadcom/simulator/v3dx_simulator.h +@@ -50,3 +50,4 @@ void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d, + void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d, + uint32_t ncounters, + uint64_t *values); ++void v3dX(simulator_get_perfcnt_total)(uint32_t *count); +diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build +index ad032d832ad5..182388a35b4d 100644 +--- a/src/broadcom/vulkan/meson.build ++++ b/src/broadcom/vulkan/meson.build +@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target( + '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv', + '--beta', with_vulkan_beta.to_string(), + '--device-prefix', 'ver42', ++ '--device-prefix', 'ver71', + ], + depend_files : vk_entrypoints_gen_depend_files, + ) +@@ -64,13 +65,11 @@ files_per_version = files( + 'v3dvx_pipeline.c', + 'v3dvx_meta_common.c', + 'v3dvx_pipeline.c', ++ 'v3dvx_query.c', + 'v3dvx_queue.c', + ) + +-# The vulkan driver only supports version >= 42, which is the version present in +-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d +-# driver. +-v3d_versions = ['42'] ++v3d_versions = ['42', '71'] + + v3dv_flags = [] + +diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c +index 96360a96b448..609c7acfa8f9 100644 +--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c +@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job, + uint32_t layers, + uint32_t render_target_count, + uint8_t max_internal_bpp, ++ uint8_t total_color_bpp, + bool msaa, + bool double_buffer) + { +@@ -360,13 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job, + tiling->render_target_count = render_target_count; + tiling->msaa = msaa; + tiling->internal_bpp = max_internal_bpp; ++ tiling->total_color_bpp = total_color_bpp; + tiling->double_buffer = double_buffer; + + /* Double-buffer is incompatible with MSAA */ + assert(!tiling->msaa || !tiling->double_buffer); + +- v3d_choose_tile_size(render_target_count, max_internal_bpp, +- tiling->msaa, tiling->double_buffer, ++ v3d_choose_tile_size(&job->device->devinfo, ++ render_target_count, ++ max_internal_bpp, total_color_bpp, msaa, ++ tiling->double_buffer, + &tiling->tile_width, &tiling->tile_height); + + tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); +@@ -457,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job, + bool allocate_tile_state_now, + uint32_t render_target_count, + uint8_t max_internal_bpp, ++ uint8_t total_color_bpp, + bool msaa) + { + assert(job); +@@ -467,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job, + const struct v3dv_frame_tiling *tiling = + job_compute_frame_tiling(job, width, height, layers, + render_target_count, max_internal_bpp, +- msaa, false); ++ total_color_bpp, msaa, false); + + v3dv_cl_ensure_space_with_branch(&job->bcl, 256); + v3dv_return_if_oom(NULL, job); +@@ -528,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) + job->frame_tiling.layers, + job->frame_tiling.render_target_count, + job->frame_tiling.internal_bpp, ++ job->frame_tiling.total_color_bpp, + job->frame_tiling.msaa, + true); + +@@ -1374,7 +1380,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) + } + + uint32_t att_count = 0; +- VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ ++ VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */ + + /* We only need to emit subpass clears as draw calls for color attachments + * if the render area is not aligned to tile boundaries. +@@ -1672,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, + + const struct v3dv_framebuffer *framebuffer = state->framebuffer; + +- uint8_t internal_bpp; ++ uint8_t max_internal_bpp, total_color_bpp; + bool msaa; + v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) +- (framebuffer, state->attachments, subpass, &internal_bpp, &msaa); ++ (framebuffer, state->attachments, subpass, ++ &max_internal_bpp, &total_color_bpp, &msaa); + + /* From the Vulkan spec: + * +@@ -1699,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, + layers, + true, false, + subpass->color_count, +- internal_bpp, ++ max_internal_bpp, ++ total_color_bpp, + msaa); + } + +@@ -2062,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, + } + } + ++ if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) { ++ if (memcmp(&dest->depth_bounds, &src->depth_bounds, ++ sizeof(src->depth_bounds))) { ++ memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds)); ++ dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; ++ } ++ } ++ + if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { + if (dest->line_width != src->line_width) { + dest->line_width = src->line_width; +@@ -2131,39 +2147,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, + } + } + +-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ +-void +-v3dv_viewport_compute_xform(const VkViewport *viewport, +- float scale[3], +- float translate[3]) +-{ +- float x = viewport->x; +- float y = viewport->y; +- float half_width = 0.5f * viewport->width; +- float half_height = 0.5f * viewport->height; +- double n = viewport->minDepth; +- double f = viewport->maxDepth; +- +- scale[0] = half_width; +- translate[0] = half_width + x; +- scale[1] = half_height; +- translate[1] = half_height + y; +- +- scale[2] = (f - n); +- translate[2] = n; +- +- /* It seems that if the scale is small enough the hardware won't clip +- * correctly so we work around this my choosing the smallest scale that +- * seems to work. +- * +- * This case is exercised by CTS: +- * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero +- */ +- const float min_abs_scale = 0.000009f; +- if (fabs(scale[2]) < min_abs_scale) +- scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; +-} +- + /* Considers the pipeline's negative_one_to_one state and applies it to the + * current viewport transform if needed to produce the resulting Z translate + * and scale parameters. +@@ -2216,9 +2199,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, + viewportCount * sizeof(*pViewports)); + + for (uint32_t i = firstViewport; i < total_count; i++) { +- v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], +- state->dynamic.viewport.scale[i], +- state->dynamic.viewport.translate[i]); ++ v3dv_X(cmd_buffer->device, viewport_compute_xform) ++ (&state->dynamic.viewport.viewports[i], ++ state->dynamic.viewport.scale[i], ++ state->dynamic.viewport.translate[i]); + } + + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; +@@ -2699,6 +2683,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) + true, false, + old_job->frame_tiling.render_target_count, + old_job->frame_tiling.internal_bpp, ++ old_job->frame_tiling.total_color_bpp, + true /* msaa */); + + v3dv_job_destroy(old_job); +@@ -2963,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) + v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); + ++ if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS) ++ v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer); ++ + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) + v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); + +@@ -3392,9 +3380,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, + float minDepthBounds, + float maxDepthBounds) + { +- /* We do not support depth bounds testing so we just ignore this. We are +- * already asserting that pipelines don't enable the feature anyway. +- */ ++ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); ++ ++ cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; ++ cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; ++ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; + } + + VKAPI_ATTR void VKAPI_CALL +@@ -3826,6 +3816,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) + + void + v3dv_cmd_buffer_rewrite_indirect_csd_job( ++ struct v3dv_device *device, + struct v3dv_csd_indirect_cpu_job_info *info, + const uint32_t *wg_counts) + { +@@ -3845,8 +3836,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( + submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + +- submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * +- (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; ++ uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) * ++ (wg_counts[0] * wg_counts[1] * wg_counts[2]); ++ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ ++ if (device->devinfo.ver < 71 || ++ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { ++ submit->cfg[4] = num_batches - 1; ++ } else { ++ submit->cfg[4] = num_batches; ++ } + assert(submit->cfg[4] != ~0); + + if (info->needs_wg_uniform_rewrite) { +@@ -3879,6 +3877,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t **wg_uniform_offsets_out, + uint32_t *wg_size_out) + { ++ struct v3dv_device *device = cmd_buffer->device; + struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; + assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); + struct v3dv_shader_variant *cs_variant = +@@ -3937,18 +3936,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, + if (wg_size_out) + *wg_size_out = wg_size; + +- submit->cfg[4] = num_batches - 1; ++ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ ++ if (device->devinfo.ver < 71 || ++ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { ++ submit->cfg[4] = num_batches - 1; ++ } else { ++ submit->cfg[4] = num_batches; ++ } + assert(submit->cfg[4] != ~0); + + assert(pipeline->shared_data->assembly_bo); + struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; + + submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; +- submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (cs_variant->prog_data.base->single_seg) + submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; + if (cs_variant->prog_data.base->threads == 4) + submit->cfg[5] |= V3D_CSD_CFG5_THREADING; ++ /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */ ++ if (device->devinfo.ver < 71) ++ submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + + if (cs_variant->prog_data.cs->shared_size > 0) { + job->csd.shared_memory = +diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c +index d5de35176707..97eb220f5179 100644 +--- a/src/broadcom/vulkan/v3dv_device.c ++++ b/src/broadcom/vulkan/v3dv_device.c +@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device, + *features = (struct vk_features) { + /* Vulkan 1.0 */ + .robustBufferAccess = true, /* This feature is mandatory */ +- .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */ ++ .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, +@@ -224,10 +224,10 @@ get_features(const struct v3dv_physical_device *physical_device, + .logicOp = true, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = true, +- .depthClamp = false, /* Only available since V3D 4.5.1.1 */ ++ .depthClamp = physical_device->devinfo.ver >= 71, + .depthBiasClamp = true, + .fillModeNonSolid = true, +- .depthBounds = false, /* Only available since V3D 4.3.16.2 */ ++ .depthBounds = physical_device->devinfo.ver >= 71, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, +@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device, + * problematic, we would always have to scalarize. Overall, this would + * not lead to best performance so let's just not support it. + */ +- .scalarBlockLayout = false, ++ .scalarBlockLayout = physical_device->devinfo.ver >= 71, + /* This tells applications 2 things: + * + * 1. If they can select just one aspect for barriers. For us barriers +@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance, + device->next_program_id = 0; + + ASSERTED int len = +- asprintf(&device->name, "V3D %d.%d", +- device->devinfo.ver / 10, device->devinfo.ver % 10); ++ asprintf(&device->name, "V3D %d.%d.%d", ++ device->devinfo.ver / 10, ++ device->devinfo.ver % 10, ++ device->devinfo.rev); + assert(len != -1); + + v3dv_physical_device_init_disk_cache(device); +@@ -1279,7 +1281,8 @@ enumerate_devices(struct vk_instance *vk_instance) + if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) { + char **compat = devices[i]->deviceinfo.platform->compatible; + while (*compat) { +- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) { ++ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 || ++ strncmp(*compat, "brcm,2712-v3d", 13) == 0) { + v3d_idx = i; + break; + } +@@ -1288,8 +1291,9 @@ enumerate_devices(struct vk_instance *vk_instance) + } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) { + char **compat = devices[i]->deviceinfo.platform->compatible; + while (*compat) { +- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || +- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) { ++ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 || ++ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || ++ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) { + vc4_idx = i; + break; + } +@@ -1326,6 +1330,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev) + switch (dev->devinfo.ver) { + case 42: + return 0xBE485FD3; /* Broadcom deviceID for 2711 */ ++ case 71: ++ return 0x55701C33; /* Broadcom deviceID for 2712 */ + default: + unreachable("Unsupported V3D version"); + } +@@ -1354,6 +1360,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + const VkSampleCountFlags supported_sample_counts = + VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT; + ++ const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver); ++ + struct timespec clock_res; + clock_getres(CLOCK_MONOTONIC, &clock_res); + const float timestamp_period = +@@ -1424,7 +1432,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + .maxFragmentInputComponents = max_varying_components, + .maxFragmentOutputAttachments = 4, + .maxFragmentDualSrcAttachments = 0, +- .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS + ++ .maxFragmentCombinedOutputResources = max_rts + + MAX_STORAGE_BUFFERS + + MAX_STORAGE_IMAGES, + +@@ -1437,7 +1445,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + .subPixelPrecisionBits = V3D_COORD_SHIFT, + .subTexelPrecisionBits = 8, + .mipmapPrecisionBits = 8, +- .maxDrawIndexedIndexValue = 0x00ffffff, ++ .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ? ++ 0xffffffff : 0x00ffffff, + .maxDrawIndirectCount = 0x7fffffff, + .maxSamplerLodBias = 14.0f, + .maxSamplerAnisotropy = 16.0f, +@@ -1464,7 +1473,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + .framebufferDepthSampleCounts = supported_sample_counts, + .framebufferStencilSampleCounts = supported_sample_counts, + .framebufferNoAttachmentsSampleCounts = supported_sample_counts, +- .maxColorAttachments = MAX_RENDER_TARGETS, ++ .maxColorAttachments = max_rts, + .sampledImageColorSampleCounts = supported_sample_counts, + .sampledImageIntegerSampleCounts = supported_sample_counts, + .sampledImageDepthSampleCounts = supported_sample_counts, +@@ -2031,7 +2040,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, + v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0, + device->instance->default_pipeline_cache_enabled); + device->default_attribute_float = +- v3dv_pipeline_create_default_attribute_values(device, NULL); ++ v3dv_X(device, create_default_attribute_values)(device, NULL); + + device->device_address_mem_ctx = ralloc_context(NULL); + util_dynarray_init(&device->device_address_bo_list, +@@ -2975,7 +2984,7 @@ v3dv_CreateSampler(VkDevice _device, + } + } + +- v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info); ++ v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info); + + *pSampler = v3dv_sampler_to_handle(sampler); + +diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c +index ebbd60e4c03c..e01e2e1bd197 100644 +--- a/src/broadcom/vulkan/v3dv_image.c ++++ b/src/broadcom/vulkan/v3dv_image.c +@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device, + * makes sense to implement swizzle composition using VkSwizzle directly. + */ + VkFormat format; +- uint8_t image_view_swizzle[4]; + if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT && + range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { + format = VK_FORMAT_R8G8B8A8_UINT; +@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device, + vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle); + + util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle, +- image_view_swizzle); ++ iview->view_swizzle); + } else { + format = pCreateInfo->format; + vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, +- image_view_swizzle); ++ iview->view_swizzle); + } + + iview->vk.view_format = format; +@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device, + + const uint8_t *format_swizzle = + v3dv_get_format_swizzle(device, format, plane); +- util_format_compose_swizzles(format_swizzle, image_view_swizzle, ++ util_format_compose_swizzles(format_swizzle, iview->view_swizzle, + iview->planes[plane].swizzle); + + iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle); +diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h +index 9cda9f0d6d28..8ac997241058 100644 +--- a/src/broadcom/vulkan/v3dv_limits.h ++++ b/src/broadcom/vulkan/v3dv_limits.h +@@ -50,8 +50,6 @@ + #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \ + MAX_DYNAMIC_STORAGE_BUFFERS) + +-#define MAX_RENDER_TARGETS 4 +- + #define MAX_MULTIVIEW_VIEW_COUNT 16 + + /* These are tunable parameters in the HW design, but all the V3D +diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c +index d376c179e1c2..1c0d66c977cc 100644 +--- a/src/broadcom/vulkan/v3dv_meta_clear.c ++++ b/src/broadcom/vulkan/v3dv_meta_clear.c +@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + + v3dv_job_start_frame(job, width, height, max_layer, + false, true, 1, internal_bpp, ++ 4 * v3d_internal_bpp_words(internal_bpp), + image->vk.samples > VK_SAMPLE_COUNT_1_BIT); + + struct v3dv_meta_framebuffer framebuffer; +@@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx, + uint32_t bit_offset = 0; + + key |= rt_idx; +- bit_offset += 2; ++ bit_offset += 3; + + key |= ((uint64_t) format) << bit_offset; + bit_offset += 32; +@@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, + { + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + +- /* We can only clear attachments in the current subpass */ +- assert(attachmentCount <= 5); /* 4 color + D/S */ ++ /* We can have at most max_color_RTs + 1 D/S attachments */ ++ assert(attachmentCount <= ++ V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1); + ++ /* We can only clear attachments in the current subpass */ + struct v3dv_render_pass *pass = cmd_buffer->state.pass; + + assert(cmd_buffer->state.subpass_idx < pass->subpass_count); +diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c +index c0ec888b8c70..2d30c611e175 100644 +--- a/src/broadcom/vulkan/v3dv_meta_copy.c ++++ b/src/broadcom/vulkan/v3dv_meta_copy.c +@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, + const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); + const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); + +- v3dv_job_start_frame(job, width, height, num_layers, false, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, +@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); + const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); + +- v3dv_job_start_frame(job, width, height, num_layers, +- false, true, 1, internal_bpp, ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + src->vk.samples > VK_SAMPLE_COUNT_1_BIT); + + struct v3dv_meta_framebuffer framebuffer; +@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); + const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); + +- v3dv_job_start_frame(job, width, height, num_layers, false, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, +@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, + (fb_format, region->srcSubresource.aspectMask, + &internal_type, &internal_bpp); + +- v3dv_job_start_frame(job, width, height, num_layers, false, true, +- 1, internal_bpp, true); ++ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ true); + + struct v3dv_meta_framebuffer framebuffer; + v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, +diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c +index 20f5014268df..0583faf6f9a2 100644 +--- a/src/broadcom/vulkan/v3dv_pass.c ++++ b/src/broadcom/vulkan/v3dv_pass.c +@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device, + + /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa), + * the clear might get lost. If a subpass has this then we can't emit +- * the clear using the TLB and we have to do it as a draw call. ++ * the clear using the TLB and we have to do it as a draw call. This ++ * issue is fixed since V3D 4.3.18. + * + * FIXME: separate stencil. + */ +- if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { ++ if (device->devinfo.ver == 42 && ++ subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { + struct v3dv_render_pass_attachment *att = + &pass->attachments[subpass->ds_attachment.attachment]; + if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) { +@@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device, + /* Granularity is defined by the tile size */ + assert(subpass_idx < pass->subpass_count); + struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx]; +- const uint32_t color_attachment_count = subpass->color_count; ++ const uint32_t color_count = subpass->color_count; + + bool msaa = false; +- uint32_t max_bpp = 0; +- for (uint32_t i = 0; i < color_attachment_count; i++) { ++ uint32_t max_internal_bpp = 0; ++ uint32_t total_color_bpp = 0; ++ for (uint32_t i = 0; i < color_count; i++) { + uint32_t attachment_idx = subpass->color_attachments[i].attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + continue; +@@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device, + v3dv_X(device, get_internal_type_bpp_for_output_format) + (format->planes[0].rt_type, &internal_type, &internal_bpp); + +- max_bpp = MAX2(max_bpp, internal_bpp); ++ max_internal_bpp = MAX2(max_internal_bpp, internal_bpp); ++ total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + + if (desc->samples > VK_SAMPLE_COUNT_1_BIT) + msaa = true; +@@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device, + * heuristics so we choose a conservative granularity here, with it disabled. + */ + uint32_t width, height; +- v3d_choose_tile_size(color_attachment_count, max_bpp, msaa, ++ v3d_choose_tile_size(&device->devinfo, color_count, ++ max_internal_bpp, total_color_bpp, msaa, + false /* double-buffer */, &width, &height); + *granularity = (VkExtent2D) { + .width = width, +diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c +index 22f01bdf64bd..ba782b8268a8 100644 +--- a/src/broadcom/vulkan/v3dv_pipeline.c ++++ b/src/broadcom/vulkan/v3dv_pipeline.c +@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state) + return V3DV_DYNAMIC_LINE_WIDTH; + case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: + return V3DV_DYNAMIC_COLOR_WRITE_ENABLE; +- +- /* Depth bounds testing is not available in in V3D 4.2 so here we are just +- * ignoring this dynamic state. We are already asserting at pipeline creation +- * time that depth bounds testing is not enabled. +- */ + case VK_DYNAMIC_STATE_DEPTH_BOUNDS: +- return 0; ++ return V3DV_DYNAMIC_DEPTH_BOUNDS; + + default: + unreachable("Unhandled dynamic state"); +@@ -2632,6 +2627,7 @@ pipeline_init_dynamic_state( + const VkPipelineColorWriteCreateInfoEXT *pColorWriteState) + { + /* Initialize to default values */ ++ const struct v3d_device_info *devinfo = &pipeline->device->devinfo; + struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state; + memset(dynamic, 0, sizeof(*dynamic)); + dynamic->stencil_compare_mask.front = ~0; +@@ -2639,7 +2635,9 @@ pipeline_init_dynamic_state( + dynamic->stencil_write_mask.front = ~0; + dynamic->stencil_write_mask.back = ~0; + dynamic->line_width = 1.0f; +- dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1; ++ dynamic->color_write_enable = ++ (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1; ++ dynamic->depth_bounds.max = 1.0f; + + /* Create a mask of enabled dynamic states */ + uint32_t dynamic_states = 0; +@@ -2661,9 +2659,10 @@ pipeline_init_dynamic_state( + pViewportState->viewportCount); + + for (uint32_t i = 0; i < dynamic->viewport.count; i++) { +- v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i], +- dynamic->viewport.scale[i], +- dynamic->viewport.translate[i]); ++ v3dv_X(pipeline->device, viewport_compute_xform) ++ (&dynamic->viewport.viewports[i], ++ dynamic->viewport.scale[i], ++ dynamic->viewport.translate[i]); + } + } + +@@ -2691,6 +2690,11 @@ pipeline_init_dynamic_state( + dynamic->stencil_reference.front = pDepthStencilState->front.reference; + dynamic->stencil_reference.back = pDepthStencilState->back.reference; + } ++ ++ if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) { ++ dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds; ++ dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds; ++ } + } + + if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) { +@@ -2802,62 +2806,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline, + } + } + +-static bool +-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) +-{ +- for (uint8_t i = 0; i < pipeline->va_count; i++) { +- if (vk_format_is_int(pipeline->va[i].vk_format)) +- return true; +- } +- return false; +-} +- +-/* @pipeline can be NULL. We assume in that case that all the attributes have +- * a float format (we only create an all-float BO once and we reuse it with +- * all float pipelines), otherwise we look at the actual type of each +- * attribute used with the specific pipeline passed in. +- */ +-struct v3dv_bo * +-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, +- struct v3dv_pipeline *pipeline) +-{ +- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; +- struct v3dv_bo *bo; +- +- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); +- +- if (!bo) { +- fprintf(stderr, "failed to allocate memory for the default " +- "attribute values\n"); +- return NULL; +- } +- +- bool ok = v3dv_bo_map(device, bo, size); +- if (!ok) { +- fprintf(stderr, "failed to map default attribute values buffer\n"); +- return false; +- } +- +- uint32_t *attrs = bo->map; +- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; +- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { +- attrs[i * 4 + 0] = 0; +- attrs[i * 4 + 1] = 0; +- attrs[i * 4 + 2] = 0; +- VkFormat attr_format = +- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; +- if (i < va_count && vk_format_is_int(attr_format)) { +- attrs[i * 4 + 3] = 1; +- } else { +- attrs[i * 4 + 3] = fui(1.0); +- } +- } +- +- v3dv_bo_unmap(device, bo); +- +- return bo; +-} +- + static void + pipeline_set_sample_mask(struct v3dv_pipeline *pipeline, + const VkPipelineMultisampleStateCreateInfo *ms_info) +@@ -2960,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline, + /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that + * feature and it shouldn't be used by any pipeline. + */ +- assert(!ds_info || !ds_info->depthBoundsTestEnable); ++ assert(device->devinfo.ver >= 71 || ++ !ds_info || !ds_info->depthBoundsTestEnable); ++ pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable; + + enable_depth_bias(pipeline, rs_info); + +@@ -2992,9 +2942,10 @@ pipeline_init(struct v3dv_pipeline *pipeline, + + v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info); + +- if (pipeline_has_integer_vertex_attrib(pipeline)) { ++ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) { + pipeline->default_attribute_values = +- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline); ++ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline); ++ + if (!pipeline->default_attribute_values) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } else { +diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h +index c67072115293..43b14ec1ade3 100644 +--- a/src/broadcom/vulkan/v3dv_private.h ++++ b/src/broadcom/vulkan/v3dv_private.h +@@ -123,6 +123,9 @@ struct v3d_simulator_file; + /* Minimum required by the Vulkan 1.1 spec */ + #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30) + ++/* Maximum performance counters number */ ++#define V3D_MAX_PERFCNT 93 ++ + struct v3dv_physical_device { + struct vk_physical_device vk; + +@@ -581,6 +584,9 @@ struct v3dv_device { + * being float being float, allowing us to reuse the same BO for all + * pipelines matching this requirement. Pipelines that need integer + * attributes will create their own BO. ++ * ++ * Note that since v71 the default attribute values are not needed, so this ++ * can be NULL. + */ + struct v3dv_bo *default_attribute_float; + +@@ -772,6 +778,8 @@ struct v3dv_image_view { + + const struct v3dv_format *format; + ++ uint8_t view_swizzle[4]; ++ + uint8_t plane_count; + struct { + uint8_t image_plane; +@@ -782,8 +790,8 @@ struct v3dv_image_view { + uint32_t internal_type; + uint32_t offset; + +- /* Precomputed (composed from createinfo->components and formar swizzle) +- * swizzles to pass in to the shader key. ++ /* Precomputed swizzle (composed from the view swizzle and the format ++ * swizzle). + * + * This could be also included on the descriptor bo, but the shader state + * packet doesn't need it on a bo, so we can just avoid a memory copy +@@ -946,6 +954,7 @@ struct v3dv_frame_tiling { + uint32_t layers; + uint32_t render_target_count; + uint32_t internal_bpp; ++ uint32_t total_color_bpp; + bool msaa; + bool double_buffer; + uint32_t tile_width; +@@ -1040,7 +1049,8 @@ enum v3dv_dynamic_state_bits { + V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6, + V3DV_DYNAMIC_LINE_WIDTH = 1 << 7, + V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8, +- V3DV_DYNAMIC_ALL = (1 << 9) - 1, ++ V3DV_DYNAMIC_DEPTH_BOUNDS = 1 << 9, ++ V3DV_DYNAMIC_ALL = (1 << 10) - 1, + }; + + /* Flags for dirty pipeline state. +@@ -1065,6 +1075,7 @@ enum v3dv_cmd_dirty_bits { + V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 16, + V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 17, + V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 18, ++ V3DV_CMD_DIRTY_DEPTH_BOUNDS = 1 << 19, + }; + + struct v3dv_dynamic_state { +@@ -1101,6 +1112,11 @@ struct v3dv_dynamic_state { + float slope_factor; + } depth_bias; + ++ struct { ++ float min; ++ float max; ++ } depth_bounds; ++ + float line_width; + + uint32_t color_write_enable; +@@ -1196,7 +1212,7 @@ struct v3dv_timestamp_query_cpu_job_info { + }; + + /* Number of perfmons required to handle all supported performance counters */ +-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \ ++#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \ + DRM_V3D_MAX_PERF_COUNTERS) + + struct v3dv_perf_query { +@@ -1369,6 +1385,7 @@ void v3dv_job_start_frame(struct v3dv_job *job, + bool allocate_tile_state_now, + uint32_t render_target_count, + uint8_t max_internal_bpp, ++ uint8_t total_color_bpp, + bool msaa); + + bool v3dv_job_type_is_gpu(struct v3dv_job *job); +@@ -1667,7 +1684,7 @@ struct v3dv_query_pool { + /* Only used with performance queries */ + struct { + uint32_t ncounters; +- uint8_t counters[V3D_PERFCNT_NUM]; ++ uint8_t counters[V3D_MAX_PERFCNT]; + + /* V3D has a limit on the number of counters we can track in a + * single performance monitor, so if too many counters are requested +@@ -1803,7 +1820,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, + void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, + struct drm_v3d_submit_tfu *tfu); + +-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info, ++void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device, ++ struct v3dv_csd_indirect_cpu_job_info *info, + const uint32_t *wg_counts); + + void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, +@@ -2289,7 +2307,8 @@ struct v3dv_pipeline { + unsigned char sha1[20]; + + /* In general we can reuse v3dv_device->default_attribute_float, so note +- * that the following can be NULL. ++ * that the following can be NULL. In 7.x this is not used, so it will be ++ * always NULL. + * + * FIXME: the content of this BO will be small, so it could be improved to + * be uploaded to a common BO. But as in most cases it will be NULL, it is +@@ -2323,6 +2342,9 @@ struct v3dv_pipeline { + bool is_z16; + } depth_bias; + ++ /* Depth bounds */ ++ bool depth_bounds_test_enabled; ++ + struct { + void *mem_ctx; + struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */ +@@ -2338,6 +2360,13 @@ struct v3dv_pipeline { + uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH]; + }; + ++static inline bool ++v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device) ++{ ++ return device->devinfo.ver > 71 || ++ (device->devinfo.ver == 71 && device->devinfo.rev >= 5); ++} ++ + static inline VkPipelineBindPoint + v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline) + { +@@ -2500,10 +2529,6 @@ void + v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline, + struct v3dv_pipeline_cache *cache); + +-struct v3dv_bo * +-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, +- struct v3dv_pipeline *pipeline); +- + VkResult + v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device, + nir_shader *nir, +@@ -2608,12 +2633,32 @@ u64_compare(const void *key1, const void *key2) + case 42: \ + v3d_X_thing = &v3d42_##thing; \ + break; \ ++ case 71: \ ++ v3d_X_thing = &v3d71_##thing; \ ++ break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + v3d_X_thing; \ + }) + ++/* Helper to get hw-specific macro values */ ++#define V3DV_X(device, thing) ({ \ ++ __typeof(V3D42_##thing) V3D_X_THING; \ ++ switch (device->devinfo.ver) { \ ++ case 42: \ ++ V3D_X_THING = V3D42_##thing; \ ++ break; \ ++ case 71: \ ++ V3D_X_THING = V3D71_##thing; \ ++ break; \ ++ default: \ ++ unreachable("Unsupported hardware generation"); \ ++ } \ ++ V3D_X_THING; \ ++}) ++ ++ + + /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to + * define v3dX for each version supported, because when we compile code that +@@ -2626,6 +2671,10 @@ u64_compare(const void *key1, const void *key2) + # define v3dX(x) v3d42_##x + # include "v3dvx_private.h" + # undef v3dX ++ ++# define v3dX(x) v3d71_##x ++# include "v3dvx_private.h" ++# undef v3dX + #endif + + #ifdef ANDROID +diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c +index 3284c467d749..deb7821f02b9 100644 +--- a/src/broadcom/vulkan/v3dv_query.c ++++ b/src/broadcom/vulkan/v3dv_query.c +@@ -23,7 +23,6 @@ + + #include "v3dv_private.h" + +-#include "common/v3d_performance_counters.h" + #include "util/timespec.h" + #include "compiler/nir/nir_builder.h" + +@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device, + DRM_IOCTL_V3D_PERFMON_CREATE, + &req); + if (ret) +- fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret)); ++ fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret)); + + pool->queries[query].perf.kperfmon_ids[i] = req.id; + } +@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device, + QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); + + assert(pq_info); +- assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM); + + pool->perfmon.ncounters = pq_info->counterIndexCount; + for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) +@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device, + assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + struct v3dv_query *q = &pool->queries[query]; +- uint64_t counter_values[V3D_PERFCNT_NUM]; ++ uint64_t counter_values[V3D_MAX_PERFCNT]; + + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + struct drm_v3d_perfmon_get_values req = { +@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) + { +- uint32_t desc_count = *pCounterCount; ++ V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice); + +- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, +- out, pCounters, pCounterCount); +- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, +- out_desc, pCounterDescriptions, &desc_count); +- +- for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { +- vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { +- counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; +- counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; +- counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; +- +- unsigned char sha1_result[20]; +- _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], +- strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), +- sha1_result); +- +- memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); +- } +- +- vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, +- &out_desc, desc) { +- desc->flags = 0; +- snprintf(desc->name, sizeof(desc->name), "%s", +- v3d_performance_counters[i][V3D_PERFCNT_NAME]); +- snprintf(desc->category, sizeof(desc->category), "%s", +- v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); +- snprintf(desc->description, sizeof(desc->description), "%s", +- v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); +- } +- } +- +- return vk_outarray_status(&out); ++ return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount, ++ pCounters, ++ pCounterDescriptions); + } + + VKAPI_ATTR void VKAPI_CALL +diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c +index b4aae1951806..429d14a91966 100644 +--- a/src/broadcom/vulkan/v3dv_queue.c ++++ b/src/broadcom/vulkan/v3dv_queue.c +@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, + + if (memcmp(group_counts, info->csd_job->csd.wg_count, + sizeof(info->csd_job->csd.wg_count)) != 0) { +- v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); ++ v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts); + } + + return VK_SUCCESS; +diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c +index 72fa9a1b39c5..6e5adc368a87 100644 +--- a/src/broadcom/vulkan/v3dv_uniforms.c ++++ b/src/broadcom/vulkan/v3dv_uniforms.c +@@ -497,7 +497,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect); + + struct v3dv_cl_out *uniforms = cl_start(&job->indirect); +- + for (int i = 0; i < uinfo->count; i++) { + uint32_t data = uinfo->data[i]; + +@@ -519,13 +518,17 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, + cmd_buffer, pipeline, variant->stage); + break; + +- case QUNIFORM_VIEWPORT_X_SCALE: +- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f); ++ case QUNIFORM_VIEWPORT_X_SCALE: { ++ float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); ++ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity); + break; ++ } + +- case QUNIFORM_VIEWPORT_Y_SCALE: +- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f); ++ case QUNIFORM_VIEWPORT_Y_SCALE: { ++ float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); ++ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity); + break; ++ } + + case QUNIFORM_VIEWPORT_Z_OFFSET: { + float translate_z; +diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +index f182b790d363..011f5c8e1010 100644 +--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c ++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c +@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job) + }; + config.width_in_pixels = tiling->width; + config.height_in_pixels = tiling->height; ++#if V3D_VERSION == 42 + config.number_of_render_targets = MAX2(tiling->render_target_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ unreachable("HW generation 71 not supported yet."); ++#endif + + uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr; + cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config); +@@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, + cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { + config.width_in_pixels = tiling->width; + config.height_in_pixels = tiling->height; ++#if V3D_VERSION == 42 + config.number_of_render_targets = MAX2(tiling->render_target_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ config.log2_tile_width = log2_tile_size(tiling->tile_width); ++ config.log2_tile_height = log2_tile_size(tiling->tile_height); ++ /* FIXME: ideally we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); ++#endif + } + + /* There's definitely nothing in the VCD cache we want. */ +@@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer, + iview->vk.base_array_layer + layer, + image_plane); + ++ /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it ++ * is broken in earlier V3D versions. ++ */ ++ assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear); ++ + cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = buffer; + store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset); +@@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + const VkImageAspectFlags aspects = + vk_format_aspects(ds_attachment->desc.format); + ++#if V3D_VERSION <= 42 ++ /* GFXH-1689: The per-buffer store command's clear buffer bit is broken ++ * for depth/stencil. ++ * ++ * There used to be some confusion regarding the Clear Tile Buffers ++ * Z/S bit also being broken, but we confirmed with Broadcom that this ++ * is not the case, it was just that some other hardware bugs (that we ++ * need to work around, such as GFXH-1461) could cause this bit to behave ++ * incorrectly. ++ * ++ * There used to be another issue where the RTs bit in the Clear Tile ++ * Buffers packet also cleared Z/S, but Broadcom confirmed this is ++ * fixed since V3D 4.1. ++ * ++ * So if we have to emit a clear of depth or stencil we don't use ++ * the per-buffer store clear bit, even if we need to store the buffers, ++ * instead we always have to use the Clear Tile Buffers Z/S bit. ++ * If we have configured the job to do early Z/S clearing, then we ++ * don't want to emit any Clear Tile Buffers command at all here. ++ * ++ * Note that GFXH-1689 is not reproduced in the simulator, where ++ * using the clear buffer bit in depth/stencil stores works fine. ++ */ ++ + /* Only clear once on the first subpass that uses the attachment */ + uint32_t ds_first_subpass = !state->pass->multiview_enabled ? + ds_attachment->first_subpass : +@@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + ds_attachment->desc.stencilLoadOp, + subpass->do_stencil_clear_with_draw); + ++ use_global_zs_clear = !state->job->early_zs_clear && ++ (needs_depth_clear || needs_stencil_clear); ++#endif ++#if V3D_VERSION >= 71 ++ /* The store command's clear buffer bit cannot be used for Z/S stencil: ++ * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles, ++ * so we don't want to emit redundant clears here. ++ */ ++ use_global_zs_clear = false; ++#endif ++ + /* Skip the last store if it is not required */ + uint32_t ds_last_subpass = !pass->multiview_enabled ? + ds_attachment->last_subpass : +@@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + needs_stencil_store = subpass->resolve_stencil; + } + +- /* GFXH-1689: The per-buffer store command's clear buffer bit is broken +- * for depth/stencil. +- * +- * There used to be some confusion regarding the Clear Tile Buffers +- * Z/S bit also being broken, but we confirmed with Broadcom that this +- * is not the case, it was just that some other hardware bugs (that we +- * need to work around, such as GFXH-1461) could cause this bit to behave +- * incorrectly. +- * +- * There used to be another issue where the RTs bit in the Clear Tile +- * Buffers packet also cleared Z/S, but Broadcom confirmed this is +- * fixed since V3D 4.1. +- * +- * So if we have to emit a clear of depth or stencil we don't use +- * the per-buffer store clear bit, even if we need to store the buffers, +- * instead we always have to use the Clear Tile Buffers Z/S bit. +- * If we have configured the job to do early Z/S clearing, then we +- * don't want to emit any Clear Tile Buffers command at all here. +- * +- * Note that GFXH-1689 is not reproduced in the simulator, where +- * using the clear buffer bit in depth/stencil stores works fine. +- */ +- use_global_zs_clear = !state->job->early_zs_clear && +- (needs_depth_clear || needs_stencil_clear); + if (needs_depth_store || needs_stencil_store) { + const uint32_t zs_buffer = + v3dv_zs_buffer(needs_depth_store, needs_stencil_store); +@@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, + * bit and instead we have to emit a single clear of all tile buffers. + */ + if (use_global_zs_clear || use_global_rt_clear) { ++#if V3D_VERSION == 42 + cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = use_global_zs_clear; + clear.clear_all_render_targets = use_global_rt_clear; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(cl, CLEAR_RENDER_TARGETS, clear); ++#endif + } + } + +@@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job, + } + } + ++/* Note that for v71, render target cfg packets has just one field that ++ * combined the internal type and clamp mode. For simplicity we keep just one ++ * helper. ++ * ++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)". ++ * ++ * FIXME: for v71 we are not returning all the possible combinations for ++ * render target internal type and clamp. For example for int types we are ++ * always using clamp int, and for 16f we are using clamp none or pos (that ++ * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In ++ * summary right now we are just porting what we were doing on 4.2 ++ */ ++uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ VkFormat vk_format) ++{ ++#if V3D_VERSION == 42 ++ if (vk_format_is_int(vk_format)) ++ return V3D_RENDER_TARGET_CLAMP_INT; ++ else if (vk_format_is_srgb(vk_format)) ++ return V3D_RENDER_TARGET_CLAMP_NORM; ++ else ++ return V3D_RENDER_TARGET_CLAMP_NONE; ++#endif ++#if V3D_VERSION >= 71 ++ switch (rt_type) { ++ case V3D_INTERNAL_TYPE_8I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; ++ case V3D_INTERNAL_TYPE_8UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_8: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8; ++ case V3D_INTERNAL_TYPE_16I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; ++ case V3D_INTERNAL_TYPE_16UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_16F: ++ return vk_format_is_srgb(vk_format) ? ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F; ++ case V3D_INTERNAL_TYPE_32I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; ++ case V3D_INTERNAL_TYPE_32UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_32F: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32F; ++ default: ++ unreachable("Unknown internal render target type"); ++ } ++ ++ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; ++#endif ++} ++ ++static void ++cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer, ++ int rt, ++ uint32_t *rt_bpp, ++#if V3D_VERSION == 42 ++ uint32_t *rt_type, ++ uint32_t *rt_clamp) ++#else ++ uint32_t *rt_type_clamp) ++#endif ++{ ++ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; ++ ++ assert(state->subpass_idx < state->pass->subpass_count); ++ const struct v3dv_subpass *subpass = ++ &state->pass->subpasses[state->subpass_idx]; ++ ++ if (rt >= subpass->color_count) ++ return; ++ ++ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; ++ const uint32_t attachment_idx = attachment->attachment; ++ if (attachment_idx == VK_ATTACHMENT_UNUSED) ++ return; ++ ++ assert(attachment_idx < state->framebuffer->attachment_count && ++ attachment_idx < state->attachment_alloc_count); ++ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; ++ assert(vk_format_is_color(iview->vk.format)); ++ ++ assert(iview->plane_count == 1); ++ *rt_bpp = iview->planes[0].internal_bpp; ++#if V3D_VERSION == 42 ++ *rt_type = iview->planes[0].internal_type; ++ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, ++ iview->vk.format); ++#endif ++#if V3D_VERSION >= 71 ++ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, ++ iview->vk.format); ++#endif ++} ++ + void + v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + { +@@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + config.number_of_render_targets = MAX2(subpass->color_count, 1); + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; ++#if V3D_VERSION == 42 + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ config.log2_tile_width = log2_tile_size(tiling->tile_width); ++ config.log2_tile_height = log2_tile_size(tiling->tile_height); ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); ++#endif + + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { + const struct v3dv_image_view *iview = +@@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + * Early-Z/S clearing is independent of Early Z/S testing, so it is + * possible to enable one but not the other so long as their + * respective requirements are met. ++ * ++ * From V3D 4.5.6, Z/S buffers are always cleared automatically ++ * between tiles, but we still want to enable early ZS clears ++ * when Z/S are not loaded or stored. + */ + struct v3dv_render_pass_attachment *ds_attachment = + &pass->attachments[ds_attachment_idx]; +@@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + const VkImageAspectFlags ds_aspects = + vk_format_aspects(ds_attachment->desc.format); + +- bool needs_depth_clear = +- check_needs_clear(state, +- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, +- ds_attachment->first_subpass, +- ds_attachment->desc.loadOp, +- subpass->do_depth_clear_with_draw); +- + bool needs_depth_store = + v3dv_cmd_buffer_check_needs_store(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp) || + subpass->resolve_depth; ++#if V3D_VERSION <= 42 ++ bool needs_depth_clear = ++ check_needs_clear(state, ++ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, ++ ds_attachment->first_subpass, ++ ds_attachment->desc.loadOp, ++ subpass->do_depth_clear_with_draw); + + do_early_zs_clear = needs_depth_clear && !needs_depth_store; ++#endif ++#if V3D_VERSION >= 71 ++ bool needs_depth_load = ++ v3dv_cmd_buffer_check_needs_load(state, ++ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, ++ ds_attachment->first_subpass, ++ ds_attachment->desc.loadOp, ++ ds_attachment->last_subpass, ++ ds_attachment->desc.storeOp); ++ do_early_zs_clear = !needs_depth_load && !needs_depth_store; ++#endif ++ + if (do_early_zs_clear && + vk_format_has_stencil(ds_attachment->desc.format)) { + bool needs_stencil_load = +@@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + */ + job->early_zs_clear = do_early_zs_clear; + ++#if V3D_VERSION >= 71 ++ uint32_t base_addr = 0; ++#endif + for (uint32_t i = 0; i < subpass->color_count; i++) { + uint32_t attachment_idx = subpass->color_attachments[i].attachment; +- if (attachment_idx == VK_ATTACHMENT_UNUSED) ++ if (attachment_idx == VK_ATTACHMENT_UNUSED) { ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.render_target_number = i; ++ rt.stride = 1; /* Unused */ ++ } ++#endif + continue; ++ } + + struct v3dv_image_view *iview = + state->attachments[attachment_idx].image_view; +@@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + const struct v3d_resource_slice *slice = + &image->planes[plane].slices[iview->vk.base_mip_level]; + +- const uint32_t *clear_color = ++ UNUSED const uint32_t *clear_color = + &state->attachments[attachment_idx].clear_value.color[0]; + +- uint32_t clear_pad = 0; ++ UNUSED uint32_t clear_pad = 0; + if (slice->tiling == V3D_TILING_UIF_NO_XOR || + slice->tiling == V3D_TILING_UIF_XOR) { + int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2; +@@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + } + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { + clear.clear_color_low_32_bits = clear_color[0]; + clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; +@@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + clear.render_target_number = i; + }; + } ++#endif ++ ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.clear_color_low_bits = clear_color[0]; ++ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp, ++ &rt.internal_type_and_clamping); ++ rt.stride = ++ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, ++ v3d_internal_bpp_words(rt.internal_bpp)); ++ rt.base_address = base_addr; ++ rt.render_target_number = i; ++ ++ /* base_addr in multiples of 512 bits. We divide by 8 because stride ++ * is in 128-bit units, but it is packing 2 rows worth of data, so we ++ * need to divide it by 2 so it is only 1 row, and then again by 4 so ++ * it is in 512-bit units. ++ */ ++ base_addr += (tiling->tile_height * rt.stride) / 8; ++ } ++ ++ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { ++ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ ++ ((uint64_t) clear_color[1]) | ++ (((uint64_t) (clear_color[2] & 0xff)) << 32); ++ rt.render_target_number = i; ++ } ++ } ++ ++ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { ++ rt.clear_color_top_bits = /* 56 bits (24 + 32) */ ++ (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) | ++ (((uint64_t) (clear_color[3])) << 24); ++ rt.render_target_number = i; ++ } ++ } ++#endif + } + ++#if V3D_VERSION >= 71 ++ /* If we don't have any color RTs, we still need to emit one and flag ++ * it as not used using stride = 1. ++ */ ++ if (subpass->color_count == 0) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.stride = 1; ++ } ++ } ++#endif ++ ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 0, &rt.render_target_0_internal_bpp, + &rt.render_target_0_internal_type, &rt.render_target_0_clamp); +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 1, &rt.render_target_1_internal_bpp, + &rt.render_target_1_internal_type, &rt.render_target_1_clamp); +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 2, &rt.render_target_2_internal_bpp, + &rt.render_target_2_internal_type, &rt.render_target_2_clamp); +- v3dX(cmd_buffer_render_pass_setup_render_target) ++ cmd_buffer_render_pass_setup_render_target + (cmd_buffer, 3, &rt.render_target_3_internal_bpp, + &rt.render_target_3_internal_type, &rt.render_target_3_clamp); + } ++#endif + + /* Ends rendering mode config. */ + if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { +@@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + } + if (cmd_buffer->state.tile_aligned_render_area && + (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { ++#if V3D_VERSION == 42 + cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = !job->early_zs_clear; + clear.clear_all_render_targets = true; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt); ++#endif + } + cl_emit(rcl, END_OF_TILE_MARKER, end); + } +@@ -1054,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) + cl_emit(rcl, END_OF_RENDERING, end); + } + ++void ++v3dX(viewport_compute_xform)(const VkViewport *viewport, ++ float scale[3], ++ float translate[3]) ++{ ++ float x = viewport->x; ++ float y = viewport->y; ++ float half_width = 0.5f * viewport->width; ++ float half_height = 0.5f * viewport->height; ++ double n = viewport->minDepth; ++ double f = viewport->maxDepth; ++ ++ scale[0] = half_width; ++ translate[0] = half_width + x; ++ scale[1] = half_height; ++ translate[1] = half_height + y; ++ ++ scale[2] = (f - n); ++ translate[2] = n; ++ ++ /* It seems that if the scale is small enough the hardware won't clip ++ * correctly so we work around this my choosing the smallest scale that ++ * seems to work. ++ * ++ * This case is exercised by CTS: ++ * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero ++ * ++ * V3D 7.x fixes this by using the new ++ * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND. ++ */ ++#if V3D_VERSION <= 42 ++ const float min_abs_scale = 0.0005f; ++ if (fabs(scale[2]) < min_abs_scale) ++ scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; ++#endif ++} ++ + void + v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + { +@@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) + v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size); + v3dv_return_if_oom(cmd_buffer, NULL); + ++#if V3D_VERSION == 42 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f; + clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { ++ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f; ++ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f; ++ } ++#endif + + float translate_z, scale_z; + v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0, + &translate_z, &scale_z); + ++#if V3D_VERSION == 42 + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } ++#endif ++ ++#if V3D_VERSION >= 71 ++ /* If the Z scale is too small guardband clipping may not clip correctly */ ++ if (fabsf(scale_z) < 0.01f) { ++ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) { ++ clip.viewport_z_offset_zc_to_zs = translate_z; ++ clip.viewport_z_scale_zc_to_zs = scale_z; ++ } ++ } else { ++ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { ++ clip.viewport_z_offset_zc_to_zs = translate_z; ++ clip.viewport_z_scale_zc_to_zs = scale_z; ++ } ++ } ++#endif ++ + cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) { + /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled, + * we are using OpenGL's [-1, 1] instead. +@@ -1205,14 +1499,48 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) + cl_emit(&job->bcl, DEPTH_OFFSET, bias) { + bias.depth_offset_factor = dynamic->depth_bias.slope_factor; + bias.depth_offset_units = dynamic->depth_bias.constant_factor; ++#if V3D_VERSION <= 42 + if (pipeline->depth_bias.is_z16) + bias.depth_offset_units *= 256.0f; ++#endif + bias.limit = dynamic->depth_bias.depth_bias_clamp; + } + + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS; + } + ++void ++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer) ++{ ++ /* No depthBounds support for v42, so this method is empty in that case. ++ * ++ * Note that this method is being called as v3dv_job_init flags all state ++ * as dirty. See FIXME note in v3dv_job_init. ++ */ ++ ++#if V3D_VERSION >= 71 ++ struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; ++ assert(pipeline); ++ ++ if (!pipeline->depth_bounds_test_enabled) ++ return; ++ ++ struct v3dv_job *job = cmd_buffer->state.job; ++ assert(job); ++ ++ v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS)); ++ v3dv_return_if_oom(cmd_buffer, NULL); ++ ++ struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; ++ cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) { ++ bounds.lower_test_limit = dynamic->depth_bounds.min; ++ bounds.upper_test_limit = dynamic->depth_bounds.max; ++ } ++ ++ cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS; ++#endif ++} ++ + void + v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer) + { +@@ -1256,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + ++ const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo; ++ const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver); ++ + const uint32_t blend_packets_size = + cl_packet_length(BLEND_ENABLES) + + cl_packet_length(BLEND_CONSTANT_COLOR) + +- cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS; ++ cl_packet_length(BLEND_CFG) * max_color_rts; + + v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size); + v3dv_return_if_oom(cmd_buffer, NULL); +@@ -1271,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) + } + } + +- for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { ++ for (uint32_t i = 0; i < max_color_rts; i++) { + if (pipeline->blend.enables & (1 << i)) + cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]); + } +@@ -1298,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer) + + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; ++ uint32_t color_write_mask = ~dynamic->color_write_enable | ++ pipeline->blend.color_write_masks; ++#if V3D_VERSION <= 42 ++ /* Only 4 RTs */ ++ color_write_mask &= 0xffff; ++#endif ++ + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { +- mask.mask = (~dynamic->color_write_enable | +- pipeline->blend.color_write_masks) & 0xffff; ++ mask.mask = color_write_mask; + } + + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; +@@ -1591,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + +- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); +- + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); + v3dv_return_if_oom(cmd_buffer, NULL); + + cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { ++#if V3D_VERSION == 42 ++ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); + config.early_z_enable = enable_ez; + config.early_z_updates_enable = config.early_z_enable && + pipeline->z_updates_enable; ++#endif + } + } + +@@ -1845,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, + gs_bin->prog_data.gs->base.threads == 4; + shader.geometry_bin_mode_shader_start_in_final_thread_section = + gs_bin->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_bin_mode_shader_propagate_nans = true; ++#endif + shader.geometry_bin_mode_shader_uniforms_address = + gs_bin_uniforms; + +@@ -1855,7 +2195,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, + gs->prog_data.gs->base.threads == 4; + shader.geometry_render_mode_shader_start_in_final_thread_section = + gs->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_render_mode_shader_propagate_nans = true; ++#endif + shader.geometry_render_mode_shader_uniforms_address = + gs_render_uniforms; + } +@@ -2031,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) + pipeline->vpm_cfg.Gv); + } + ++#if V3D_VERSION == 42 + struct v3dv_bo *default_attribute_values = + pipeline->default_attribute_values != NULL ? + pipeline->default_attribute_values : + pipeline->device->default_attribute_float; ++#endif + + cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD, + pipeline->shader_state_record, shader) { +@@ -2060,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) + shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs; + shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs; + ++#if V3D_VERSION == 42 + shader.address_of_default_attribute_values = + v3dv_cl_address(default_attribute_values, 0); ++#endif + + shader.any_shader_reads_hardware_written_primitive_id = + (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid; +@@ -2370,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, + buffer->mem_offset + offset); + } + } +- +-void +-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, +- int rt, +- uint32_t *rt_bpp, +- uint32_t *rt_type, +- uint32_t *rt_clamp) +-{ +- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; +- +- assert(state->subpass_idx < state->pass->subpass_count); +- const struct v3dv_subpass *subpass = +- &state->pass->subpasses[state->subpass_idx]; +- +- if (rt >= subpass->color_count) +- return; +- +- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; +- const uint32_t attachment_idx = attachment->attachment; +- if (attachment_idx == VK_ATTACHMENT_UNUSED) +- return; +- +- assert(attachment_idx < state->framebuffer->attachment_count && +- attachment_idx < state->attachment_alloc_count); +- struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; +- assert(vk_format_is_color(iview->vk.format)); +- +- assert(iview->plane_count == 1); +- *rt_bpp = iview->planes[0].internal_bpp; +- *rt_type = iview->planes[0].internal_type; +- if (vk_format_is_int(iview->vk.view_format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; +- else if (vk_format_is_srgb(iview->vk.view_format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; +- else +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; +-} +diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c +index e235983864cd..1b50d51e19ff 100644 +--- a/src/broadcom/vulkan/v3dvx_device.c ++++ b/src/broadcom/vulkan/v3dvx_device.c +@@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = { + [VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS, + }; + +- + static union pipe_color_union encode_border_color( ++ const struct v3dv_device *device, + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) + { + const struct util_format_description *desc = +@@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color( + * colors so we need to fix up the swizzle manually for this case. + */ + uint8_t swizzle[4]; +- if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && ++ const bool v3d_has_reverse_swap_rb_bits = ++ v3dv_texture_shader_state_has_rb_swap_reverse_bits(device); ++ if (!v3d_has_reverse_swap_rb_bits && ++ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && + v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) { + swizzle[0] = PIPE_SWIZZLE_W; + swizzle[1] = PIPE_SWIZZLE_X; + swizzle[2] = PIPE_SWIZZLE_Y; + swizzle[3] = PIPE_SWIZZLE_Z; ++ } ++ /* In v3d 7.x we no longer have a reverse flag for the border color. Instead ++ * we have to use the new reverse and swap_r/b flags in the texture shader ++ * state which will apply the format swizzle automatically when sampling ++ * the border color too and we should not apply it manually here. ++ */ ++ else if (v3d_has_reverse_swap_rb_bits && ++ (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) || ++ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) { ++ swizzle[0] = PIPE_SWIZZLE_X; ++ swizzle[1] = PIPE_SWIZZLE_Y; ++ swizzle[2] = PIPE_SWIZZLE_Z; ++ swizzle[3] = PIPE_SWIZZLE_W; + } else { + memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle)); + } +@@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color( + (1 << (desc->channel[i].size - 1)) - 1); + } + +- /* convert from float to expected format */ ++#if V3D_VERSION <= 42 ++ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions ++ * for us. In V3D 4.x we need to manually convert floating point color ++ * values to the expected format. ++ */ + if (vk_format_is_srgb(bc_info->format) || + vk_format_is_compressed(bc_info->format)) { + for (int i = 0; i < 4; i++) +@@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color( + } + } + } ++#endif + + return border; + } + + void +-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, ++v3dX(pack_sampler_state)(const struct v3dv_device *device, ++ struct v3dv_sampler *sampler, + const VkSamplerCreateInfo *pCreateInfo, + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) + { +@@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, + s.border_color_mode = border_color_mode; + + if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) { +- union pipe_color_union border = encode_border_color(bc_info); ++ union pipe_color_union border = encode_border_color(device, bc_info); + + s.border_color_word_0 = border.ui[0]; + s.border_color_word_1 = border.ui[1]; +@@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + const struct v3dv_framebuffer *framebuffer, + const struct v3dv_cmd_buffer_attachment_state *attachments, + const struct v3dv_subpass *subpass, +- uint8_t *max_bpp, ++ uint8_t *max_internal_bpp, ++ uint8_t *total_color_bpp, + bool *msaa) + { + STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0); +- *max_bpp = V3D_INTERNAL_BPP_32; ++ *max_internal_bpp = V3D_INTERNAL_BPP_32; ++ *total_color_bpp = 0; + *msaa = false; + + if (subpass) { +@@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + assert(att); + assert(att->plane_count == 1); + +- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) +- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); ++ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { ++ const uint32_t internal_bpp = att->planes[0].internal_bpp; ++ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); ++ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); ++ } + + if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) + *msaa = true; +@@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) + *msaa = true; + } +- + return; + } + +@@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( + assert(att); + assert(att->plane_count == 1); + +- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) +- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); ++ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { ++ const uint32_t internal_bpp = att->planes[0].internal_bpp; ++ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); ++ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); ++ } + + if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) + *msaa = true; +diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c +index 80a3e5bfde86..de984e81220f 100644 +--- a/src/broadcom/vulkan/v3dvx_image.c ++++ b/src/broadcom/vulkan/v3dvx_image.c +@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]); + tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]); + +- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; +- + tex.texture_type = image_view->format->planes[plane].tex_type; + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { +@@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + + tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; + +- tex.srgb = vk_format_is_srgb(image_view->vk.view_format); +- + /* At this point we don't have the job. That's the reason the first + * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to + * add the bo to the job. This also means that we need to add manually +@@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device, + v3dv_layer_offset(image, 0, image_view->vk.base_array_layer, + iplane); + tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); ++ ++ bool is_srgb = vk_format_is_srgb(image_view->vk.format); ++ ++ /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose ++ * the reverse and/or swap_r/b swizzle from the format table with the ++ * image view swizzle. This, however, doesn't work for border colors, ++ * for that there is the reverse_standard_border_color. ++ * ++ * In v3d 7.x, however, there is no reverse_standard_border_color bit, ++ * since the reverse and swap_r/b bits also affect border colors. It is ++ * because of this that we absolutely need to use these bits with ++ * reversed and swpaped formats, since that's the only way to ensure ++ * correct border colors. In that case we don't want to program the ++ * swizzle to the composition of the format swizzle and the view ++ * swizzle like we do in v3d 4.x, since the format swizzle is applied ++ * via the reverse and swap_r/b bits. ++ */ ++#if V3D_VERSION == 42 ++ tex.srgb = is_srgb; ++ tex.reverse_standard_border_color = ++ image_view->planes[plane].channel_reverse; ++#endif ++#if V3D_VERSION >= 71 ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; ++ ++ tex.reverse = image_view->planes[plane].channel_reverse; ++ tex.r_b_swap = image_view->planes[plane].swap_rb; ++ ++ if (tex.reverse || tex.r_b_swap) { ++ tex.swizzle_r = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[0]); ++ tex.swizzle_g = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[1]); ++ tex.swizzle_b = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[2]); ++ tex.swizzle_a = ++ v3d_translate_pipe_swizzle(image_view->view_swizzle[3]); ++ } ++ ++ tex.chroma_offset_x = 1; ++ tex.chroma_offset_y = 1; ++ /* See comment in XML field definition for rationale of the shifts */ ++ tex.texture_base_pointer_cb = base_offset >> 6; ++ tex.texture_base_pointer_cr = base_offset >> 6; ++#endif + } + } + } +@@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, + + assert(buffer_view->format->plane_count == 1); + tex.texture_type = buffer_view->format->planes[0].tex_type; +- tex.srgb = vk_format_is_srgb(buffer_view->vk_format); ++ ++ bool is_srgb = vk_format_is_srgb(buffer_view->vk_format); ++#if V3D_VERSION == 42 ++ tex.srgb = is_srgb; ++#endif ++#if V3D_VERSION >= 71 ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; ++#endif + + /* At this point we don't have the job. That's the reason the first + * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to +@@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, + buffer_view->offset; + + tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); ++ ++#if V3D_VERSION >= 71 ++ tex.chroma_offset_x = 1; ++ tex.chroma_offset_y = 1; ++ /* See comment in XML field definition for rationale of the shifts */ ++ tex.texture_base_pointer_cb = base_offset >> 6; ++ tex.texture_base_pointer_cr = base_offset >> 6; ++#endif + } + } +diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c +index 04147b82cbd7..858096f9e4b4 100644 +--- a/src/broadcom/vulkan/v3dvx_meta_common.c ++++ b/src/broadcom/vulkan/v3dvx_meta_common.c +@@ -26,6 +26,7 @@ + + #include "broadcom/common/v3d_macros.h" + #include "broadcom/common/v3d_tfu.h" ++#include "broadcom/common/v3d_util.h" + #include "broadcom/cle/v3dx_pack.h" + #include "broadcom/compiler/v3d_compiler.h" + +@@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job, + config.number_of_render_targets = 1; + config.multisample_mode_4x = tiling->msaa; + config.double_buffer_in_non_ms_mode = tiling->double_buffer; ++#if V3D_VERSION == 42 + config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ config.log2_tile_width = log2_tile_size(tiling->tile_width); ++ config.log2_tile_height = log2_tile_size(tiling->tile_height); ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); ++#endif + config.internal_depth_type = fb->internal_depth_type; + } + ++ const uint32_t *color = NULL; + if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) { +- uint32_t clear_pad = 0; ++ UNUSED uint32_t clear_pad = 0; + if (clear_info->image) { + const struct v3dv_image *image = clear_info->image; + +@@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job, + } + } + +- const uint32_t *color = &clear_info->clear_value->color[0]; ++ color = &clear_info->clear_value->color[0]; ++ ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { + clear.clear_color_low_32_bits = color[0]; + clear.clear_color_next_24_bits = color[1] & 0x00ffffff; +@@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job, + clear.render_target_number = 0; + }; + } ++#endif + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + rt.render_target_0_internal_bpp = tiling->internal_bpp; + rt.render_target_0_internal_type = fb->internal_type; + rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + } ++#endif ++ ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ if (color) ++ rt.clear_color_low_bits = color[0]; ++ rt.internal_bpp = tiling->internal_bpp; ++ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type, ++ fb->vk_format); ++ rt.stride = ++ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, ++ v3d_internal_bpp_words(rt.internal_bpp)); ++ rt.base_address = 0; ++ rt.render_target_number = 0; ++ } ++ ++ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { ++ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ ++ ((uint64_t) color[1]) | ++ (((uint64_t) (color[2] & 0xff)) << 32); ++ rt.render_target_number = 0; ++ } ++ } ++ ++ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) { ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { ++ rt.clear_color_top_bits = /* 56 bits (24 + 32) */ ++ (((uint64_t) (color[2] & 0xffffff00)) >> 8) | ++ (((uint64_t) (color[3])) << 24); ++ rt.render_target_number = 0; ++ } ++ } ++#endif + + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { + clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f; +@@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job, + */ + if (clear_value && + (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { ++#if V3D_VERSION == 42 + cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = true; + clear.clear_all_render_targets = true; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear); ++#endif + } + cl_emit(rcl, END_OF_TILE_MARKER, end); + } +@@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + + tfu.iia |= src_offset; + ++#if V3D_VERSION <= 42 + if (src_tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT; + } else { +@@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + V3D33_TFU_ICFG_FORMAT_SHIFT; + } + tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT; ++#endif ++#if V3D_VERSION >= 71 ++ if (src_tiling == V3D_TILING_RASTER) { ++ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } else { ++ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + ++ (src_tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } ++ tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT; ++#endif + + tfu.ioa = dst_offset; + ++#if V3D_VERSION <= 42 + tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE + + (dst_tiling - V3D_TILING_LINEARTILE)) << + V3D33_TFU_IOA_FORMAT_SHIFT; ++#endif ++ ++#if V3D_VERSION >= 71 ++ tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE + ++ (dst_tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_IOC_FORMAT_SHIFT; ++ ++ switch (dst_tiling) { ++ case V3D_TILING_UIF_NO_XOR: ++ case V3D_TILING_UIF_XOR: ++ tfu.v71.ioc |= ++ (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ case V3D_TILING_RASTER: ++ tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ default: ++ break; ++ } ++#endif + + switch (src_tiling) { + case V3D_TILING_UIF_NO_XOR: +@@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + /* The TFU can handle raster sources but always produces UIF results */ + assert(dst_tiling != V3D_TILING_RASTER); + ++#if V3D_VERSION <= 42 + /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the + * OPAD field for the destination (how many extra UIF blocks beyond + * those necessary to cover the height). +@@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, + uif_block_h; + tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT; + } ++#endif + + v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu); + } +@@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t width, height; + framebuffer_size_for_pixel_count(num_items, &width, &height); + +- v3dv_job_start_frame(job, width, height, 1, true, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, 1, true, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type, +@@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer, + uint32_t width, height; + framebuffer_size_for_pixel_count(num_items, &width, &height); + +- v3dv_job_start_frame(job, width, height, 1, true, true, +- 1, internal_bpp, false); ++ v3dv_job_start_frame(job, width, height, 1, true, true, 1, ++ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), ++ false); + + struct v3dv_meta_framebuffer framebuffer; + v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT, +diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c +index 5d32d414ed86..ad22add155d8 100644 +--- a/src/broadcom/vulkan/v3dvx_pipeline.c ++++ b/src/broadcom/vulkan/v3dvx_pipeline.c +@@ -227,6 +227,45 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, + ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false; + + pipeline->z_updates_enable = config.z_updates_enable; ++ ++#if V3D_VERSION >= 71 ++ /* From the Vulkan spec: ++ * ++ * "depthClampEnable controls whether to clamp the fragment’s depth ++ * values as described in Depth Test. If the pipeline is not created ++ * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present ++ * then enabling depth clamp will also disable clipping primitives to ++ * the z planes of the frustrum as described in Primitive Clipping. ++ * Otherwise depth clipping is controlled by the state set in ++ * VkPipelineRasterizationDepthClipStateCreateInfoEXT." ++ * ++ * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually ++ * supported in the driver yet, so in practice we are always enabling Z ++ * clipping for now. ++ */ ++ bool z_clamp_enable = rs_info && rs_info->depthClampEnable; ++ bool z_clip_enable = false; ++ const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info = ++ ds_info ? vk_find_struct_const(ds_info->pNext, ++ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) : ++ NULL; ++ if (clip_info) ++ z_clip_enable = clip_info->depthClipEnable; ++ else if (!z_clamp_enable) ++ z_clip_enable = true; ++ ++ if (z_clip_enable) { ++ config.z_clipping_mode = pipeline->negative_one_to_one ? ++ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE; ++ } else { ++ config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; ++ } ++ ++ config.z_clamp_mode = z_clamp_enable; ++ ++ config.depth_bounds_test_enable = ++ ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment; ++#endif + }; + } + +@@ -360,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline, + static void + pack_shader_state_record(struct v3dv_pipeline *pipeline) + { +- assert(sizeof(pipeline->shader_state_record) == ++ assert(sizeof(pipeline->shader_state_record) >= + cl_packet_length(GL_SHADER_STATE_RECORD)); + + struct v3d_fs_prog_data *prog_data_fs = +@@ -435,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) + shader.number_of_varyings_in_fragment_shader = + prog_data_fs->num_inputs; + +- shader.coordinate_shader_propagate_nans = true; +- shader.vertex_shader_propagate_nans = true; +- shader.fragment_shader_propagate_nans = true; +- + /* Note: see previous note about addresses */ + /* shader.coordinate_shader_code_address */ + /* shader.vertex_shader_code_address */ + /* shader.fragment_shader_code_address */ + ++#if V3D_VERSION == 42 ++ shader.coordinate_shader_propagate_nans = true; ++ shader.vertex_shader_propagate_nans = true; ++ shader.fragment_shader_propagate_nans = true; ++ + /* FIXME: Use combined input/output size flag in the common case (also + * on v3d, see v3dx_draw). + */ +@@ -451,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) + prog_data_vs_bin->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + prog_data_vs->separate_segments; +- + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->separate_segments ? + prog_data_vs_bin->vpm_input_size : 1; + shader.vertex_shader_input_vpm_segment_size = + prog_data_vs->separate_segments ? + prog_data_vs->vpm_input_size : 1; ++#endif ++ ++ /* On V3D 7.1 there isn't a specific flag to set if we are using ++ * shared/separate segments or not. We just set the value of ++ * vpm_input_size to 0, and set output to the max needed. That should be ++ * already properly set on prog_data_vs_bin ++ */ ++#if V3D_VERSION == 71 ++ shader.coordinate_shader_input_vpm_segment_size = ++ prog_data_vs_bin->vpm_input_size; ++ shader.vertex_shader_input_vpm_segment_size = ++ prog_data_vs->vpm_input_size; ++#endif + + shader.coordinate_shader_output_vpm_segment_size = + prog_data_vs_bin->vpm_output_size; +@@ -659,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, + } + } + } ++ ++#if V3D_VERSION == 42 ++static bool ++pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) ++{ ++ for (uint8_t i = 0; i < pipeline->va_count; i++) { ++ if (vk_format_is_int(pipeline->va[i].vk_format)) ++ return true; ++ } ++ return false; ++} ++#endif ++ ++bool ++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline) ++{ ++#if V3D_VERSION == 42 ++ return pipeline_has_integer_vertex_attrib(pipeline); ++#endif ++ ++ return false; ++} ++ ++/* @pipeline can be NULL. In that case we assume the most common case. For ++ * example, for v42 we assume in that case that all the attributes have a ++ * float format (we only create an all-float BO once and we reuse it with all ++ * float pipelines), otherwise we look at the actual type of each attribute ++ * used with the specific pipeline passed in. ++ */ ++struct v3dv_bo * ++v3dX(create_default_attribute_values)(struct v3dv_device *device, ++ struct v3dv_pipeline *pipeline) ++{ ++#if V3D_VERSION >= 71 ++ return NULL; ++#endif ++ ++ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; ++ struct v3dv_bo *bo; ++ ++ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); ++ ++ if (!bo) { ++ fprintf(stderr, "failed to allocate memory for the default " ++ "attribute values\n"); ++ return NULL; ++ } ++ ++ bool ok = v3dv_bo_map(device, bo, size); ++ if (!ok) { ++ fprintf(stderr, "failed to map default attribute values buffer\n"); ++ return NULL; ++ } ++ ++ uint32_t *attrs = bo->map; ++ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; ++ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { ++ attrs[i * 4 + 0] = 0; ++ attrs[i * 4 + 1] = 0; ++ attrs[i * 4 + 2] = 0; ++ VkFormat attr_format = ++ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; ++ if (i < va_count && vk_format_is_int(attr_format)) { ++ attrs[i * 4 + 3] = 1; ++ } else { ++ attrs[i * 4 + 3] = fui(1.0); ++ } ++ } ++ ++ v3dv_bo_unmap(device, bo); ++ ++ return bo; ++} +diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h +index ad8ddfa5731c..0f5887eab937 100644 +--- a/src/broadcom/vulkan/v3dvx_private.h ++++ b/src/broadcom/vulkan/v3dvx_private.h +@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer); + void + v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer); + ++void ++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer); ++ + void + v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer); + +@@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color, + uint32_t internal_size, + uint32_t *hw_color); + +-void +-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, +- int rt, +- uint32_t *rt_bpp, +- uint32_t *rt_type, +- uint32_t *rt_clamp); +- + /* Used at v3dv_device */ + + void +-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, ++v3dX(pack_sampler_state)(const struct v3dv_device *device, ++ struct v3dv_sampler *sampler, + const VkSamplerCreateInfo *pCreateInfo, + const VkSamplerCustomBorderColorCreateInfoEXT *bc_info); + +@@ -143,7 +140,9 @@ void + v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer, + const struct v3dv_cmd_buffer_attachment_state *attachments, + const struct v3dv_subpass *subpass, +- uint8_t *max_bpp, bool *msaa); ++ uint8_t *max_internal_bpp, ++ uint8_t *total_color_bpp, ++ bool *msaa); + + #ifdef DEBUG + void +@@ -313,10 +312,24 @@ void + v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, + const VkPipelineVertexInputStateCreateInfo *vi_info, + const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info); ++ ++bool ++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline); ++ ++struct v3dv_bo * ++v3dX(create_default_attribute_values)(struct v3dv_device *device, ++ struct v3dv_pipeline *pipeline); ++ + /* Used at v3dv_queue */ + void + v3dX(job_emit_noop)(struct v3dv_job *job); + ++/* Used at v3dv_query */ ++VkResult ++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, ++ VkPerformanceCounterKHR *pCounters, ++ VkPerformanceCounterDescriptionKHR *pCounterDescriptions); ++ + /* Used at v3dv_descriptor_set, and other descriptor set utils */ + uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type); + +@@ -325,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void); + uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane); + + uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane); ++ ++/* General utils */ ++ ++uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ VkFormat vk_format); ++ ++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f ++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f ++ ++uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ VkFormat vk_format); ++ ++void ++v3dX(viewport_compute_xform)(const VkViewport *viewport, ++ float scale[3], ++ float translate[3]); +diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c +new file mode 100644 +index 000000000000..e59a1e84ff6c +--- /dev/null ++++ b/src/broadcom/vulkan/v3dvx_query.c +@@ -0,0 +1,67 @@ ++/* ++ * Copyright © 2023 Raspberry Pi Ltd ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "v3dv_private.h" ++ ++#include "common/v3d_performance_counters.h" ++ ++VkResult ++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, ++ VkPerformanceCounterKHR *pCounters, ++ VkPerformanceCounterDescriptionKHR *pCounterDescriptions) ++{ ++ uint32_t desc_count = *pCounterCount; ++ ++ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, ++ out, pCounters, pCounterCount); ++ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, ++ out_desc, pCounterDescriptions, &desc_count); ++ ++ for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { ++ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { ++ counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; ++ counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; ++ counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; ++ ++ unsigned char sha1_result[20]; ++ _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], ++ strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), ++ sha1_result); ++ ++ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); ++ } ++ ++ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, ++ &out_desc, desc) { ++ desc->flags = 0; ++ snprintf(desc->name, sizeof(desc->name), "%s", ++ v3d_performance_counters[i][V3D_PERFCNT_NAME]); ++ snprintf(desc->category, sizeof(desc->category), "%s", ++ v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); ++ snprintf(desc->description, sizeof(desc->description), "%s", ++ v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); ++ } ++ } ++ ++ return vk_outarray_status(&out); ++} +diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c +index efe63de425c5..6eed2de9d543 100644 +--- a/src/broadcom/vulkan/v3dvx_queue.c ++++ b/src/broadcom/vulkan/v3dvx_queue.c +@@ -29,7 +29,8 @@ + void + v3dX(job_emit_noop)(struct v3dv_job *job) + { +- v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false); ++ v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, ++ V3D_INTERNAL_BPP_32, 4, false); + v3dX(job_emit_binning_flush)(job); + + struct v3dv_cl *rcl = &job->rcl; +@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job) + config.image_height_pixels = 1; + config.number_of_render_targets = 1; + config.multisample_mode_4x = false; ++#if V3D_VERSION == 42 + config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32; ++#endif ++#if V3D_VERSION >= 71 ++ config.log2_tile_width = 3; /* Tile size 64 */ ++ config.log2_tile_height = 3; /* Tile size 64 */ ++#endif + } + ++#if V3D_VERSION == 42 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32; + rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8; + rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.internal_bpp = V3D_INTERNAL_BPP_32; ++ rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8; ++ rt.stride = 1; /* Unused RT */ ++ } ++#endif + + cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { + clear.z_clear_value = 1.0f; +diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build +index dfa1e88097b9..289473d2ca13 100644 +--- a/src/gallium/drivers/v3d/meson.build ++++ b/src/gallium/drivers/v3d/meson.build +@@ -34,7 +34,6 @@ files_libv3d = files( + 'v3d_query.c', + 'v3d_query.h', + 'v3d_query_pipe.c', +- 'v3d_query_perfcnt.c', + 'v3d_resource.c', + 'v3d_resource.h', + 'v3d_screen.c', +@@ -47,8 +46,10 @@ files_per_version = files( + 'v3dx_emit.c', + 'v3dx_format_table.c', + 'v3dx_job.c', ++ 'v3dx_query_perfcnt.c', + 'v3dx_rcl.c', + 'v3dx_state.c', ++ 'v3dx_tfu.c', + ) + + v3d_args = ['-DV3D_BUILD_NEON'] +@@ -58,7 +59,7 @@ if dep_v3dv3.found() + v3d_args += '-DUSE_V3D_SIMULATOR' + endif + +-v3d_versions = ['33', '42'] ++v3d_versions = ['33', '42', '71'] + + per_version_libs = [] + foreach ver : v3d_versions +diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c +index 0260bdde6d1c..51ddc292ff73 100644 +--- a/src/gallium/drivers/v3d/v3d_blit.c ++++ b/src/gallium/drivers/v3d/v3d_blit.c +@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info) + info->mask &= ~PIPE_MASK_S; + } + +-static bool +-v3d_tfu(struct pipe_context *pctx, +- struct pipe_resource *pdst, +- struct pipe_resource *psrc, +- unsigned int src_level, +- unsigned int base_level, +- unsigned int last_level, +- unsigned int src_layer, +- unsigned int dst_layer, +- bool for_mipmap) +-{ +- struct v3d_context *v3d = v3d_context(pctx); +- struct v3d_screen *screen = v3d->screen; +- struct v3d_resource *src = v3d_resource(psrc); +- struct v3d_resource *dst = v3d_resource(pdst); +- struct v3d_resource_slice *src_base_slice = &src->slices[src_level]; +- struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level]; +- int msaa_scale = pdst->nr_samples > 1 ? 2 : 1; +- int width = u_minify(pdst->width0, base_level) * msaa_scale; +- int height = u_minify(pdst->height0, base_level) * msaa_scale; +- enum pipe_format pformat; +- +- if (psrc->format != pdst->format) +- return false; +- if (psrc->nr_samples != pdst->nr_samples) +- return false; +- +- /* Can't write to raster. */ +- if (dst_base_slice->tiling == V3D_TILING_RASTER) +- return false; +- +- /* When using TFU for blit, we are doing exact copies (both input and +- * output format must be the same, no scaling, etc), so there is no +- * pixel format conversions. Thus we can rewrite the format to use one +- * that is TFU compatible based on its texel size. +- */ +- if (for_mipmap) { +- pformat = pdst->format; +- } else { +- switch (dst->cpp) { +- case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break; +- case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break; +- case 4: pformat = PIPE_FORMAT_R32_FLOAT; break; +- case 2: pformat = PIPE_FORMAT_R16_FLOAT; break; +- case 1: pformat = PIPE_FORMAT_R8_UNORM; break; +- default: unreachable("unsupported format bit-size"); break; +- }; +- } +- +- uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat); +- struct v3d_device_info *devinfo = &screen->devinfo; +- +- if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) { +- assert(for_mipmap); +- return false; +- } +- +- v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); +- v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); +- +- struct drm_v3d_submit_tfu tfu = { +- .ios = (height << 16) | width, +- .bo_handles = { +- dst->bo->handle, +- src != dst ? src->bo->handle : 0 +- }, +- .in_sync = v3d->out_sync, +- .out_sync = v3d->out_sync, +- }; +- uint32_t src_offset = (src->bo->offset + +- v3d_layer_offset(psrc, src_level, src_layer)); +- tfu.iia |= src_offset; +- if (src_base_slice->tiling == V3D_TILING_RASTER) { +- tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER << +- V3D33_TFU_ICFG_FORMAT_SHIFT); +- } else { +- tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE + +- (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << +- V3D33_TFU_ICFG_FORMAT_SHIFT); +- } +- +- uint32_t dst_offset = (dst->bo->offset + +- v3d_layer_offset(pdst, base_level, dst_layer)); +- tfu.ioa |= dst_offset; +- if (last_level != base_level) +- tfu.ioa |= V3D33_TFU_IOA_DIMTW; +- tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE + +- (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << +- V3D33_TFU_IOA_FORMAT_SHIFT); +- +- tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT; +- tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT; +- +- switch (src_base_slice->tiling) { +- case V3D_TILING_UIF_NO_XOR: +- case V3D_TILING_UIF_XOR: +- tfu.iis |= (src_base_slice->padded_height / +- (2 * v3d_utile_height(src->cpp))); +- break; +- case V3D_TILING_RASTER: +- tfu.iis |= src_base_slice->stride / src->cpp; +- break; +- case V3D_TILING_LINEARTILE: +- case V3D_TILING_UBLINEAR_1_COLUMN: +- case V3D_TILING_UBLINEAR_2_COLUMN: +- break; +- } +- +- /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the +- * OPAD field for the destination (how many extra UIF blocks beyond +- * those necessary to cover the height). When filling mipmaps, the +- * miplevel 1+ tiling state is inferred. +- */ +- if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR || +- dst_base_slice->tiling == V3D_TILING_UIF_XOR) { +- int uif_block_h = 2 * v3d_utile_height(dst->cpp); +- int implicit_padded_height = align(height, uif_block_h); +- +- tfu.icfg |= (((dst_base_slice->padded_height - +- implicit_padded_height) / uif_block_h) << +- V3D33_TFU_ICFG_OPAD_SHIFT); +- } +- +- int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu); +- if (ret != 0) { +- fprintf(stderr, "Failed to submit TFU job: %d\n", ret); +- return false; +- } +- +- dst->writes++; +- +- return true; +-} +- + bool + v3d_generate_mipmap(struct pipe_context *pctx, + struct pipe_resource *prsc, +@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx, + if (first_layer != last_layer) + return false; + +- return v3d_tfu(pctx, +- prsc, prsc, +- base_level, +- base_level, last_level, +- first_layer, first_layer, +- true); ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_device_info *devinfo = &screen->devinfo; ++ ++ return v3d_X(devinfo, tfu)(pctx, ++ prsc, prsc, ++ base_level, ++ base_level, last_level, ++ first_layer, first_layer, ++ true); + } + + static void +@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info) + if (info->dst.format != info->src.format) + return; + +- if (v3d_tfu(pctx, info->dst.resource, info->src.resource, +- info->src.level, +- info->dst.level, info->dst.level, +- info->src.box.z, info->dst.box.z, +- false)) { ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_device_info *devinfo = &screen->devinfo; ++ ++ if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource, ++ info->src.level, ++ info->dst.level, info->dst.level, ++ info->src.box.z, info->dst.box.z, ++ false)) { + info->mask &= ~PIPE_MASK_RGBA; + } + } +@@ -495,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info) + bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa; + + uint32_t tile_width, tile_height, max_bpp; +- v3d_get_tile_buffer_size(msaa, double_buffer, ++ v3d_get_tile_buffer_size(devinfo, msaa, double_buffer, + is_color_blit ? 1 : 0, surfaces, src_surf, + &tile_width, &tile_height, &max_bpp); + +diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c +index f12e8c92139c..1dc4bd017fe7 100644 +--- a/src/gallium/drivers/v3d/v3d_context.c ++++ b/src/gallium/drivers/v3d/v3d_context.c +@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d, + } + + void +-v3d_get_tile_buffer_size(bool is_msaa, ++v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, ++ bool is_msaa, + bool double_buffer, + uint32_t nr_cbufs, + struct pipe_surface **cbufs, +@@ -232,11 +233,13 @@ v3d_get_tile_buffer_size(bool is_msaa, + assert(!is_msaa || !double_buffer); + + uint32_t max_cbuf_idx = 0; ++ uint32_t total_bpp = 0; + *max_bpp = 0; + for (int i = 0; i < nr_cbufs; i++) { + if (cbufs[i]) { + struct v3d_surface *surf = v3d_surface(cbufs[i]); + *max_bpp = MAX2(*max_bpp, surf->internal_bpp); ++ total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp); + max_cbuf_idx = MAX2(i, max_cbuf_idx); + } + } +@@ -245,9 +248,11 @@ v3d_get_tile_buffer_size(bool is_msaa, + struct v3d_surface *bsurf = v3d_surface(bbuf); + assert(bbuf->texture->nr_samples <= 1 || is_msaa); + *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp); ++ total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp); + } + +- v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp, ++ v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, ++ *max_bpp, total_bpp, + is_msaa, double_buffer, + tile_width, tile_height); + } +diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h +index 97850b0363eb..eb184b4b2036 100644 +--- a/src/gallium/drivers/v3d/v3d_context.h ++++ b/src/gallium/drivers/v3d/v3d_context.h +@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj { + unsigned num_elements; + + uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)]; ++ /* defaults can be NULL for some hw generation */ + struct pipe_resource *defaults; + uint32_t defaults_offset; + }; +@@ -794,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx); + void v3d_flag_dirty_sampler_state(struct v3d_context *v3d, + enum pipe_shader_type shader); + +-void v3d_get_tile_buffer_size(bool is_msaa, ++void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, ++ bool is_msaa, + bool double_buffer, + uint32_t nr_cbufs, + struct pipe_surface **cbufs, +@@ -818,16 +820,52 @@ void v3d_disk_cache_store(struct v3d_context *v3d, + + /* Helper to call hw ver specific functions */ + #define v3d_X(devinfo, thing) ({ \ +- __typeof(&v3d42_##thing) v3d_X_thing; \ +- if ((devinfo)->ver >= 42) \ +- v3d_X_thing = &v3d42_##thing; \ +- else if ((devinfo)->ver >= 33) \ ++ __typeof(&v3d33_##thing) v3d_X_thing; \ ++ switch (devinfo->ver) { \ ++ case 33: \ ++ case 40: \ + v3d_X_thing = &v3d33_##thing; \ +- else \ ++ break; \ ++ case 42: \ ++ v3d_X_thing = &v3d42_##thing; \ ++ break; \ ++ case 71: \ ++ v3d_X_thing = &v3d71_##thing; \ ++ break; \ ++ default: \ + unreachable("Unsupported hardware generation"); \ ++ } \ + v3d_X_thing; \ + }) + ++/* FIXME: The same for vulkan/opengl. Common place? define it at the ++ * v3d_packet files? ++ */ ++#define V3D33_CLIPPER_XY_GRANULARITY 256.0f ++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f ++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f ++ ++/* Helper to get hw-specific macro values */ ++#define V3DV_X(devinfo, thing) ({ \ ++ __typeof(V3D33_##thing) V3D_X_THING; \ ++ switch (devinfo->ver) { \ ++ case 33: \ ++ case 40: \ ++ V3D_X_THING = V3D33_##thing; \ ++ break; \ ++ case 41: \ ++ case 42: \ ++ V3D_X_THING = V3D42_##thing; \ ++ break; \ ++ case 71: \ ++ V3D_X_THING = V3D71_##thing; \ ++ break; \ ++ default: \ ++ unreachable("Unsupported hardware generation"); \ ++ } \ ++ V3D_X_THING; \ ++}) ++ + #ifdef v3dX + # include "v3dx_context.h" + #else +@@ -838,6 +876,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d, + # define v3dX(x) v3d42_##x + # include "v3dx_context.h" + # undef v3dX ++ ++# define v3dX(x) v3d71_##x ++# include "v3dx_context.h" ++# undef v3dX + #endif + + #endif /* V3D_CONTEXT_H */ +diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c +index b022ed45073e..577890a06c31 100644 +--- a/src/gallium/drivers/v3d/v3d_job.c ++++ b/src/gallium/drivers/v3d/v3d_job.c +@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d) + job->double_buffer = false; + } + +- v3d_get_tile_buffer_size(job->msaa, job->double_buffer, ++ v3d_get_tile_buffer_size(&v3d->screen->devinfo, ++ job->msaa, job->double_buffer, + job->nr_cbufs, job->cbufs, job->bbuf, +- &job->tile_width, &job->tile_height, ++ &job->tile_width, ++ &job->tile_height, + &job->internal_bpp); + + /* The dirty flags are tracking what's been updated while v3d->job has +diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c +index db98c89625f5..83f82e44a3df 100644 +--- a/src/gallium/drivers/v3d/v3d_query.c ++++ b/src/gallium/drivers/v3d/v3d_query.c +@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_group_info *info) + { + struct v3d_screen *screen = v3d_screen(pscreen); ++ struct v3d_device_info *devinfo = &screen->devinfo; + +- return v3d_get_driver_query_group_info_perfcnt(screen, index, info); ++ return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen, ++ index, ++ info); + } + + int +@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info) + { + struct v3d_screen *screen = v3d_screen(pscreen); ++ struct v3d_device_info *devinfo = &screen->devinfo; + +- return v3d_get_driver_query_info_perfcnt(screen, index, info); ++ return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen, ++ index, ++ info); + } + + static struct pipe_query * +@@ -53,9 +59,13 @@ static struct pipe_query * + v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries, + unsigned *query_types) + { +- return v3d_create_batch_query_perfcnt(v3d_context(pctx), +- num_queries, +- query_types); ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_device_info *devinfo = &screen->devinfo; ++ ++ return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx), ++ num_queries, ++ query_types); + } + + static void +diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h +index 3e1426b8d867..605ed1a12f9d 100644 +--- a/src/gallium/drivers/v3d/v3d_query.h ++++ b/src/gallium/drivers/v3d/v3d_query.h +@@ -42,11 +42,5 @@ struct v3d_query + }; + + struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index); +-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries, +- unsigned *query_types); +-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_group_info *info); +-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_info *info); + + #endif /* V3D_QUERY_H */ +diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c +index 98ca9bb69e62..53bfb28924f4 100644 +--- a/src/gallium/drivers/v3d/v3d_screen.c ++++ b/src/gallium/drivers/v3d/v3d_screen.c +@@ -255,9 +255,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return V3D_MAX_ARRAY_LAYERS; + +- /* Render targets. */ + case PIPE_CAP_MAX_RENDER_TARGETS: +- return 4; ++ return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver); + + case PIPE_CAP_VENDOR_ID: + return 0x14E4; +diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c +index 95eb838954f1..64c217d4f6c6 100644 +--- a/src/gallium/drivers/v3d/v3d_uniforms.c ++++ b/src/gallium/drivers/v3d/v3d_uniforms.c +@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, + struct v3d_compiled_shader *shader, + enum pipe_shader_type stage) + { ++ struct v3d_device_info *devinfo = &v3d->screen->devinfo; + struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage]; + struct v3d_texture_stateobj *texstate = &v3d->tex[stage]; + struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms; +@@ -292,13 +293,16 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, + case QUNIFORM_UNIFORM: + cl_aligned_u32(&uniforms, gallium_uniforms[data]); + break; +- case QUNIFORM_VIEWPORT_X_SCALE: +- cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f); ++ case QUNIFORM_VIEWPORT_X_SCALE: { ++ float clipper_xy_granularity = V3DV_X(devinfo, CLIPPER_XY_GRANULARITY); ++ cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity); + break; +- case QUNIFORM_VIEWPORT_Y_SCALE: +- cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f); ++ } ++ case QUNIFORM_VIEWPORT_Y_SCALE: { ++ float clipper_xy_granularity = V3DV_X(devinfo, CLIPPER_XY_GRANULARITY); ++ cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity); + break; +- ++ } + case QUNIFORM_VIEWPORT_Z_OFFSET: + cl_aligned_f(&uniforms, v3d->viewport.translate[2]); + break; +diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h +index 03d7c244ea2b..c487ac3b9965 100644 +--- a/src/gallium/drivers/v3d/v3dx_context.h ++++ b/src/gallium/drivers/v3d/v3dx_context.h +@@ -51,3 +51,23 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format, + */ + bool v3dX(tfu_supports_tex_format)(uint32_t tex_format, + bool for_mipmap); ++ ++bool v3dX(tfu)(struct pipe_context *pctx, ++ struct pipe_resource *pdst, ++ struct pipe_resource *psrc, ++ unsigned int src_level, ++ unsigned int base_level, ++ unsigned int last_level, ++ unsigned int src_layer, ++ unsigned int dst_layer, ++ bool for_mipmap); ++ ++int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, ++ unsigned index, ++ struct pipe_driver_query_group_info *info); ++int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, ++ unsigned index, ++ struct pipe_driver_query_info *info); ++struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, ++ unsigned num_queries, ++ unsigned *query_types); +diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c +index 17442500ea96..4e1af41d50e0 100644 +--- a/src/gallium/drivers/v3d/v3dx_draw.c ++++ b/src/gallium/drivers/v3d/v3dx_draw.c +@@ -95,7 +95,25 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + #endif + + assert(!job->msaa || !job->double_buffer); +-#if V3D_VERSION >= 40 ++#if V3D_VERSION >= 71 ++ cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { ++ config.width_in_pixels = job->draw_width; ++ config.height_in_pixels = job->draw_height; ++ ++ config.log2_tile_width = log2_tile_size(job->tile_width); ++ config.log2_tile_height = log2_tile_size(job->tile_height); ++ ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); ++ } ++ ++#endif ++ ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 + cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { + config.width_in_pixels = job->draw_width; + config.height_in_pixels = job->draw_height; +@@ -107,7 +125,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + + config.maximum_bpp_of_all_render_targets = job->internal_bpp; + } +-#else /* V3D_VERSION < 40 */ ++#endif ++#if V3D_VERSION < 40 + /* "Binning mode lists start with a Tile Binning Mode Configuration + * item (120)" + * +@@ -134,7 +153,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) + + config.maximum_bpp_of_all_render_targets = job->internal_bpp; + } +-#endif /* V3D_VERSION < 40 */ ++#endif + + /* There's definitely nothing in the VCD cache we want. */ + cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin); +@@ -377,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job, + gs_bin->prog_data.gs->base.threads == 4; + shader.geometry_bin_mode_shader_start_in_final_thread_section = + gs_bin->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_bin_mode_shader_propagate_nans = true; ++#endif + shader.geometry_bin_mode_shader_uniforms_address = + gs_bin_uniforms; + +@@ -387,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job, + gs->prog_data.gs->base.threads == 4; + shader.geometry_render_mode_shader_start_in_final_thread_section = + gs->prog_data.gs->base.single_seg; ++#if V3D_VERSION <= 42 + shader.geometry_render_mode_shader_propagate_nans = true; ++#endif + shader.geometry_render_mode_shader_uniforms_address = + gs_render_uniforms; + } +@@ -638,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + shader.number_of_varyings_in_fragment_shader = + v3d->prog.fs->prog_data.fs->num_inputs; + +- shader.coordinate_shader_propagate_nans = true; +- shader.vertex_shader_propagate_nans = true; +- shader.fragment_shader_propagate_nans = true; +- + shader.coordinate_shader_code_address = + cl_address(v3d_resource(v3d->prog.cs->resource)->bo, + v3d->prog.cs->offset); +@@ -652,6 +671,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + cl_address(v3d_resource(v3d->prog.fs->resource)->bo, + v3d->prog.fs->offset); + ++#if V3D_VERSION <= 42 ++ shader.coordinate_shader_propagate_nans = true; ++ shader.vertex_shader_propagate_nans = true; ++ shader.fragment_shader_propagate_nans = true; ++ + /* XXX: Use combined input/output size flag in the common + * case. + */ +@@ -659,13 +683,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + v3d->prog.cs->prog_data.vs->separate_segments; + shader.vertex_shader_has_separate_input_and_output_vpm_blocks = + v3d->prog.vs->prog_data.vs->separate_segments; +- + shader.coordinate_shader_input_vpm_segment_size = + v3d->prog.cs->prog_data.vs->separate_segments ? + v3d->prog.cs->prog_data.vs->vpm_input_size : 1; + shader.vertex_shader_input_vpm_segment_size = + v3d->prog.vs->prog_data.vs->separate_segments ? + v3d->prog.vs->prog_data.vs->vpm_input_size : 1; ++#endif ++ /* On V3D 7.1 there isn't a specific flag to set if we are using ++ * shared/separate segments or not. We just set the value of ++ * vpm_input_size to 0, and set output to the max needed. That should be ++ * already properly set on prog_data_vs_bin ++ */ ++#if V3D_VERSION == 71 ++ shader.coordinate_shader_input_vpm_segment_size = ++ v3d->prog.cs->prog_data.vs->vpm_input_size; ++ shader.vertex_shader_input_vpm_segment_size = ++ v3d->prog.vs->prog_data.vs->vpm_input_size; ++#endif + + shader.coordinate_shader_output_vpm_segment_size = + v3d->prog.cs->prog_data.vs->vpm_output_size; +@@ -724,9 +759,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, + shader.instance_id_read_by_vertex_shader = + v3d->prog.vs->prog_data.vs->uses_iid; + ++#if V3D_VERSION <= 42 + shader.address_of_default_attribute_values = + cl_address(v3d_resource(vtx->defaults)->bo, + vtx->defaults_offset); ++#endif + } + + bool cs_loaded_any = false; +@@ -1436,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) + submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; + + +- /* Number of batches the dispatch will invoke (minus 1). */ +- submit.cfg[4] = num_batches - 1; ++ /* Number of batches the dispatch will invoke. ++ * V3D 7.1.6 and later don't subtract 1 from the number of batches ++ */ ++ if (v3d->screen->devinfo.ver < 71 || ++ (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) { ++ submit.cfg[4] = num_batches - 1; ++ } else { ++ submit.cfg[4] = num_batches; ++ } + + /* Make sure we didn't accidentally underflow. */ + assert(submit.cfg[4] != ~0); +@@ -1445,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) + v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo); + submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset + + v3d->prog.compute->offset); +- submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; ++ if (v3d->screen->devinfo.ver < 71) ++ submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (v3d->prog.compute->prog_data.base->single_seg) + submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; + if (v3d->prog.compute->prog_data.base->threads == 4) +@@ -1560,9 +1605,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers, + /* GFXH-1461: If we were to emit a load of just depth or just stencil, + * then the clear for the other may get lost. We need to decide now + * if it would be possible to need to emit a load of just one after +- * we've set up our TLB clears. ++ * we've set up our TLB clears. This issue is fixed since V3D 4.3.18. + */ +- if (buffers & PIPE_CLEAR_DEPTHSTENCIL && ++ if (v3d->screen->devinfo.ver <= 42 && ++ buffers & PIPE_CLEAR_DEPTHSTENCIL && + (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL && + job->zsbuf && + util_format_is_depth_and_stencil(job->zsbuf->texture->format)) { +diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c +index 0ad3fb68b1e2..ee17b935e196 100644 +--- a/src/gallium/drivers/v3d/v3dx_emit.c ++++ b/src/gallium/drivers/v3d/v3dx_emit.c +@@ -512,13 +512,17 @@ v3dX(emit_state)(struct pipe_context *pctx) + /* Note: EZ state may update based on the compiled FS, + * along with ZSA + */ ++#if V3D_VERSION <= 42 + config.early_z_updates_enable = + (job->ez_state != V3D_EZ_DISABLED); ++#endif + if (v3d->zsa->base.depth_enabled) { + config.z_updates_enable = + v3d->zsa->base.depth_writemask; ++#if V3D_VERSION <= 42 + config.early_z_enable = + config.early_z_updates_enable; ++#endif + config.depth_test_function = + v3d->zsa->base.depth_func; + } else { +@@ -535,13 +539,28 @@ v3dX(emit_state)(struct pipe_context *pctx) + v3d_line_smoothing_enabled(v3d) ? + V3D_LINE_RASTERIZATION_PERP_END_CAPS : + V3D_LINE_RASTERIZATION_DIAMOND_EXIT; +- } + ++#if V3D_VERSION >= 71 ++ /* The following follows the logic implemented in v3dv ++ * plus the definition of depth_clip_near/far and ++ * depth_clamp. ++ * ++ * Note: some extensions are not supported by v3d ++ * (like ARB_depth_clamp) that would affect this, but ++ * the values on rasterizer are taking that into ++ * account. ++ */ ++ config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near || ++ v3d->rasterizer->base.depth_clip_far ? ++ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_NONE; ++#endif ++ } + } + + if (v3d->dirty & V3D_DIRTY_RASTERIZER && + v3d->rasterizer->base.offset_tri) { +- if (job->zsbuf && ++ if (v3d->screen->devinfo.ver <= 42 && ++ job->zsbuf && + job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) { + cl_emit_prepacked_sized(&job->bcl, + v3d->rasterizer->depth_offset_z16, +@@ -564,12 +583,23 @@ v3dX(emit_state)(struct pipe_context *pctx) + } + + if (v3d->dirty & V3D_DIRTY_VIEWPORT) { ++#if V3D_VERSION <= 42 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_256th_of_pixel = + v3d->viewport.scale[0] * 256.0f; + clip.viewport_half_height_in_1_256th_of_pixel = + v3d->viewport.scale[1] * 256.0f; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { ++ clip.viewport_half_width_in_1_64th_of_pixel = ++ v3d->viewport.scale[0] * 64.0f; ++ clip.viewport_half_height_in_1_64th_of_pixel = ++ v3d->viewport.scale[1] * 64.0f; ++ } ++#endif ++ + + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = +@@ -633,8 +663,10 @@ v3dX(emit_state)(struct pipe_context *pctx) + } + #endif + ++ const uint32_t max_rts = ++ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver); + if (blend->base.independent_blend_enable) { +- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) ++ for (int i = 0; i < max_rts; i++) + emit_rt_blend(v3d, job, &blend->base, i, + (1 << i), + v3d->blend_dst_alpha_one & (1 << i)); +@@ -650,16 +682,16 @@ v3dX(emit_state)(struct pipe_context *pctx) + * RTs without. + */ + emit_rt_blend(v3d, job, &blend->base, 0, +- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) & ++ ((1 << max_rts) - 1) & + v3d->blend_dst_alpha_one, + true); + emit_rt_blend(v3d, job, &blend->base, 0, +- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) & ++ ((1 << max_rts) - 1) & + ~v3d->blend_dst_alpha_one, + false); + } else { + emit_rt_blend(v3d, job, &blend->base, 0, +- (1 << V3D_MAX_DRAW_BUFFERS) - 1, ++ (1 << max_rts) - 1, + v3d->blend_dst_alpha_one); + } + } +@@ -668,8 +700,10 @@ v3dX(emit_state)(struct pipe_context *pctx) + if (v3d->dirty & V3D_DIRTY_BLEND) { + struct pipe_blend_state *blend = &v3d->blend->base; + ++ const uint32_t max_rts = ++ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver); + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { +- for (int i = 0; i < 4; i++) { ++ for (int i = 0; i < max_rts; i++) { + int rt = blend->independent_blend_enable ? i : 0; + int rt_mask = blend->rt[rt].colormask; + +diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c +similarity index 94% +rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c +rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c +index e00d84e375f0..431aad14b4fa 100644 +--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c ++++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c +@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon) + } + + int +-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_group_info *info) ++v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index, ++ struct pipe_driver_query_group_info *info) + { + if (!screen->has_perfmon) + return 0; +@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde + } + + int +-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index, +- struct pipe_driver_query_info *info) ++v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index, ++ struct pipe_driver_query_info *info) + { + if (!screen->has_perfmon) + return 0; +@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = { + }; + + struct pipe_query * +-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries, +- unsigned *query_types) ++v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries, ++ unsigned *query_types) + { + struct v3d_query_perfcnt *pquery = NULL; + struct v3d_query *query; +diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c +index 82547437c252..8eabc5ea6263 100644 +--- a/src/gallium/drivers/v3d/v3dx_rcl.c ++++ b/src/gallium/drivers/v3d/v3dx_rcl.c +@@ -23,8 +23,9 @@ + + #include "util/format/u_format.h" + #include "v3d_context.h" +-#include "broadcom/common/v3d_tiling.h" + #include "broadcom/common/v3d_macros.h" ++#include "broadcom/common/v3d_tiling.h" ++#include "broadcom/common/v3d_util.h" + #include "broadcom/cle/v3dx_pack.h" + + #define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 | \ +@@ -419,10 +420,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer) + * clearing Z/S. + */ + if (job->clear) { ++#if V3D_VERSION <= 42 + cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = !job->early_zs_clear; + clear.clear_all_render_targets = true; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(cl, CLEAR_RENDER_TARGETS, clear); ++#endif ++ + } + #endif /* V3D_VERSION >= 40 */ + } +@@ -483,10 +490,66 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer) + } + } + +-#if V3D_VERSION >= 40 ++#if V3D_VERSION > 33 ++/* Note that for v71, render target cfg packets has just one field that ++ * combined the internal type and clamp mode. For simplicity we keep just one ++ * helper. ++ * ++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)". ++ * ++ */ ++static uint32_t ++v3dX(clamp_for_format_and_type)(uint32_t rt_type, ++ enum pipe_format format) ++{ ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 ++ if (util_format_is_srgb(format)) { ++ return V3D_RENDER_TARGET_CLAMP_NORM; ++#if V3D_VERSION >= 42 ++ } else if (util_format_is_pure_integer(format)) { ++ return V3D_RENDER_TARGET_CLAMP_INT; ++#endif ++ } else { ++ return V3D_RENDER_TARGET_CLAMP_NONE; ++ } ++#endif ++#if V3D_VERSION >= 71 ++ switch (rt_type) { ++ case V3D_INTERNAL_TYPE_8I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; ++ case V3D_INTERNAL_TYPE_8UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_8: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_8; ++ case V3D_INTERNAL_TYPE_16I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; ++ case V3D_INTERNAL_TYPE_16UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_16F: ++ return util_format_is_srgb(format) ? ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : ++ V3D_RENDER_TARGET_TYPE_CLAMP_16F; ++ case V3D_INTERNAL_TYPE_32I: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; ++ case V3D_INTERNAL_TYPE_32UI: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; ++ case V3D_INTERNAL_TYPE_32F: ++ return V3D_RENDER_TARGET_TYPE_CLAMP_32F; ++ default: ++ unreachable("Unknown internal render target type"); ++ } ++ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; ++#endif ++ return 0; ++} ++#endif ++ ++#if V3D_VERSION >= 71 + static void +-v3d_setup_render_target(struct v3d_job *job, int cbuf, +- uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp) ++v3d_setup_render_target(struct v3d_job *job, ++ int cbuf, ++ uint32_t *rt_bpp, ++ uint32_t *rt_type_clamp) + { + if (!job->cbufs[cbuf]) + return; +@@ -497,19 +560,35 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf, + struct v3d_surface *bsurf = v3d_surface(job->bbuf); + *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp); + } +- *rt_type = surf->internal_type; +- if (util_format_is_srgb(surf->base.format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; +-#if V3D_VERSION >= 42 +- else if (util_format_is_pure_integer(surf->base.format)) +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; +-#endif +- else +- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; ++ *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type, ++ surf->base.format); + } ++#endif + +-#else /* V3D_VERSION < 40 */ ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 ++static void ++v3d_setup_render_target(struct v3d_job *job, ++ int cbuf, ++ uint32_t *rt_bpp, ++ uint32_t *rt_type, ++ uint32_t *rt_clamp) ++{ ++ if (!job->cbufs[cbuf]) ++ return; ++ ++ struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]); ++ *rt_bpp = surf->internal_bpp; ++ if (job->bbuf) { ++ struct v3d_surface *bsurf = v3d_surface(job->bbuf); ++ *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp); ++ } ++ *rt_type = surf->internal_type; ++ *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type, ++ surf->base.format); ++} ++#endif + ++#if V3D_VERSION < 40 + static void + v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf, + struct v3d_resource *rsc, bool is_separate_stencil) +@@ -656,7 +735,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } +-#else ++#endif ++#if V3D_VERSION >= 40 + for (int i = 0; i < 2; i++) { + if (i > 0) + cl_emit(&job->rcl, TILE_COORDINATES, coords); +@@ -664,16 +744,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } ++ + if (i == 0 || do_double_initial_tile_clear(job)) { ++#if V3D_VERSION < 71 + cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = !job->early_zs_clear; + clear.clear_all_render_targets = true; + } ++#else ++ cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear); ++#endif + } + cl_emit(&job->rcl, END_OF_TILE_MARKER, end); + } + #endif +- + cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); + + v3d_rcl_emit_generic_per_tile_list(job, layer); +@@ -775,18 +859,52 @@ v3dX(emit_rcl)(struct v3d_job *job) + config.multisample_mode_4x = job->msaa; + config.double_buffer_in_non_ms_mode = job->double_buffer; + ++#if V3D_VERSION <= 42 + config.maximum_bpp_of_all_render_targets = job->internal_bpp; ++#endif ++#if V3D_VERSION >= 71 ++ config.log2_tile_width = log2_tile_size(job->tile_width); ++ config.log2_tile_height = log2_tile_size(job->tile_height); ++ ++ /* FIXME: ideallly we would like next assert on the packet header (as is ++ * general, so also applies to GL). We would need to expand ++ * gen_pack_header for that. ++ */ ++ assert(config.log2_tile_width == config.log2_tile_height || ++ config.log2_tile_width == config.log2_tile_height + 1); ++#endif ++ + } + ++#if V3D_VERSION >= 71 ++ uint32_t base_addr = 0; ++ ++ /* If we don't have any color RTs, we sill need to emit one and flag ++ * it as not used using stride = 1 ++ */ ++ if (job->nr_cbufs == 0) { ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.stride = 1; /* Unused */ ++ } ++ } ++#endif + for (int i = 0; i < job->nr_cbufs; i++) { + struct pipe_surface *psurf = job->cbufs[i]; +- if (!psurf) ++ if (!psurf) { ++#if V3D_VERSION >= 71 ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.render_target_number = i; ++ rt.stride = 1; /* Unused */ ++ } ++#endif + continue; ++ } ++ + struct v3d_surface *surf = v3d_surface(psurf); + struct v3d_resource *rsc = v3d_resource(psurf->texture); + + UNUSED uint32_t config_pad = 0; +- uint32_t clear_pad = 0; ++ UNUSED uint32_t clear_pad = 0; + + /* XXX: Set the pad for raster. */ + if (surf->tiling == V3D_TILING_UIF_NO_XOR || +@@ -819,6 +937,7 @@ v3dX(emit_rcl)(struct v3d_job *job) + } + #endif /* V3D_VERSION < 40 */ + ++#if V3D_VERSION <= 42 + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, + clear) { + clear.clear_color_low_32_bits = job->clear_color[i][0]; +@@ -847,9 +966,42 @@ v3dX(emit_rcl)(struct v3d_job *job) + clear.render_target_number = i; + }; + } ++#endif ++#if V3D_VERSION >= 71 ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { ++ rt.clear_color_low_bits = job->clear_color[i][0]; ++ v3d_setup_render_target(job, i, &rt.internal_bpp, ++ &rt.internal_type_and_clamping); ++ rt.stride = ++ v3d_compute_rt_row_row_stride_128_bits(job->tile_width, ++ v3d_internal_bpp_words(rt.internal_bpp)); ++ rt.base_address = base_addr; ++ rt.render_target_number = i; ++ ++ base_addr += (job->tile_height * rt.stride) / 8; ++ } ++ ++ if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) { ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { ++ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ ++ ((uint64_t) job->clear_color[i][1]) | ++ (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32); ++ rt.render_target_number = i; ++ } ++ } ++ ++ if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) { ++ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { ++ rt.clear_color_top_bits = /* 56 bits (24 + 32) */ ++ (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) | ++ (((uint64_t) (job->clear_color[i][3])) << 24); ++ rt.render_target_number = i; ++ } ++ } ++#endif + } + +-#if V3D_VERSION >= 40 ++#if V3D_VERSION >= 40 && V3D_VERSION <= 42 + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { + v3d_setup_render_target(job, 0, + &rt.render_target_0_internal_bpp, +diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c +index 0f1735fee666..032a6643fcdc 100644 +--- a/src/gallium/drivers/v3d/v3dx_state.c ++++ b/src/gallium/drivers/v3d/v3dx_state.c +@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx, + #endif + } + +- /* The HW treats polygon offset units based on a Z24 buffer, so we ++ /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we + * need to scale up offset_units if we're only Z16. + */ ++#if V3D_VERSION <= 42 + v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) { + depth.depth_offset_factor = cso->offset_scale; + depth.depth_offset_units = cso->offset_units * 256.0; +@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx, + depth.limit = cso->offset_clamp; + #endif + } ++#endif + + return so; + } +@@ -138,8 +140,9 @@ v3d_create_blend_state(struct pipe_context *pctx, + + so->base = *cso; + ++ uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION); + if (cso->independent_blend_enable) { +- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { ++ for (int i = 0; i < max_rts; i++) { + so->blend_enables |= cso->rt[i].blend_enable << i; + + /* V3D 4.x is when we got independent blend enables. */ +@@ -148,7 +151,7 @@ v3d_create_blend_state(struct pipe_context *pctx, + } + } else { + if (cso->rt[0].blend_enable) +- so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1; ++ so->blend_enables = (1 << max_rts) - 1; + } + + return so; +@@ -337,6 +340,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso) + v3d->dirty |= V3D_DIRTY_ZSA; + } + ++ ++static bool ++needs_default_attribute_values(void) ++{ ++#if V3D_VERSION <= 42 ++ /* FIXME: on vulkan we are able to refine even further, as we know in ++ * advance when we create the pipeline if we have an integer vertex ++ * attrib. Pending to check if we could do something similar here. ++ */ ++ return true; ++#endif ++ return false; ++} ++ + static void * + v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, + const struct pipe_vertex_element *elements) +@@ -414,24 +431,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, + } + } + +- /* Set up the default attribute values in case any of the vertex +- * elements use them. +- */ +- uint32_t *attrs; +- u_upload_alloc(v3d->state_uploader, 0, +- V3D_MAX_VS_INPUTS * sizeof(float), 16, +- &so->defaults_offset, &so->defaults, (void **)&attrs); +- +- for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) { +- attrs[i * 4 + 0] = 0; +- attrs[i * 4 + 1] = 0; +- attrs[i * 4 + 2] = 0; +- if (i < so->num_elements && +- util_format_is_pure_integer(so->pipe[i].src_format)) { +- attrs[i * 4 + 3] = 1; +- } else { +- attrs[i * 4 + 3] = fui(1.0); ++ if (needs_default_attribute_values()) { ++ /* Set up the default attribute values in case any of the vertex ++ * elements use them. ++ */ ++ uint32_t *attrs; ++ u_upload_alloc(v3d->state_uploader, 0, ++ V3D_MAX_VS_INPUTS * sizeof(float), 16, ++ &so->defaults_offset, &so->defaults, (void **)&attrs); ++ ++ for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) { ++ attrs[i * 4 + 0] = 0; ++ attrs[i * 4 + 1] = 0; ++ attrs[i * 4 + 2] = 0; ++ if (i < so->num_elements && ++ util_format_is_pure_integer(so->pipe[i].src_format)) { ++ attrs[i * 4 + 3] = 1; ++ } else { ++ attrs[i * 4 + 3] = fui(1.0); ++ } + } ++ } else { ++ so->defaults = NULL; ++ so->defaults_offset = 0; + } + + u_upload_unmap(v3d->state_uploader); +@@ -699,21 +721,22 @@ v3d_upload_sampler_state_variant(void *map, + break; + } + +- if (variant >= V3D_SAMPLER_STATE_32) { +- sampler.border_color_word_0 = border.ui[0]; +- sampler.border_color_word_1 = border.ui[1]; +- sampler.border_color_word_2 = border.ui[2]; +- sampler.border_color_word_3 = border.ui[3]; +- } else { +- sampler.border_color_word_0 = +- _mesa_float_to_half(border.f[0]); +- sampler.border_color_word_1 = +- _mesa_float_to_half(border.f[1]); +- sampler.border_color_word_2 = +- _mesa_float_to_half(border.f[2]); +- sampler.border_color_word_3 = +- _mesa_float_to_half(border.f[3]); ++#if V3D_VERSION <= 42 ++ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions ++ * for us. In V3D 4.x we need to manually convert floating point color ++ * values to the expected format. ++ */ ++ if (variant < V3D_SAMPLER_STATE_32) { ++ border.ui[0] = _mesa_float_to_half(border.f[0]); ++ border.ui[1] = _mesa_float_to_half(border.f[1]); ++ border.ui[2] = _mesa_float_to_half(border.f[2]); ++ border.ui[3] = _mesa_float_to_half(border.f[3]); + } ++#endif ++ sampler.border_color_word_0 = border.ui[0]; ++ sampler.border_color_word_1 = border.ui[1]; ++ sampler.border_color_word_2 = border.ui[2]; ++ sampler.border_color_word_3 = border.ui[3]; + } + } + } +@@ -869,7 +892,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te + } + + static void +-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, ++v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo, ++ struct V3DX(TEXTURE_SHADER_STATE) *tex, + struct pipe_resource *prsc, + int base_level, int last_level, + int first_layer, int last_layer, +@@ -917,19 +941,29 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, + } + + tex->base_level = base_level; ++ + #if V3D_VERSION >= 40 + tex->max_level = last_level; + /* Note that we don't have a job to reference the texture's sBO + * at state create time, so any time this sampler view is used + * we need to add the texture to the job. + */ +- tex->texture_base_pointer = +- cl_address(NULL, +- rsc->bo->offset + +- v3d_layer_offset(prsc, 0, first_layer)); ++ const uint32_t base_offset = rsc->bo->offset + ++ v3d_layer_offset(prsc, 0, first_layer); ++ ++ tex->texture_base_pointer = cl_address(NULL, base_offset); + #endif ++ + tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64; + ++#if V3D_VERSION >= 71 ++ tex->chroma_offset_x = 1; ++ tex->chroma_offset_y = 1; ++ /* See comment in XML field definition for rationale of the shifts */ ++ tex->texture_base_pointer_cb = base_offset >> 6; ++ tex->texture_base_pointer_cr = base_offset >> 6; ++#endif ++ + /* Since other platform devices may produce UIF images even + * when they're not big enough for V3D to assume they're UIF, + * we force images with level 0 as UIF to be always treated +@@ -977,7 +1011,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + + v3dx_pack(map, TEXTURE_SHADER_STATE, tex) { + if (prsc->target != PIPE_BUFFER) { +- v3d_setup_texture_shader_state(&tex, prsc, ++ v3d_setup_texture_shader_state(&v3d->screen->devinfo, ++ &tex, prsc, + cso->u.tex.first_level, + cso->u.tex.last_level, + cso->u.tex.first_layer, +@@ -990,7 +1025,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + cso->u.buf.size); + } + +- tex.srgb = util_format_is_srgb(cso->format); ++ bool is_srgb = util_format_is_srgb(cso->format); ++#if V3D_VERSION <= 42 ++ tex.srgb = is_srgb; ++#endif ++#if V3D_VERSION >= 71 ++ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; ++#endif + + #if V3D_VERSION >= 40 + tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]); +@@ -1040,7 +1081,10 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, + * shader code if we wanted to read an MSAA sRGB + * texture without sRGB decode. + */ ++#if V3D_VERSION <= 42 + tex.srgb = false; ++#endif ++ + } else { + tex.texture_type = v3d_get_tex_format(&screen->devinfo, + cso->format); +@@ -1404,7 +1448,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d, + + v3dx_pack(map, TEXTURE_SHADER_STATE, tex) { + if (prsc->target != PIPE_BUFFER) { +- v3d_setup_texture_shader_state(&tex, prsc, ++ v3d_setup_texture_shader_state(&v3d->screen->devinfo, ++ &tex, prsc, + iview->base.u.tex.level, + iview->base.u.tex.level, + iview->base.u.tex.first_layer, +diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c +new file mode 100644 +index 000000000000..f4dba0cfa485 +--- /dev/null ++++ b/src/gallium/drivers/v3d/v3dx_tfu.c +@@ -0,0 +1,202 @@ ++/* ++ * Copyright © 2022 Raspberry Pi Ltd ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "v3d_context.h" ++#include "broadcom/common/v3d_tfu.h" ++ ++bool ++v3dX(tfu)(struct pipe_context *pctx, ++ struct pipe_resource *pdst, ++ struct pipe_resource *psrc, ++ unsigned int src_level, ++ unsigned int base_level, ++ unsigned int last_level, ++ unsigned int src_layer, ++ unsigned int dst_layer, ++ bool for_mipmap) ++{ ++ struct v3d_context *v3d = v3d_context(pctx); ++ struct v3d_screen *screen = v3d->screen; ++ struct v3d_resource *src = v3d_resource(psrc); ++ struct v3d_resource *dst = v3d_resource(pdst); ++ struct v3d_resource_slice *src_base_slice = &src->slices[src_level]; ++ struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level]; ++ int msaa_scale = pdst->nr_samples > 1 ? 2 : 1; ++ int width = u_minify(pdst->width0, base_level) * msaa_scale; ++ int height = u_minify(pdst->height0, base_level) * msaa_scale; ++ enum pipe_format pformat; ++ ++ if (psrc->format != pdst->format) ++ return false; ++ if (psrc->nr_samples != pdst->nr_samples) ++ return false; ++ ++ if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D) ++ return false; ++ ++ /* Can't write to raster. */ ++ if (dst_base_slice->tiling == V3D_TILING_RASTER) ++ return false; ++ ++ /* When using TFU for blit, we are doing exact copies (both input and ++ * output format must be the same, no scaling, etc), so there is no ++ * pixel format conversions. Thus we can rewrite the format to use one ++ * that is TFU compatible based on its texel size. ++ */ ++ if (for_mipmap) { ++ pformat = pdst->format; ++ } else { ++ switch (dst->cpp) { ++ case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break; ++ case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break; ++ case 4: pformat = PIPE_FORMAT_R32_FLOAT; break; ++ case 2: pformat = PIPE_FORMAT_R16_FLOAT; break; ++ case 1: pformat = PIPE_FORMAT_R8_UNORM; break; ++ default: unreachable("unsupported format bit-size"); break; ++ }; ++ } ++ ++ uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat); ++ ++ if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) { ++ assert(for_mipmap); ++ return false; ++ } ++ ++ v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); ++ v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); ++ ++ struct drm_v3d_submit_tfu tfu = { ++ .ios = (height << 16) | width, ++ .bo_handles = { ++ dst->bo->handle, ++ src != dst ? src->bo->handle : 0 ++ }, ++ .in_sync = v3d->out_sync, ++ .out_sync = v3d->out_sync, ++ }; ++ uint32_t src_offset = (src->bo->offset + ++ v3d_layer_offset(psrc, src_level, src_layer)); ++ tfu.iia |= src_offset; ++ ++ uint32_t dst_offset = (dst->bo->offset + ++ v3d_layer_offset(pdst, base_level, dst_layer)); ++ tfu.ioa |= dst_offset; ++ ++ switch (src_base_slice->tiling) { ++ case V3D_TILING_UIF_NO_XOR: ++ case V3D_TILING_UIF_XOR: ++ tfu.iis |= (src_base_slice->padded_height / ++ (2 * v3d_utile_height(src->cpp))); ++ break; ++ case V3D_TILING_RASTER: ++ tfu.iis |= src_base_slice->stride / src->cpp; ++ break; ++ case V3D_TILING_LINEARTILE: ++ case V3D_TILING_UBLINEAR_1_COLUMN: ++ case V3D_TILING_UBLINEAR_2_COLUMN: ++ break; ++ } ++ ++#if V3D_VERSION <= 42 ++ if (src_base_slice->tiling == V3D_TILING_RASTER) { ++ tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER << ++ V3D33_TFU_ICFG_FORMAT_SHIFT); ++ } else { ++ tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE + ++ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D33_TFU_ICFG_FORMAT_SHIFT); ++ } ++ tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT; ++ ++ if (last_level != base_level) ++ tfu.ioa |= V3D33_TFU_IOA_DIMTW; ++ ++ tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE + ++ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D33_TFU_IOA_FORMAT_SHIFT); ++ ++ tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT; ++ ++ /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the ++ * OPAD field for the destination (how many extra UIF blocks beyond ++ * those necessary to cover the height). When filling mipmaps, the ++ * miplevel 1+ tiling state is inferred. ++ */ ++ if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR || ++ dst_base_slice->tiling == V3D_TILING_UIF_XOR) { ++ int uif_block_h = 2 * v3d_utile_height(dst->cpp); ++ int implicit_padded_height = align(height, uif_block_h); ++ ++ tfu.icfg |= (((dst_base_slice->padded_height - ++ implicit_padded_height) / uif_block_h) << ++ V3D33_TFU_ICFG_OPAD_SHIFT); ++ } ++#endif /* V3D_VERSION <= 42 */ ++ ++#if V3D_VERSION >= 71 ++ if (src_base_slice->tiling == V3D_TILING_RASTER) { ++ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } else { ++ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + ++ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_ICFG_IFORMAT_SHIFT; ++ } ++ tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT; ++ ++ if (last_level != base_level) ++ tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW; ++ ++ tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE + ++ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << ++ V3D71_TFU_IOC_FORMAT_SHIFT); ++ ++ switch (dst_base_slice->tiling) { ++ case V3D_TILING_UIF_NO_XOR: ++ case V3D_TILING_UIF_XOR: ++ tfu.v71.ioc |= ++ (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ case V3D_TILING_RASTER: ++ tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) << ++ V3D71_TFU_IOC_STRIDE_SHIFT; ++ break; ++ default: ++ break; ++ } ++ ++ tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT; ++#endif /* V3D_VERSION >= 71*/ ++ ++ int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu); ++ if (ret != 0) { ++ fprintf(stderr, "Failed to submit TFU job: %d\n", ret); ++ return false; ++ } ++ ++ dst->writes++; ++ ++ return true; ++} ++ +-- +2.39.2 +