diff -u mesa-23.2.1/debian/changelog mesa-23.2.1/debian/changelog
--- mesa-23.2.1/debian/changelog
+++ mesa-23.2.1/debian/changelog
@@ -1,3 +1,10 @@
+mesa (23.2.1-1ubuntu2) mantic; urgency=medium
+
+  * d/p/v3d-v3dv-support-for-HW-7.1.x.patch
+    - [FFe] Raspberry Pi 5 (LP: #2037642)
+
+ -- Juerg Haefliger <juerg.haefliger@canonical.com>  Fri, 29 Sep 2023 08:27:32 +0200
+
 mesa (23.2.1-1ubuntu1) mantic; urgency=medium
 
   * Merge from Debian.
diff -u mesa-23.2.1/debian/patches/series mesa-23.2.1/debian/patches/series
--- mesa-23.2.1/debian/patches/series
+++ mesa-23.2.1/debian/patches/series
@@ -2,3 +2,4 @@
 path_max.diff
 src_glx_dri_common.h.diff
 fix-clover-build-without-spirv.diff
+v3d-v3dv-support-for-HW-7.1.x.patch
only in patch2:
unchanged:
--- mesa-23.2.1.orig/debian/patches/v3d-v3dv-support-for-HW-7.1.x.patch
+++ mesa-23.2.1/debian/patches/v3d-v3dv-support-for-HW-7.1.x.patch
@@ -0,0 +1,11461 @@
+From 2b2fb2d7889c5cc6a624a867ef0123c7ba8cb2a1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Mon, 26 Apr 2021 00:02:21 +0200
+Subject: [PATCH] v3d, v3dv: support for HW 7.1.x
+
+Squash of MR:
+https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25450
+
+Signed-off-by: Juerg Haefliger <juerg.haefliger@canonical.com>
+---
+ include/drm-uapi/v3d_drm.h                    |    5 +
+ src/broadcom/cle/meson.build                  |    3 +-
+ src/broadcom/cle/v3d_packet_v33.xml           |  386 ++++-
+ src/broadcom/cle/v3dx_pack.h                  |    2 +
+ src/broadcom/clif/clif_private.h              |    2 +
+ src/broadcom/common/v3d_device_info.c         |   17 +-
+ src/broadcom/common/v3d_device_info.h         |    6 +
+ src/broadcom/common/v3d_limits.h              |    3 +-
+ src/broadcom/common/v3d_macros.h              |    3 +
+ .../common/v3d_performance_counters.h         |  108 ++
+ src/broadcom/common/v3d_tfu.h                 |   23 +
+ src/broadcom/common/v3d_util.c                |  128 +-
+ src/broadcom/common/v3d_util.h                |   38 +-
+ src/broadcom/compiler/nir_to_vir.c            |   63 +-
+ src/broadcom/compiler/qpu_schedule.c          |  813 +++++++--
+ src/broadcom/compiler/qpu_validate.c          |   98 +-
+ src/broadcom/compiler/v3d_compiler.h          |    9 +-
+ src/broadcom/compiler/v3d_nir_lower_io.c      |   10 +-
+ src/broadcom/compiler/vir.c                   |   32 +-
+ src/broadcom/compiler/vir_dump.c              |    8 +-
+ src/broadcom/compiler/vir_live_variables.c    |   21 +-
+ .../compiler/vir_opt_copy_propagate.c         |   95 +-
+ .../compiler/vir_opt_redundant_flags.c        |    8 +-
+ .../compiler/vir_opt_small_immediates.c       |   26 +-
+ src/broadcom/compiler/vir_register_allocate.c |  480 ++++--
+ src/broadcom/compiler/vir_to_qpu.c            |  155 +-
+ src/broadcom/meson.build                      |    2 +-
+ src/broadcom/qpu/qpu_disasm.c                 |   81 +-
+ src/broadcom/qpu/qpu_instr.c                  |  121 +-
+ src/broadcom/qpu/qpu_instr.h                  |   87 +-
+ src/broadcom/qpu/qpu_pack.c                   | 1453 ++++++++++++++---
+ src/broadcom/qpu/tests/qpu_disasm.c           |    8 +-
+ src/broadcom/simulator/v3d_simulator.c        |   52 +-
+ src/broadcom/simulator/v3d_simulator.h        |   26 +
+ src/broadcom/simulator/v3dx_simulator.c       |   94 +-
+ src/broadcom/simulator/v3dx_simulator.h       |    1 +
+ src/broadcom/vulkan/meson.build               |    7 +-
+ src/broadcom/vulkan/v3dv_cmd_buffer.c         |  107 +-
+ src/broadcom/vulkan/v3dv_device.c             |   37 +-
+ src/broadcom/vulkan/v3dv_image.c              |    7 +-
+ src/broadcom/vulkan/v3dv_limits.h             |    2 -
+ src/broadcom/vulkan/v3dv_meta_clear.c         |    9 +-
+ src/broadcom/vulkan/v3dv_meta_copy.c          |   19 +-
+ src/broadcom/vulkan/v3dv_pass.c               |   19 +-
+ src/broadcom/vulkan/v3dv_pipeline.c           |   89 +-
+ src/broadcom/vulkan/v3dv_private.h            |   71 +-
+ src/broadcom/vulkan/v3dv_query.c              |   43 +-
+ src/broadcom/vulkan/v3dv_queue.c              |    2 +-
+ src/broadcom/vulkan/v3dv_uniforms.c           |   13 +-
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c        |  471 +++++-
+ src/broadcom/vulkan/v3dvx_device.c            |   53 +-
+ src/broadcom/vulkan/v3dvx_image.c             |   66 +-
+ src/broadcom/vulkan/v3dvx_meta_common.c       |  108 +-
+ src/broadcom/vulkan/v3dvx_pipeline.c          |  137 +-
+ src/broadcom/vulkan/v3dvx_private.h           |   49 +-
+ src/broadcom/vulkan/v3dvx_query.c             |   67 +
+ src/broadcom/vulkan/v3dvx_queue.c             |   18 +-
+ src/gallium/drivers/v3d/meson.build           |    5 +-
+ src/gallium/drivers/v3d/v3d_blit.c            |  166 +-
+ src/gallium/drivers/v3d/v3d_context.c         |    9 +-
+ src/gallium/drivers/v3d/v3d_context.h         |   54 +-
+ src/gallium/drivers/v3d/v3d_job.c             |    6 +-
+ src/gallium/drivers/v3d/v3d_query.c           |   20 +-
+ src/gallium/drivers/v3d/v3d_query.h           |    6 -
+ src/gallium/drivers/v3d/v3d_screen.c          |    3 +-
+ src/gallium/drivers/v3d/v3d_uniforms.c        |   14 +-
+ src/gallium/drivers/v3d/v3dx_context.h        |   20 +
+ src/gallium/drivers/v3d/v3dx_draw.c           |   72 +-
+ src/gallium/drivers/v3d/v3dx_emit.c           |   48 +-
+ ...d_query_perfcnt.c => v3dx_query_perfcnt.c} |   12 +-
+ src/gallium/drivers/v3d/v3dx_rcl.c            |  190 ++-
+ src/gallium/drivers/v3d/v3dx_state.c          |  129 +-
+ src/gallium/drivers/v3d/v3dx_tfu.c            |  202 +++
+ 73 files changed, 5451 insertions(+), 1366 deletions(-)
+ create mode 100644 src/broadcom/vulkan/v3dvx_query.c
+ rename src/gallium/drivers/v3d/{v3d_query_perfcnt.c => v3dx_query_perfcnt.c} (94%)
+ create mode 100644 src/gallium/drivers/v3d/v3dx_tfu.c
+
+diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h
+index 3dfc0af8756a..1a7d7a689de3 100644
+--- a/include/drm-uapi/v3d_drm.h
++++ b/include/drm-uapi/v3d_drm.h
+@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu {
+ 
+ 	/* Pointer to an array of ioctl extensions*/
+ 	__u64 extensions;
++
++	struct {
++		__u32 ioc;
++		__u32 pad;
++	} v71;
+ };
+ 
+ /* Submits a compute shader for dispatch.  This job will block on any
+diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
+index 31a0d5bfa94a..8ac32b313e4d 100644
+--- a/src/broadcom/cle/meson.build
++++ b/src/broadcom/cle/meson.build
+@@ -23,7 +23,8 @@ v3d_versions = [
+   [21, 21],
+   [33, 33],
+   [41, 33],
+-  [42, 33]
++  [42, 33],
++  [71, 33]
+ ]
+ 
+ v3d_xml_files = []
+diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
+index a0242b5f1c2f..624353ca2bf2 100644
+--- a/src/broadcom/cle/v3d_packet_v33.xml
++++ b/src/broadcom/cle/v3d_packet_v33.xml
+@@ -1,4 +1,4 @@
+-<vcxml gen="3.3" min_ver="33" max_ver="42">
++<vcxml gen="3.3" min_ver="33" max_ver="71">
+ 
+   <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
+     <value name="NEVER" value="0"/>
+@@ -167,13 +167,36 @@
+     <value name="depth_16" value="2"/>
+   </enum>
+ 
+-  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
++  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
+     <value name="none" value="0"/> <!-- no clamping -->
+     <value name="norm" value="1"/> <!-- [0,1] for f16 -->
+     <value name="pos" value="2"/> <!-- [0, for f16 -->
+     <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
+   </enum>
+ 
++  <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
++    <value name="8i"             value="0"/>  <!-- no clamping -->
++    <value name="16i"            value="1"/>  <!-- no clamping -->
++    <value name="32i"            value="2"/>  <!-- no clamping -->
++    <value name="8ui"            value="4"/>  <!-- no clamping -->
++    <value name="16ui"           value="5"/>  <!-- no clamping -->
++    <value name="32ui"           value="6"/>  <!-- no clamping -->
++    <value name="8"              value="8"/>  <!-- no clamping -->
++    <value name="16f"            value="9"/>  <!-- no clamping -->
++    <value name="32f"            value="10"/> <!-- no clamping -->
++    <value name="8i_clamped"     value="16"/> <!-- clamp to integer RT's range -->
++    <value name="16i_clamped"    value="17"/> <!-- clamp to integer RT's range -->
++    <value name="32i_clamped"    value="18"/> <!-- clamp to integer RT's range -->
++    <value name="8ui_clamped"    value="20"/> <!-- clamp to integer RT's range -->
++    <value name="16ui_clamped"   value="21"/> <!-- clamp to integer RT's range -->
++    <value name="32ui_clamped"   value="22"/> <!-- clamp to integer RT's range -->
++    <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
++    <value name="16f_clamp_pos"  value="25"/> <!-- [0, for f16 -->
++    <value name="16f_clamp_pq"   value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
++    <value name="16f_clamp_hlg"  value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
++    <value name="invalid"        value="32"/>
++  </enum>
++
+   <!---
+     CL cache flush commands are not fully documented and subject to a
+     number of hardware issues that make them unreliable. Specifically:
+@@ -263,13 +286,27 @@
+     <value name="r8ui"     value="36"/>
+     <value name="srgbx8"   value="37" max_ver="33"/>
+     <value name="rgbx8"    value="38" max_ver="33"/>
+-    <value name="bstc"     value="39" min_ver="41"/>
++    <value name="bstc8"    value="39" min_ver="41"/>
+     <value name="d32f"     value="40" min_ver="41"/>
+     <value name="d24"      value="41" min_ver="41"/>
+     <value name="d16"      value="42" min_ver="41"/>
+     <value name="d24s8"    value="43" min_ver="41"/>
+     <value name="s8"       value="44" min_ver="41"/>
+     <value name="rgba5551" value="45" min_ver="41"/>
++    <value name="bstc8_srgb"          value="46" min_ver="71"/>
++    <value name="bstc10"              value="47" min_ver="71"/>
++    <value name="bstc10_srgb"         value="48" min_ver="71"/>
++    <value name="bstc10_pq"           value="49" min_ver="71"/>
++    <value name="rgba10x6"            value="50" min_ver="71"/>
++    <value name="bstc10_hlg"          value="55" min_ver="71"/>
++    <value name="rgba10x6_hlg"        value="56" min_ver="71"/>
++    <value name="rgb10_a2_hlg"        value="57" min_ver="71"/>
++    <value name="bstc10_pq_bt1886"    value="58" min_ver="71"/>
++    <value name="rgba10x6_pq_bt1886"  value="59" min_ver="71"/>
++    <value name="rgb10_a2_pq_bt1886"  value="60" min_ver="71"/>
++    <value name="bstc10_hlg_bt1886"   value="61" min_ver="71"/>
++    <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
++    <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
+   </enum>
+ 
+   <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
+@@ -314,6 +351,12 @@
+     <value name="perp end caps" value="1"/>
+   </enum>
+ 
++  <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
++    <value name="NONE" value="0"/>
++    <value name="MIN_ONE_TO_ONE" value="1"/>
++    <value name="ZERO_TO_ONE" value="2"/>
++  </enum>
++
+   <packet code="0" name="Halt"/>
+   <packet code="1" name="NOP"/>
+   <packet code="4" name="Flush"/>
+@@ -381,11 +424,13 @@
+     <field name="Last Tile of Frame" size="1" start="0" type="bool"/>
+   </packet>
+ 
+-  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
++  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
+     <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
+     <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
+   </packet>
+ 
++  <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
++
+   <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
+     <field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
+     <field name="Enable Z load" size="1" start="7" type="bool"/>
+@@ -443,6 +488,10 @@
+       <value name="Render target 1" value="1"/>
+       <value name="Render target 2" value="2"/>
+       <value name="Render target 3" value="3"/>
++      <value name="Render target 4" value="4" min_ver="71"/>
++      <value name="Render target 5" value="5" min_ver="71"/>
++      <value name="Render target 6" value="6" min_ver="71"/>
++      <value name="Render target 7" value="7" min_ver="71"/>
+       <value name="None" value="8"/>
+       <value name="Z" value="9"/>
+       <value name="Stencil" value="10"/>
+@@ -789,7 +838,7 @@
+     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+   </packet>
+ 
+-  <packet code="84" name="Blend Cfg" min_ver="41">
++  <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
+     <field name="Render Target Mask" size="4" start="24" type="uint"/>
+     <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+     <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+@@ -799,6 +848,16 @@
+     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+   </packet>
+ 
++  <packet code="84" name="Blend Cfg" min_ver="71">
++    <field name="Render Target Mask" size="8" start="24" type="uint"/>
++    <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
++    <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
++    <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
++    <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
++    <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
++    <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
++  </packet>
++
+   <packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
+     <field name="Alpha (F16)" size="16" start="48" type="uint"/>
+     <field name="Blue (F16)" size="16" start="32" type="uint"/>
+@@ -828,7 +887,12 @@
+     <field name="address" size="32" start="0" type="address"/>
+   </packet>
+ 
+-  <packet code="96" name="Cfg Bits">
++  <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
++    <field name="Lower Test Limit" size="32" start="0" type="float"/>
++    <field name="Upper Test Limit" size="32" start="32" type="float"/>
++  </packet>
++
++  <packet code="96" name="Cfg Bits" max_ver="42">
+     <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+     <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+     <field name="Blend enable" size="1" start="19" type="bool"/>
+@@ -846,6 +910,25 @@
+     <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+   </packet>
+ 
++  <packet code="96" name="Cfg Bits" min_ver="71">
++    <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
++    <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
++    <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
++    <field name="Blend enable" size="1" start="19" type="bool"/>
++    <field name="Stencil enable" size="1" start="18" type="bool"/>
++    <field name="Z updates enable" size="1" start="15" type="bool"/>
++    <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
++    <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
++    <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
++    <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
++    <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
++    <field name="Line Rasterization" size="1" start="4" type="uint"/>
++    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
++    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
++    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
++    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
++  </packet>
++
+   <packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
+ 
+   <packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
+@@ -907,16 +990,26 @@
+     <field name="Minimum Zw" size="32" start="0" type="float"/>
+   </packet>
+ 
+-  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
++  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
+     <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
+     <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
+   </packet>
+ 
++  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
++    <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
++    <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
++  </packet>
++
+   <packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
+     <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+     <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+   </packet>
+ 
++  <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
++    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
++    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
++  </packet>
++
+   <packet name="Number of Layers" code="119" min_ver="41">
+     <field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
+   </packet>
+@@ -947,7 +1040,7 @@
+     <field name="sub-id" size="1" start="0" type="uint" default="0"/>
+   </packet>
+ 
+-  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
++  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">
+ 
+     <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+     <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
+@@ -971,6 +1064,35 @@
+     </field>
+   </packet>
+ 
++  <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
++    <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
++    <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
++
++    <field name="Log2 Tile Height" size="3" start="11" type="uint">
++      <value name="tile height 8 pixels" value="0"/>
++      <value name="tile height 16 pixels" value="1"/>
++      <value name="tile height 32 pixels" value="2"/>
++      <value name="tile height 64 pixels" value="3"/>
++    </field>
++    <field name="Log2 Tile Width"  size="3" start="8" type="uint">
++      <value name="tile width 8 pixels" value="0"/>
++      <value name="tile width 16 pixels" value="1"/>
++      <value name="tile width 32 pixels" value="2"/>
++      <value name="tile width 64 pixels" value="3"/>
++    </field>
++
++    <field name="tile allocation block size" size="2" start="4" type="uint">
++      <value name="tile allocation block size 64b" value="0"/>
++      <value name="tile allocation block size 128b" value="1"/>
++      <value name="tile allocation block size 256b" value="2"/>
++    </field>
++    <field name="tile allocation initial block size" size="2" start="2" type="uint">
++      <value name="tile allocation initial block size 64b" value="0"/>
++      <value name="tile allocation initial block size 128b" value="1"/>
++      <value name="tile allocation initial block size 256b" value="2"/>
++    </field>
++  </packet>
++
+   <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
+     <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
+     <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
+@@ -1002,7 +1124,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
+     <field name="Pad" size="12" start="52" type="uint"/>
+ 
+     <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+@@ -1018,7 +1140,11 @@
+     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+ 
+-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
++    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
++      <value name="Render target maximum 32bpp" value="0"/>
++      <value name="Render target maximum 64bpp" value="1"/>
++      <value name="Render target maximum 128bpp" value="2"/>
++    </field>
+ 
+     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+@@ -1027,6 +1153,43 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
++    <field name="Pad" size="6" start="58" type="uint"/>
++
++    <field name="Log2 Tile Height" size="3" start="55" type="uint">
++      <value name="tile height 8 pixels" value="0"/>
++      <value name="tile height 16 pixels" value="1"/>
++      <value name="tile height 32 pixels" value="2"/>
++      <value name="tile height 64 pixels" value="3"/>
++    </field>
++    <field name="Log2 Tile Width"  size="3" start="52" type="uint">
++      <value name="tile width 8 pixels" value="0"/>
++      <value name="tile width 16 pixels" value="1"/>
++      <value name="tile width 32 pixels" value="2"/>
++      <value name="tile width 64 pixels" value="3"/>
++    </field>
++
++    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
++    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
++
++    <field name="Early-Z disable" size="1" start="46" type="bool"/>
++
++    <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
++      <value name="Early-Z direction LT/LE" value="0"/>
++      <value name="Early-Z direction GT/GE" value="1"/>
++    </field>
++
++    <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
++    <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
++    <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
++
++    <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
++    <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
++    <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
++
++    <field name="sub-id" size="3" start="0" type="uint" default="0"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
+     <field name="Address" size="32" start="32" type="address"/>
+ 
+@@ -1048,7 +1211,8 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
++  <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
++  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">
+ 
+     <field name="Pad" size="28" start="36" type="uint"/>
+ 
+@@ -1099,7 +1263,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
+     <field name="unused" size="16" start="48" type="uint"/>
+ 
+     <field name="Z Clear Value" size="32" start="16" type="float"/>
+@@ -1108,6 +1272,15 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
++    <field name="unused" size="16" start="48" type="uint"/>
++
++    <field name="Z Clear Value" size="32" start="16" type="float"/>
++
++    <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
++    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
+@@ -1117,7 +1290,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
+     <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
+@@ -1126,6 +1299,19 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
++
++    <field name="Clear Color low bits" size="32" start="32" type="uint"/>
++    <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
++    <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
++
++    <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
++    <!-- In multiples of 512 bits -->
++    <field name="Base Address" size="11" start="7" type="uint"/>
++    <field name="Render Target number" size="3" start="3" type="uint"/>
++    <field name="sub-id" size="3" start="0" type="uint" default="2"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
+@@ -1135,7 +1321,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
+     <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
+@@ -1144,6 +1330,13 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
++    <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
++
++    <field name="Render Target number" size="3" start="3" type="uint"/>
++    <field name="sub-id" size="3" start="0" type="uint" default="3"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
+     <field name="pad" size="11" start="53" type="uint"/>
+     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
+@@ -1155,7 +1348,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="6"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
+     <field name="pad" size="11" start="53" type="uint"/>
+     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
+     <!-- image height is for Y flipping -->
+@@ -1166,6 +1359,13 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
++    <field name="Clear Color top bits" size="56" start="8" type="uint"/>
++
++    <field name="Render Target number" size="3" start="3" type="uint"/>
++    <field name="sub-id" size="3" start="0" type="uint" default="4"/>
++  </packet>
++
+   <packet code="124" shortname="tile_coords" name="Tile Coordinates">
+     <field name="tile row number" size="12" start="12" type="uint"/>
+     <field name="tile column number" size="12" start="0" type="uint"/>
+@@ -1240,7 +1440,7 @@
+     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
+   </struct>
+ 
+-  <struct name="GL Shader State Record" min_ver="41">
++  <struct name="GL Shader State Record" min_ver="41" max_ver="42">
+     <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+     <field name="Enable clipping" size="1" start="1" type="bool"/>
+ 
+@@ -1299,6 +1499,63 @@
+     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
+   </struct>
+ 
++  <struct name="GL Shader State Record" min_ver="71">
++    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
++    <field name="Enable clipping" size="1" start="1" type="bool"/>
++
++    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
++    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
++    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
++    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
++    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
++    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
++
++    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
++    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
++
++    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
++    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
++    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
++    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
++    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
++    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
++    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
++    <field name="No prim pack" size="1" start="19" type="bool"/>
++    <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
++
++    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
++
++    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
++    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
++
++    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
++    <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
++
++    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
++    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
++
++    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
++    <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
++
++    <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
++    <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
++    <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
++    <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
++    <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
++
++    <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
++    <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
++    <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
++    <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
++    <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
++
++    <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
++    <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
++    <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
++    <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
++    <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
++  </struct>
++
+   <struct name="Geometry Shader State Record" min_ver="41">
+     <field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
+     <field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
+@@ -1543,7 +1800,7 @@
+     <field name="Offset Format 8" size="1" start="0" type="bool"/>
+   </struct>
+ 
+-  <struct name="TMU Config Parameter 2" min_ver="42">
++  <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
+     <field name="Pad" size="7" start="25" type="uint"/>
+     <field name="LOD Query" size="1" start="24" type="bool"/>
+     <field name="Op" size="4" start="20" type="TMU Op"/>
+@@ -1558,6 +1815,23 @@
+     <field name="Offset Format 8" size="1" start="0" type="bool"/>
+   </struct>
+ 
++  <struct name="TMU Config Parameter 2" min_ver="71">
++    <field name="Pad" size="5" start="27" type="uint"/>
++    <field name="Write conversion" size="1" start="26" type="bool"/>
++    <field name="DIM query" size="1" start="25" type="bool"/>
++    <field name="LOD Query" size="1" start="24" type="bool"/>
++    <field name="Op" size="4" start="20" type="TMU Op"/>
++    <field name="Offset R" size="4" start="16" type="int"/>
++    <field name="Offset T" size="4" start="12" type="int"/>
++    <field name="Offset S" size="4" start="8" type="int"/>
++    <field name="Gather Mode" size="1" start="7" type="bool"/>
++    <field name="Gather Component" size="2" start="5" type="uint"/>
++    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
++    <field name="Sample Number" size="2" start="2" type="uint"/>
++    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
++    <field name="Offset Format 8" size="1" start="0" type="bool"/>
++  </struct>
++
+   <struct name="Texture Shader State" max_ver="33">
+     <field name="UIF XOR disable" size="1" start="255" type="bool"/>
+     <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
+@@ -1611,7 +1885,7 @@
+     <field name="Filter" size="4" start="0" type="TMU Filter"/>
+   </struct>
+ 
+-  <struct name="Texture Shader State" min_ver="41">
++  <struct name="Texture Shader State" min_ver="41" max_ver="42">
+     <field name="Pad" size="56" start="136" type="uint"/>
+     <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+     <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+@@ -1652,6 +1926,82 @@
+     <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+   </struct>
+ 
++  <struct name="Texture Shader State" min_ver="71">
++    <field name="Pad" size="2" start="190" type="uint"/>
++    <!-- When we use an address type, there is an implicit requirement
++         that the address is a 32-bit that is encoded starting at a 32-bit
++         aligned bit offset into the packet. If the address field has less than
++         32 bits, it is assumed that the address is aligned. For example, a
++         26-bit address field is expected to be 64-byte aligned (6 lsb bits
++         are 0) and that this will be encoded into a packet starting at bit
++         offset 6 into a 32-bit dword (since bits 0..5 of the address are
++         implicitly 0 and don't need to be explicitly encoded).
++
++         Unfortunately, the CB address below doesn't match this requirement:
++         it starts at bit 138, which is 10 bits into a 32-bit dword, but it
++         represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
++         encode it as an address type. To fix this we encode these addresses
++         as uint types which has two implications:
++         1. the driver is responsible for manually addinng the buffer objects
++            for these addresses to the job BO list.
++         2. the driver needs to pass an actual 26-bit address value by manually
++            shifting the 6 lsb bits (that are implicitly 0).
++    -->
++    <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
++    <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
++    <field name="Chroma offset y" size="1" start="137" type="uint"/>
++    <field name="Chroma offset x" size="1" start="136" type="uint"/>
++
++    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
++    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
++    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
++    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
++
++    <field name="Base Level" size="4" start="124" type="uint"/>
++    <field name="Max Level" size="4" start="120" type="uint"/>
++
++    <field name="Swizzle A" size="3" start="117" type="uint">
++      <value name="Swizzle Zero" value="0"/>
++      <value name="Swizzle One" value="1"/>
++      <value name="Swizzle Red" value="2"/>
++      <value name="Swizzle Green" value="3"/>
++      <value name="Swizzle Blue" value="4"/>
++      <value name="Swizzle Alpha" value="5"/>
++    </field>
++
++    <field name="Swizzle B" size="3" start="114" type="uint"/>
++    <field name="Swizzle G" size="3" start="111" type="uint"/>
++    <field name="Swizzle R" size="3" start="108" type="uint"/>
++    <field name="Extended" size="1" start="107" type="bool"/>
++
++    <field name="Texture type" size="7" start="100" type="uint"/>
++    <field name="Image Depth" size="14" start="86" type="uint"/>
++    <field name="Image Height" size="14" start="72" type="uint"/>
++    <field name="Image Width" size="14" start="58" type="uint"/>
++
++    <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
++         at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
++         Array Stride starting at 33, which is backwards incompatible,
++         We use the definition from 7.1.5.
++    -->
++    <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
++    <field name="R/B swap" size="1" start="32" type="bool"/>
++
++    <field name="Texture base pointer" size="32" start="0" type="address"/>
++
++    <field name="Reverse" size="1" start="5" type="bool"/>
++    <field name="Transfer func" size="3" start="2" type="uint">
++      <value name="Transfer Func None" value="0"/>
++      <value name="Transfer Func sRGB" value="1"/>
++      <value name="Transfer Func PQ" value="2"/>
++      <value name="Transfer Func HLG" value="3"/>
++      <value name="Transfer Func PQ BT1886" value="4"/>
++      <value name="Transfer Func HLG BT1886" value="5"/>
++    </field>
++    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
++    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
++  </struct>
++
+   <struct name="Sampler State" min_ver="41">
+     <field name="Border color word 3" size="32" start="160" type="uint"/>
+     <field name="Border color word 2" size="32" start="128" type="uint"/>
+diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
+index 5762e5aaa708..e5a1eb266983 100644
+--- a/src/broadcom/cle/v3dx_pack.h
++++ b/src/broadcom/cle/v3dx_pack.h
+@@ -37,6 +37,8 @@
+ #  include "cle/v3d_packet_v41_pack.h"
+ #elif (V3D_VERSION == 42)
+ #  include "cle/v3d_packet_v42_pack.h"
++#elif (V3D_VERSION == 71)
++#  include "cle/v3d_packet_v71_pack.h"
+ #else
+ #  error "Need to add a pack header include for this v3d version"
+ #endif
+diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
+index 6ace62b03101..cda407a00bf4 100644
+--- a/src/broadcom/clif/clif_private.h
++++ b/src/broadcom/clif/clif_private.h
+@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
+ bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
++bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
++                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
+ 
+ static inline void
+ out(struct clif_dump *clif, const char *fmt, ...)
+diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
+index 272190eb2e54..7bc2b662cfc7 100644
+--- a/src/broadcom/common/v3d_device_info.c
++++ b/src/broadcom/common/v3d_device_info.c
+@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+     struct drm_v3d_get_param ident1 = {
+             .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+     };
++    struct drm_v3d_get_param hub_ident3 = {
++            .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
++    };
+     int ret;
+ 
+     ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
+@@ -62,10 +65,13 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+     int qups = (ident1.value >> 8) & 0xf;
+     devinfo->qpu_count = nslc * qups;
+ 
++    devinfo->has_accumulators = devinfo->ver < 71;
++
+     switch (devinfo->ver) {
+         case 33:
+         case 41:
+         case 42:
++        case 71:
+                 break;
+         default:
+                 fprintf(stderr,
+@@ -75,5 +81,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+                 return false;
+     }
+ 
+-    return true;
++    ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
++    if (ret != 0) {
++            fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
++                    strerror(errno));
++            return false;
++    }
++
++   devinfo->rev = (hub_ident3.value >> 8) & 0xff;
++
++   return true;
+ }
+diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
+index 97abd9b8d9fc..8dfc7858727e 100644
+--- a/src/broadcom/common/v3d_device_info.h
++++ b/src/broadcom/common/v3d_device_info.h
+@@ -34,11 +34,17 @@ struct v3d_device_info {
+         /** Simple V3D version: major * 10 + minor */
+         uint8_t ver;
+ 
++        /** V3D revision number */
++        uint8_t rev;
++
+         /** Size of the VPM, in bytes. */
+         int vpm_size;
+ 
+         /* NSLC * QUPS from the core's IDENT registers. */
+         int qpu_count;
++
++        /* If the hw has accumulator registers */
++        bool has_accumulators;
+ };
+ 
+ typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
+diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
+index 46f38bd74846..354c8784914c 100644
+--- a/src/broadcom/common/v3d_limits.h
++++ b/src/broadcom/common/v3d_limits.h
+@@ -42,7 +42,8 @@
+ 
+ #define V3D_MAX_SAMPLES 4
+ 
+-#define V3D_MAX_DRAW_BUFFERS 4
++#define V3D_MAX_DRAW_BUFFERS 8
++#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)
+ 
+ #define V3D_MAX_POINT_SIZE 512.0f
+ #define V3D_MAX_LINE_WIDTH 32
+diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
+index fe89398208ab..b4291fb53500 100644
+--- a/src/broadcom/common/v3d_macros.h
++++ b/src/broadcom/common/v3d_macros.h
+@@ -41,6 +41,9 @@
+ #elif (V3D_VERSION == 42)
+ #  define V3DX(x) V3D42_##x
+ #  define v3dX(x) v3d42_##x
++#elif (V3D_VERSION == 71)
++#  define V3DX(x) V3D71_##x
++#  define v3dX(x) v3d71_##x
+ #else
+ #  error "Need to add prefixing macros for this v3d version"
+ #endif
+diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
+index 08d750c2cbe7..a8f0cff8784a 100644
+--- a/src/broadcom/common/v3d_performance_counters.h
++++ b/src/broadcom/common/v3d_performance_counters.h
+@@ -28,6 +28,110 @@
+ #define V3D_PERFCNT_NAME 1
+ #define V3D_PERFCNT_DESCRIPTION 2
+ 
++#ifndef V3D_VERSION
++#  error "The V3D_VERSION macro must be defined"
++#endif
++
++#if (V3D_VERSION >= 71)
++
++static const char *v3d_performance_counters[][3] = {
++   {"CORE", "cycle-count", "[CORE] Cycle counter"},
++   {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
++   {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
++   {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
++   {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
++   {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
++   {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
++   {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
++   {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
++   {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
++   {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
++   {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
++   {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
++   {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
++   {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
++   {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
++   {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
++   {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
++   {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
++   {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
++   {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
++   {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
++   {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
++   {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
++   {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
++   {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
++   {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
++   {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
++   {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
++   {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
++   {"L2T", "L2T-local", "[L2T] Local mode access"},
++   {"L2T", "L2T-writeback", "[L2T] Writeback"},
++   {"L2T", "L2T-zero", "[L2T] Zero"},
++   {"L2T", "L2T-merge", "[L2T] Merge"},
++   {"L2T", "L2T-fill", "[L2T] Fill"},
++   {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
++   {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
++   {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
++   {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
++   {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
++   {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
++   {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
++   {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
++   {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
++   {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
++   {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
++   {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
++   {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
++   {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
++   {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
++   {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
++   {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
++   {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
++   {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
++   {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
++   {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
++   {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
++   {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
++   {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
++   {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
++   {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
++   {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
++   {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
++   {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
++   {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
++   {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
++   {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
++   {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
++   {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
++   {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
++   {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
++   {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
++   {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
++   {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
++   {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
++   {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
++   {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
++   {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
++   {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
++   {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
++   {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
++   {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
++   {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
++   {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
++   {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
++   {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
++   {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
++   {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
++   {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
++   {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
++   {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
++   {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
++   {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
++};
++
++#elif (V3D_VERSION >= 41)
++
+ static const char *v3d_performance_counters[][3] = {
+    {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+    {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = {
+    {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+ };
+ 
++#else
++static const char *v3d_performance_counters[][3] = { };
++#endif
++
+ #endif
+diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
+index 80da224ca2d9..572d00747940 100644
+--- a/src/broadcom/common/v3d_tfu.h
++++ b/src/broadcom/common/v3d_tfu.h
+@@ -48,4 +48,27 @@
+ #define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+ #define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15
+ 
++/* Disable level 0 write, just write following mipmaps */
++#define V3D71_TFU_IOC_DIMTW (1 << 0)
++#define V3D71_TFU_IOC_FORMAT_SHIFT              12
++#define V3D71_TFU_IOC_FORMAT_LINEARTILE          3
++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN   4
++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN   5
++#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR          6
++#define V3D71_TFU_IOA_FORMAT_UIF_XOR             7
++
++#define V3D71_TFU_IOC_STRIDE_SHIFT              16
++#define V3D71_TFU_IOC_NUMMM_SHIFT                4
++
++#define V3D71_TFU_ICFG_OTYPE_SHIFT              16
++#define V3D71_TFU_ICFG_IFORMAT_SHIFT            23
++#define V3D71_TFU_ICFG_FORMAT_RASTER             0
++#define V3D71_TFU_ICFG_FORMAT_SAND_128           1
++#define V3D71_TFU_ICFG_FORMAT_SAND_256           2
++#define V3D71_TFU_ICFG_FORMAT_LINEARTILE        11
++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
++#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR        14
++#define V3D71_TFU_ICFG_FORMAT_UIF_XOR           15
++
+ #endif
+diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
+index 57872a923d34..8a50d2799853 100644
+--- a/src/broadcom/common/v3d_util.c
++++ b/src/broadcom/common/v3d_util.c
+@@ -87,10 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+    return best_wgs_per_sg;
+ }
+ 
++#define V3D71_TLB_COLOR_SIZE     (16 * 1024)
++#define V3D71_TLB_DETPH_SIZE     (16 * 1024)
++#define V3D71_TLB_AUX_DETPH_SIZE  (8 * 1024)
++
++static bool
++tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
++{
++   /* First, we check if we can fit this tile size allocating the depth
++    * TLB memory to color.
++    */
++   if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
++       pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
++      return true;
++   }
++
++   /* Otherwise the tile must fit in the main TLB buffers */
++   return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
++          pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
++}
++
+ void
+-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+-                     bool msaa, bool double_buffer,
+-                     uint32_t *width, uint32_t *height)
++v3d_choose_tile_size(const struct v3d_device_info *devinfo,
++                     uint32_t color_attachment_count,
++                     /* V3D 4.x max internal bpp of all RTs */
++                     uint32_t max_internal_bpp,
++                     /* V3D 7.x accumulated bpp for all RTs (in bytes) */
++                     uint32_t total_color_bpp,
++                     bool msaa,
++                     bool double_buffer,
++                     uint32_t *width,
++                     uint32_t *height)
+ {
+    static const uint8_t tile_sizes[] = {
+       64, 64,
+@@ -103,19 +130,65 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+    };
+ 
+    uint32_t idx = 0;
+-   if (color_attachment_count > 2)
+-      idx += 2;
+-   else if (color_attachment_count > 1)
+-      idx += 1;
++   if (devinfo->ver >= 71) {
++      /* In V3D 7.x, we use the actual bpp used by color attachments to compute
++       * the tile size instead of the maximum bpp. This may allow us to choose a
++       * larger tile size than we would in 4.x in scenarios with multiple RTs
++       * with different bpps.
++       *
++       * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
++       * used for depth instead of the main 16KB depth TLB buffer when the depth
++       * tile fits in the auxiliary buffer, allowing the hardware to allocate
++       * the 16KB from the main depth TLB to the color TLB. If we can do that,
++       * then we are effectively doubling the memory we have for color and we
++       * can also select a larger tile size. This is necessary to support
++       * the most expensive configuration: 8x128bpp RTs + MSAA.
++       *
++       * FIXME: the docs state that depth TLB memory can be used for color
++       * if depth testing is not used by setting the 'depth disable' bit in the
++       * rendering configuration. However, this comes with a requirement that
++       * occlussion queries must not be active. We need to clarify if this means
++       * active at the point at which we emit a tile rendering configuration
++       * item, meaning that the we have a query spanning a full render pass
++       * (this is something we can tell before we emit the rendering
++       * configuration item) or active in the subpass for which we are enabling
++       * the bit (which we can't tell until later, when we record commands for
++       * the subpass). If it is the latter, then we cannot use this feature.
++       *
++       * FIXME: pending handling double_buffer.
++       */
++      const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
++      const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
++      do {
++         const uint32_t tile_w = tile_sizes[idx * 2];
++         const uint32_t tile_h = tile_sizes[idx * 2 + 1];
++         if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
++            break;
++         idx++;
++      } while (idx < ARRAY_SIZE(tile_sizes) / 2);
++
++      /* FIXME: pending handling double_buffer */
++      assert(!double_buffer);
++   } else {
++      /* On V3D 4.x tile size is selected based on the number of RTs, the
++       * maximum bpp across all of them and whether 4x MSAA is used.
++       */
++      if (color_attachment_count > 4)
++         idx += 3;
++      else if (color_attachment_count > 2)
++         idx += 2;
++      else if (color_attachment_count > 1)
++         idx += 1;
+ 
+-   /* MSAA and double-buffer are mutually exclusive */
+-   assert(!msaa || !double_buffer);
+-   if (msaa)
+-      idx += 2;
+-   else if (double_buffer)
+-      idx += 1;
++      /* MSAA and double-buffer are mutually exclusive */
++      assert(!msaa || !double_buffer);
++      if (msaa)
++         idx += 2;
++      else if (double_buffer)
++         idx += 1;
+ 
+-   idx += max_color_bpp;
++      idx += max_internal_bpp;
++   }
+ 
+    assert(idx < ARRAY_SIZE(tile_sizes) / 2);
+ 
+@@ -170,3 +243,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
+       unreachable("Unsupported primitive type");
+    }
+ }
++
++uint32_t
++v3d_internal_bpp_words(uint32_t internal_bpp)
++{
++        switch (internal_bpp) {
++        case 0 /* V3D_INTERNAL_BPP_32 */:
++                return 1;
++        case 1 /* V3D_INTERNAL_BPP_64 */:
++                return 2;
++        case 2 /* V3D_INTERNAL_BPP_128 */:
++                return 4;
++        default:
++                unreachable("Unsupported internal BPP");
++        }
++}
++
++uint32_t
++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
++                                       uint32_t bpp)
++{
++        /* stride in multiples of 128 bits, and covers 2 rows. This is the
++         * reason we divide by 2 instead of 4, as we divide number of 32-bit
++         * words per row by 2.
++         */
++
++        return (tile_width * bpp) / 2;
++}
+diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
+index eb802b77f674..d02d41dd0897 100644
+--- a/src/broadcom/common/v3d_util.h
++++ b/src/broadcom/common/v3d_util.h
+@@ -24,6 +24,7 @@
+ #ifndef V3D_UTIL_H
+ #define V3D_UTIL_H
+ 
++#include "util/macros.h"
+ #include "common/v3d_device_info.h"
+ #include "pipe/p_defines.h"
+ 
+@@ -36,9 +37,14 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+                                          uint32_t wg_size);
+ 
+ void
+-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+-                     bool msaa, bool double_buffer,
+-                     uint32_t *width, uint32_t *height);
++v3d_choose_tile_size(const struct v3d_device_info *devinfo,
++                     uint32_t color_attachment_count,
++                     uint32_t max_internal_bpp,
++                     uint32_t total_color_bpp,
++                     bool msaa,
++                     bool double_buffer,
++                     uint32_t *width,
++                     uint32_t *height);
+ 
+ uint32_t
+ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
+@@ -46,4 +52,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
+ uint32_t
+ v3d_hw_prim_type(enum mesa_prim prim_type);
+ 
++uint32_t
++v3d_internal_bpp_words(uint32_t internal_bpp);
++
++/* Some configuration packets want the size on log2, but starting at 0 for
++ * size 8.
++ */
++static inline uint8_t
++log2_tile_size(uint32_t size)
++{
++        switch(size) {
++        case 8:
++                return 0;
++        case 16:
++                return 1;
++        case 32:
++                return 2;
++        case 64:
++                return 3;
++        default:
++                unreachable("Unsupported tile width/height");
++        }
++}
++
++uint32_t
++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
++                                       uint32_t bpp);
+ #endif
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index ca072971f01d..bef4126c2dc2 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
+ 
+ static struct qreg
+ emit_smooth_varying(struct v3d_compile *c,
+-                    struct qreg vary, struct qreg w, struct qreg r5)
++                    struct qreg vary, struct qreg w, struct qreg c_reg)
+ {
+-        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
++        return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
+ }
+ 
+ static struct qreg
+ emit_noperspective_varying(struct v3d_compile *c,
+-                           struct qreg vary, struct qreg r5)
++                           struct qreg vary, struct qreg c_reg)
+ {
+-        return vir_FADD(c, vir_MOV(c, vary), r5);
++        return vir_FADD(c, vir_MOV(c, vary), c_reg);
+ }
+ 
+ static struct qreg
+ emit_flat_varying(struct v3d_compile *c,
+-                  struct qreg vary, struct qreg r5)
++                  struct qreg vary, struct qreg c_reg)
+ {
+         vir_MOV_dest(c, c->undef, vary);
+-        return vir_MOV(c, r5);
++        return vir_MOV(c, c_reg);
+ }
+ 
+ static struct qreg
+ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                       int8_t input_idx, uint8_t swizzle, int array_index)
+ {
+-        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
+-        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
++        struct qreg c_reg; /* C coefficient */
++
++        if (c->devinfo->has_accumulators)
++                c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
++        else
++                c_reg = vir_reg(QFILE_REG, 0);
+ 
+         struct qinst *ldvary = NULL;
+         struct qreg vary;
+@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                 vary = vir_emit_def(c, ldvary);
+         } else {
+                 vir_NOP(c)->qpu.sig.ldvary = true;
+-                vary = r3;
++                vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
+         }
+ 
+         /* Store the input value before interpolation so we can implement
+@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+         if (input_idx >= 0) {
+                 assert(var);
+                 c->interp[input_idx].vp = vary;
+-                c->interp[input_idx].C = vir_MOV(c, r5);
++                c->interp[input_idx].C = vir_MOV(c, c_reg);
+                 c->interp[input_idx].mode = var->data.interpolation;
+         }
+ 
+@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+          */
+         if (!var) {
+                 assert(input_idx < 0);
+-                return emit_smooth_varying(c, vary, c->payload_w, r5);
++                return emit_smooth_varying(c, vary, c->payload_w, c_reg);
+         }
+ 
+         int i = c->num_inputs++;
+@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                 if (var->data.centroid) {
+                         BITSET_SET(c->centroid_flags, i);
+                         result = emit_smooth_varying(c, vary,
+-                                                     c->payload_w_centroid, r5);
++                                                     c->payload_w_centroid, c_reg);
+                 } else {
+-                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
++                        result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
+                 }
+                 break;
+ 
+         case INTERP_MODE_NOPERSPECTIVE:
+                 BITSET_SET(c->noperspective_flags, i);
+-                result = emit_noperspective_varying(c, vary, r5);
++                result = emit_noperspective_varying(c, vary, c_reg);
+                 break;
+ 
+         case INTERP_MODE_FLAT:
+                 BITSET_SET(c->flat_shade_flags, i);
+-                result = emit_flat_varying(c, vary, r5);
++                result = emit_flat_varying(c, vary, c_reg);
+                 break;
+ 
+         default:
+@@ -2440,15 +2444,17 @@ ntq_setup_outputs(struct v3d_compile *c)
+ 
+                 switch (var->data.location) {
+                 case FRAG_RESULT_COLOR:
+-                        c->output_color_var[0] = var;
+-                        c->output_color_var[1] = var;
+-                        c->output_color_var[2] = var;
+-                        c->output_color_var[3] = var;
++                        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
++                                c->output_color_var[i] = var;
+                         break;
+                 case FRAG_RESULT_DATA0:
+                 case FRAG_RESULT_DATA1:
+                 case FRAG_RESULT_DATA2:
+                 case FRAG_RESULT_DATA3:
++                case FRAG_RESULT_DATA4:
++                case FRAG_RESULT_DATA5:
++                case FRAG_RESULT_DATA6:
++                case FRAG_RESULT_DATA7:
+                         c->output_color_var[var->data.location -
+                                             FRAG_RESULT_DATA0] = var;
+                         break;
+@@ -4321,7 +4327,11 @@ nir_to_vir(struct v3d_compile *c)
+ {
+         switch (c->s->info.stage) {
+         case MESA_SHADER_FRAGMENT:
+-                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
++                if (c->devinfo->ver < 71)
++                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
++                else
++                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
++
+                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
+                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ 
+@@ -4354,8 +4364,13 @@ nir_to_vir(struct v3d_compile *c)
+                                                       V3D_QPU_WADDR_SYNC));
+                 }
+ 
+-                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+-                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
++                if (c->devinfo->ver <= 42) {
++                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
++                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
++                } else if (c->devinfo->ver >= 71) {
++                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
++                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
++                }
+ 
+                 /* Set up the division between gl_LocalInvocationIndex and
+                  * wg_in_mem in the payload reg.
+@@ -4534,8 +4549,8 @@ vir_check_payload_w(struct v3d_compile *c)
+ 
+         vir_for_each_inst_inorder(inst, c) {
+                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
+-                        if (inst->src[i].file == QFILE_REG &&
+-                            inst->src[i].index == 0) {
++                        if (inst->src[i].file == c->payload_w.file &&
++                            inst->src[i].index == c->payload_w.index) {
+                                 c->uses_center_w = true;
+                                 return;
+                         }
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 3b32b48f86f0..864947063861 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -155,12 +155,13 @@ static void
+ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+                  enum v3d_qpu_mux mux)
+ {
++        assert(state->devinfo->ver < 71);
+         switch (mux) {
+         case V3D_QPU_MUX_A:
+                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
+                 break;
+         case V3D_QPU_MUX_B:
+-                if (!n->inst->qpu.sig.small_imm) {
++                if (!n->inst->qpu.sig.small_imm_b) {
+                         add_read_dep(state,
+                                      state->last_rf[n->inst->qpu.raddr_b], n);
+                 }
+@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+         }
+ }
+ 
++
++static void
++process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
++                   uint8_t raddr, bool is_small_imm)
++{
++        assert(state->devinfo->ver >= 71);
++
++        if (!is_small_imm)
++                add_read_dep(state, state->last_rf[raddr], n);
++}
++
+ static bool
+ tmu_write_is_sequence_terminator(uint32_t waddr)
+ {
+@@ -285,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+         /* If the input and output segments are shared, then all VPM reads to
+          * a location need to happen before all writes.  We handle this by
+          * serializing all VPM operations for now.
++         *
++         * FIXME: we are assuming that the segments are shared. That is
++         * correct right now as we are only using shared, but technically you
++         * can choose.
+          */
+         bool separate_vpm_segment = false;
+ 
+@@ -305,15 +321,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+ 
+         /* XXX: LOAD_IMM */
+ 
+-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
+-                process_mux_deps(state, n, inst->alu.add.a);
+-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
+-                process_mux_deps(state, n, inst->alu.add.b);
++        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.add.a.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.add.a.raddr,
++                                           inst->sig.small_imm_a);
++                }
++        }
++        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.add.b.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.add.b.raddr,
++                                           inst->sig.small_imm_b);
++                }
++        }
+ 
+-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
+-                process_mux_deps(state, n, inst->alu.mul.a);
+-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
+-                process_mux_deps(state, n, inst->alu.mul.b);
++        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.mul.a.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.mul.a.raddr,
++                                           inst->sig.small_imm_c);
++                }
++        }
++        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.mul.b.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.mul.b.raddr,
++                                           inst->sig.small_imm_d);
++                }
++        }
+ 
+         switch (inst->alu.add.op) {
+         case V3D_QPU_A_VPMSETUP:
+@@ -386,6 +426,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+                 add_write_dep(state, &state->last_r[4], n);
+         if (v3d_qpu_writes_r5(devinfo, inst))
+                 add_write_dep(state, &state->last_r[5], n);
++        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
++                add_write_dep(state, &state->last_rf[0], n);
+ 
+         /* If we add any more dependencies here we should consider whether we
+          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
+@@ -500,6 +542,10 @@ struct choose_scoreboard {
+         int ldvary_count;
+         int pending_ldtmu_count;
+         bool first_ldtmu_after_thrsw;
++
++        /* V3D 7.x */
++        int last_implicit_rf0_write_tick;
++        bool has_rf0_flops_conflict;
+ };
+ 
+ static bool
+@@ -524,7 +570,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
+ }
+ 
+ static bool
+-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
++reads_too_soon(struct choose_scoreboard *scoreboard,
++               const struct v3d_qpu_instr *inst, uint8_t raddr)
++{
++        switch (raddr) {
++        case 0: /* ldvary delayed write of C coefficient to rf0 */
++                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
++                        return true;
++                break;
++        default:
++                break;
++        }
++
++        return false;
++}
++
++static bool
++reads_too_soon_after_write(const struct v3d_device_info *devinfo,
++                           struct choose_scoreboard *scoreboard,
+                            struct qinst *qinst)
+ {
+         const struct v3d_qpu_instr *inst = &qinst->qpu;
+@@ -536,24 +599,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+ 
+         if (inst->alu.add.op != V3D_QPU_A_NOP) {
+-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
+-                        return true;
++                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
++                                        return true;
++                        }
+                 }
+-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
+-                        return true;
++                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
++                                        return true;
++                        }
+                 }
+         }
+ 
+         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
+-                        return true;
++                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
++                                        return true;
++                        }
+                 }
+-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
+-                        return true;
++                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
++                                        return true;
++                        }
+                 }
+         }
+ 
+@@ -577,6 +660,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
+             v3d_qpu_writes_r4(devinfo, inst))
+                 return true;
+ 
++        if (devinfo->ver <= 42)
++           return false;
++
++        /* Don't schedule anything that writes rf0 right after ldvary, since
++         * that would clash with the ldvary's delayed rf0 write (the exception
++         * is another ldvary, since its implicit rf0 write would also have
++         * one cycle of delay and would not clash).
++         */
++        if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
++            (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
++             (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
++              !inst->sig.ldvary))) {
++            return true;
++       }
++
+         return false;
+ }
+ 
+@@ -604,29 +702,36 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
+ }
+ 
+ static bool
+-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
++qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
++                        const struct v3d_qpu_instr *inst,
+                         uint32_t waddr) {
+ 
+         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+            return false;
+ 
+-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+-            inst->raddr_a == waddr)
+-              return true;
++        if (devinfo->ver < 71) {
++                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
++                    inst->raddr_a == waddr)
++                        return true;
+ 
+-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+-            !inst->sig.small_imm && (inst->raddr_b == waddr))
+-              return true;
++                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
++                    !inst->sig.small_imm_b && (inst->raddr_b == waddr))
++                        return true;
++        } else {
++                if (v3d71_qpu_reads_raddr(inst, waddr))
++                        return true;
++        }
+ 
+         return false;
+ }
+ 
+ static bool
+-mux_read_stalls(struct choose_scoreboard *scoreboard,
+-                const struct v3d_qpu_instr *inst)
++read_stalls(const struct v3d_device_info *devinfo,
++            struct choose_scoreboard *scoreboard,
++            const struct v3d_qpu_instr *inst)
+ {
+         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
+-                qpu_instruction_uses_rf(inst,
++                qpu_instruction_uses_rf(devinfo, inst,
+                                         scoreboard->last_stallable_sfu_reg);
+ }
+ 
+@@ -692,7 +797,8 @@ enum {
+         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
+         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
+         V3D_PERIPHERAL_TSY                = (1 << 8),
+-        V3D_PERIPHERAL_TLB                = (1 << 9),
++        V3D_PERIPHERAL_TLB_READ           = (1 << 9),
++        V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
+ };
+ 
+ static uint32_t
+@@ -717,8 +823,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
+         if (v3d_qpu_uses_sfu(inst))
+                 result |= V3D_PERIPHERAL_SFU;
+ 
+-        if (v3d_qpu_uses_tlb(inst))
+-                result |= V3D_PERIPHERAL_TLB;
++        if (v3d_qpu_reads_tlb(inst))
++                result |= V3D_PERIPHERAL_TLB_READ;
++        if (v3d_qpu_writes_tlb(inst))
++                result |= V3D_PERIPHERAL_TLB_WRITE;
+ 
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
+@@ -749,32 +857,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
+         if (devinfo->ver < 41)
+                 return false;
+ 
+-        /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
+-         * tmuc).
++        /* V3D 4.x can't do more than one peripheral access except in a
++         * few cases:
+          */
+-        if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+-            b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+-                return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
++        if (devinfo->ver <= 42) {
++                /* WRTMUC signal with TMU register write (other than tmuc). */
++                if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                    b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
++                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
++                }
++                if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                    a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
++                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
++                }
++
++                /* TMU read with VPM read/write. */
++                if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
++                    (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
++                     b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
++                        return true;
++                }
++                if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
++                    (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
++                     a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
++                        return true;
++                }
++
++                return false;
+         }
+ 
+-        if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
+-            b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
+-                return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
++        /* V3D 7.x can't have more than one of these restricted peripherals */
++        const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
++                                    V3D_PERIPHERAL_TMU_WRTMUC_SIG |
++                                    V3D_PERIPHERAL_TSY |
++                                    V3D_PERIPHERAL_TLB_READ |
++                                    V3D_PERIPHERAL_SFU |
++                                    V3D_PERIPHERAL_VPM_READ |
++                                    V3D_PERIPHERAL_VPM_WRITE;
++
++        const uint32_t a_restricted = a_peripherals & restricted;
++        const uint32_t b_restricted = b_peripherals & restricted;
++        if (a_restricted && b_restricted) {
++                /* WRTMUC signal with TMU register write (other than tmuc) is
++                 * allowed though.
++                 */
++                if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                       b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
++                       v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
++                      (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                       a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
++                       v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
++                        return false;
++                }
+         }
+ 
+-        /* V3D 4.1+ allows TMU read with VPM read/write. */
+-        if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+-            (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+-             b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+-                return true;
++        /* Only one TMU read per instruction */
++        if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
++            (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
++                return false;
+         }
+-        if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+-            (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+-             a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+-                return true;
++
++        /* Only one TLB access per instruction */
++        if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
++                              V3D_PERIPHERAL_TLB_READ)) &&
++            (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
++                              V3D_PERIPHERAL_TLB_READ))) {
++                return false;
+         }
+ 
+-        return false;
++        return true;
+ }
+ 
+ /* Compute a bitmask of which rf registers are used between
+@@ -790,42 +941,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
+         uint64_t raddrs_used = 0;
+         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
+                 raddrs_used |= (1ll << a->raddr_a);
+-        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
++        if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+                 raddrs_used |= (1ll << a->raddr_b);
+         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
+                 raddrs_used |= (1ll << b->raddr_a);
+-        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
++        if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+                 raddrs_used |= (1ll << b->raddr_b);
+ 
+         return raddrs_used;
+ }
+ 
+-/* Take two instructions and attempt to merge their raddr fields
+- * into one merged instruction. Returns false if the two instructions
+- * access more than two different rf registers between them, or more
+- * than one rf register and one small immediate.
++/* Takes two instructions and attempts to merge their raddr fields (including
++ * small immediates) into one merged instruction. For V3D 4.x, returns false
++ * if the two instructions access more than two different rf registers between
++ * them, or more than one rf register and one small immediate. For 7.x returns
++ * false if both instructions use small immediates.
+  */
+ static bool
+ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+                  const struct v3d_qpu_instr *add_instr,
+-                 const struct v3d_qpu_instr *mul_instr)
++                 const struct v3d_qpu_instr *mul_instr,
++                 const struct v3d_device_info *devinfo)
+ {
++        if (devinfo->ver >= 71) {
++                assert(add_instr->sig.small_imm_a +
++                       add_instr->sig.small_imm_b <= 1);
++                assert(add_instr->sig.small_imm_c +
++                       add_instr->sig.small_imm_d == 0);
++                assert(mul_instr->sig.small_imm_a +
++                       mul_instr->sig.small_imm_b == 0);
++                assert(mul_instr->sig.small_imm_c +
++                       mul_instr->sig.small_imm_d <= 1);
++
++                result->sig.small_imm_a = add_instr->sig.small_imm_a;
++                result->sig.small_imm_b = add_instr->sig.small_imm_b;
++                result->sig.small_imm_c = mul_instr->sig.small_imm_c;
++                result->sig.small_imm_d = mul_instr->sig.small_imm_d;
++
++                return (result->sig.small_imm_a +
++                        result->sig.small_imm_b +
++                        result->sig.small_imm_c +
++                        result->sig.small_imm_d) <= 1;
++        }
++
++        assert(devinfo->ver <= 42);
++
+         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
+         int naddrs = util_bitcount64(raddrs_used);
+ 
+         if (naddrs > 2)
+                 return false;
+ 
+-        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
++        if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
+                 if (naddrs > 1)
+                         return false;
+ 
+-                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
++                if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
+                         if (add_instr->raddr_b != mul_instr->raddr_b)
+                                 return false;
+ 
+-                result->sig.small_imm = true;
+-                result->raddr_b = add_instr->sig.small_imm ?
++                result->sig.small_imm_b = true;
++                result->raddr_b = add_instr->sig.small_imm_b ?
+                         add_instr->raddr_b : mul_instr->raddr_b;
+         }
+ 
+@@ -836,23 +1012,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+         raddrs_used &= ~(1ll << raddr_a);
+         result->raddr_a = raddr_a;
+ 
+-        if (!result->sig.small_imm) {
++        if (!result->sig.small_imm_b) {
+                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
+                     raddr_a == add_instr->raddr_b) {
+-                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
+-                                result->alu.add.a = V3D_QPU_MUX_A;
+-                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
++                        if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
++                                result->alu.add.a.mux = V3D_QPU_MUX_A;
++                        if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
+-                                result->alu.add.b = V3D_QPU_MUX_A;
++                                result->alu.add.b.mux = V3D_QPU_MUX_A;
+                         }
+                 }
+                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
+                     raddr_a == mul_instr->raddr_b) {
+-                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
+-                                result->alu.mul.a = V3D_QPU_MUX_A;
+-                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
++                        if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
++                                result->alu.mul.a.mux = V3D_QPU_MUX_A;
++                        if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
+                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
+-                                result->alu.mul.b = V3D_QPU_MUX_A;
++                                result->alu.mul.b.mux = V3D_QPU_MUX_A;
+                         }
+                 }
+         }
+@@ -863,20 +1039,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+         result->raddr_b = raddr_b;
+         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
+             raddr_b == add_instr->raddr_a) {
+-                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
+-                        result->alu.add.a = V3D_QPU_MUX_B;
+-                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
++                if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
++                        result->alu.add.a.mux = V3D_QPU_MUX_B;
++                if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
+                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
+-                        result->alu.add.b = V3D_QPU_MUX_B;
++                        result->alu.add.b.mux = V3D_QPU_MUX_B;
+                 }
+         }
+         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
+             raddr_b == mul_instr->raddr_a) {
+-                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
+-                        result->alu.mul.a = V3D_QPU_MUX_B;
+-                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
++                if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
++                        result->alu.mul.a.mux = V3D_QPU_MUX_B;
++                if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
+                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
+-                        result->alu.mul.b = V3D_QPU_MUX_B;
++                        result->alu.mul.b.mux = V3D_QPU_MUX_B;
+                 }
+         }
+ 
+@@ -909,7 +1085,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
+ }
+ 
+ static void
+-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
++qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
++                       struct v3d_qpu_instr *inst)
+ {
+         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
+         assert(inst->alu.add.op != V3D_QPU_A_NOP);
+@@ -927,11 +1104,85 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+         inst->flags.auf = V3D_QPU_UF_NONE;
+ 
+         inst->alu.mul.output_pack = inst->alu.add.output_pack;
+-        inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
+-        inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
++
++        inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
++        inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+-        inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
+-        inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
++        inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
++        inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
++
++        if (devinfo->ver >= 71) {
++                assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
++                assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
++                if (inst->sig.small_imm_a) {
++                        inst->sig.small_imm_c = true;
++                        inst->sig.small_imm_a = false;
++                } else if (inst->sig.small_imm_b) {
++                        inst->sig.small_imm_d = true;
++                        inst->sig.small_imm_b = false;
++                }
++        }
++}
++
++static bool
++can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
++{
++        switch (op) {
++        case V3D_QPU_M_MOV:
++        case V3D_QPU_M_FMOV:
++                return devinfo->ver >= 71;
++        default:
++                return false;
++        }
++}
++
++static enum v3d_qpu_mul_op
++mul_op_as_add_op(enum v3d_qpu_mul_op op)
++{
++        switch (op) {
++        case V3D_QPU_M_MOV:
++                return V3D_QPU_A_MOV;
++        case V3D_QPU_M_FMOV:
++                return V3D_QPU_A_FMOV;
++        default:
++                unreachable("unexpected mov opcode");
++        }
++}
++
++static void
++qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
++{
++        STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
++        assert(inst->alu.mul.op != V3D_QPU_M_NOP);
++        assert(inst->alu.add.op == V3D_QPU_A_NOP);
++
++        memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
++        inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
++        inst->alu.mul.op = V3D_QPU_M_NOP;
++
++        inst->flags.ac = inst->flags.mc;
++        inst->flags.apf = inst->flags.mpf;
++        inst->flags.auf = inst->flags.muf;
++        inst->flags.mc = V3D_QPU_COND_NONE;
++        inst->flags.mpf = V3D_QPU_PF_NONE;
++        inst->flags.muf = V3D_QPU_UF_NONE;
++
++        inst->alu.add.output_pack = inst->alu.mul.output_pack;
++        inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
++        inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
++        inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++        inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
++        inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++
++        assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
++        assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
++        if (inst->sig.small_imm_c) {
++                inst->sig.small_imm_a = true;
++                inst->sig.small_imm_c = false;
++        } else if (inst->sig.small_imm_d) {
++                inst->sig.small_imm_b = true;
++                inst->sig.small_imm_d = false;
++        }
+ }
+ 
+ static bool
+@@ -970,20 +1221,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
+                          can_do_add_as_mul(b->alu.add.op)) {
+                         mul_inst = *b;
+-                        qpu_convert_add_to_mul(&mul_inst);
++                        qpu_convert_add_to_mul(devinfo, &mul_inst);
+ 
+                         merge.alu.mul = mul_inst.alu.mul;
+ 
+-                        merge.flags.mc = b->flags.ac;
+-                        merge.flags.mpf = b->flags.apf;
+-                        merge.flags.muf = b->flags.auf;
++                        merge.flags.mc = mul_inst.flags.mc;
++                        merge.flags.mpf = mul_inst.flags.mpf;
++                        merge.flags.muf = mul_inst.flags.muf;
+ 
+                         add_instr = a;
+                         mul_instr = &mul_inst;
+                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
+                            can_do_add_as_mul(a->alu.add.op)) {
+                         mul_inst = *a;
+-                        qpu_convert_add_to_mul(&mul_inst);
++                        qpu_convert_add_to_mul(devinfo, &mul_inst);
+ 
+                         merge = mul_inst;
+                         merge.alu.add = b->alu.add;
+@@ -999,22 +1250,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+                 }
+         }
+ 
++        struct v3d_qpu_instr add_inst;
+         if (b->alu.mul.op != V3D_QPU_M_NOP) {
+-                if (a->alu.mul.op != V3D_QPU_M_NOP)
+-                        return false;
+-                merge.alu.mul = b->alu.mul;
++                if (a->alu.mul.op == V3D_QPU_M_NOP) {
++                        merge.alu.mul = b->alu.mul;
++
++                        merge.flags.mc = b->flags.mc;
++                        merge.flags.mpf = b->flags.mpf;
++                        merge.flags.muf = b->flags.muf;
++
++                        mul_instr = b;
++                        add_instr = a;
++                }
++                /* If a's mul op is used but its add op is not, then see if we
++                 * can convert either a's mul op or b's mul op to an add op
++                 * so we can merge.
++                 */
++                else if (a->alu.add.op == V3D_QPU_A_NOP &&
++                         can_do_mul_as_add(devinfo, b->alu.mul.op)) {
++                        add_inst = *b;
++                        qpu_convert_mul_to_add(&add_inst);
+ 
+-                merge.flags.mc = b->flags.mc;
+-                merge.flags.mpf = b->flags.mpf;
+-                merge.flags.muf = b->flags.muf;
++                        merge.alu.add = add_inst.alu.add;
+ 
+-                mul_instr = b;
+-                add_instr = a;
++                        merge.flags.ac = add_inst.flags.ac;
++                        merge.flags.apf = add_inst.flags.apf;
++                        merge.flags.auf = add_inst.flags.auf;
++
++                        mul_instr = a;
++                        add_instr = &add_inst;
++                } else if (a->alu.add.op == V3D_QPU_A_NOP &&
++                           can_do_mul_as_add(devinfo, a->alu.mul.op)) {
++                        add_inst = *a;
++                        qpu_convert_mul_to_add(&add_inst);
++
++                        merge = add_inst;
++                        merge.alu.mul = b->alu.mul;
++
++                        merge.flags.mc = b->flags.mc;
++                        merge.flags.mpf = b->flags.mpf;
++                        merge.flags.muf = b->flags.muf;
++
++                        mul_instr = b;
++                        add_instr = &add_inst;
++                } else {
++                        return false;
++                }
+         }
+ 
++        /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
++         * they have restrictions on the number of raddrs that can be adressed
++         * in a single instruction. In V3D 7.x, we don't have that restriction,
++         * but we are still limited to a single small immediate per instruction.
++         */
+         if (add_instr && mul_instr &&
+-            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
+-                        return false;
++            !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
++                return false;
+         }
+ 
+         merge.sig.thrsw |= b->sig.thrsw;
+@@ -1025,7 +1316,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+         merge.sig.ldtmu |= b->sig.ldtmu;
+         merge.sig.ldvary |= b->sig.ldvary;
+         merge.sig.ldvpm |= b->sig.ldvpm;
+-        merge.sig.small_imm |= b->sig.small_imm;
+         merge.sig.ldtlb |= b->sig.ldtlb;
+         merge.sig.ldtlbu |= b->sig.ldtlbu;
+         merge.sig.ucb |= b->sig.ucb;
+@@ -1108,7 +1398,7 @@ retry:
+                  *  regfile A or B that was written to by the previous
+                  *  instruction."
+                  */
+-                if (reads_too_soon_after_write(scoreboard, n->inst))
++                if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
+                         continue;
+ 
+                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
+@@ -1122,10 +1412,11 @@ retry:
+                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
+                         continue;
+ 
+-                /* ldunif and ldvary both write r5, but ldunif does so a tick
+-                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
++                /* ldunif and ldvary both write the same register (r5 for v42
++                 * and below, rf0 for v71), but ldunif does so a tick sooner.
++                 * If the ldvary's register wasn't used, then ldunif might
+                  * otherwise get scheduled so ldunif and ldvary try to update
+-                 * r5 in the same tick.
++                 * the register in the same tick.
+                  */
+                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
+                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
+@@ -1204,11 +1495,20 @@ retry:
+                          * ldvary now if the follow-up fixup would place
+                          * it in the delay slots of a thrsw, which is not
+                          * allowed and would prevent the fixup from being
+-                         * successful.
++                         * successful. In V3D 7.x we can allow this to happen
++                         * as long as it is not the last delay slot.
+                          */
+-                        if (inst->sig.ldvary &&
+-                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+-                                continue;
++                        if (inst->sig.ldvary) {
++                                if (c->devinfo->ver <= 42 &&
++                                    scoreboard->last_thrsw_tick + 2 >=
++                                    scoreboard->tick - 1) {
++                                        continue;
++                                }
++                                if (c->devinfo->ver >= 71 &&
++                                    scoreboard->last_thrsw_tick + 2 ==
++                                    scoreboard->tick - 1) {
++                                        continue;
++                                }
+                         }
+ 
+                         /* We can emit a new tmu lookup with a previous ldtmu
+@@ -1243,7 +1543,7 @@ retry:
+ 
+                 int prio = get_instruction_priority(c->devinfo, inst);
+ 
+-                if (mux_read_stalls(scoreboard, inst)) {
++                if (read_stalls(c->devinfo, scoreboard, inst)) {
+                         /* Don't merge an instruction that stalls */
+                         if (prev_inst)
+                                 continue;
+@@ -1340,6 +1640,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+         }
+ }
+ 
++static void
++set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
++                           const struct v3d_qpu_instr *inst,
++                           const struct v3d_device_info *devinfo)
++{
++        if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
++            v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
++            !inst->sig_magic) {
++                scoreboard->has_rf0_flops_conflict = true;
++        }
++}
++
++static void
++update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
++                                const struct v3d_qpu_instr *inst,
++                                const struct v3d_device_info *devinfo)
++{
++        if (devinfo->ver < 71)
++                return;
++
++        /* Thread switch restrictions:
++         *
++         * At the point of a thread switch or thread end (when the actual
++         * thread switch or thread end happens, not when the signalling
++         * instruction is processed):
++         *
++         *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
++         *      ldvary instruction in which another signal also wrote to the
++         *      register file, and the final instruction of the thread section
++         *      contained a signal which wrote to the register file, then the
++         *      value of rf0 is undefined at the start of the new section
++         *
++         * Here we use the scoreboard to track if our last rf0 implicit write
++         * happens at the same time that another signal writes the register
++         * file (has_rf0_flops_conflict). We will use that information when
++         * scheduling thrsw instructions to avoid putting anything in their
++         * last delay slot which has a signal that writes to the register file.
++         */
++
++        /* Reset tracking if we have an explicit rf0 write or we are starting
++         * a new thread section.
++         */
++        if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
++            scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
++                scoreboard->last_implicit_rf0_write_tick = -10;
++                scoreboard->has_rf0_flops_conflict = false;
++        }
++
++        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
++                scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
++                        scoreboard->tick + 1 : scoreboard->tick;
++        }
++
++        set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
++}
++
+ static void
+ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
+                              const struct qinst *qinst,
+@@ -1383,6 +1739,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
+         if (inst->sig.ldvary)
+                 scoreboard->last_ldvary_tick = scoreboard->tick;
+ 
++        update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
++
+         update_scoreboard_tmu_tracking(scoreboard, qinst);
+ }
+ 
+@@ -1580,7 +1938,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+         if (slot > 0 && qinst->uniform != ~0)
+                 return false;
+ 
+-        if (v3d_qpu_waits_vpm(inst))
++        if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
+                 return false;
+ 
+         if (inst->sig.ldvary)
+@@ -1588,35 +1946,67 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+ 
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
+-                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
++                if (c->devinfo->ver <= 42 && slot == 2 &&
++                    inst->alu.add.op == V3D_QPU_A_TMUWT) {
+                         return false;
++                }
+ 
+-                /* No writing physical registers at the end. */
+-                bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+-                bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+-                if ((!add_is_nop && !inst->alu.add.magic_write) ||
+-                    (!mul_is_nop && !inst->alu.mul.magic_write)) {
+-                        return false;
++                if (c->devinfo->ver <= 42) {
++                        /* No writing physical registers at the end. */
++                        bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
++                        bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
++                        if ((!add_is_nop && !inst->alu.add.magic_write) ||
++                            (!mul_is_nop && !inst->alu.mul.magic_write)) {
++                                return false;
++                        }
++
++                        if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
++                            !inst->sig_magic) {
++                                return false;
++                        }
+                 }
+ 
+-                if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+-                    !inst->sig_magic) {
+-                        return false;
++                if (c->devinfo->ver >= 71) {
++                        /* The thread end instruction must not write to the
++                         * register file via the add/mul ALUs.
++                         */
++                        if (slot == 0 &&
++                            (!inst->alu.add.magic_write ||
++                             !inst->alu.mul.magic_write)) {
++                                return false;
++                        }
+                 }
+ 
+                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
+                         return false;
+ 
+-                /* RF0-2 might be overwritten during the delay slots by
+-                 * fragment shader setup.
+-                 */
+-                if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+-                        return false;
++                if (c->devinfo->ver <= 42) {
++                        /* RF0-2 might be overwritten during the delay slots by
++                         * fragment shader setup.
++                         */
++                        if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
++                                return false;
+ 
+-                if (inst->raddr_b < 3 &&
+-                    !inst->sig.small_imm &&
+-                    v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+-                        return false;
++                        if (inst->raddr_b < 3 &&
++                            !inst->sig.small_imm_b &&
++                            v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
++                                return false;
++                        }
++                }
++
++                if (c->devinfo->ver >= 71) {
++                        /* RF2-3 might be overwritten during the delay slots by
++                         * fragment shader setup.
++                         */
++                        if (v3d71_qpu_reads_raddr(inst, 2) ||
++                            v3d71_qpu_reads_raddr(inst, 3)) {
++                                return false;
++                        }
++
++                        if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
++                            v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
++                                return false;
++                        }
+                 }
+         }
+ 
+@@ -1632,6 +2022,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+  */
+ static bool
+ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
++                                          struct choose_scoreboard *scoreboard,
+                                           const struct qinst *qinst,
+                                           uint32_t slot)
+ {
+@@ -1642,8 +2033,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+         if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+                 return false;
+ 
+-        if (slot > 0 && qinst->qpu.sig.ldvary)
+-                return false;
++        if (qinst->qpu.sig.ldvary) {
++                if (c->devinfo->ver <= 42 && slot > 0)
++                        return false;
++                if (c->devinfo->ver >= 71 && slot == 2)
++                        return false;
++        }
+ 
+         /* unifa and the following 3 instructions can't overlap a
+          * thread switch/end. The docs further clarify that this means
+@@ -1662,6 +2057,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
+                 return false;
+ 
++        /* See comment when we set has_rf0_flops_conflict for details */
++        if (c->devinfo->ver >= 71 &&
++            slot == 2 &&
++            v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
++            !qinst->qpu.sig_magic) {
++                if (scoreboard->has_rf0_flops_conflict)
++                        return false;
++                if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
++                        return false;
++        }
++
+         return true;
+ }
+ 
+@@ -1694,7 +2100,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+          * also apply to instructions scheduled after the thrsw that we want
+          * to place in its delay slots.
+          */
+-        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
++        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
+                 return false;
+ 
+         /* TLB access is disallowed until scoreboard wait is executed, which
+@@ -1767,8 +2173,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
+                      bool is_thrend)
+ {
+         for (int slot = 0; slot < instructions_in_sequence; slot++) {
+-                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
++                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
++                                                               qinst, slot)) {
+                         return false;
++                }
+ 
+                 if (is_thrend &&
+                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
+@@ -1969,10 +2377,11 @@ emit_branch(struct v3d_compile *c,
+         assert(scoreboard->last_branch_tick + 3 < branch_tick);
+         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
+ 
+-        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
++        /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
+          * setmsf.
+          */
+         bool is_safe_msf_branch =
++                c->devinfo->ver >= 71 ||
+                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
+                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
+                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
+@@ -2056,46 +2465,72 @@ emit_branch(struct v3d_compile *c,
+ }
+ 
+ static bool
+-alu_reads_register(struct v3d_qpu_instr *inst,
++alu_reads_register(const struct v3d_device_info *devinfo,
++                   struct v3d_qpu_instr *inst,
+                    bool add, bool magic, uint32_t index)
+ {
+         uint32_t num_src;
+-        enum v3d_qpu_mux mux_a, mux_b;
+-
+-        if (add) {
++        if (add)
+                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
+-                mux_a = inst->alu.add.a;
+-                mux_b = inst->alu.add.b;
+-        } else {
++        else
+                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+-                mux_a = inst->alu.mul.a;
+-                mux_b = inst->alu.mul.b;
+-        }
+ 
+-        for (int i = 0; i < num_src; i++) {
+-                if (magic) {
+-                        if (i == 0 && mux_a == index)
+-                                return true;
+-                        if (i == 1 && mux_b == index)
+-                                return true;
++        if (devinfo->ver <= 42) {
++                enum v3d_qpu_mux mux_a, mux_b;
++                if (add) {
++                        mux_a = inst->alu.add.a.mux;
++                        mux_b = inst->alu.add.b.mux;
+                 } else {
+-                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+-                            inst->raddr_a == index) {
+-                                return true;
+-                        }
+-                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+-                            inst->raddr_b == index) {
+-                                return true;
+-                        }
+-                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+-                            inst->raddr_a == index) {
+-                                return true;
+-                        }
+-                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+-                            inst->raddr_b == index) {
+-                                return true;
++                        mux_a = inst->alu.mul.a.mux;
++                        mux_b = inst->alu.mul.b.mux;
++                }
++
++                for (int i = 0; i < num_src; i++) {
++                        if (magic) {
++                                if (i == 0 && mux_a == index)
++                                        return true;
++                                if (i == 1 && mux_b == index)
++                                        return true;
++                        } else {
++                                if (i == 0 && mux_a == V3D_QPU_MUX_A &&
++                                    inst->raddr_a == index) {
++                                        return true;
++                                }
++                                if (i == 0 && mux_a == V3D_QPU_MUX_B &&
++                                    inst->raddr_b == index) {
++                                        return true;
++                                }
++                                if (i == 1 && mux_b == V3D_QPU_MUX_A &&
++                                    inst->raddr_a == index) {
++                                        return true;
++                                }
++                                if (i == 1 && mux_b == V3D_QPU_MUX_B &&
++                                    inst->raddr_b == index) {
++                                        return true;
++                                }
+                         }
+                 }
++
++                return false;
++        }
++
++        assert(devinfo->ver >= 71);
++        assert(!magic);
++
++        uint32_t raddr_a, raddr_b;
++        if (add) {
++                raddr_a = inst->alu.add.a.raddr;
++                raddr_b = inst->alu.add.b.raddr;
++        } else {
++                raddr_a = inst->alu.mul.a.raddr;
++                raddr_b = inst->alu.mul.b.raddr;
++        }
++
++        for (int i = 0; i < num_src; i++) {
++                if (i == 0 && raddr_a == index)
++                        return true;
++                if (i == 1 && raddr_b == index)
++                        return true;
+         }
+ 
+         return false;
+@@ -2130,6 +2565,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+                        struct qblock *block,
+                        struct v3d_qpu_instr *inst)
+ {
++        const struct v3d_device_info *devinfo = c->devinfo;
++
+         /* We only call this if we have successfully merged an ldvary into a
+          * previous instruction.
+          */
+@@ -2142,9 +2579,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+          * the ldvary destination, if it does, then moving the ldvary before
+          * it would overwrite it.
+          */
+-        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
++        if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
+                 return false;
+-        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
++        if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+                 return false;
+ 
+         /* The implicit ldvary destination may not be written to by a signal
+@@ -2180,13 +2617,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+         }
+ 
+         /* The previous instruction cannot have a conflicting signal */
+-        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
++        if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+                 return false;
+ 
+         uint32_t sig;
+         struct v3d_qpu_sig new_sig = prev->qpu.sig;
+         new_sig.ldvary = true;
+-        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
++        if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+                 return false;
+ 
+         /* The previous instruction cannot use flags since ldvary uses the
+@@ -2199,9 +2636,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+ 
+         /* We can't put an ldvary in the delay slots of a thrsw. We should've
+          * prevented this when pairing up the ldvary with another instruction
+-         * and flagging it for a fixup.
++         * and flagging it for a fixup. In V3D 7.x this is limited only to the
++         * second delay slot.
+          */
+-        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
++        assert((devinfo->ver <= 42 &&
++                scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
++               (devinfo->ver >= 71 &&
++                scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
+ 
+         /* Move the ldvary to the previous instruction and remove it from the
+          * current one.
+@@ -2215,14 +2656,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+         inst->sig_magic = false;
+         inst->sig_addr = 0;
+ 
+-        /* By moving ldvary to the previous instruction we make it update
+-         * r5 in the current one, so nothing else in it should write r5.
+-         * This should've been prevented by our dependency tracking, which
++        /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
++        if (devinfo->ver >= 71) {
++                scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
++                set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
++        }
++
++        /* By moving ldvary to the previous instruction we make it update r5
++         * (rf0 for ver >= 71) in the current one, so nothing else in it
++         * should write this register.
++         *
++         * This should've been prevented by our depedency tracking, which
+          * would not allow ldvary to be paired up with an instruction that
+-         * writes r5 (since our dependency tracking doesn't know that the
+-         * ldvary write r5 happens in the next instruction).
++         * writes r5/rf0 (since our dependency tracking doesn't know that the
++         * ldvary write to r5/rf0 happens in the next instruction).
+          */
+-        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
++        assert(!v3d_qpu_writes_r5(devinfo, inst));
++        assert(devinfo->ver <= 42 ||
++               (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
++                !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
+ 
+         return true;
+ }
+@@ -2313,7 +2765,7 @@ schedule_instructions(struct v3d_compile *c,
+                                         }
+                                 }
+                         }
+-                        if (mux_read_stalls(scoreboard, inst))
++                        if (read_stalls(c->devinfo, scoreboard, inst))
+                                 c->qpu_inst_stalled_count++;
+                 }
+ 
+@@ -2538,6 +2990,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
+         scoreboard.last_setmsf_tick = -10;
+         scoreboard.last_stallable_sfu_tick = -10;
+         scoreboard.first_ldtmu_after_thrsw = true;
++        scoreboard.last_implicit_rf0_write_tick = - 10;
+ 
+         if (debug) {
+                 fprintf(stderr, "Pre-schedule instructions\n");
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index 2cc7a0eb0ae6..0466ee5d0b69 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
+         int last_sfu_write;
+         int last_branch_ip;
+         int last_thrsw_ip;
++        int first_tlb_z_write;
+ 
+         /* Set when we've found the last-THRSW signal, or if we were started
+          * in single-segment mode.
+@@ -110,11 +111,58 @@ static void
+ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+ {
+         const struct v3d_device_info *devinfo = state->c->devinfo;
++
++        if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
++                state->first_tlb_z_write = state->ip;
++
+         const struct v3d_qpu_instr *inst = &qinst->qpu;
+ 
++        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
++            state->first_tlb_z_write >= 0 &&
++            state->ip > state->first_tlb_z_write &&
++            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
++            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
++            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
++            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
++                fail_instr(state, "Implicit branch MSF read after TLB Z write");
++        }
++
+         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                 return;
+ 
++        if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
++            state->first_tlb_z_write >= 0 &&
++            state->ip > state->first_tlb_z_write) {
++                fail_instr(state, "SETMSF after TLB Z write");
++        }
++
++        if (state->first_tlb_z_write >= 0 &&
++            state->ip > state->first_tlb_z_write &&
++            inst->alu.add.op == V3D_QPU_A_MSF) {
++                fail_instr(state, "MSF read after TLB Z write");
++        }
++
++        if (devinfo->ver < 71) {
++                if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
++                    inst->sig.small_imm_d) {
++                        fail_instr(state, "small imm a/c/d added after V3D 7.1");
++                }
++        } else {
++                if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
++                    !vir_is_add(qinst)) {
++                        fail_instr(state, "small imm a/b used but no ADD inst");
++                }
++                if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
++                    !vir_is_mul(qinst)) {
++                        fail_instr(state, "small imm c/d used but no MUL inst");
++                }
++                if (inst->sig.small_imm_a + inst->sig.small_imm_b +
++                    inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
++                        fail_instr(state, "only one small immediate can be "
++                                   "enabled per instruction");
++                }
++        }
++
+         /* LDVARY writes r5 two instructions later and LDUNIF writes
+          * r5 one instruction later, which is illegal to have
+          * together.
+@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+                                    "SFU write started during THRSW delay slots ");
+                 }
+ 
+-                if (inst->sig.ldvary)
+-                        fail_instr(state, "LDVARY during THRSW delay slots");
++                if (inst->sig.ldvary) {
++                        if (devinfo->ver <= 42)
++                                fail_instr(state, "LDVARY during THRSW delay slots");
++                        if (devinfo->ver >= 71 &&
++                            state->ip - state->last_thrsw_ip == 2) {
++                                fail_instr(state, "LDVARY in 2nd THRSW delay slot");
++                        }
++                }
+         }
+ 
+         (void)qpu_magic_waddr_matches; /* XXX */
+@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+             vpm_writes +
+             tlb_writes +
+             tsy_writes +
+-            inst->sig.ldtmu +
++            (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
+             inst->sig.ldtlb +
+             inst->sig.ldvpm +
+             inst->sig.ldtlbu > 1) {
+@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+             inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
+                      !inst->alu.add.magic_write)) {
+-                        fail_instr(state, "RF write after THREND");
++                        if (devinfo->ver <= 42) {
++                                fail_instr(state, "RF write after THREND");
++                        } else if (devinfo->ver >= 71) {
++                                if (state->last_thrsw_ip - state->ip == 0) {
++                                        fail_instr(state,
++                                                   "ADD RF write at THREND");
++                                }
++                                if (inst->alu.add.waddr == 2 ||
++                                    inst->alu.add.waddr == 3) {
++                                        fail_instr(state,
++                                                   "RF2-3 write after THREND");
++                                }
++                        }
+                 }
+ 
+                 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
+                      !inst->alu.mul.magic_write)) {
+-                        fail_instr(state, "RF write after THREND");
++                        if (devinfo->ver <= 42) {
++                                fail_instr(state, "RF write after THREND");
++                        } else if (devinfo->ver >= 71) {
++                                if (state->last_thrsw_ip - state->ip == 0) {
++                                        fail_instr(state,
++                                                   "MUL RF write at THREND");
++                                }
++
++                                if (inst->alu.mul.waddr == 2 ||
++                                    inst->alu.mul.waddr == 3) {
++                                        fail_instr(state,
++                                                   "RF2-3 write after THREND");
++                                }
++                        }
+                 }
+ 
+                 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+                     !inst->sig_magic) {
+-                        fail_instr(state, "RF write after THREND");
++                        if (devinfo->ver <= 42) {
++                                fail_instr(state, "RF write after THREND");
++                        } else if (devinfo->ver >= 71 &&
++                                   (inst->sig_addr == 2 ||
++                                    inst->sig_addr == 3)) {
++                                fail_instr(state, "RF2-3 write after THREND");
++                        }
+                 }
+ 
+                 /* GFXH-1625: No TMUWT in the last instruction */
+@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
+                 .last_sfu_write = -10,
+                 .last_thrsw_ip = -10,
+                 .last_branch_ip = -10,
++                .first_tlb_z_write = INT_MAX,
+                 .ip = 0,
+ 
+                 .last_thrsw_found = !c->last_thrsw,
+diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
+index eb4e692464b2..889979cdda07 100644
+--- a/src/broadcom/compiler/v3d_compiler.h
++++ b/src/broadcom/compiler/v3d_compiler.h
+@@ -613,6 +613,11 @@ struct v3d_ra_node_info {
+         struct {
+                 uint32_t priority;
+                 uint8_t class_bits;
++                bool is_program_end;
++                bool unused;
++
++                /* V3D 7.x */
++                bool is_ldunif_dst;
+         } *info;
+         uint32_t alloc_count;
+ };
+@@ -1149,8 +1154,8 @@ bool vir_is_raw_mov(struct qinst *inst);
+ bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
+ bool vir_is_add(struct qinst *inst);
+ bool vir_is_mul(struct qinst *inst);
+-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
+-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
++bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
++bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
+ struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
+ uint8_t vir_channels_written(struct qinst *inst);
+ struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
+diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
+index 3ef0e398228a..4cdba3748a1c 100644
+--- a/src/broadcom/compiler/v3d_nir_lower_io.c
++++ b/src/broadcom/compiler/v3d_nir_lower_io.c
+@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
+                          * The correct fix for this as recommended by Broadcom
+                          * is to convert to .8 fixed-point with ffloor().
+                          */
+-                        pos = nir_f2i32(b, nir_ffloor(b, pos));
+-                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
+-                                             offset_reg, pos);
++                        if (c->devinfo->ver <= 42)
++                                 pos = nir_f2i32(b, nir_ffloor(b, pos));
++                        else
++                                 pos = nir_f2i32(b, nir_fround_even(b, pos));
++
++                       v3d_nir_store_output(b, state->vp_vpm_offset + i,
++                                            offset_reg, pos);
+                 }
+         }
+ 
+diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
+index 660b11b05776..f6965012d93c 100644
+--- a/src/broadcom/compiler/vir.c
++++ b/src/broadcom/compiler/vir.c
+@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
+                 return false;
+         }
+ 
+-        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
+-            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
+-            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+-            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
++        if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
++            inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
++            inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
++            inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
+                 return false;
+         }
+ 
+@@ -156,8 +156,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
+ }
+ 
+ bool
+-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
++vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
++                         struct qinst *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                 switch (inst->src[i].file) {
+                 case QFILE_VPM:
+@@ -178,8 +182,12 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+ }
+ 
+ bool
+-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
++vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
++                         struct qinst *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         switch (inst->dst.file) {
+         case QFILE_MAGIC:
+                 switch (inst->dst.index) {
+@@ -209,15 +217,15 @@ vir_set_unpack(struct qinst *inst, int src,
+ 
+         if (vir_is_add(inst)) {
+                 if (src == 0)
+-                        inst->qpu.alu.add.a_unpack = unpack;
++                        inst->qpu.alu.add.a.unpack = unpack;
+                 else
+-                        inst->qpu.alu.add.b_unpack = unpack;
++                        inst->qpu.alu.add.b.unpack = unpack;
+         } else {
+                 assert(vir_is_mul(inst));
+                 if (src == 0)
+-                        inst->qpu.alu.mul.a_unpack = unpack;
++                        inst->qpu.alu.mul.a.unpack = unpack;
+                 else
+-                        inst->qpu.alu.mul.b_unpack = unpack;
++                        inst->qpu.alu.mul.b.unpack = unpack;
+         }
+ }
+ 
+@@ -737,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
+ 
+         /* Set us up for shared input/output segments.  This is apparently
+          * necessary for our VCM setup to avoid varying corruption.
++         *
++         * FIXME: initial testing on V3D 7.1 seems to work fine when using
++         * separate segments. So we could try to reevaluate in the future, if
++         * there is any advantage of using separate segments.
+          */
+         prog_data->separate_segments = false;
+         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
+diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
+index 5c47bbdc1b01..ab5d40430393 100644
+--- a/src/broadcom/compiler/vir_dump.c
++++ b/src/broadcom/compiler/vir_dump.c
+@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+                 vir_print_reg(c, inst, inst->dst);
+                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
+ 
+-                unpack[0] = instr->alu.add.a_unpack;
+-                unpack[1] = instr->alu.add.b_unpack;
++                unpack[0] = instr->alu.add.a.unpack;
++                unpack[1] = instr->alu.add.b.unpack;
+         } else {
+                 fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
+                 fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
+@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+                 vir_print_reg(c, inst, inst->dst);
+                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
+ 
+-                unpack[0] = instr->alu.mul.a_unpack;
+-                unpack[1] = instr->alu.mul.b_unpack;
++                unpack[0] = instr->alu.mul.a.unpack;
++                unpack[1] = instr->alu.mul.b.unpack;
+         }
+ 
+         for (int i = 0; i < nsrc; i++) {
+diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
+index 575b0481dc81..d1f44aa9cf76 100644
+--- a/src/broadcom/compiler/vir_live_variables.c
++++ b/src/broadcom/compiler/vir_live_variables.c
+@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
+                                 flags_inst = NULL;
+                         }
+ 
+-                        /* Payload registers: r0/1/2 contain W, centroid W,
+-                         * and Z at program start.  Register allocation will
+-                         * force their nodes to R0/1/2.
++                        /* Payload registers: for fragment shaders, W,
++                         * centroid W, and Z will be initialized in r0/1/2
++                         * until v42, or r1/r2/r3 since v71.
++                         *
++                         * For compute shaders, payload is in r0/r2 up to v42,
++                         * r2/r3 since v71.
++                         *
++                         * Register allocation will force their nodes to those
++                         * registers.
+                          */
+                         if (inst->src[0].file == QFILE_REG) {
+-                                switch (inst->src[0].index) {
+-                                case 0:
+-                                case 1:
+-                                case 2:
++                                uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
++                                uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
++                                if (inst->src[0].index >= min_payload_r ||
++                                    inst->src[0].index <= max_payload_r) {
+                                         c->temp_start[inst->dst.index] = 0;
+-                                        break;
+                                 }
+                         }
+ 
+diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
+index da121c2a5bd4..1260838ca056 100644
+--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
++++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
+@@ -35,7 +35,7 @@
+ #include "v3d_compiler.h"
+ 
+ static bool
+-is_copy_mov(struct qinst *inst)
++is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
+ {
+         if (!inst)
+                 return false;
+@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
+                 return false;
+         }
+ 
+-        switch (inst->src[0].file) {
+-        case QFILE_MAGIC:
+-                /* No copy propagating from R3/R4/R5 -- the MOVs from those
+-                 * are there to register allocate values produced into R3/4/5
+-                 * to other regs (though hopefully r3/4/5).
+-                 */
+-                switch (inst->src[0].index) {
+-                case V3D_QPU_WADDR_R3:
+-                case V3D_QPU_WADDR_R4:
+-                case V3D_QPU_WADDR_R5:
+-                        return false;
++        if (devinfo->ver <= 42) {
++                switch (inst->src[0].file) {
++                case QFILE_MAGIC:
++                        /* No copy propagating from R3/R4/R5 -- the MOVs from
++                         * those are there to register allocate values produced
++                         * into R3/4/5 to other regs (though hopefully r3/4/5).
++                         */
++                        switch (inst->src[0].index) {
++                        case V3D_QPU_WADDR_R3:
++                        case V3D_QPU_WADDR_R4:
++                        case V3D_QPU_WADDR_R5:
++                                return false;
++                        default:
++                                break;
++                        }
++                        break;
++
++                case QFILE_REG:
++                        switch (inst->src[0].index) {
++                        case 0:
++                        case 1:
++                        case 2:
++                                /* MOVs from rf0/1/2 are only to track the live
++                                 * intervals for W/centroid W/Z.
++                                 */
++                                return false;
++                        }
++                        break;
++
+                 default:
+                         break;
+                 }
+-                break;
+-
+-        case QFILE_REG:
+-                switch (inst->src[0].index) {
+-                case 0:
+-                case 1:
+-                case 2:
+-                        /* MOVs from rf0/1/2 are only to track the live
++        } else {
++                assert(devinfo->ver >= 71);
++                switch (inst->src[0].file) {
++                case QFILE_REG:
++                        switch (inst->src[0].index) {
++                        /* MOVs from rf1/2/3 are only to track the live
+                          * intervals for W/centroid W/Z.
++                         *
++                         * Note: rf0 can be implicitly written by ldvary
++                         * (no temp involved), so it is not an SSA value and
++                         * could clash with writes to other temps that are
++                         * also allocated to rf0. In theory, that would mean
++                         * that we can't copy propagate from it, but we handle
++                         * this at register allocation time, preventing temps
++                         * from being allocated to rf0 while the rf0 value from
++                         * ldvary is still live.
+                          */
+-                        return false;
+-                }
+-                break;
++                        case 1:
++                        case 2:
++                        case 3:
++                                return false;
++                        }
++                        break;
+ 
+-        default:
+-                break;
++                default:
++                        break;
++                }
+         }
+ 
+         return true;
+@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
+ 
+         if (vir_is_add(inst)) {
+                 if (chan == 0)
+-                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
+                 else
+-                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
+         } else {
+                 if (chan == 0)
+-                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
+                 else
+-                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
+         }
+ }
+ 
+@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+                  */
+                 struct qinst *mov = movs[inst->src[i].index];
+                 if (!mov) {
+-                        if (!is_copy_mov(c->defs[inst->src[i].index]))
++                        if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
+                                 continue;
+                         mov = c->defs[inst->src[i].index];
+ 
+@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+                                 continue;
+ 
+                         /* these ops can't represent abs. */
+-                        if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
++                        if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
+                                 switch (inst->qpu.alu.add.op) {
+                                 case V3D_QPU_A_VFPACK:
+                                 case V3D_QPU_A_FROUND:
+@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+ 
+                 inst->src[i] = mov->src[0];
+                 if (vir_has_unpack(mov, 0)) {
+-                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
++                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
+ 
+                         vir_set_unpack(inst, i, unpack);
+                 }
+@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)
+ 
+                         apply_kills(c, movs, inst);
+ 
+-                        if (is_copy_mov(inst))
++                        if (is_copy_mov(c->devinfo, inst))
+                                 movs[inst->dst.index] = inst;
+                 }
+         }
+diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
+index c7896d57f2bd..6b61ed6a39ac 100644
+--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
++++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
+@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
+             a->qpu.flags.mpf != b->qpu.flags.mpf ||
+             a->qpu.alu.add.op != b->qpu.alu.add.op ||
+             a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
+-            a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
+-            a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
++            a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
++            a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
+             a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
+-            a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
+-            a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
++            a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
++            a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
+             a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
+                 return false;
+         }
+diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
+index 47d7722968d8..ed5bc0119642 100644
+--- a/src/broadcom/compiler/vir_opt_small_immediates.c
++++ b/src/broadcom/compiler/vir_opt_small_immediates.c
+@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                 /* The small immediate value sits in the raddr B field, so we
+                  * can't have 2 small immediates in one instruction (unless
+                  * they're the same value, but that should be optimized away
+-                 * elsewhere).
++                 * elsewhere). Since 7.x we can encode small immediates in
++                 * any raddr field, but each instruction can still only use
++                 * one.
+                  */
+                 bool uses_small_imm = false;
+                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
+@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                          */
+                         struct v3d_qpu_sig new_sig = inst->qpu.sig;
+                         uint32_t sig_packed;
+-                        new_sig.small_imm = true;
++                        if (c->devinfo->ver <= 42) {
++                                new_sig.small_imm_b = true;
++                        } else {
++                               if (vir_is_add(inst)) {
++                                       if (i == 0)
++                                               new_sig.small_imm_a = true;
++                                       else
++                                               new_sig.small_imm_b = true;
++                               } else {
++                                       if (i == 0)
++                                               new_sig.small_imm_c = true;
++                                       else
++                                               new_sig.small_imm_d = true;
++                               }
++                        }
++
+                         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
+                                 continue;
+ 
+@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                                 vir_dump_inst(c, inst);
+                                 fprintf(stderr, "\n");
+                         }
+-                        inst->qpu.sig.small_imm = true;
++                        inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
++                        inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
++                        inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
++                        inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
+                         inst->qpu.raddr_b = packed;
+ 
+                         inst->src[i].file = QFILE_SMALL_IMM;
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index b22f915d1dfc..8eac2b75bd79 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -28,41 +28,73 @@
+ 
+ #define ACC_INDEX     0
+ #define ACC_COUNT     6
+-#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
+-#define PHYS_COUNT    64
+ 
++/* RA nodes used to track RF registers with implicit writes */
++#define IMPLICIT_RF_COUNT 1
++
++#define PHYS_COUNT 64
++
++static uint8_t
++get_phys_index(const struct v3d_device_info *devinfo)
++{
++        if (devinfo->has_accumulators)
++                return ACC_INDEX + ACC_COUNT;
++        else
++                return 0;
++}
++
++/* ACC as accumulator */
+ #define CLASS_BITS_PHYS   (1 << 0)
+ #define CLASS_BITS_ACC    (1 << 1)
+ #define CLASS_BITS_R5     (1 << 4)
+-#define CLASS_BITS_ANY    (CLASS_BITS_PHYS | \
+-                           CLASS_BITS_ACC | \
+-                           CLASS_BITS_R5)
++
++static uint8_t
++get_class_bit_any(const struct v3d_device_info *devinfo)
++{
++        if (devinfo->has_accumulators)
++                return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
++        else
++                return CLASS_BITS_PHYS;
++}
++
++static uint8_t
++filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
++{
++   if (!devinfo->has_accumulators) {
++      assert(class_bits & CLASS_BITS_PHYS);
++      class_bits = CLASS_BITS_PHYS;
++   }
++   return class_bits;
++}
+ 
+ static inline uint32_t
+-temp_to_node(uint32_t temp)
++temp_to_node(struct v3d_compile *c, uint32_t temp)
+ {
+-        return temp + ACC_COUNT;
++        return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
++                                                      IMPLICIT_RF_COUNT);
+ }
+ 
+ static inline uint32_t
+-node_to_temp(uint32_t node)
++node_to_temp(struct v3d_compile *c, uint32_t node)
+ {
+-        assert(node >= ACC_COUNT);
+-        return node - ACC_COUNT;
++        assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
++               (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
++        return node - (c->devinfo->has_accumulators ? ACC_COUNT :
++                                                      IMPLICIT_RF_COUNT);
+ }
+ 
+ static inline uint8_t
+-get_temp_class_bits(struct v3d_ra_node_info *nodes,
++get_temp_class_bits(struct v3d_compile *c,
+                     uint32_t temp)
+ {
+-        return nodes->info[temp_to_node(temp)].class_bits;
++        return c->nodes.info[temp_to_node(c, temp)].class_bits;
+ }
+ 
+ static inline void
+-set_temp_class_bits(struct v3d_ra_node_info *nodes,
++set_temp_class_bits(struct v3d_compile *c,
+                     uint32_t temp, uint8_t class_bits)
+ {
+-        nodes->info[temp_to_node(temp)].class_bits = class_bits;
++        c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
+ }
+ 
+ static struct ra_class *
+@@ -71,11 +103,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
+         if (class_bits == CLASS_BITS_PHYS) {
+                 return c->compiler->reg_class_phys[c->thread_index];
+         } else if (class_bits == (CLASS_BITS_R5)) {
++                assert(c->devinfo->has_accumulators);
+                 return c->compiler->reg_class_r5[c->thread_index];
+         } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
++                assert(c->devinfo->has_accumulators);
+                 return c->compiler->reg_class_phys_or_acc[c->thread_index];
+         } else {
+-                assert(class_bits == CLASS_BITS_ANY);
++                assert(class_bits == get_class_bit_any(c->devinfo));
+                 return c->compiler->reg_class_any[c->thread_index];
+         }
+ }
+@@ -84,7 +118,7 @@ static inline struct ra_class *
+ choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
+ {
+         assert(temp < c->num_temps && temp < c->nodes.alloc_count);
+-        return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
++        return choose_reg_class(c, get_temp_class_bits(c, temp));
+ }
+ 
+ static inline bool
+@@ -313,7 +347,7 @@ v3d_choose_spill_node(struct v3d_compile *c)
+ 
+         for (unsigned i = 0; i < c->num_temps; i++) {
+                 if (BITSET_TEST(c->spillable, i)) {
+-                        ra_set_node_spill_cost(c->g, temp_to_node(i),
++                        ra_set_node_spill_cost(c->g, temp_to_node(c, i),
+                                                spill_costs[i]);
+                 }
+         }
+@@ -331,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
+         c->nodes.info = reralloc_array_size(c,
+                                             c->nodes.info,
+                                             sizeof(c->nodes.info[0]),
+-                                            c->nodes.alloc_count + ACC_COUNT);
++                                            c->nodes.alloc_count +
++                                            MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
+ }
+ 
+ /* Creates the interference node for a new temp. We use this to keep the node
+@@ -343,11 +378,15 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+         ensure_nodes(c);
+ 
+         int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
+-        assert(node == temp + ACC_COUNT);
++        assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
++                                              node == temp + IMPLICIT_RF_COUNT);
+ 
+         /* We fill the node priority after we are done inserting spills */
+         c->nodes.info[node].class_bits = class_bits;
+         c->nodes.info[node].priority = 0;
++        c->nodes.info[node].is_ldunif_dst = false;
++        c->nodes.info[node].is_program_end = false;
++        c->nodes.info[node].unused = false;
+ }
+ 
+ /* The spill offset for this thread takes a bit of setup, so do it once at
+@@ -395,8 +434,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
+                  */
+                 if (c->spilling) {
+                         int temp_class = CLASS_BITS_PHYS;
+-                        if (i != c->spill_base.index)
++                        if (c->devinfo->has_accumulators &&
++                            i != c->spill_base.index) {
+                                 temp_class |= CLASS_BITS_ACC;
++                        }
+                         add_node(c, i, temp_class);
+                 }
+         }
+@@ -436,7 +477,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
+          */
+         assert(c->disable_ldunif_opt);
+         struct qreg offset = vir_uniform_ui(c, spill_offset);
+-        add_node(c, offset.index, CLASS_BITS_ANY);
++        add_node(c, offset.index, get_class_bit_any(c->devinfo));
+ 
+         /* We always enable per-quad on spills/fills to ensure we spill
+          * any channels involved with helper invocations.
+@@ -455,14 +496,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
+          * temp will be used immediately so just like the uniform above we
+          * can allow accumulators.
+          */
++        int temp_class =
++                filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+         if (!fill_dst) {
+                 struct qreg dst = vir_TMUWT(c);
+                 assert(dst.file == QFILE_TEMP);
+-                add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
++                add_node(c, dst.index, temp_class);
+         } else {
+                 *fill_dst = vir_LDTMU(c);
+                 assert(fill_dst->file == QFILE_TEMP);
+-                add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
++                add_node(c, fill_dst->index, temp_class);
+         }
+ 
+         /* Temps across the thread switch we injected can't be assigned to
+@@ -482,7 +525,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
+                         c->temp_start[i] < ip && c->temp_end[i] >= ip :
+                         c->temp_start[i] <= ip && c->temp_end[i] > ip;
+                 if (thrsw_cross) {
+-                        ra_set_node_class(c->g, temp_to_node(i),
++                        ra_set_node_class(c->g, temp_to_node(c, i),
+                                           choose_reg_class(c, CLASS_BITS_PHYS));
+                 }
+         }
+@@ -509,8 +552,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
+          * same register class bits as the original.
+          */
+         if (inst == position) {
+-                uint8_t class_bits = get_temp_class_bits(&c->nodes,
+-                                                         inst->dst.index);
++                uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
+                 inst->dst = vir_get_temp(c);
+                 add_node(c, inst->dst.index, class_bits);
+         } else {
+@@ -542,7 +584,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
+ }
+ 
+ static void
+-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
++v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
++              int spill_temp)
+ {
+         c->spill_start_num_temps = c->num_temps;
+         c->spilling = true;
+@@ -554,8 +597,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                 spill_offset = c->spill_size;
+                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
+ 
+-                if (spill_offset == 0)
++                if (spill_offset == 0) {
+                         v3d_setup_spill_base(c);
++
++                        /* Don't allocate our spill base to rf0 to avoid
++                         * conflicts with instructions doing implicit writes
++                         * to that register.
++                         */
++                        if (!c->devinfo->has_accumulators) {
++                                ra_add_node_interference(
++                                        c->g,
++                                        temp_to_node(c, c->spill_base.index),
++                                        implicit_rf_nodes[0]);
++                        }
++                }
+         }
+ 
+         struct qinst *last_thrsw = c->last_thrsw;
+@@ -574,7 +629,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                 reconstruct_op = orig_def->qpu.alu.add.op;
+         }
+ 
+-        uint32_t spill_node = temp_to_node(spill_temp);
++        uint32_t spill_node = temp_to_node(c, spill_temp);
+ 
+         /* We must disable the ldunif optimization if we are spilling uniforms */
+         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
+@@ -635,7 +690,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                                          * instruction immediately after, so
+                                          * we can use any register class for it.
+                                          */
+-                                        add_node(c, unif.index, CLASS_BITS_ANY);
++                                        add_node(c, unif.index,
++                                                 get_class_bit_any(c->devinfo));
+                                 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+                                         struct qreg temp =
+                                                 reconstruct_temp(c, reconstruct_op);
+@@ -644,8 +700,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                                          * instruction immediately after so we
+                                          * can use ACC.
+                                          */
+-                                        add_node(c, temp.index, CLASS_BITS_PHYS |
+-                                                                CLASS_BITS_ACC);
++                                        int temp_class =
++                                                filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
++                                                                              CLASS_BITS_ACC);
++                                        add_node(c, temp.index, temp_class);
+                                 } else {
+                                         /* If we have a postponed spill, we
+                                          * don't need a fill as the temp would
+@@ -739,12 +797,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+          * update node priorities based one new liveness data.
+          */
+         uint32_t sb_temp =c->spill_base.index;
+-        uint32_t sb_node = temp_to_node(sb_temp);
++        uint32_t sb_node = temp_to_node(c, sb_temp);
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+                 if (c->temp_end[i] == -1)
+                         continue;
+ 
+-                uint32_t node_i = temp_to_node(i);
++                uint32_t node_i = temp_to_node(c, i);
+                 c->nodes.info[node_i].priority =
+                         c->temp_end[i] - c->temp_start[i];
+ 
+@@ -752,7 +810,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                      j < c->num_temps; j++) {
+                         if (interferes(c->temp_start[i], c->temp_end[i],
+                                        c->temp_start[j], c->temp_end[j])) {
+-                                uint32_t node_j = temp_to_node(j);
++                                uint32_t node_j = temp_to_node(c, j);
+                                 ra_add_node_interference(c->g, node_i, node_j);
+                         }
+                 }
+@@ -771,9 +829,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+ }
+ 
+ struct v3d_ra_select_callback_data {
++        uint32_t phys_index;
+         uint32_t next_acc;
+         uint32_t next_phys;
+         struct v3d_ra_node_info *nodes;
++        const struct v3d_device_info *devinfo;
+ };
+ 
+ /* Choosing accumulators improves chances of merging QPU instructions
+@@ -785,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                    BITSET_WORD *regs,
+                    int priority)
+ {
++        if (!v3d_ra->devinfo->has_accumulators)
++                return false;
++
+         /* Favor accumulators if we have less that this number of physical
+          * registers. Accumulators have more restrictions (like being
+          * invalidated through thrsw), so running out of physical registers
+@@ -794,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
+         static const int available_rf_threshold = 5;
+         int available_rf = 0 ;
+         for (int i = 0; i < PHYS_COUNT; i++) {
+-                if (BITSET_TEST(regs, PHYS_INDEX + i))
++                if (BITSET_TEST(regs, v3d_ra->phys_index + i))
+                         available_rf++;
+                 if (available_rf >= available_rf_threshold)
+                         break;
+@@ -820,6 +883,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                     BITSET_WORD *regs,
+                     unsigned int *out)
+ {
++        if (!v3d_ra->devinfo->has_accumulators)
++                return false;
++
+         /* Choose r5 for our ldunifs if possible (nobody else can load to that
+          * reg, and it keeps the QPU cond field free from being occupied by
+          * ldunifrf).
+@@ -849,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
+ 
+ static bool
+ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
++                 unsigned int node,
+                  BITSET_WORD *regs,
+                  unsigned int *out)
+ {
++        /* If this node is for an unused temp, ignore. */
++        if (v3d_ra->nodes->info[node].unused) {
++                *out = 0;
++                return true;
++        }
++
++        /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
++         * so we can avoid turning them into ldunifrf (which uses the
++         * cond field to encode the dst and would prevent merge with
++         * instructions that use cond flags).
++         */
++        if (v3d_ra->nodes->info[node].is_ldunif_dst &&
++            BITSET_TEST(regs, v3d_ra->phys_index)) {
++                assert(v3d_ra->devinfo->ver >= 71);
++                *out = v3d_ra->phys_index;
++                return true;
++        }
++
++        /* The last 3 instructions in a shader can't use some specific registers
++         * (usually early rf registers, depends on v3d version) so try to
++         * avoid allocating these to registers used by the last instructions
++         * in the shader.
++         */
++        const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
++        if (v3d_ra->nodes->info[node].is_program_end &&
++            v3d_ra->next_phys < safe_rf_start) {
++                v3d_ra->next_phys = safe_rf_start;
++        }
++
+         for (int i = 0; i < PHYS_COUNT; i++) {
+                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+-                int phys = PHYS_INDEX + phys_off;
++
++                /* Try to keep rf0 available for ldunif in 7.x (see above). */
++                if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
++                        continue;
++
++                int phys = v3d_ra->phys_index + phys_off;
+ 
+                 if (BITSET_TEST(regs, phys)) {
+                         v3d_ra->next_phys = phys_off + 1;
+@@ -863,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 }
+         }
+ 
++        /* If we couldn't allocate, do try to assign rf0 if it is available. */
++        if (v3d_ra->devinfo->ver >= 71 &&
++            BITSET_TEST(regs, v3d_ra->phys_index)) {
++                v3d_ra->next_phys = 1;
++                *out = v3d_ra->phys_index;
++                return true;
++        }
++
+         return false;
+ }
+ 
+@@ -877,7 +986,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
+                 return reg;
+         }
+ 
+-        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
++        if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
+                 return reg;
+ 
+         /* If we ran out of physical registers try to assign an accumulator
+@@ -896,8 +1005,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+          * register file can be divided up for fragment shader threading.
+          */
+         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
++        uint8_t phys_index = get_phys_index(compiler->devinfo);
+ 
+-        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
++        compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
+                                           false);
+         if (!compiler->regs)
+                 return false;
+@@ -905,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+         for (int threads = 0; threads < max_thread_index; threads++) {
+                 compiler->reg_class_any[threads] =
+                         ra_alloc_contig_reg_class(compiler->regs, 1);
+-                compiler->reg_class_r5[threads] =
+-                        ra_alloc_contig_reg_class(compiler->regs, 1);
+-                compiler->reg_class_phys_or_acc[threads] =
+-                        ra_alloc_contig_reg_class(compiler->regs, 1);
++                if (compiler->devinfo->has_accumulators) {
++                        compiler->reg_class_r5[threads] =
++                                ra_alloc_contig_reg_class(compiler->regs, 1);
++                        compiler->reg_class_phys_or_acc[threads] =
++                                ra_alloc_contig_reg_class(compiler->regs, 1);
++                }
+                 compiler->reg_class_phys[threads] =
+                         ra_alloc_contig_reg_class(compiler->regs, 1);
+ 
+-                for (int i = PHYS_INDEX;
+-                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
+-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
++                /* Init physical regs */
++                for (int i = phys_index;
++                     i < phys_index + (PHYS_COUNT >> threads); i++) {
++                        if (compiler->devinfo->has_accumulators)
++                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
+                         ra_class_add_reg(compiler->reg_class_any[threads], i);
+                 }
+ 
+-                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+-                        ra_class_add_reg(compiler->reg_class_any[threads], i);
++                /* Init accumulator regs */
++                if (compiler->devinfo->has_accumulators) {
++                        for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
++                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
++                                ra_class_add_reg(compiler->reg_class_any[threads], i);
++                        }
++                        /* r5 can only store a single 32-bit value, so not much can
++                         * use it.
++                         */
++                        ra_class_add_reg(compiler->reg_class_r5[threads],
++                                         ACC_INDEX + 5);
++                        ra_class_add_reg(compiler->reg_class_any[threads],
++                                         ACC_INDEX + 5);
+                 }
+-                /* r5 can only store a single 32-bit value, so not much can
+-                 * use it.
+-                 */
+-                ra_class_add_reg(compiler->reg_class_r5[threads],
+-                                 ACC_INDEX + 5);
+-                ra_class_add_reg(compiler->reg_class_any[threads],
+-                                 ACC_INDEX + 5);
+         }
+ 
+         ra_set_finalize(compiler->regs, NULL);
+@@ -944,7 +1061,10 @@ tmu_spilling_allowed(struct v3d_compile *c)
+ }
+ 
+ static void
+-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
++update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
++                                      int *acc_nodes,
++                                      int *implicit_rf_nodes,
++                                      int last_ldvary_ip,
+                                       struct qinst *inst)
+ {
+         int32_t ip = inst->ip;
+@@ -954,26 +1074,39 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+          * result to a temp), nothing else can be stored in r3/r4 across
+          * it.
+          */
+-        if (vir_writes_r3(c->devinfo, inst)) {
++        if (vir_writes_r3_implicitly(c->devinfo, inst)) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 ra_add_node_interference(c->g,
+-                                                         temp_to_node(i),
++                                                         temp_to_node(c, i),
+                                                          acc_nodes[3]);
+                         }
+                 }
+         }
+ 
+-        if (vir_writes_r4(c->devinfo, inst)) {
++        if (vir_writes_r4_implicitly(c->devinfo, inst)) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 ra_add_node_interference(c->g,
+-                                                         temp_to_node(i),
++                                                         temp_to_node(c, i),
+                                                          acc_nodes[4]);
+                         }
+                 }
+         }
+ 
++        /* If any instruction writes to a physical register implicitly
++         * nothing else can write the same register across it.
++         */
++        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
++                for (int i = 0; i < c->num_temps; i++) {
++                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
++                                ra_add_node_interference(c->g,
++                                                         temp_to_node(c, i),
++                                                         implicit_rf_nodes[0]);
++                        }
++                }
++        }
++
+         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                 switch (inst->qpu.alu.add.op) {
+                 case V3D_QPU_A_LDVPMV_IN:
+@@ -987,7 +1120,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          * decides whether the LDVPM is in or out)
+                          */
+                         assert(inst->dst.file == QFILE_TEMP);
+-                        set_temp_class_bits(&c->nodes, inst->dst.index,
++                        set_temp_class_bits(c, inst->dst.index,
+                                             CLASS_BITS_PHYS);
+                         break;
+                 }
+@@ -1002,7 +1135,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          * phys regfile.
+                          */
+                         assert(inst->dst.file == QFILE_TEMP);
+-                        set_temp_class_bits(&c->nodes, inst->dst.index,
++                        set_temp_class_bits(c, inst->dst.index,
+                                             CLASS_BITS_PHYS);
+                         break;
+                 }
+@@ -1015,6 +1148,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+         if (inst->src[0].file == QFILE_REG) {
+                 switch (inst->src[0].index) {
+                 case 0:
++                        /* V3D 7.x doesn't use rf0 for thread payload */
++                        if (c->devinfo->ver >= 71)
++                                break;
++                        else
++                                FALLTHROUGH;
+                 case 1:
+                 case 2:
+                 case 3: {
+@@ -1024,14 +1162,34 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          */
+                         assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+                         assert(inst->dst.file == QFILE_TEMP);
+-                        uint32_t node = temp_to_node(inst->dst.index);
++                        uint32_t node = temp_to_node(c, inst->dst.index);
+                         ra_set_node_reg(c->g, node,
+-                                        PHYS_INDEX + inst->src[0].index);
++                                        get_phys_index(c->devinfo) +
++                                        inst->src[0].index);
+                         break;
+                 }
+                 }
+         }
+ 
++        /* Don't allocate rf0 to temps that cross ranges where we have
++         * live implicit rf0 writes from ldvary. We can identify these
++         * by tracking the last ldvary instruction and explicit reads
++         * of rf0.
++         */
++        if (c->devinfo->ver >= 71 &&
++            ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
++              (vir_get_nsrc(inst) > 1 &&
++               inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
++                for (int i = 0; i < c->num_temps; i++) {
++                        if (c->temp_start[i] < ip &&
++                            c->temp_end[i] > last_ldvary_ip) {
++                                        ra_add_node_interference(c->g,
++                                                                 temp_to_node(c, i),
++                                                                 implicit_rf_nodes[0]);
++                        }
++                }
++        }
++
+         if (inst->dst.file == QFILE_TEMP) {
+                 /* Only a ldunif gets to write to R5, which only has a
+                  * single 32-bit channel of storage.
+@@ -1041,36 +1199,95 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                  * because ldunif has usually a shorter lifespan, allowing for
+                  * more accumulator reuse and QPU merges.
+                  */
+-                if (!inst->qpu.sig.ldunif) {
+-                        uint8_t class_bits =
+-                                get_temp_class_bits(&c->nodes, inst->dst.index) &
+-                                ~CLASS_BITS_R5;
+-                        set_temp_class_bits(&c->nodes, inst->dst.index,
+-                                            class_bits);
+-
++                if (c->devinfo->has_accumulators) {
++                        if (!inst->qpu.sig.ldunif) {
++                                uint8_t class_bits =
++                                        get_temp_class_bits(c, inst->dst.index) &
++                                        ~CLASS_BITS_R5;
++                                set_temp_class_bits(c, inst->dst.index,
++                                                    class_bits);
++
++                        } else {
++                                /* Until V3D 4.x, we could only load a uniform
++                                 * to r5, so we'll need to spill if uniform
++                                 * loads interfere with each other.
++                                 */
++                                if (c->devinfo->ver < 40) {
++                                        set_temp_class_bits(c, inst->dst.index,
++                                                            CLASS_BITS_R5);
++                                }
++                        }
+                 } else {
+-                        /* Until V3D 4.x, we could only load a uniform
+-                         * to r5, so we'll need to spill if uniform
+-                         * loads interfere with each other.
++                        /* Make sure we don't allocate the ldvary's
++                         * destination to rf0, since it would clash
++                         * with its implicit write to that register.
++                         */
++                        if (inst->qpu.sig.ldvary) {
++                                ra_add_node_interference(c->g,
++                                                         temp_to_node(c, inst->dst.index),
++                                                         implicit_rf_nodes[0]);
++                        }
++                        /* Flag dst temps from ldunif(a) instructions
++                         * so we can try to assign rf0 to them and avoid
++                         * converting these to ldunif(a)rf.
+                          */
+-                        if (c->devinfo->ver < 40) {
+-                                set_temp_class_bits(&c->nodes, inst->dst.index,
+-                                                    CLASS_BITS_R5);
++                        if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
++                                const uint32_t dst_n =
++                                        temp_to_node(c, inst->dst.index);
++                                c->nodes.info[dst_n].is_ldunif_dst = true;
+                         }
+                 }
+         }
+ 
+         /* All accumulators are invalidated across a thread switch. */
+-        if (inst->qpu.sig.thrsw) {
++        if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+-                                set_temp_class_bits(&c->nodes, i,
++                                set_temp_class_bits(c, i,
+                                                     CLASS_BITS_PHYS);
+                         }
+                 }
+         }
+ }
+ 
++static void
++flag_program_end_nodes(struct v3d_compile *c)
++{
++        /* Only look for registers used in this many instructions */
++        uint32_t last_set_count = 6;
++
++        struct qblock *last_block = vir_exit_block(c);
++        list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
++                if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
++                        continue;
++
++                int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
++                for (int i = 0; i < num_src; i++) {
++                        if (inst->src[i].file == QFILE_TEMP) {
++                                int node = temp_to_node(c, inst->src[i].index);
++                                c->nodes.info[node].is_program_end = true;
++                        }
++                }
++
++                num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
++                for (int i = 0; i < num_src; i++) {
++                       if (inst->src[i].file == QFILE_TEMP) {
++                                int node = temp_to_node(c, inst->src[i].index);
++                                c->nodes.info[node].is_program_end = true;
++
++                        }
++                }
++
++                if (inst->dst.file == QFILE_TEMP) {
++                        int node = temp_to_node(c, inst->dst.index);
++                        c->nodes.info[node].is_program_end = true;
++                }
++
++                if (--last_set_count == 0)
++                        break;
++        }
++}
++
+ /**
+  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
+  *
+@@ -1080,19 +1297,32 @@ struct qpu_reg *
+ v3d_register_allocate(struct v3d_compile *c)
+ {
+         int acc_nodes[ACC_COUNT];
++        int implicit_rf_nodes[IMPLICIT_RF_COUNT];
++
++        unsigned num_ra_nodes = c->num_temps;
++        if (c->devinfo->has_accumulators)
++                num_ra_nodes += ARRAY_SIZE(acc_nodes);
++        else
++                num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
++
+         c->nodes = (struct v3d_ra_node_info) {
+                 .alloc_count = c->num_temps,
+                 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
+-                                          c->num_temps + ACC_COUNT),
++                                          num_ra_nodes),
+         };
+ 
++        uint32_t phys_index = get_phys_index(c->devinfo);
++
+         struct v3d_ra_select_callback_data callback_data = {
++                .phys_index = phys_index,
+                 .next_acc = 0,
+                 /* Start at RF3, to try to keep the TLB writes from using
+-                 * RF0-2.
++                 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
++                 * using RF2-3.
+                  */
+-                .next_phys = 3,
++                .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
+                 .nodes = &c->nodes,
++                .devinfo = c->devinfo,
+         };
+ 
+         vir_calculate_live_intervals(c);
+@@ -1108,27 +1338,35 @@ v3d_register_allocate(struct v3d_compile *c)
+                         c->thread_index--;
+         }
+ 
+-        c->g = ra_alloc_interference_graph(c->compiler->regs,
+-                                           c->num_temps + ARRAY_SIZE(acc_nodes));
++        c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
+ 
+         /* Make some fixed nodes for the accumulators, which we will need to
+          * interfere with when ops have implied r3/r4 writes or for the thread
+          * switches.  We could represent these as classes for the nodes to
+          * live in, but the classes take up a lot of memory to set up, so we
+-         * don't want to make too many.
++         * don't want to make too many. We use the same mechanism on platforms
++         * without accumulators that can have implicit writes to phys regs.
+          */
+-        for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
+-                if (i < ACC_COUNT) {
++        for (uint32_t i = 0; i < num_ra_nodes; i++) {
++                c->nodes.info[i].is_ldunif_dst = false;
++                c->nodes.info[i].is_program_end = false;
++                c->nodes.info[i].unused = false;
++                c->nodes.info[i].priority = 0;
++                c->nodes.info[i].class_bits = 0;
++                if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+                         acc_nodes[i] = i;
+                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+-                        c->nodes.info[i].priority = 0;
+-                        c->nodes.info[i].class_bits = 0;
++                } else if (!c->devinfo->has_accumulators &&
++                           i < ARRAY_SIZE(implicit_rf_nodes)) {
++                        implicit_rf_nodes[i] = i;
++                        ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+                 } else {
+-                        uint32_t t = node_to_temp(i);
++                        uint32_t t = node_to_temp(c, i);
+                         c->nodes.info[i].priority =
+                                 c->temp_end[t] - c->temp_start[t];
+-                        c->nodes.info[i].class_bits = CLASS_BITS_ANY;
++                        c->nodes.info[i].class_bits =
++                                get_class_bit_any(c->devinfo);
+                 }
+         }
+ 
+@@ -1136,25 +1374,61 @@ v3d_register_allocate(struct v3d_compile *c)
+          * interferences.
+          */
+         int ip = 0;
++        int last_ldvary_ip = -1;
+         vir_for_each_inst_inorder(inst, c) {
+                 inst->ip = ip++;
+-                update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
++
++                /* ldunif(a) always write to a temporary, so we have
++                 * liveness info available to decide if rf0 is
++                 * available for them, however, ldvary is different:
++                 * it always writes to rf0 directly so we don't have
++                 * liveness information for its implicit rf0 write.
++                 *
++                 * That means the allocator may assign rf0 to a temp
++                 * that is defined while an implicit rf0 write from
++                 * ldvary is still live. We fix that by manually
++                 * tracking rf0 live ranges from ldvary instructions.
++                 */
++                if (inst->qpu.sig.ldvary)
++                        last_ldvary_ip = ip;
++
++                update_graph_and_reg_classes_for_inst(c, acc_nodes,
++                                                      implicit_rf_nodes,
++                                                      last_ldvary_ip, inst);
+         }
+ 
++        /* Flag the nodes that are used in the last instructions of the program
++         * (there are some registers that cannot be used in the last 3
++         * instructions). We only do this for fragment shaders, because the idea
++         * is that by avoiding this conflict we may be able to emit the last
++         * thread switch earlier in some cases, however, in non-fragment shaders
++         * this won't happen because the last instructions are always VPM stores
++         * with a small immediate, which conflicts with other signals,
++         * preventing us from ever moving the thrsw earlier.
++         */
++        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
++                flag_program_end_nodes(c);
++
+         /* Set the register classes for all our temporaries in the graph */
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+-                ra_set_node_class(c->g, temp_to_node(i),
++                ra_set_node_class(c->g, temp_to_node(c, i),
+                                   choose_reg_class_for_temp(c, i));
+         }
+ 
+         /* Add register interferences based on liveness data */
+         for (uint32_t i = 0; i < c->num_temps; i++) {
++                /* And while we are here, let's also flag nodes for
++                 * unused temps.
++                 */
++                if (c->temp_start[i] > c->temp_end[i])
++                        c->nodes.info[temp_to_node(c, i)].unused = true;
++
+                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
+                         if (interferes(c->temp_start[i], c->temp_end[i],
+                                        c->temp_start[j], c->temp_end[j])) {
+                                 ra_add_node_interference(c->g,
+-                                                         temp_to_node(i),
+-                                                         temp_to_node(j));
++                                                         temp_to_node(c, i),
++                                                         temp_to_node(c, j));
+                         }
+                 }
+         }
+@@ -1171,9 +1445,9 @@ v3d_register_allocate(struct v3d_compile *c)
+                 if (c->spill_size <
+                     V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+                         int node = v3d_choose_spill_node(c);
+-                        uint32_t temp = node_to_temp(node);
++                        uint32_t temp = node_to_temp(c, node);
+                         if (node != -1) {
+-                                v3d_spill_reg(c, acc_nodes, temp);
++                                v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                                 continue;
+                         }
+                 }
+@@ -1186,11 +1460,11 @@ v3d_register_allocate(struct v3d_compile *c)
+                 if (node == -1)
+                         goto spill_fail;
+ 
+-                uint32_t temp = node_to_temp(node);
++                uint32_t temp = node_to_temp(c, node);
+                 enum temp_spill_type spill_type =
+                         get_spill_type_for_temp(c, temp);
+                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+-                        v3d_spill_reg(c, acc_nodes, temp);
++                        v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                         if (c->spills + c->fills > c->max_tmu_spills)
+                                 goto spill_fail;
+                 } else {
+@@ -1201,14 +1475,14 @@ v3d_register_allocate(struct v3d_compile *c)
+         /* Allocation was successful, build the 'temp -> reg' map */
+         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+-                int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
+-                if (ra_reg < PHYS_INDEX) {
++                int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
++                if (ra_reg < phys_index) {
+                         temp_registers[i].magic = true;
+                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
+                                                    ra_reg - ACC_INDEX);
+                 } else {
+                         temp_registers[i].magic = false;
+-                        temp_registers[i].index = ra_reg - PHYS_INDEX;
++                        temp_registers[i].index = ra_reg - phys_index;
+                 }
+         }
+ 
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index 45e6bfa1470c..4ed184cbbcb7 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -86,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
+         return q;
+ }
+ 
++static void
++v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
++{
++        /* If we have a small immediate move it from inst->raddr_b to the
++         * corresponding raddr.
++         */
++        if (src.smimm) {
++                assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
++                       instr->sig.small_imm_c || instr->sig.small_imm_d);
++                *raddr = instr->raddr_b;
++                return;
++        }
++
++        assert(!src.magic);
++        *raddr = src.index;
++}
++
+ /**
+  * Allocates the src register (accumulator or register file) into the RADDR
+  * fields of the instruction.
+  */
+ static void
+-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
++v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+ {
+         if (src.smimm) {
+-                assert(instr->sig.small_imm);
++                assert(instr->sig.small_imm_b);
+                 *mux = V3D_QPU_MUX_B;
+                 return;
+         }
+@@ -106,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+                 return;
+         }
+ 
+-        if (instr->alu.add.a != V3D_QPU_MUX_A &&
+-            instr->alu.add.b != V3D_QPU_MUX_A &&
+-            instr->alu.mul.a != V3D_QPU_MUX_A &&
+-            instr->alu.mul.b != V3D_QPU_MUX_A) {
++        if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
++            instr->alu.add.b.mux != V3D_QPU_MUX_A &&
++            instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
++            instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
+                 instr->raddr_a = src.index;
+                 *mux = V3D_QPU_MUX_A;
+         } else {
+                 if (instr->raddr_a == src.index) {
+                         *mux = V3D_QPU_MUX_A;
+                 } else {
+-                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
+-                                 instr->alu.add.b == V3D_QPU_MUX_B &&
+-                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
+-                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
++                        assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
++                                 instr->alu.add.b.mux == V3D_QPU_MUX_B &&
++                                 instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
++                                 instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
+                                src.index == instr->raddr_b);
+ 
+                         instr->raddr_b = src.index;
+@@ -128,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+         }
+ }
+ 
+-static bool
+-is_no_op_mov(struct qinst *qinst)
++/*
++ * The main purpose of the following wrapper is to make calling set_src
++ * cleaner. This is the reason it receives both mux and raddr pointers. Those
++ * will be filled or not based on the device version.
++ */
++static void
++set_src(struct v3d_qpu_instr *instr,
++        enum v3d_qpu_mux *mux,
++        uint8_t *raddr,
++        struct qpu_reg src,
++        const struct v3d_device_info *devinfo)
+ {
+-        static const struct v3d_qpu_sig no_sig = {0};
+-
+-        /* Make sure it's just a lone MOV. */
+-        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+-            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+-            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+-            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+-                return false;
+-        }
++        if (devinfo->ver < 71)
++                return v3d33_set_src(instr, mux, src);
++        else
++                return v3d71_set_src(instr, raddr, src);
++}
+ 
+-        /* Check if it's a MOV from a register to itself. */
++static bool
++v3d33_mov_src_and_dst_equal(struct qinst *qinst)
++{
+         enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+         if (qinst->qpu.alu.mul.magic_write) {
+                 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
+                         return false;
+ 
+-                if (qinst->qpu.alu.mul.a !=
++                if (qinst->qpu.alu.mul.a.mux !=
+                     V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
+                         return false;
+                 }
+         } else {
+                 int raddr;
+ 
+-                switch (qinst->qpu.alu.mul.a) {
++                switch (qinst->qpu.alu.mul.a.mux) {
+                 case V3D_QPU_MUX_A:
+                         raddr = qinst->qpu.raddr_a;
+                         break;
+@@ -168,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
+                         return false;
+         }
+ 
++        return true;
++}
++
++static bool
++v3d71_mov_src_and_dst_equal(struct qinst *qinst)
++{
++        if (qinst->qpu.alu.mul.magic_write)
++                return false;
++
++        enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
++        int raddr;
++
++        raddr = qinst->qpu.alu.mul.a.raddr;
++        if (raddr != waddr)
++                return false;
++
++        return true;
++}
++
++static bool
++mov_src_and_dst_equal(struct qinst *qinst,
++                      const struct v3d_device_info *devinfo)
++{
++        if (devinfo->ver < 71)
++                return v3d33_mov_src_and_dst_equal(qinst);
++        else
++                return v3d71_mov_src_and_dst_equal(qinst);
++}
++
++
++static bool
++is_no_op_mov(struct qinst *qinst,
++             const struct v3d_device_info *devinfo)
++{
++        static const struct v3d_qpu_sig no_sig = {0};
++
++        /* Make sure it's just a lone MOV. We only check for M_MOV. Although
++         * for V3D 7.x there is also A_MOV, we don't need to check for it as
++         * we always emit using M_MOV. We could use A_MOV later on the
++         * squedule to improve performance
++         */
++        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
++            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
++            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
++            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
++                return false;
++        }
++
++        if (!mov_src_and_dst_equal(qinst, devinfo))
++                return false;
++
+         /* No packing or flags updates, or we need to execute the
+          * instruction.
+          */
+-        if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
++        if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+             qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+             qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
+             qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
+@@ -277,8 +352,15 @@ v3d_generate_code_block(struct v3d_compile *c,
+                                 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
+                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+ 
+-                                if (!dst.magic ||
+-                                    dst.index != V3D_QPU_WADDR_R5) {
++                                bool use_rf;
++                                if (c->devinfo->has_accumulators) {
++                                        use_rf = !dst.magic ||
++                                                 dst.index != V3D_QPU_WADDR_R5;
++                                } else {
++                                        use_rf = dst.magic || dst.index != 0;
++                                }
++
++                                if (use_rf) {
+                                         assert(c->devinfo->ver >= 40);
+ 
+                                         if (qinst->qpu.sig.ldunif) {
+@@ -300,13 +382,18 @@ v3d_generate_code_block(struct v3d_compile *c,
+                                 qinst->qpu.sig_magic = dst.magic;
+                         } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
++
+                                 if (nsrc >= 1) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.add.a, src[0]);
++                                                &qinst->qpu.alu.add.a.mux,
++                                                &qinst->qpu.alu.add.a.raddr,
++                                                src[0], c->devinfo);
+                                 }
+                                 if (nsrc >= 2) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.add.b, src[1]);
++                                                &qinst->qpu.alu.add.b.mux,
++                                                &qinst->qpu.alu.add.b.raddr,
++                                                src[1], c->devinfo);
+                                 }
+ 
+                                 qinst->qpu.alu.add.waddr = dst.index;
+@@ -314,17 +401,21 @@ v3d_generate_code_block(struct v3d_compile *c,
+                         } else {
+                                 if (nsrc >= 1) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.mul.a, src[0]);
++                                                &qinst->qpu.alu.mul.a.mux,
++                                                &qinst->qpu.alu.mul.a.raddr,
++                                                src[0], c->devinfo);
+                                 }
+                                 if (nsrc >= 2) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.mul.b, src[1]);
++                                                &qinst->qpu.alu.mul.b.mux,
++                                                &qinst->qpu.alu.mul.b.raddr,
++                                                src[1], c->devinfo);
+                                 }
+ 
+                                 qinst->qpu.alu.mul.waddr = dst.index;
+                                 qinst->qpu.alu.mul.magic_write = dst.magic;
+ 
+-                                if (is_no_op_mov(qinst)) {
++                                if (is_no_op_mov(qinst, c->devinfo)) {
+                                         vir_remove_instruction(c, qinst);
+                                         continue;
+                                 }
+diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
+index 2c10e46b1882..73cb7aa05756 100644
+--- a/src/broadcom/meson.build
++++ b/src/broadcom/meson.build
+@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
+ 
+ subdir('cle')
+ 
+-v3d_versions = ['33', '41', '42']
++v3d_versions = ['33', '41', '42', '71']
+ v3d_libs = []
+ 
+ if with_gallium_v3d or with_broadcom_vk
+diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
+index 28fb2357b971..c1590a760de5 100644
+--- a/src/broadcom/qpu/qpu_disasm.c
++++ b/src/broadcom/qpu/qpu_disasm.c
+@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n)
+ 
+ 
+ static void
+-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+-                     const struct v3d_qpu_instr *instr, uint8_t mux)
++v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
++                       const struct v3d_qpu_instr *instr,
++                       enum v3d_qpu_mux mux)
+ {
+         if (mux == V3D_QPU_MUX_A) {
+                 append(disasm, "rf%d", instr->raddr_a);
+         } else if (mux == V3D_QPU_MUX_B) {
+-                if (instr->sig.small_imm) {
++                if (instr->sig.small_imm_b) {
+                         uint32_t val;
+                         ASSERTED bool ok =
+                                 v3d_qpu_small_imm_unpack(disasm->devinfo,
+@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+         }
+ }
+ 
++enum v3d_qpu_input_class {
++        V3D_QPU_ADD_A,
++        V3D_QPU_ADD_B,
++        V3D_QPU_MUL_A,
++        V3D_QPU_MUL_B
++};
++
++static void
++v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
++                       const struct v3d_qpu_instr *instr,
++                       uint8_t raddr,
++                       enum v3d_qpu_input_class input_class)
++{
++        bool is_small_imm = false;
++        switch(input_class) {
++        case V3D_QPU_ADD_A:
++                is_small_imm = instr->sig.small_imm_a;
++                break;
++        case V3D_QPU_ADD_B:
++                is_small_imm = instr->sig.small_imm_b;
++                break;
++        case V3D_QPU_MUL_A:
++                is_small_imm = instr->sig.small_imm_c;
++                break;
++        case V3D_QPU_MUL_B:
++                is_small_imm = instr->sig.small_imm_d;
++                break;
++        }
++
++        if (is_small_imm) {
++                uint32_t val;
++                ASSERTED bool ok =
++                        v3d_qpu_small_imm_unpack(disasm->devinfo,
++                                                 raddr,
++                                                 &val);
++
++                if ((int)val >= -16 && (int)val <= 15)
++                        append(disasm, "%d", val);
++                else
++                        append(disasm, "0x%08x", val);
++                assert(ok);
++        } else {
++                append(disasm, "rf%d", raddr);
++        }
++}
++
++static void
++v3d_qpu_disasm_raddr(struct disasm_state *disasm,
++                     const struct v3d_qpu_instr *instr,
++                     const struct v3d_qpu_input *input,
++                     enum v3d_qpu_input_class input_class)
++{
++        if (disasm->devinfo->ver < 71)
++                v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
++        else
++                v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
++}
++
+ static void
+ v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
+ {
+@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
+         if (num_src >= 1) {
+                 if (has_dst)
+                         append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.add.a_unpack));
++                       v3d_qpu_unpack_name(instr->alu.add.a.unpack));
+         }
+ 
+         if (num_src >= 2) {
+                 append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.add.b_unpack));
++                       v3d_qpu_unpack_name(instr->alu.add.b.unpack));
+         }
+ }
+ 
+@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
+         if (num_src >= 1) {
+                 if (has_dst)
+                         append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
++                       v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
+         }
+ 
+         if (num_src >= 2) {
+                 append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
++                       v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
+         }
+ }
+ 
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 60dabf74e8e0..44f20618a5a3 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
+         if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
+                 return "tmu";
+ 
++        /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
++         */
++        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
++                return "quad";
++
++        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
++                return "rep";
++
+         static const char *waddr_magic[] = {
+                 [V3D_QPU_WADDR_R0] = "r0",
+                 [V3D_QPU_WADDR_R1] = "r1",
+@@ -169,6 +177,12 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
+                 [V3D_QPU_A_ITOF] = "itof",
+                 [V3D_QPU_A_CLZ] = "clz",
+                 [V3D_QPU_A_UTOF] = "utof",
++                [V3D_QPU_A_MOV] = "mov",
++                [V3D_QPU_A_FMOV] = "fmov",
++                [V3D_QPU_A_VPACK] = "vpack",
++                [V3D_QPU_A_V8PACK] = "v8pack",
++                [V3D_QPU_A_V10PACK] = "v10pack",
++                [V3D_QPU_A_V11FPACK] = "v11fpack",
+         };
+ 
+         if (op >= ARRAY_SIZE(op_names))
+@@ -191,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
+                 [V3D_QPU_M_MOV] = "mov",
+                 [V3D_QPU_M_NOP] = "nop",
+                 [V3D_QPU_M_FMUL] = "fmul",
++                [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
++                [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
++                [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
++                [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
++                [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
++                [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
+         };
+ 
+         if (op >= ARRAY_SIZE(op_names))
+@@ -450,6 +470,13 @@ static const uint8_t add_op_args[] = {
+         [V3D_QPU_A_ITOF] = D | A,
+         [V3D_QPU_A_CLZ] = D | A,
+         [V3D_QPU_A_UTOF] = D | A,
++
++        [V3D_QPU_A_MOV] = D | A,
++        [V3D_QPU_A_FMOV] = D | A,
++        [V3D_QPU_A_VPACK] = D | A | B,
++        [V3D_QPU_A_V8PACK] = D | A | B,
++        [V3D_QPU_A_V10PACK] = D | A | B,
++        [V3D_QPU_A_V11FPACK] = D | A | B,
+ };
+ 
+ static const uint8_t mul_op_args[] = {
+@@ -463,6 +490,12 @@ static const uint8_t mul_op_args[] = {
+         [V3D_QPU_M_NOP] = 0,
+         [V3D_QPU_M_MOV] = D | A,
+         [V3D_QPU_M_FMUL] = D | A | B,
++        [V3D_QPU_M_FTOUNORM16] = D | A,
++        [V3D_QPU_M_FTOSNORM16] = D | A,
++        [V3D_QPU_M_VFTOUNORM8] = D | A,
++        [V3D_QPU_M_VFTOSNORM8] = D | A,
++        [V3D_QPU_M_VFTOUNORM10LO] = D | A,
++        [V3D_QPU_M_VFTOUNORM10HI] = D | A,
+ };
+ 
+ bool
+@@ -636,12 +669,14 @@ v3d_qpu_add_op_writes_vpm(enum  v3d_qpu_add_op op)
+ }
+ 
+ bool
+-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
++v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
+ {
+-        if (inst->sig.ldtlb ||
+-            inst->sig.ldtlbu)
+-                return true;
++        return inst->sig.ldtlb || inst->sig.ldtlbu;
++}
+ 
++bool
++v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
++{
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                     inst->alu.add.magic_write &&
+@@ -659,6 +694,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+         return false;
+ }
+ 
++bool
++v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
++{
++        return  v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
++}
++
+ bool
+ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
+ {
+@@ -846,6 +887,9 @@ bool
+ v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *inst)
+ {
++        if(!devinfo->has_accumulators)
++                return false;
++
+         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
+                 return true;
+ 
+@@ -856,6 +900,9 @@ bool
+ v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                     inst->alu.add.magic_write &&
+@@ -886,6 +933,9 @@ bool
+ v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
+                 return true;
+ 
+@@ -896,6 +946,9 @@ bool
+ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
+                      const struct v3d_qpu_instr *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         if (v3d_qpu_writes_r5(devinfo, inst))
+                 return true;
+         if (v3d_qpu_writes_r4(devinfo, inst))
+@@ -912,16 +965,68 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
+         return false;
+ }
+ 
++bool
++v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
++                              const struct v3d_qpu_instr *inst)
++{
++        if (devinfo->ver >= 71 &&
++            (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
++                return true;
++        }
++
++        return false;
++}
++
+ bool
+ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
+ {
+         int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+         int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+ 
+-        return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
+-                (add_nsrc > 1 && inst->alu.add.b == mux) ||
+-                (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
+-                (mul_nsrc > 1 && inst->alu.mul.b == mux));
++        return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
++                (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
++                (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
++                (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
++}
++
++bool
++v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
++{
++        int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
++        int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
++
++        return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
++               (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
++               (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
++               (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
++}
++
++bool
++v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
++                                  const struct v3d_qpu_instr *inst,
++                                  uint8_t waddr)
++{
++        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
++                return false;
++
++        if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
++            !inst->alu.add.magic_write &&
++            inst->alu.add.waddr == waddr) {
++                return true;
++        }
++
++        if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
++            !inst->alu.mul.magic_write &&
++            inst->alu.mul.waddr == waddr) {
++                return true;
++        }
++
++        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
++            !inst->sig_magic && inst->sig_addr == waddr) {
++                return true;
++        }
++
++        return false;
+ }
+ 
+ bool
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 2e1334726987..56eee9f9cac8 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
+         bool ldvpm:1;
+         bool ldtlb:1;
+         bool ldtlbu:1;
+-        bool small_imm:1;
+         bool ucb:1;
+         bool rotate:1;
+         bool wrtmuc:1;
++        bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
++        bool small_imm_b:1; /* raddr_b (add b) */
++        bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
++        bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
+ };
+ 
+ enum v3d_qpu_cond {
+@@ -88,12 +91,13 @@ enum v3d_qpu_uf {
+ };
+ 
+ enum v3d_qpu_waddr {
+-        V3D_QPU_WADDR_R0 = 0,
+-        V3D_QPU_WADDR_R1 = 1,
+-        V3D_QPU_WADDR_R2 = 2,
+-        V3D_QPU_WADDR_R3 = 3,
+-        V3D_QPU_WADDR_R4 = 4,
+-        V3D_QPU_WADDR_R5 = 5,
++        V3D_QPU_WADDR_R0 = 0,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R1 = 1,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R2 = 2,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R3 = 3,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R4 = 4,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R5 = 5,    /* V3D 4.x */
++        V3D_QPU_WADDR_QUAD = 5,  /* V3D 7.x */
+         V3D_QPU_WADDR_NOP = 6,
+         V3D_QPU_WADDR_TLB = 7,
+         V3D_QPU_WADDR_TLBU = 8,
+@@ -108,12 +112,12 @@ enum v3d_qpu_waddr {
+         V3D_QPU_WADDR_SYNC = 16,
+         V3D_QPU_WADDR_SYNCU = 17,
+         V3D_QPU_WADDR_SYNCB = 18,
+-        V3D_QPU_WADDR_RECIP = 19,
+-        V3D_QPU_WADDR_RSQRT = 20,
+-        V3D_QPU_WADDR_EXP = 21,
+-        V3D_QPU_WADDR_LOG = 22,
+-        V3D_QPU_WADDR_SIN = 23,
+-        V3D_QPU_WADDR_RSQRT2 = 24,
++        V3D_QPU_WADDR_RECIP = 19,  /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_RSQRT = 20,  /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_EXP = 21,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_LOG = 22,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_SIN = 23,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
+         V3D_QPU_WADDR_TMUC = 32,
+         V3D_QPU_WADDR_TMUS = 33,
+         V3D_QPU_WADDR_TMUT = 34,
+@@ -129,7 +133,8 @@ enum v3d_qpu_waddr {
+         V3D_QPU_WADDR_TMUHSCM = 44,
+         V3D_QPU_WADDR_TMUHSF = 45,
+         V3D_QPU_WADDR_TMUHSLOD = 46,
+-        V3D_QPU_WADDR_R5REP = 55,
++        V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
++        V3D_QPU_WADDR_REP = 55,   /* V3D 7.x */
+ };
+ 
+ struct v3d_qpu_flags {
+@@ -222,6 +227,14 @@ enum v3d_qpu_add_op {
+         V3D_QPU_A_ITOF,
+         V3D_QPU_A_CLZ,
+         V3D_QPU_A_UTOF,
++
++        /* V3D 7.x */
++        V3D_QPU_A_FMOV,
++        V3D_QPU_A_MOV,
++        V3D_QPU_A_VPACK,
++        V3D_QPU_A_V8PACK,
++        V3D_QPU_A_V10PACK,
++        V3D_QPU_A_V11FPACK,
+ };
+ 
+ enum v3d_qpu_mul_op {
+@@ -235,6 +248,14 @@ enum v3d_qpu_mul_op {
+         V3D_QPU_M_MOV,
+         V3D_QPU_M_NOP,
+         V3D_QPU_M_FMUL,
++
++        /* V3D 7.x */
++        V3D_QPU_M_FTOUNORM16,
++        V3D_QPU_M_FTOSNORM16,
++        V3D_QPU_M_VFTOUNORM8,
++        V3D_QPU_M_VFTOSNORM8,
++        V3D_QPU_M_VFTOUNORM10LO,
++        V3D_QPU_M_VFTOUNORM10HI,
+ };
+ 
+ enum v3d_qpu_output_pack {
+@@ -276,6 +297,15 @@ enum v3d_qpu_input_unpack {
+ 
+         /** Swap high and low 16 bits */
+         V3D_QPU_UNPACK_SWAP_16,
++
++        /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
++        V3D_QPU_UNPACK_UL,
++        /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
++        V3D_QPU_UNPACK_UH,
++        /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
++        V3D_QPU_UNPACK_IL,
++        /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
++        V3D_QPU_UNPACK_IH,
+ };
+ 
+ enum v3d_qpu_mux {
+@@ -289,25 +319,29 @@ enum v3d_qpu_mux {
+         V3D_QPU_MUX_B,
+ };
+ 
++struct v3d_qpu_input {
++        union {
++                enum v3d_qpu_mux mux; /* V3D 4.x */
++                uint8_t raddr; /* V3D 7.x */
++        };
++        enum v3d_qpu_input_unpack unpack;
++};
++
+ struct v3d_qpu_alu_instr {
+         struct {
+                 enum v3d_qpu_add_op op;
+-                enum v3d_qpu_mux a, b;
++                struct v3d_qpu_input a, b;
+                 uint8_t waddr;
+                 bool magic_write;
+                 enum v3d_qpu_output_pack output_pack;
+-                enum v3d_qpu_input_unpack a_unpack;
+-                enum v3d_qpu_input_unpack b_unpack;
+         } add;
+ 
+         struct {
+                 enum v3d_qpu_mul_op op;
+-                enum v3d_qpu_mux a, b;
++                struct v3d_qpu_input a, b;
+                 uint8_t waddr;
+                 bool magic_write;
+                 enum v3d_qpu_output_pack output_pack;
+-                enum v3d_qpu_input_unpack a_unpack;
+-                enum v3d_qpu_input_unpack b_unpack;
+         } mul;
+ };
+ 
+@@ -379,8 +413,8 @@ struct v3d_qpu_instr {
+         struct v3d_qpu_sig sig;
+         uint8_t sig_addr;
+         bool sig_magic; /* If the signal writes to a magic address */
+-        uint8_t raddr_a;
+-        uint8_t raddr_b;
++        uint8_t raddr_a; /* V3D 4.x */
++        uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
+         struct v3d_qpu_flags flags;
+ 
+         union {
+@@ -450,6 +484,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+ bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+ bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
++bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
++bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+@@ -464,6 +500,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+ bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
++bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
++                                   const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+ bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
+                           const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+@@ -483,4 +521,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ 
+ bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
++
++bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
++bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
++                                       const struct v3d_qpu_instr *inst,
++                                       uint8_t waddr);
+ #endif
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index a875683c6f80..f09bc041e7de 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -84,6 +84,9 @@
+ #define V3D_QPU_MUL_A_SHIFT                 18
+ #define V3D_QPU_MUL_A_MASK                  QPU_MASK(20, 18)
+ 
++#define V3D_QPU_RADDR_C_SHIFT               18
++#define V3D_QPU_RADDR_C_MASK                QPU_MASK(23, 18)
++
+ #define V3D_QPU_ADD_B_SHIFT                 15
+ #define V3D_QPU_ADD_B_MASK                  QPU_MASK(17, 15)
+ 
+@@ -98,6 +101,9 @@
+ #define V3D_QPU_BRANCH_BDI_SHIFT            12
+ #define V3D_QPU_BRANCH_BDI_MASK             QPU_MASK(13, 12)
+ 
++#define V3D_QPU_RADDR_D_SHIFT               12
++#define V3D_QPU_RADDR_D_MASK                QPU_MASK(17, 12)
++
+ #define V3D_QPU_RADDR_A_SHIFT               6
+ #define V3D_QPU_RADDR_A_MASK                QPU_MASK(11, 6)
+ 
+@@ -112,12 +118,15 @@
+ #define LDTMU .ldtmu = true
+ #define LDVARY .ldvary = true
+ #define LDVPM .ldvpm = true
+-#define SMIMM .small_imm = true
+ #define LDTLB .ldtlb = true
+ #define LDTLBU .ldtlbu = true
+ #define UCB .ucb = true
+ #define ROT .rotate = true
+ #define WRTMUC .wrtmuc = true
++#define SMIMM_A .small_imm_a = true
++#define SMIMM_B .small_imm_b = true
++#define SMIMM_C .small_imm_c = true
++#define SMIMM_D .small_imm_d = true
+ 
+ static const struct v3d_qpu_sig v33_sig_map[] = {
+         /*      MISC   R3       R4      R5 */
+@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
+         [11] = { THRSW, LDVARY,         LDUNIF },
+         [12] = {        LDVARY, LDTMU,         },
+         [13] = { THRSW, LDVARY, LDTMU,         },
+-        [14] = { SMIMM, LDVARY,                },
+-        [15] = { SMIMM,                        },
++        [14] = { SMIMM_B, LDVARY,              },
++        [15] = { SMIMM_B,                      },
+         [16] = {        LDTLB,                 },
+         [17] = {        LDTLBU,                },
+         /* 18-21 reserved */
+@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
+         [27] = { THRSW, LDVPM,          LDUNIF },
+         [28] = {        LDVPM, LDTMU,          },
+         [29] = { THRSW, LDVPM, LDTMU,          },
+-        [30] = { SMIMM, LDVPM,                 },
+-        [31] = { SMIMM,                        },
++        [30] = { SMIMM_B, LDVPM,               },
++        [31] = { SMIMM_B,                      },
+ };
+ 
+ static const struct v3d_qpu_sig v40_sig_map[] = {
+@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
+         [10] = {        LDVARY,         LDUNIF },
+         [11] = { THRSW, LDVARY,         LDUNIF },
+         /* 12-13 reserved */
+-        [14] = { SMIMM, LDVARY,                },
+-        [15] = { SMIMM,                        },
++        [14] = { SMIMM_B, LDVARY,              },
++        [15] = { SMIMM_B,                      },
+         [16] = {        LDTLB,                 },
+         [17] = {        LDTLBU,                },
+         [18] = {                        WRTMUC },
+@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
+         [22] = { UCB,                          },
+         [23] = { ROT,                          },
+         /* 24-30 reserved */
+-        [31] = { SMIMM,         LDTMU,         },
++        [31] = { SMIMM_B,       LDTMU,         },
+ };
+ 
+ static const struct v3d_qpu_sig v41_sig_map[] = {
+@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
+         [11] = { THRSW,    LDVARY, LDUNIF },
+         [12] = { LDUNIFRF                 },
+         [13] = { THRSW,    LDUNIFRF       },
+-        [14] = { SMIMM,    LDVARY,        },
+-        [15] = { SMIMM,                   },
++        [14] = { SMIMM_B,    LDVARY       },
++        [15] = { SMIMM_B,                 },
+         [16] = {           LDTLB,         },
+         [17] = {           LDTLBU,        },
+         [18] = {                          WRTMUC },
+@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
+         [24] = {                   LDUNIFA},
+         [25] = { LDUNIFARF                },
+         /* 26-30 reserved */
+-        [31] = { SMIMM,            LDTMU, },
++        [31] = { SMIMM_B,          LDTMU, },
++};
++
++
++static const struct v3d_qpu_sig v71_sig_map[] = {
++        /*      MISC       phys    RF0 */
++        [0]  = {                          },
++        [1]  = { THRSW,                   },
++        [2]  = {                   LDUNIF },
++        [3]  = { THRSW,            LDUNIF },
++        [4]  = {           LDTMU,         },
++        [5]  = { THRSW,    LDTMU,         },
++        [6]  = {           LDTMU,  LDUNIF },
++        [7]  = { THRSW,    LDTMU,  LDUNIF },
++        [8]  = {           LDVARY,        },
++        [9]  = { THRSW,    LDVARY,        },
++        [10] = {           LDVARY, LDUNIF },
++        [11] = { THRSW,    LDVARY, LDUNIF },
++        [12] = { LDUNIFRF                 },
++        [13] = { THRSW,    LDUNIFRF       },
++        [14] = { SMIMM_A,                 },
++        [15] = { SMIMM_B,                 },
++        [16] = {           LDTLB,         },
++        [17] = {           LDTLBU,        },
++        [18] = {                          WRTMUC },
++        [19] = { THRSW,                   WRTMUC },
++        [20] = {           LDVARY,        WRTMUC },
++        [21] = { THRSW,    LDVARY,        WRTMUC },
++        [22] = { UCB,                     },
++        /* 23 reserved */
++        [24] = {                   LDUNIFA},
++        [25] = { LDUNIFARF                },
++        /* 26-29 reserved */
++        [30] = { SMIMM_C,                 },
++        [31] = { SMIMM_D,                 },
+ };
+ 
+ bool
+@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
+         if (packed_sig >= ARRAY_SIZE(v33_sig_map))
+                 return false;
+ 
+-        if (devinfo->ver >= 41)
++        if (devinfo->ver >= 71)
++                *sig = v71_sig_map[packed_sig];
++        else if (devinfo->ver >= 41)
+                 *sig = v41_sig_map[packed_sig];
+         else if (devinfo->ver == 40)
+                 *sig = v40_sig_map[packed_sig];
+@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
+ {
+         static const struct v3d_qpu_sig *map;
+ 
+-        if (devinfo->ver >= 41)
++        if (devinfo->ver >= 71)
++                map = v71_sig_map;
++        else if (devinfo->ver >= 41)
+                 map = v41_sig_map;
+         else if (devinfo->ver == 40)
+                 map = v40_sig_map;
+@@ -443,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
+ 
+ /* Make a mapping of the table of opcodes in the spec.  The opcode is
+  * determined by a combination of the opcode field, and in the case of 0 or
+- * 1-arg opcodes, the mux_b field as well.
++ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
++ * well.
+  */
+-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
+-#define ANYMUX MUX_MASK(0, 7)
++#define OP_MASK(val) BITFIELD64_BIT(val)
++#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
++#define ANYMUX OP_RANGE(0, 7)
++#define ANYOPMASK OP_RANGE(0, 63)
+ 
+ struct opcode_desc {
+         uint8_t opcode_first;
+         uint8_t opcode_last;
+-        uint8_t mux_b_mask;
+-        uint8_t mux_a_mask;
++
++        union {
++                struct {
++                        uint8_t b_mask;
++                        uint8_t a_mask;
++                } mux;
++                uint64_t raddr_mask;
++        };
++
+         uint8_t op;
+ 
+         /* first_ver == 0 if it's the same across all V3D versions.
+@@ -465,122 +522,321 @@ struct opcode_desc {
+         uint8_t last_ver;
+ };
+ 
+-static const struct opcode_desc add_ops[] = {
++static const struct opcode_desc add_ops_v33[] = {
+         /* FADD is FADDNF depending on the order of the mux_a/mux_b. */
+-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADD },
+-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
+-        { 53,  55,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
+-        { 56,  56,  ANYMUX, ANYMUX, V3D_QPU_A_ADD },
+-        { 57,  59,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
+-        { 60,  60,  ANYMUX, ANYMUX, V3D_QPU_A_SUB },
+-        { 61,  63,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
+-        { 64,  111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
+-        { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
+-        { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
+-        { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
+-        { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
+-        { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
+-        { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
+-        { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
+-        { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
++        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
++        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
++        { 53,  55,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
++        { 56,  56,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
++        { 57,  59,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
++        { 60,  60,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
++        { 61,  63,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
++        { 64,  111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
++        { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
++        { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
++        { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
++        { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
++        { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
++        { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
++        { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
++        { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
+         /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
+-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
+-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
+-        { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
+-
+-        { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
+-        { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
+-        { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
+-
+-        { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
+-        { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
+-        { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
+-        { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
+-        { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
+-        { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
+-        { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
+-        { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
+-        { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
+-        { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
+-        { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
+-        { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
+-        { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
+-        { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
+-        { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
+-        { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
+-        { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
+-        { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
+-
+-        { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
+-        { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
+-        { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
+-        { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
+-
+-        { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
+-        { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
+-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
+-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
+-        { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
+-        { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
+-        { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
+-        { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
+-        { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
+-        { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
+-        { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+-
+-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+-        { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+-        { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
+-        { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
+-        { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
+-        { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
+-        { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
++        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
++        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
++        { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
++
++        { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
++        { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
++        { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
++
++        { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
++        { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
++        { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
++        { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
++        { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
++        { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
++        { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
++        { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
++        { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
++        { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
++
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
++
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
++        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
++        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
++
++        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
++        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
++        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+ 
+         /* FIXME: MORE COMPLICATED */
+-        /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
++        /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+ 
+-        { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
+-        { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
++        { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
++        { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },
+ 
+-        { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
+-        { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
+-        { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
+-        { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
+-        { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
+-        { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
+-        { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
+-        { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
++        { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
++        { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
++        { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
++        { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
++        { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
++        { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
++        { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
++        { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },
+ 
+-        { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
+-        { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
++        { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
++        { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },
+ 
+         /* The stvpms are distinguished by the waddr field. */
+-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
+-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
+-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
++        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
++        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
++        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
++
++        { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
++        { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
++        { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
++};
++
++static const struct opcode_desc mul_ops_v33[] = {
++        { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
++        { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
++        { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
++        { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
++        { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
++        { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
++        { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
++        { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
++        { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
++        { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
++
++        { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
++};
++
++/* Note that it would have been possible to define all the add/mul opcodes in
++ * just one table, using the first_ver/last_ver. But taking into account that
++ * for v71 there were a lot of changes, it was more tidy this way. Also right
++ * now we are doing a linear search on those tables, so this maintains the
++ * tables smaller.
++ *
++ * Just in case we merge the tables, we define the first_ver as 71 for those
++ * opcodes that changed on v71
++ */
++static const struct opcode_desc add_ops_v71[] = {
++        /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
++        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
++        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
++        { 53,  55,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
++        { 56,  56,  .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
++        { 57,  59,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
++        { 60,  60,  .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
++        { 61,  63,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
++        { 64,  111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
++        { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
++        { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
++        { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
++        { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
++        { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
++        { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
++        { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
++        { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
++        /* FMIN is instead FMAX depending on the raddr_a/b order. */
++        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
++        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
++        { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
++
++        { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
++        { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
++        { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
++        { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
++        { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
++
++        { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
++        { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
++        { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
++        { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
++        { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
++        { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
++        { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
++        { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
++
++        { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
++        { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
++        { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
++        { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
++        { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
++        { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
++        { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
++        { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
++        { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
++        { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
++        { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
++        { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
++        { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
++        { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
++        { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
++        { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
++        { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
++        { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
++        { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
++
++        { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
++        { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
++
++        { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
++
++        { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
++
++        { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },
+ 
+-        { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
+-        { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
+-        { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
++        /* The stvpms are distinguished by the waddr field. */
++        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
++        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
++        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
++
++        { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FROUND, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FROUND, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FROUND, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(3),  V3D_QPU_A_FTOIN, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(7),  V3D_QPU_A_FTOIN, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
++        { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
++        { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
++        { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
++
++        { 246, 246, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
++
++        { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
++
++        { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
++        { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
++
++        { 249, 249, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
++
++        { 249, 249, .raddr_mask = OP_MASK(3),  V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(7),  V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
++
++        { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
++        { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
+ };
+ 
+-static const struct opcode_desc mul_ops[] = {
+-        { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
+-        { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
+-        { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
+-        { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
+-        { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
+-        { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
+-        { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
+-        { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
+-        { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
+-        { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
+-        { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
++static const struct opcode_desc mul_ops_v71[] = {
++        /* For V3D 7.1, second mask field would be ignored */
++        { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
++        { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
++        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
++        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
++        { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
++        { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
++        { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
++
++        { 14, 14, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
++
++        { 14, 14, .raddr_mask = OP_MASK(3),  V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(7),  V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
++
++        { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
++
++        { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
++
++        { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
+ };
+ 
+ /* Returns true if op_desc should be filtered out based on devinfo->ver
+@@ -589,17 +845,23 @@ static const struct opcode_desc mul_ops[] = {
+  */
+ static bool
+ opcode_invalid_in_version(const struct v3d_device_info *devinfo,
+-                          const struct opcode_desc *op_desc)
++                          const uint8_t first_ver,
++                          const uint8_t last_ver)
+ {
+-        return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
+-                (op_desc->last_ver != 0  && devinfo->ver > op_desc->last_ver);
++        return (first_ver != 0 && devinfo->ver < first_ver) ||
++                (last_ver != 0  && devinfo->ver > last_ver);
+ }
+ 
++/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
++ * on the devinfo->ver some would be ignored. We do this way just to avoid
++ * having two really similar lookup_opcode methods
++ */
+ static const struct opcode_desc *
+ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
+                           const struct opcode_desc *opcodes,
+                           size_t num_opcodes, uint32_t opcode,
+-                          uint32_t mux_a, uint32_t mux_b)
++                          uint32_t mux_a, uint32_t mux_b,
++                          uint32_t raddr)
+ {
+         for (int i = 0; i < num_opcodes; i++) {
+                 const struct opcode_desc *op_desc = &opcodes[i];
+@@ -608,14 +870,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
+                     opcode > op_desc->opcode_last)
+                         continue;
+ 
+-                if (opcode_invalid_in_version(devinfo, op_desc))
++                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
+                         continue;
+ 
+-                if (!(op_desc->mux_b_mask & (1 << mux_b)))
+-                        continue;
++                if (devinfo->ver < 71) {
++                        if (!(op_desc->mux.b_mask & (1 << mux_b)))
++                                continue;
+ 
+-                if (!(op_desc->mux_a_mask & (1 << mux_a)))
+-                        continue;
++                        if (!(op_desc->mux.a_mask & (1 << mux_a)))
++                                continue;
++                } else {
++                        if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
++                                continue;
++                }
+ 
+                 return op_desc;
+         }
+@@ -667,6 +934,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+         }
+ }
+ 
++static bool
++v3d_qpu_int32_unpack_unpack(uint32_t packed,
++                            enum v3d_qpu_input_unpack *unpacked)
++{
++        switch (packed) {
++        case 0:
++                *unpacked = V3D_QPU_UNPACK_NONE;
++                return true;
++        case 1:
++                *unpacked = V3D_QPU_UNPACK_UL;
++                return true;
++        case 2:
++                *unpacked = V3D_QPU_UNPACK_UH;
++                return true;
++        case 3:
++                *unpacked = V3D_QPU_UNPACK_IL;
++                return true;
++        case 4:
++                *unpacked = V3D_QPU_UNPACK_IH;
++                return true;
++        default:
++                return false;
++        }
++}
++
++static bool
++v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
++                          uint32_t *packed)
++{
++        switch (unpacked) {
++        case V3D_QPU_UNPACK_NONE:
++                *packed = 0;
++                return true;
++        case V3D_QPU_UNPACK_UL:
++                *packed = 1;
++                return true;
++        case V3D_QPU_UNPACK_UH:
++                *packed = 2;
++                return true;
++        case V3D_QPU_UNPACK_IL:
++                *packed = 3;
++                return true;
++        case V3D_QPU_UNPACK_IH:
++                *packed = 4;
++                return true;
++        default:
++                return false;
++        }
++}
++
+ static bool
+ v3d_qpu_float16_unpack_unpack(uint32_t packed,
+                               enum v3d_qpu_input_unpack *unpacked)
+@@ -737,8 +1054,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack,
+ }
+ 
+ static bool
+-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+-                   struct v3d_qpu_instr *instr)
++v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
+ {
+         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
+@@ -755,8 +1072,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 map_op = (map_op - 253 + 245);
+ 
+         const struct opcode_desc *desc =
+-                lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
+-                                          map_op, mux_a, mux_b);
++                lookup_opcode_from_packed(devinfo, add_ops_v33,
++                                          ARRAY_SIZE(add_ops_v33),
++                                          map_op, mux_a, mux_b, 0);
+ 
+         if (!desc)
+                 return false;
+@@ -812,12 +1130,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                         instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+-                                                   &instr->alu.add.b_unpack)) {
++                                                   &instr->alu.add.b.unpack)) {
+                         return false;
+                 }
+                 break;
+@@ -831,7 +1149,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.add.output_pack = mux_b & 0x3;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+                 break;
+@@ -843,7 +1161,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+                 break;
+@@ -851,23 +1169,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+         case V3D_QPU_A_VFMIN:
+         case V3D_QPU_A_VFMAX:
+                 if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+ 
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+ 
+         default:
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+-                instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
+-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+         }
+ 
+-        instr->alu.add.a = mux_a;
+-        instr->alu.add.b = mux_b;
++        instr->alu.add.a.mux = mux_a;
++        instr->alu.add.b.mux = mux_b;
+         instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+ 
+         instr->alu.add.magic_write = false;
+@@ -892,8 +1210,194 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ }
+ 
+ static bool
+-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
++{
++        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
++        uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
++        uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
++        uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
++        uint32_t map_op = op;
++
++        const struct opcode_desc *desc =
++                lookup_opcode_from_packed(devinfo,
++                                          add_ops_v71,
++                                          ARRAY_SIZE(add_ops_v71),
++                                          map_op, 0, 0,
++                                          raddr_b);
++        if (!desc)
++                return false;
++
++        instr->alu.add.op = desc->op;
++
++        /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
++         * operands.
++         */
++        if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
++            instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
++                if (instr->alu.add.op == V3D_QPU_A_FMIN)
++                        instr->alu.add.op = V3D_QPU_A_FMAX;
++                if (instr->alu.add.op == V3D_QPU_A_FADD)
++                        instr->alu.add.op = V3D_QPU_A_FADDNF;
++        }
++
++        /* Some QPU ops require a bit more than just basic opcode and mux a/b
++         * comparisons to distinguish them.
++         */
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_STVPMV:
++        case V3D_QPU_A_STVPMD:
++        case V3D_QPU_A_STVPMP:
++                switch (waddr) {
++                case 0:
++                        instr->alu.add.op = V3D_QPU_A_STVPMV;
++                        break;
++                case 1:
++                        instr->alu.add.op = V3D_QPU_A_STVPMD;
++                        break;
++                case 2:
++                        instr->alu.add.op = V3D_QPU_A_STVPMP;
++                        break;
++                default:
++                        return false;
++                }
++                break;
++        default:
++                break;
++        }
++
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_FADD:
++        case V3D_QPU_A_FADDNF:
++        case V3D_QPU_A_FSUB:
++        case V3D_QPU_A_FMIN:
++        case V3D_QPU_A_FMAX:
++        case V3D_QPU_A_FCMP:
++        case V3D_QPU_A_VFPACK:
++                if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
++                    instr->alu.add.op != V3D_QPU_A_FCMP) {
++                        instr->alu.add.output_pack = (op >> 4) & 0x3;
++                } else {
++                        instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++                }
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
++                                                   &instr->alu.add.b.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_FFLOOR:
++        case V3D_QPU_A_FROUND:
++        case V3D_QPU_A_FTRUNC:
++        case V3D_QPU_A_FCEIL:
++        case V3D_QPU_A_FDX:
++        case V3D_QPU_A_FDY:
++                instr->alu.add.output_pack = raddr_b & 0x3;
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_FTOIN:
++        case V3D_QPU_A_FTOIZ:
++        case V3D_QPU_A_FTOUZ:
++        case V3D_QPU_A_FTOC:
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_VFMIN:
++        case V3D_QPU_A_VFMAX:
++                unreachable("pending v71 update");
++                if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
++                break;
++
++        case V3D_QPU_A_MOV:
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
++                                                 &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_FMOV:
++                instr->alu.add.output_pack = raddr_b & 0x3;
++
++                /* Mul alu FMOV has one additional variant */
++                int32_t unpack = (raddr_b >> 2) & 0x7;
++                if (unpack == 7)
++                        return false;
++
++                if (!v3d_qpu_float32_unpack_unpack(unpack,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        default:
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
++                break;
++        }
++
++        instr->alu.add.a.raddr = raddr_a;
++        instr->alu.add.b.raddr = raddr_b;
++        instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
++
++        instr->alu.add.magic_write = false;
++        if (packed_inst & V3D_QPU_MA) {
++                switch (instr->alu.add.op) {
++                case V3D_QPU_A_LDVPMV_IN:
++                        instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
++                        break;
++                case V3D_QPU_A_LDVPMD_IN:
++                        instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
++                        break;
++                case V3D_QPU_A_LDVPMG_IN:
++                        instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
++                        break;
++                default:
++                        instr->alu.add.magic_write = true;
++                        break;
++                }
++        }
++
++        return true;
++}
++
++static bool
++v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                    struct v3d_qpu_instr *instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
++        else
++                return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
++}
++
++static bool
++v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
+ {
+         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
+@@ -901,9 +1405,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ 
+         {
+                 const struct opcode_desc *desc =
+-                        lookup_opcode_from_packed(devinfo, mul_ops,
+-                                                  ARRAY_SIZE(mul_ops),
+-                                                  op, mux_a, mux_b);
++                        lookup_opcode_from_packed(devinfo,
++                                                  mul_ops_v33,
++                                                  ARRAY_SIZE(mul_ops_v33),
++                                                  op, mux_a, mux_b, 0);
+                 if (!desc)
+                         return false;
+ 
+@@ -915,12 +1420,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.mul.a_unpack)) {
++                                                   &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+-                                                   &instr->alu.mul.b_unpack)) {
++                                                   &instr->alu.mul.b.unpack)) {
+                         return false;
+                 }
+ 
+@@ -931,7 +1436,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                                               ((mux_b >> 2) & 1));
+ 
+                 if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
+-                                                   &instr->alu.mul.a_unpack)) {
++                                                   &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+ 
+@@ -941,74 +1446,169 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ 
+                 if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+-                                                   &instr->alu.mul.a_unpack)) {
++                                                   &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+ 
+-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+ 
+                 break;
+ 
+         default:
+                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+-                instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
+-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+         }
+ 
+-        instr->alu.mul.a = mux_a;
+-        instr->alu.mul.b = mux_b;
++        instr->alu.mul.a.mux = mux_a;
++        instr->alu.mul.b.mux = mux_b;
+         instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+         instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+ 
+         return true;
+ }
+ 
+-static const struct opcode_desc *
+-lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+-                         const struct opcode_desc *opcodes, size_t num_opcodes,
+-                         uint8_t op)
++static bool
++v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
+ {
+-        for (int i = 0; i < num_opcodes; i++) {
+-                const struct opcode_desc *op_desc = &opcodes[i];
+-
+-                if (op_desc->op != op)
+-                        continue;
++        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
++        uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
++        uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);
+ 
+-                if (opcode_invalid_in_version(devinfo, op_desc))
+-                        continue;
++        {
++                const struct opcode_desc *desc =
++                        lookup_opcode_from_packed(devinfo,
++                                                  mul_ops_v71,
++                                                  ARRAY_SIZE(mul_ops_v71),
++                                                  op, 0, 0,
++                                                  raddr_d);
++                if (!desc)
++                        return false;
+ 
+-                return op_desc;
++                instr->alu.mul.op = desc->op;
+         }
+ 
+-        return NULL;
+-}
+-
++        switch (instr->alu.mul.op) {
++        case V3D_QPU_M_FMUL:
++                instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
++                                                   &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
++                                                   &instr->alu.mul.b.unpack)) {
++                        return false;
++                }
++
++                break;
++
++        case V3D_QPU_M_FMOV:
++                instr->alu.mul.output_pack = raddr_d & 0x3;
++
++                if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
++                                                   &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++
++                break;
++
++        case V3D_QPU_M_VFMUL:
++                unreachable("pending v71 update");
++                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
++                                                   &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++
++                break;
++
++        case V3D_QPU_M_MOV:
++                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
++                                                 &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        default:
++                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++                break;
++        }
++
++        instr->alu.mul.a.raddr = raddr_c;
++        instr->alu.mul.b.raddr = raddr_d;
++        instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
++        instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
++
++        return true;
++}
++
+ static bool
+-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                   struct v3d_qpu_instr *instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
++        else
++                return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
++}
++
++static const struct opcode_desc *
++lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
++                         const struct opcode_desc *opcodes, size_t num_opcodes,
++                         uint8_t op)
++{
++        for (int i = 0; i < num_opcodes; i++) {
++                const struct opcode_desc *op_desc = &opcodes[i];
++
++                if (op_desc->op != op)
++                        continue;
++
++                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
++                        continue;
++
++                return op_desc;
++        }
++
++        return NULL;
++}
++
++static bool
++v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+ {
+         uint32_t waddr = instr->alu.add.waddr;
+-        uint32_t mux_a = instr->alu.add.a;
+-        uint32_t mux_b = instr->alu.add.b;
++        uint32_t mux_a = instr->alu.add.a.mux;
++        uint32_t mux_b = instr->alu.add.b.mux;
+         int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+         const struct opcode_desc *desc =
+-                lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
++                lookup_opcode_from_instr(devinfo, add_ops_v33,
++                                         ARRAY_SIZE(add_ops_v33),
+                                          instr->alu.add.op);
+ 
+         if (!desc)
+                 return false;
+ 
+-        uint32_t opcode = desc->opcode_first;
++        uint32_t opcode = opcode = desc->opcode_first;
+ 
+         /* If an operation doesn't use an arg, its mux values may be used to
+          * identify the operation type.
+          */
+         if (nsrc < 2)
+-                mux_b = ffs(desc->mux_b_mask) - 1;
++                mux_b = ffs(desc->mux.b_mask) - 1;
+ 
+         if (nsrc < 1)
+-                mux_a = ffs(desc->mux_a_mask) - 1;
++                mux_a = ffs(desc->mux.a_mask) - 1;
+ 
+         bool no_magic_write = false;
+ 
+@@ -1061,12 +1661,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 }
+                 opcode |= output_pack << 4;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &a_unpack)) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                  &b_unpack)) {
+                         return false;
+                 }
+@@ -1100,23 +1700,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 uint32_t a_unpack;
+                 uint32_t b_unpack;
+ 
+-                if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
+-                    instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
++                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
++                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &a_unpack)) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                  &b_unpack)) {
+                         return false;
+                 }
+ 
+-                opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
+-                opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
++                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
++                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+ 
+                 break;
+         }
+@@ -1135,13 +1735,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 }
+                 mux_b |= packed;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+                 if (packed == 0)
+                         return false;
+-                opcode = (opcode & ~(1 << 2)) | packed << 2;
++                opcode = (opcode & ~(0x3 << 2)) | packed << 2;
+                 break;
+         }
+ 
+@@ -1153,7 +1753,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                         return false;
+ 
+                 uint32_t packed;
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1166,11 +1766,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+         case V3D_QPU_A_VFMIN:
+         case V3D_QPU_A_VFMAX:
+                 if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+-                    instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
++                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1180,8 +1780,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+         default:
+                 if (instr->alu.add.op != V3D_QPU_A_NOP &&
+                     (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+-                     instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
+-                     instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
++                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                         return false;
+                 }
+                 break;
+@@ -1198,15 +1798,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+ }
+ 
+ static bool
+-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        uint32_t waddr = instr->alu.add.waddr;
++        uint32_t raddr_a = instr->alu.add.a.raddr;
++        uint32_t raddr_b = instr->alu.add.b.raddr;
++
++        int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
++        const struct opcode_desc *desc =
++                lookup_opcode_from_instr(devinfo, add_ops_v71,
++                                         ARRAY_SIZE(add_ops_v71),
++                                         instr->alu.add.op);
++        if (!desc)
++                return false;
++
++        uint32_t opcode = opcode = desc->opcode_first;
++
++        /* If an operation doesn't use an arg, its raddr values may be used to
++         * identify the operation type.
++         */
++        if (nsrc < 2)
++                raddr_b = ffsll(desc->raddr_mask) - 1;
++
++        bool no_magic_write = false;
++
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_STVPMV:
++                waddr = 0;
++                no_magic_write = true;
++                break;
++        case V3D_QPU_A_STVPMD:
++                waddr = 1;
++                no_magic_write = true;
++                break;
++        case V3D_QPU_A_STVPMP:
++                waddr = 2;
++                no_magic_write = true;
++                break;
++
++        case V3D_QPU_A_LDVPMV_IN:
++        case V3D_QPU_A_LDVPMD_IN:
++        case V3D_QPU_A_LDVPMP:
++        case V3D_QPU_A_LDVPMG_IN:
++                assert(!instr->alu.add.magic_write);
++                break;
++
++        case V3D_QPU_A_LDVPMV_OUT:
++        case V3D_QPU_A_LDVPMD_OUT:
++        case V3D_QPU_A_LDVPMG_OUT:
++                assert(!instr->alu.add.magic_write);
++                *packed_instr |= V3D_QPU_MA;
++                break;
++
++        default:
++                break;
++        }
++
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_FADD:
++        case V3D_QPU_A_FADDNF:
++        case V3D_QPU_A_FSUB:
++        case V3D_QPU_A_FMIN:
++        case V3D_QPU_A_FMAX:
++        case V3D_QPU_A_FCMP: {
++                uint32_t output_pack;
++                uint32_t a_unpack;
++                uint32_t b_unpack;
++
++                if (instr->alu.add.op != V3D_QPU_A_FCMP) {
++                        if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
++                                                       &output_pack)) {
++                                return false;
++                        }
++                        opcode |= output_pack << 4;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &a_unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
++                                                 &b_unpack)) {
++                        return false;
++                }
++
++                /* These operations with commutative operands are
++                 * distinguished by the order of the operands come in.
++                 */
++                bool ordering =
++                        instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
++                        instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
++                if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
++                      instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
++                    ((instr->alu.add.op == V3D_QPU_A_FMAX ||
++                      instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
++                        uint32_t temp;
++
++                        temp = a_unpack;
++                        a_unpack = b_unpack;
++                        b_unpack = temp;
++
++                        temp = raddr_a;
++                        raddr_a = raddr_b;
++                        raddr_b = temp;
++
++                        /* If we are swapping raddr_a/b we also need to swap
++                         * small_imm_a/b.
++                         */
++                        if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
++                                assert(instr->sig.small_imm_a !=
++                                       instr->sig.small_imm_b);
++                                struct v3d_qpu_sig new_sig = instr->sig;
++                                new_sig.small_imm_a = !instr->sig.small_imm_a;
++                                new_sig.small_imm_b = !instr->sig.small_imm_b;
++                                uint32_t sig;
++                                if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
++                                    return false;
++                            *packed_instr &= ~V3D_QPU_SIG_MASK;
++                            *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
++                        }
++                }
++
++                opcode |= a_unpack << 2;
++                opcode |= b_unpack << 0;
++
++                break;
++        }
++
++        case V3D_QPU_A_VFPACK: {
++                uint32_t a_unpack;
++                uint32_t b_unpack;
++
++                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
++                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &a_unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
++                                                 &b_unpack)) {
++                        return false;
++                }
++
++                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
++                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
++
++                break;
++        }
++
++        case V3D_QPU_A_FFLOOR:
++        case V3D_QPU_A_FROUND:
++        case V3D_QPU_A_FTRUNC:
++        case V3D_QPU_A_FCEIL:
++        case V3D_QPU_A_FDX:
++        case V3D_QPU_A_FDY: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                raddr_b |= packed;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                if (packed == 0)
++                        return false;
++                raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
++                break;
++        }
++
++        case V3D_QPU_A_FTOIN:
++        case V3D_QPU_A_FTOIZ:
++        case V3D_QPU_A_FTOUZ:
++        case V3D_QPU_A_FTOC:
++                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                uint32_t packed;
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                if (packed == 0)
++                        return false;
++
++                raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
++
++                break;
++
++        case V3D_QPU_A_VFMIN:
++        case V3D_QPU_A_VFMAX:
++                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
++                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                opcode |= packed;
++                break;
++
++        case V3D_QPU_A_MOV: {
++                uint32_t packed;
++
++                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
++                                               &packed)) {
++                        return false;
++                }
++
++                raddr_b |= packed << 2;
++                break;
++        }
++
++        case V3D_QPU_A_FMOV: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                raddr_b = packed;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                raddr_b |= packed << 2;
++                break;
++        }
++
++        default:
++                if (instr->alu.add.op != V3D_QPU_A_NOP &&
++                    (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
++                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
++                        return false;
++                }
++                break;
++        }
++
++        *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
++        *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
++        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
++        *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
++        if (instr->alu.add.magic_write && !no_magic_write)
++                *packed_instr |= V3D_QPU_MA;
++
++        return true;
++}
++
++static bool
++v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+ {
+-        uint32_t mux_a = instr->alu.mul.a;
+-        uint32_t mux_b = instr->alu.mul.b;
++        uint32_t mux_a = instr->alu.mul.a.mux;
++        uint32_t mux_b = instr->alu.mul.b.mux;
+         int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+ 
+         const struct opcode_desc *desc =
+-                lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
++                lookup_opcode_from_instr(devinfo, mul_ops_v33,
++                                         ARRAY_SIZE(mul_ops_v33),
+                                          instr->alu.mul.op);
+ 
+         if (!desc)
+@@ -1218,10 +2083,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+          * that here.  If mux a/b determine packing, it will be set below.
+          */
+         if (nsrc < 2)
+-                mux_b = ffs(desc->mux_b_mask) - 1;
++                mux_b = ffs(desc->mux.b_mask) - 1;
+ 
+         if (nsrc < 1)
+-                mux_a = ffs(desc->mux_a_mask) - 1;
++                mux_a = ffs(desc->mux.a_mask) - 1;
+ 
+         switch (instr->alu.mul.op) {
+         case V3D_QPU_M_FMUL: {
+@@ -1236,13 +2101,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                  */
+                 opcode += packed << 4;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+                 opcode |= packed << 2;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1260,7 +2125,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 opcode |= (packed >> 1) & 1;
+                 mux_b = (packed & 1) << 2;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1274,22 +2139,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+                         return false;
+ 
+-                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+-                if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
++                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+                         opcode = 8;
+                 else
+                         opcode |= (packed + 4) & 7;
+ 
+-                if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
++                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+                         return false;
+ 
+                 break;
+         }
+ 
+         default:
++                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
++                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
++                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
++                        return false;
++                }
+                 break;
+         }
+ 
+@@ -1304,6 +2175,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+         return true;
+ }
+ 
++static bool
++v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        uint32_t raddr_c = instr->alu.mul.a.raddr;
++        uint32_t raddr_d = instr->alu.mul.b.raddr;
++        int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
++
++        const struct opcode_desc *desc =
++                lookup_opcode_from_instr(devinfo, mul_ops_v71,
++                                         ARRAY_SIZE(mul_ops_v71),
++                                         instr->alu.mul.op);
++        if (!desc)
++                return false;
++
++        uint32_t opcode = desc->opcode_first;
++
++        /* Some opcodes have a single valid value for their raddr_d, so set
++         * that here.  If raddr_b determine packing, it will be set below.
++         */
++        if (nsrc < 2)
++                raddr_d = ffsll(desc->raddr_mask) - 1;
++
++        switch (instr->alu.mul.op) {
++        case V3D_QPU_M_FMUL: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                /* No need for a +1 because desc->opcode_first has a 1 in this
++                 * field.
++                 */
++                opcode += packed << 4;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                opcode |= packed << 2;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                opcode |= packed << 0;
++                break;
++        }
++
++        case V3D_QPU_M_FMOV: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                raddr_d |= packed;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                raddr_d |= packed << 2;
++                break;
++        }
++
++        case V3D_QPU_M_VFMUL: {
++                unreachable("pending v71 update");
++                uint32_t packed;
++
++                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
++                        opcode = 8;
++                else
++                        opcode |= (packed + 4) & 7;
++
++                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
++                        return false;
++
++                break;
++        }
++
++        case V3D_QPU_M_MOV: {
++                uint32_t packed;
++
++                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
++                                               &packed)) {
++                        return false;
++                }
++
++                raddr_d |= packed << 2;
++                break;
++        }
++
++        default:
++                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
++                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
++                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
++                        return false;
++                }
++                break;
++        }
++
++        *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
++        *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
++        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
++        *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
++        if (instr->alu.mul.magic_write)
++                *packed_instr |= V3D_QPU_MM;
++
++        return true;
++}
++
++static bool
++v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
++                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
++        else
++                return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
++}
++
++static bool
++v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
++                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
++        else
++                return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
++}
++
+ static bool
+ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
+                          uint64_t packed_instr,
+@@ -1332,8 +2347,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
+                         return false;
+         }
+ 
+-        instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+-        instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
++        if (devinfo->ver <= 71) {
++                /*
++                 * For v71 this will be set on add/mul unpack, as raddr are now
++                 * part of v3d_qpu_input
++                 */
++                instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
++                instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
++        }
+ 
+         if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
+                 return false;
+@@ -1419,8 +2440,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
+         *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+ 
+         if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
+-                *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+-                *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
++                if (devinfo->ver < 71) {
++                        /*
++                         * For v71 this will be set on add/mul unpack, as raddr are now
++                         * part of v3d_qpu_input
++                         */
++                        *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
++                        *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
++                }
+ 
+                 if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
+                         return false;
+diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
+index 2f8e19c73fed..be7b78d5ef00 100644
+--- a/src/broadcom/qpu/tests/qpu_disasm.c
++++ b/src/broadcom/qpu/tests/qpu_disasm.c
+@@ -160,10 +160,10 @@ main(int argc, char **argv)
+                                 /* Swap the operands to be sure that we test
+                                  * how the QPUs distinguish between these ops.
+                                  */
+-                                swap_mux(&instr.alu.add.a,
+-                                         &instr.alu.add.b);
+-                                swap_pack(&instr.alu.add.a_unpack,
+-                                          &instr.alu.add.b_unpack);
++                                swap_mux(&instr.alu.add.a.mux,
++                                         &instr.alu.add.b.mux);
++                                swap_pack(&instr.alu.add.a.unpack,
++                                          &instr.alu.add.b.unpack);
+                                 break;
+                         default:
+                                 break;
+diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
+index eea5d3f050ea..c3802dd78575 100644
+--- a/src/broadcom/simulator/v3d_simulator.c
++++ b/src/broadcom/simulator/v3d_simulator.c
+@@ -92,6 +92,9 @@ static struct v3d_simulator_state {
+         /** Last performance monitor ID. */
+         uint32_t last_perfid;
+ 
++        /** Total performance counters */
++        uint32_t perfcnt_total;
++
+         struct util_dynarray bin_oom;
+         int refcount;
+ } sim_state = {
+@@ -436,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
+ 
+         perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
+         if (perfmon)
+-                v3d41_simulator_perfmon_stop(sim_state.v3d,
+-                                             perfmon->ncounters,
+-                                             perfmon->values);
++                v3d_X_simulator(perfmon_stop)(sim_state.v3d,
++                                              perfmon->ncounters,
++                                              perfmon->values);
+ 
+         perfmon = v3d_get_simulator_perfmon(fd, perfid);
+         if (perfmon)
+-                v3d41_simulator_perfmon_start(sim_state.v3d,
+-                                              perfmon->ncounters,
+-                                              perfmon->counters);
++                v3d_X_simulator(perfmon_start)(sim_state.v3d,
++                                               perfmon->ncounters,
++                                               perfmon->counters);
+ 
+         file->active_perfid = perfid;
+ }
+@@ -489,11 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
+         bin_fd = fd;
+ 
+         v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
+-
+-        if (sim_state.ver >= 41)
+-                v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+-        else
+-                v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
++        v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);
+ 
+         util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
+                               sim_bo) {
+@@ -632,15 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+         return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
+ }
+ 
+-static int
+-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
+-{
+-        if (sim_state.ver >= 41)
+-                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
+-        else
+-                return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
+-}
+-
+ static int
+ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
+ {
+@@ -652,10 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
+         v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
+         v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
+ 
+-        if (sim_state.ver >= 41)
+-                ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+-        else
+-                ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
++        ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);
+ 
+         v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
+ 
+@@ -682,11 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
+ 
+         v3d_simulator_perfmon_switch(fd, args->perfmon_id);
+ 
+-        if (sim_state.ver >= 41)
+-                ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
+-                                                       file->gmp->ofs);
+-        else
+-                ret = -1;
++        ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
++                                                file->gmp->ofs);
+ 
+         for (int i = 0; i < args->bo_handle_count; i++)
+                 v3d_simulator_copy_out_handle(file, bo_handles[i]);
+@@ -716,7 +700,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
+ 
+         perfmon->ncounters = args->ncounters;
+         for (int i = 0; i < args->ncounters; i++) {
+-                if (args->counters[i] >= V3D_PERFCNT_NUM) {
++                if (args->counters[i] >= sim_state.perfcnt_total) {
+                         ralloc_free(perfmon);
+                         return -EINVAL;
+                 } else {
+@@ -797,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
+                 return 0;
+ 
+         case DRM_IOCTL_V3D_GET_PARAM:
+-                return v3d_simulator_get_param_ioctl(fd, args);
++                return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);
+ 
+         case DRM_IOCTL_GEM_CLOSE:
+                 return v3d_simulator_gem_close_ioctl(fd, args);
+@@ -880,10 +864,8 @@ v3d_simulator_init_global()
+ 
+         util_dynarray_init(&sim_state.bin_oom, NULL);
+ 
+-        if (sim_state.ver >= 41)
+-                v3d41_simulator_init_regs(sim_state.v3d);
+-        else
+-                v3d33_simulator_init_regs(sim_state.v3d);
++        v3d_X_simulator(init_regs)(sim_state.v3d);
++        v3d_X_simulator(get_perfcnt_total)(&sim_state.perfcnt_total);
+ }
+ 
+ struct v3d_simulator_file *
+diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
+index ddb079c14559..923056344687 100644
+--- a/src/broadcom/simulator/v3d_simulator.h
++++ b/src/broadcom/simulator/v3d_simulator.h
+@@ -52,6 +52,32 @@ uint32_t v3d_simulator_get_mem_free(void);
+ #  define v3dX(x) v3d41_##x
+ #  include "v3dx_simulator.h"
+ #  undef v3dX
++
++#  define v3dX(x) v3d71_##x
++#  include "v3dx_simulator.h"
++#  undef v3dX
++
+ #endif
+ 
++/* Helper to call simulator ver specific functions */
++#define v3d_X_simulator(thing) ({                     \
++   __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\
++   switch (sim_state.ver) {                           \
++   case 33:                                           \
++   case 40:                                           \
++      v3d_X_sim_thing = &v3d33_simulator_##thing;     \
++      break;                                          \
++   case 41:                                           \
++   case 42:                                           \
++      v3d_X_sim_thing = &v3d41_simulator_##thing;     \
++      break;                                          \
++   case 71:                                           \
++      v3d_X_sim_thing = &v3d71_simulator_##thing;     \
++      break;                                          \
++   default:                                           \
++      unreachable("Unsupported hardware generation"); \
++   }                                                  \
++   v3d_X_sim_thing;                                   \
++})
++
+ #endif
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index c9322f0397b8..904cf2d1b764 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -40,17 +40,23 @@
+ #include "v3d_simulator.h"
+ #include "v3d_simulator_wrapper.h"
+ 
++#include "common/v3d_performance_counters.h"
++
+ #include "util/macros.h"
+ #include "util/bitscan.h"
+ #include "drm-uapi/v3d_drm.h"
+ 
+ #define HW_REGISTER_RO(x) (x)
+ #define HW_REGISTER_RW(x) (x)
+-#if V3D_VERSION >= 41
+-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
++#if V3D_VERSION == 71
++#include "libs/core/v3d/registers/7.1.6.0/v3d.h"
++#else
++#if V3D_VERSION == 41 || V3D_VERSION == 42
++#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
+ #else
+ #include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+ #endif
++#endif
+ 
+ #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
+ #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
+@@ -178,38 +184,48 @@ v3d_flush_caches(struct v3d_hw *v3d)
+         v3d_flush_l2t(v3d);
+ }
+ 
++#if V3D_VERSION < 71
++#define TFU_REG(NAME) V3D_TFU_ ## NAME
++#else
++#define TFU_REG(NAME) V3D_IFC_ ## NAME
++#endif
++
++
+ int
+ v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
+                                  struct drm_v3d_submit_tfu *args)
+ {
+-        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
+-
+-        V3D_WRITE(V3D_TFU_IIA, args->iia);
+-        V3D_WRITE(V3D_TFU_IIS, args->iis);
+-        V3D_WRITE(V3D_TFU_ICA, args->ica);
+-        V3D_WRITE(V3D_TFU_IUA, args->iua);
+-        V3D_WRITE(V3D_TFU_IOA, args->ioa);
+-        V3D_WRITE(V3D_TFU_IOS, args->ios);
+-        V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
+-        V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
+-        V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
+-        V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
+-
+-        V3D_WRITE(V3D_TFU_ICFG, args->icfg);
+-
+-        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
++        int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET;
++
++        V3D_WRITE(TFU_REG(IIA), args->iia);
++        V3D_WRITE(TFU_REG(IIS), args->iis);
++        V3D_WRITE(TFU_REG(ICA), args->ica);
++        V3D_WRITE(TFU_REG(IUA), args->iua);
++        V3D_WRITE(TFU_REG(IOA), args->ioa);
++#if V3D_VERSION >= 71
++        V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
++#endif
++        V3D_WRITE(TFU_REG(IOS), args->ios);
++        V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
++        V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
++        V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
++        V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
++
++        V3D_WRITE(TFU_REG(ICFG), args->icfg);
++
++        while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+                 v3d_hw_tick(v3d);
+         }
+ 
+         return 0;
+ }
+ 
+-#if V3D_VERSION >= 41
+ int
+ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
+                                  struct drm_v3d_submit_csd *args,
+                                  uint32_t gmp_ofs)
+ {
++#if V3D_VERSION >= 41
+         int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
+                                    V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
+         g_gmp_ofs = gmp_ofs;
+@@ -223,6 +239,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
++#if V3D_VERSION >= 71
++        V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
++#endif
+         /* CFG0 kicks off the job */
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
+ 
+@@ -239,8 +258,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
+         v3d_flush_caches(v3d);
+ 
+         return 0;
+-}
++#else
++        return -1;
+ #endif
++}
+ 
+ int
+ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
+@@ -310,16 +331,17 @@ v3d_isr_core(struct v3d_hw *v3d,
+                 return;
+         }
+ 
++#if V3D_VERSION <= 42
+         if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
+                 fprintf(stderr, "GMP violation at 0x%08x\n",
+                         V3D_READ(V3D_GMP_VIO_ADDR));
+-                abort();
+         } else {
+                 fprintf(stderr,
+                         "Unexpected ISR with core status 0x%08x\n",
+                         core_status);
+         }
+         abort();
++#endif
+ }
+ 
+ static void
+@@ -396,6 +418,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
+         }
+ 
+         handle_mmu_interruptions(v3d, hub_status);
++
++#if V3D_VERSION == 71
++        if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
++                fprintf(stderr, "GMP violation at 0x%08x\n",
++                        V3D_READ(V3D_GMP_VIO_ADDR));
++        } else {
++                fprintf(stderr,
++                        "Unexpected ISR with status 0x%08x\n",
++                        hub_status);
++        }
++        abort();
++#endif
+ }
+ 
+ static void
+@@ -436,8 +470,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
+          * for tracing. Perhaps we should evaluate to do the same here and add
+          * some debug options.
+          */
+-        uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
+-                                    V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
++        uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
++#if V3D_VERSION <= 42
++        core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
++#endif
++
+         V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
+         V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
+ 
+@@ -447,6 +484,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
+             V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET |  /* CAP exceeded */
+             V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+ 
++#if V3D_VERSION == 71
++        hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
++#endif
+         V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
+         V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
+ 
+@@ -509,7 +549,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
+ #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
+ #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
+ #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
+-                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
++                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + \
++                                                 V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
+ #endif
+ 
+ void
+@@ -549,4 +590,9 @@ void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
+ #endif
+ }
+ 
++void v3dX(simulator_get_perfcnt_total)(uint32_t *count)
++{
++        *count = ARRAY_SIZE(v3d_performance_counters);
++}
++
+ #endif /* USE_V3D_SIMULATOR */
+diff --git a/src/broadcom/simulator/v3dx_simulator.h b/src/broadcom/simulator/v3dx_simulator.h
+index f7d2cc67b03a..51fc2409d3e2 100644
+--- a/src/broadcom/simulator/v3dx_simulator.h
++++ b/src/broadcom/simulator/v3dx_simulator.h
+@@ -50,3 +50,4 @@ void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
+ void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
+                                   uint32_t ncounters,
+                                   uint64_t *values);
++void v3dX(simulator_get_perfcnt_total)(uint32_t *count);
+diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
+index ad032d832ad5..182388a35b4d 100644
+--- a/src/broadcom/vulkan/meson.build
++++ b/src/broadcom/vulkan/meson.build
+@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
+     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
+     '--beta', with_vulkan_beta.to_string(),
+     '--device-prefix', 'ver42',
++    '--device-prefix', 'ver71',
+   ],
+   depend_files : vk_entrypoints_gen_depend_files,
+ )
+@@ -64,13 +65,11 @@ files_per_version = files(
+   'v3dvx_pipeline.c',
+   'v3dvx_meta_common.c',
+   'v3dvx_pipeline.c',
++  'v3dvx_query.c',
+   'v3dvx_queue.c',
+ )
+ 
+-# The vulkan driver only supports version >= 42, which is the version present in
+-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
+-# driver.
+-v3d_versions = ['42']
++v3d_versions = ['42', '71']
+ 
+ v3dv_flags = []
+ 
+diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+index 96360a96b448..609c7acfa8f9 100644
+--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job,
+                          uint32_t layers,
+                          uint32_t render_target_count,
+                          uint8_t max_internal_bpp,
++                         uint8_t total_color_bpp,
+                          bool msaa,
+                          bool double_buffer)
+ {
+@@ -360,13 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job,
+    tiling->render_target_count = render_target_count;
+    tiling->msaa = msaa;
+    tiling->internal_bpp = max_internal_bpp;
++   tiling->total_color_bpp = total_color_bpp;
+    tiling->double_buffer = double_buffer;
+ 
+    /* Double-buffer is incompatible with MSAA */
+    assert(!tiling->msaa || !tiling->double_buffer);
+ 
+-   v3d_choose_tile_size(render_target_count, max_internal_bpp,
+-                        tiling->msaa, tiling->double_buffer,
++   v3d_choose_tile_size(&job->device->devinfo,
++                        render_target_count,
++                        max_internal_bpp, total_color_bpp, msaa,
++                        tiling->double_buffer,
+                         &tiling->tile_width, &tiling->tile_height);
+ 
+    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
+@@ -457,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
+                      bool allocate_tile_state_now,
+                      uint32_t render_target_count,
+                      uint8_t max_internal_bpp,
++                     uint8_t total_color_bpp,
+                      bool msaa)
+ {
+    assert(job);
+@@ -467,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
+    const struct v3dv_frame_tiling *tiling =
+       job_compute_frame_tiling(job, width, height, layers,
+                                render_target_count, max_internal_bpp,
+-                               msaa, false);
++                               total_color_bpp, msaa, false);
+ 
+    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
+    v3dv_return_if_oom(NULL, job);
+@@ -528,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
+                                job->frame_tiling.layers,
+                                job->frame_tiling.render_target_count,
+                                job->frame_tiling.internal_bpp,
++                               job->frame_tiling.total_color_bpp,
+                                job->frame_tiling.msaa,
+                                true);
+ 
+@@ -1374,7 +1380,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
+    }
+ 
+    uint32_t att_count = 0;
+-   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
++   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
+ 
+    /* We only need to emit subpass clears as draw calls for color attachments
+     * if the render area is not aligned to tile boundaries.
+@@ -1672,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
+ 
+       const struct v3dv_framebuffer *framebuffer = state->framebuffer;
+ 
+-      uint8_t internal_bpp;
++      uint8_t max_internal_bpp, total_color_bpp;
+       bool msaa;
+       v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
+-         (framebuffer, state->attachments, subpass, &internal_bpp, &msaa);
++         (framebuffer, state->attachments, subpass,
++          &max_internal_bpp, &total_color_bpp, &msaa);
+ 
+       /* From the Vulkan spec:
+        *
+@@ -1699,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
+                            layers,
+                            true, false,
+                            subpass->color_count,
+-                           internal_bpp,
++                           max_internal_bpp,
++                           total_color_bpp,
+                            msaa);
+    }
+ 
+@@ -2062,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
+       }
+    }
+ 
++   if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
++      if (memcmp(&dest->depth_bounds, &src->depth_bounds,
++                 sizeof(src->depth_bounds))) {
++         memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
++         dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
++      }
++   }
++
+    if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
+       if (dest->line_width != src->line_width) {
+          dest->line_width = src->line_width;
+@@ -2131,39 +2147,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
+    }
+ }
+ 
+-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
+-void
+-v3dv_viewport_compute_xform(const VkViewport *viewport,
+-                            float scale[3],
+-                            float translate[3])
+-{
+-   float x = viewport->x;
+-   float y = viewport->y;
+-   float half_width = 0.5f * viewport->width;
+-   float half_height = 0.5f * viewport->height;
+-   double n = viewport->minDepth;
+-   double f = viewport->maxDepth;
+-
+-   scale[0] = half_width;
+-   translate[0] = half_width + x;
+-   scale[1] = half_height;
+-   translate[1] = half_height + y;
+-
+-   scale[2] = (f - n);
+-   translate[2] = n;
+-
+-   /* It seems that if the scale is small enough the hardware won't clip
+-    * correctly so we work around this my choosing the smallest scale that
+-    * seems to work.
+-    *
+-    * This case is exercised by CTS:
+-    * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
+-    */
+-   const float min_abs_scale = 0.000009f;
+-   if (fabs(scale[2]) < min_abs_scale)
+-      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+-}
+-
+ /* Considers the pipeline's negative_one_to_one state and applies it to the
+  * current viewport transform if needed to produce the resulting Z translate
+  * and scale parameters.
+@@ -2216,9 +2199,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
+           viewportCount * sizeof(*pViewports));
+ 
+    for (uint32_t i = firstViewport; i < total_count; i++) {
+-      v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
+-                                  state->dynamic.viewport.scale[i],
+-                                  state->dynamic.viewport.translate[i]);
++      v3dv_X(cmd_buffer->device, viewport_compute_xform)
++         (&state->dynamic.viewport.viewports[i],
++          state->dynamic.viewport.scale[i],
++          state->dynamic.viewport.translate[i]);
+    }
+ 
+    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
+@@ -2699,6 +2683,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
+                         true, false,
+                         old_job->frame_tiling.render_target_count,
+                         old_job->frame_tiling.internal_bpp,
++                        old_job->frame_tiling.total_color_bpp,
+                         true /* msaa */);
+ 
+    v3dv_job_destroy(old_job);
+@@ -2963,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
+    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
+       v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
+ 
++   if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
++      v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
++
+    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
+       v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
+ 
+@@ -3392,9 +3380,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
+                        float minDepthBounds,
+                        float maxDepthBounds)
+ {
+-   /* We do not support depth bounds testing so we just ignore this. We are
+-    * already asserting that pipelines don't enable the feature anyway.
+-    */
++   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
++
++   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
++   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
++   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+ }
+ 
+ VKAPI_ATTR void VKAPI_CALL
+@@ -3826,6 +3816,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
+ 
+ void
+ v3dv_cmd_buffer_rewrite_indirect_csd_job(
++   struct v3dv_device *device,
+    struct v3dv_csd_indirect_cpu_job_info *info,
+    const uint32_t *wg_counts)
+ {
+@@ -3845,8 +3836,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
+    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+ 
+-   submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
+-                    (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
++   uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
++                          (wg_counts[0] * wg_counts[1] * wg_counts[2]);
++   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
++   if (device->devinfo.ver < 71 ||
++       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
++      submit->cfg[4] = num_batches - 1;
++   } else {
++      submit->cfg[4] = num_batches;
++   }
+    assert(submit->cfg[4] != ~0);
+ 
+    if (info->needs_wg_uniform_rewrite) {
+@@ -3879,6 +3877,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
+                           uint32_t **wg_uniform_offsets_out,
+                           uint32_t *wg_size_out)
+ {
++   struct v3dv_device *device = cmd_buffer->device;
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+    assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+    struct v3dv_shader_variant *cs_variant =
+@@ -3937,18 +3936,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
+    if (wg_size_out)
+       *wg_size_out = wg_size;
+ 
+-   submit->cfg[4] = num_batches - 1;
++   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
++   if (device->devinfo.ver < 71 ||
++       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
++      submit->cfg[4] = num_batches - 1;
++   } else {
++      submit->cfg[4] = num_batches;
++   }
+    assert(submit->cfg[4] != ~0);
+ 
+    assert(pipeline->shared_data->assembly_bo);
+    struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
+ 
+    submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
+-   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+    if (cs_variant->prog_data.base->single_seg)
+       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
+    if (cs_variant->prog_data.base->threads == 4)
+       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
++   /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved  */
++   if (device->devinfo.ver < 71)
++      submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+ 
+    if (cs_variant->prog_data.cs->shared_size > 0) {
+       job->csd.shared_memory =
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index d5de35176707..97eb220f5179 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device,
+    *features = (struct vk_features) {
+       /* Vulkan 1.0 */
+       .robustBufferAccess = true, /* This feature is mandatory */
+-      .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
++      .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
+       .imageCubeArray = true,
+       .independentBlend = true,
+       .geometryShader = true,
+@@ -224,10 +224,10 @@ get_features(const struct v3dv_physical_device *physical_device,
+       .logicOp = true,
+       .multiDrawIndirect = false,
+       .drawIndirectFirstInstance = true,
+-      .depthClamp = false, /* Only available since V3D 4.5.1.1 */
++      .depthClamp = physical_device->devinfo.ver >= 71,
+       .depthBiasClamp = true,
+       .fillModeNonSolid = true,
+-      .depthBounds = false, /* Only available since V3D 4.3.16.2 */
++      .depthBounds = physical_device->devinfo.ver >= 71,
+       .wideLines = true,
+       .largePoints = true,
+       .alphaToOne = true,
+@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device,
+        * problematic, we would always have to scalarize. Overall, this would
+        * not lead to best performance so let's just not support it.
+        */
+-      .scalarBlockLayout = false,
++      .scalarBlockLayout = physical_device->devinfo.ver >= 71,
+       /* This tells applications 2 things:
+        *
+        * 1. If they can select just one aspect for barriers. For us barriers
+@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
+    device->next_program_id = 0;
+ 
+    ASSERTED int len =
+-      asprintf(&device->name, "V3D %d.%d",
+-               device->devinfo.ver / 10, device->devinfo.ver % 10);
++      asprintf(&device->name, "V3D %d.%d.%d",
++               device->devinfo.ver / 10,
++               device->devinfo.ver % 10,
++               device->devinfo.rev);
+    assert(len != -1);
+ 
+    v3dv_physical_device_init_disk_cache(device);
+@@ -1279,7 +1281,8 @@ enumerate_devices(struct vk_instance *vk_instance)
+       if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
+          char **compat = devices[i]->deviceinfo.platform->compatible;
+          while (*compat) {
+-            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
++            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
++                strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
+                v3d_idx = i;
+                break;
+             }
+@@ -1288,8 +1291,9 @@ enumerate_devices(struct vk_instance *vk_instance)
+       } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
+          char **compat = devices[i]->deviceinfo.platform->compatible;
+          while (*compat) {
+-            if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+-                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
++            if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
++                strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
++                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
+                vc4_idx = i;
+                break;
+             }
+@@ -1326,6 +1330,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
+    switch (dev->devinfo.ver) {
+    case 42:
+       return 0xBE485FD3; /* Broadcom deviceID for 2711 */
++   case 71:
++      return 0x55701C33; /* Broadcom deviceID for 2712 */
+    default:
+       unreachable("Unsupported V3D version");
+    }
+@@ -1354,6 +1360,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+    const VkSampleCountFlags supported_sample_counts =
+       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
+ 
++   const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
++
+    struct timespec clock_res;
+    clock_getres(CLOCK_MONOTONIC, &clock_res);
+    const float timestamp_period =
+@@ -1424,7 +1432,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+       .maxFragmentInputComponents               = max_varying_components,
+       .maxFragmentOutputAttachments             = 4,
+       .maxFragmentDualSrcAttachments            = 0,
+-      .maxFragmentCombinedOutputResources       = MAX_RENDER_TARGETS +
++      .maxFragmentCombinedOutputResources       = max_rts +
+                                                   MAX_STORAGE_BUFFERS +
+                                                   MAX_STORAGE_IMAGES,
+ 
+@@ -1437,7 +1445,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+       .subPixelPrecisionBits                    = V3D_COORD_SHIFT,
+       .subTexelPrecisionBits                    = 8,
+       .mipmapPrecisionBits                      = 8,
+-      .maxDrawIndexedIndexValue                 = 0x00ffffff,
++      .maxDrawIndexedIndexValue                 = pdevice->devinfo.ver >= 71 ?
++                                                  0xffffffff : 0x00ffffff,
+       .maxDrawIndirectCount                     = 0x7fffffff,
+       .maxSamplerLodBias                        = 14.0f,
+       .maxSamplerAnisotropy                     = 16.0f,
+@@ -1464,7 +1473,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+       .framebufferDepthSampleCounts             = supported_sample_counts,
+       .framebufferStencilSampleCounts           = supported_sample_counts,
+       .framebufferNoAttachmentsSampleCounts     = supported_sample_counts,
+-      .maxColorAttachments                      = MAX_RENDER_TARGETS,
++      .maxColorAttachments                      = max_rts,
+       .sampledImageColorSampleCounts            = supported_sample_counts,
+       .sampledImageIntegerSampleCounts          = supported_sample_counts,
+       .sampledImageDepthSampleCounts            = supported_sample_counts,
+@@ -2031,7 +2040,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
+    v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
+                             device->instance->default_pipeline_cache_enabled);
+    device->default_attribute_float =
+-      v3dv_pipeline_create_default_attribute_values(device, NULL);
++      v3dv_X(device, create_default_attribute_values)(device, NULL);
+ 
+    device->device_address_mem_ctx = ralloc_context(NULL);
+    util_dynarray_init(&device->device_address_bo_list,
+@@ -2975,7 +2984,7 @@ v3dv_CreateSampler(VkDevice _device,
+       }
+    }
+ 
+-   v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
++   v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
+ 
+    *pSampler = v3dv_sampler_to_handle(sampler);
+ 
+diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
+index ebbd60e4c03c..e01e2e1bd197 100644
+--- a/src/broadcom/vulkan/v3dv_image.c
++++ b/src/broadcom/vulkan/v3dv_image.c
+@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device,
+     * makes sense to implement swizzle composition using VkSwizzle directly.
+     */
+    VkFormat format;
+-   uint8_t image_view_swizzle[4];
+    if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
+        range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+       format = VK_FORMAT_R8G8B8A8_UINT;
+@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device,
+       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
+ 
+       util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
+-                                   image_view_swizzle);
++                                   iview->view_swizzle);
+    } else {
+       format = pCreateInfo->format;
+       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
+-                                           image_view_swizzle);
++                                           iview->view_swizzle);
+    }
+ 
+    iview->vk.view_format = format;
+@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device,
+ 
+       const uint8_t *format_swizzle =
+          v3dv_get_format_swizzle(device, format, plane);
+-      util_format_compose_swizzles(format_swizzle, image_view_swizzle,
++      util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
+                                    iview->planes[plane].swizzle);
+ 
+       iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
+diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
+index 9cda9f0d6d28..8ac997241058 100644
+--- a/src/broadcom/vulkan/v3dv_limits.h
++++ b/src/broadcom/vulkan/v3dv_limits.h
+@@ -50,8 +50,6 @@
+ #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
+                              MAX_DYNAMIC_STORAGE_BUFFERS)
+ 
+-#define MAX_RENDER_TARGETS 4
+-
+ #define MAX_MULTIVIEW_VIEW_COUNT 16
+ 
+ /* These are tunable parameters in the HW design, but all the V3D
+diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
+index d376c179e1c2..1c0d66c977cc 100644
+--- a/src/broadcom/vulkan/v3dv_meta_clear.c
++++ b/src/broadcom/vulkan/v3dv_meta_clear.c
+@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+ 
+       v3dv_job_start_frame(job, width, height, max_layer,
+                            false, true, 1, internal_bpp,
++                           4 * v3d_internal_bpp_words(internal_bpp),
+                            image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
+ 
+       struct v3dv_meta_framebuffer framebuffer;
+@@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
+    uint32_t bit_offset = 0;
+ 
+    key |= rt_idx;
+-   bit_offset += 2;
++   bit_offset += 3;
+ 
+    key |= ((uint64_t) format) << bit_offset;
+    bit_offset += 32;
+@@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
+ {
+    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ 
+-   /* We can only clear attachments in the current subpass */
+-   assert(attachmentCount <= 5); /* 4 color + D/S */
++   /* We can have at most max_color_RTs + 1 D/S attachments */
++   assert(attachmentCount <=
++          V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
+ 
++   /* We can only clear attachments in the current subpass */
+    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+ 
+    assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
+diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
+index c0ec888b8c70..2d30c611e175 100644
+--- a/src/broadcom/vulkan/v3dv_meta_copy.c
++++ b/src/broadcom/vulkan/v3dv_meta_copy.c
+@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
+    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
+-                        1, internal_bpp, false);
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                        false);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
+    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers,
+-                        false, true, 1, internal_bpp,
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
+    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
+-                        1, internal_bpp, false);
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                        false);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+       (fb_format, region->srcSubresource.aspectMask,
+        &internal_type, &internal_bpp);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
+-                        1, internal_bpp, true);
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                        true);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
+index 20f5014268df..0583faf6f9a2 100644
+--- a/src/broadcom/vulkan/v3dv_pass.c
++++ b/src/broadcom/vulkan/v3dv_pass.c
+@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device,
+ 
+          /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
+           * the clear might get lost. If a subpass has this then we can't emit
+-          * the clear using the TLB and we have to do it as a draw call.
++          * the clear using the TLB and we have to do it as a draw call. This
++          * issue is fixed since V3D 4.3.18.
+           *
+           * FIXME: separate stencil.
+           */
+-         if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
++         if (device->devinfo.ver == 42 &&
++             subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+             struct v3dv_render_pass_attachment *att =
+                &pass->attachments[subpass->ds_attachment.attachment];
+             if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
+@@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device,
+    /* Granularity is defined by the tile size */
+    assert(subpass_idx < pass->subpass_count);
+    struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
+-   const uint32_t color_attachment_count = subpass->color_count;
++   const uint32_t color_count = subpass->color_count;
+ 
+    bool msaa = false;
+-   uint32_t max_bpp = 0;
+-   for (uint32_t i = 0; i < color_attachment_count; i++) {
++   uint32_t max_internal_bpp = 0;
++   uint32_t total_color_bpp = 0;
++   for (uint32_t i = 0; i < color_count; i++) {
+       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+       if (attachment_idx == VK_ATTACHMENT_UNUSED)
+          continue;
+@@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device,
+       v3dv_X(device, get_internal_type_bpp_for_output_format)
+          (format->planes[0].rt_type, &internal_type, &internal_bpp);
+ 
+-      max_bpp = MAX2(max_bpp, internal_bpp);
++      max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
++      total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ 
+       if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
+          msaa = true;
+@@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device,
+     * heuristics so we choose a conservative granularity here, with it disabled.
+     */
+    uint32_t width, height;
+-   v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
++   v3d_choose_tile_size(&device->devinfo, color_count,
++                        max_internal_bpp, total_color_bpp, msaa,
+                         false /* double-buffer */, &width, &height);
+    *granularity = (VkExtent2D) {
+       .width = width,
+diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
+index 22f01bdf64bd..ba782b8268a8 100644
+--- a/src/broadcom/vulkan/v3dv_pipeline.c
++++ b/src/broadcom/vulkan/v3dv_pipeline.c
+@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
+       return V3DV_DYNAMIC_LINE_WIDTH;
+    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
+       return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
+-
+-   /* Depth bounds testing is not available in in V3D 4.2 so here we are just
+-    * ignoring this dynamic state. We are already asserting at pipeline creation
+-    * time that depth bounds testing is not enabled.
+-    */
+    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
+-      return 0;
++      return V3DV_DYNAMIC_DEPTH_BOUNDS;
+ 
+    default:
+       unreachable("Unhandled dynamic state");
+@@ -2632,6 +2627,7 @@ pipeline_init_dynamic_state(
+    const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
+ {
+    /* Initialize to default values */
++   const struct v3d_device_info *devinfo = &pipeline->device->devinfo;
+    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
+    memset(dynamic, 0, sizeof(*dynamic));
+    dynamic->stencil_compare_mask.front = ~0;
+@@ -2639,7 +2635,9 @@ pipeline_init_dynamic_state(
+    dynamic->stencil_write_mask.front = ~0;
+    dynamic->stencil_write_mask.back = ~0;
+    dynamic->line_width = 1.0f;
+-   dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
++   dynamic->color_write_enable =
++      (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
++   dynamic->depth_bounds.max = 1.0f;
+ 
+    /* Create a mask of enabled dynamic states */
+    uint32_t dynamic_states = 0;
+@@ -2661,9 +2659,10 @@ pipeline_init_dynamic_state(
+                       pViewportState->viewportCount);
+ 
+          for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
+-            v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
+-                                        dynamic->viewport.scale[i],
+-                                        dynamic->viewport.translate[i]);
++            v3dv_X(pipeline->device, viewport_compute_xform)
++               (&dynamic->viewport.viewports[i],
++                dynamic->viewport.scale[i],
++                dynamic->viewport.translate[i]);
+          }
+       }
+ 
+@@ -2691,6 +2690,11 @@ pipeline_init_dynamic_state(
+          dynamic->stencil_reference.front = pDepthStencilState->front.reference;
+          dynamic->stencil_reference.back = pDepthStencilState->back.reference;
+       }
++
++      if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
++         dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds;
++         dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds;
++      }
+    }
+ 
+    if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
+@@ -2802,62 +2806,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
+    }
+ }
+ 
+-static bool
+-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+-{
+-   for (uint8_t i = 0; i < pipeline->va_count; i++) {
+-      if (vk_format_is_int(pipeline->va[i].vk_format))
+-         return true;
+-   }
+-   return false;
+-}
+-
+-/* @pipeline can be NULL. We assume in that case that all the attributes have
+- * a float format (we only create an all-float BO once and we reuse it with
+- * all float pipelines), otherwise we look at the actual type of each
+- * attribute used with the specific pipeline passed in.
+- */
+-struct v3dv_bo *
+-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
+-                                              struct v3dv_pipeline *pipeline)
+-{
+-   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+-   struct v3dv_bo *bo;
+-
+-   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+-
+-   if (!bo) {
+-      fprintf(stderr, "failed to allocate memory for the default "
+-              "attribute values\n");
+-      return NULL;
+-   }
+-
+-   bool ok = v3dv_bo_map(device, bo, size);
+-   if (!ok) {
+-      fprintf(stderr, "failed to map default attribute values buffer\n");
+-      return false;
+-   }
+-
+-   uint32_t *attrs = bo->map;
+-   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+-   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+-      attrs[i * 4 + 0] = 0;
+-      attrs[i * 4 + 1] = 0;
+-      attrs[i * 4 + 2] = 0;
+-      VkFormat attr_format =
+-         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+-      if (i < va_count && vk_format_is_int(attr_format)) {
+-         attrs[i * 4 + 3] = 1;
+-      } else {
+-         attrs[i * 4 + 3] = fui(1.0);
+-      }
+-   }
+-
+-   v3dv_bo_unmap(device, bo);
+-
+-   return bo;
+-}
+-
+ static void
+ pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
+                          const VkPipelineMultisampleStateCreateInfo *ms_info)
+@@ -2960,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline,
+    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
+     * feature and it shouldn't be used by any pipeline.
+     */
+-   assert(!ds_info || !ds_info->depthBoundsTestEnable);
++   assert(device->devinfo.ver >= 71 ||
++          !ds_info || !ds_info->depthBoundsTestEnable);
++   pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable;
+ 
+    enable_depth_bias(pipeline, rs_info);
+ 
+@@ -2992,9 +2942,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
+ 
+    v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
+ 
+-   if (pipeline_has_integer_vertex_attrib(pipeline)) {
++   if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
+       pipeline->default_attribute_values =
+-         v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
++         v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
++
+       if (!pipeline->default_attribute_values)
+          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+    } else {
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index c67072115293..43b14ec1ade3 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -123,6 +123,9 @@ struct v3d_simulator_file;
+ /* Minimum required by the Vulkan 1.1 spec */
+ #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
+ 
++/* Maximum performance counters number */
++#define V3D_MAX_PERFCNT 93
++
+ struct v3dv_physical_device {
+    struct vk_physical_device vk;
+ 
+@@ -581,6 +584,9 @@ struct v3dv_device {
+     * being float being float, allowing us to reuse the same BO for all
+     * pipelines matching this requirement. Pipelines that need integer
+     * attributes will create their own BO.
++    *
++    * Note that since v71 the default attribute values are not needed, so this
++    * can be NULL.
+     */
+    struct v3dv_bo *default_attribute_float;
+ 
+@@ -772,6 +778,8 @@ struct v3dv_image_view {
+ 
+    const struct v3dv_format *format;
+ 
++   uint8_t view_swizzle[4];
++
+    uint8_t plane_count;
+    struct {
+       uint8_t image_plane;
+@@ -782,8 +790,8 @@ struct v3dv_image_view {
+       uint32_t internal_type;
+       uint32_t offset;
+ 
+-      /* Precomputed (composed from createinfo->components and formar swizzle)
+-       * swizzles to pass in to the shader key.
++      /* Precomputed swizzle (composed from the view swizzle and the format
++       * swizzle).
+        *
+        * This could be also included on the descriptor bo, but the shader state
+        * packet doesn't need it on a bo, so we can just avoid a memory copy
+@@ -946,6 +954,7 @@ struct v3dv_frame_tiling {
+    uint32_t layers;
+    uint32_t render_target_count;
+    uint32_t internal_bpp;
++   uint32_t total_color_bpp;
+    bool     msaa;
+    bool     double_buffer;
+    uint32_t tile_width;
+@@ -1040,7 +1049,8 @@ enum v3dv_dynamic_state_bits {
+    V3DV_DYNAMIC_DEPTH_BIAS                = 1 << 6,
+    V3DV_DYNAMIC_LINE_WIDTH                = 1 << 7,
+    V3DV_DYNAMIC_COLOR_WRITE_ENABLE        = 1 << 8,
+-   V3DV_DYNAMIC_ALL                       = (1 << 9) - 1,
++   V3DV_DYNAMIC_DEPTH_BOUNDS              = 1 << 9,
++   V3DV_DYNAMIC_ALL                       = (1 << 10) - 1,
+ };
+ 
+ /* Flags for dirty pipeline state.
+@@ -1065,6 +1075,7 @@ enum v3dv_cmd_dirty_bits {
+    V3DV_CMD_DIRTY_LINE_WIDTH                = 1 << 16,
+    V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 17,
+    V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE        = 1 << 18,
++   V3DV_CMD_DIRTY_DEPTH_BOUNDS              = 1 << 19,
+ };
+ 
+ struct v3dv_dynamic_state {
+@@ -1101,6 +1112,11 @@ struct v3dv_dynamic_state {
+       float slope_factor;
+    } depth_bias;
+ 
++   struct {
++      float                                     min;
++      float                                     max;
++   } depth_bounds;
++
+    float line_width;
+ 
+    uint32_t color_write_enable;
+@@ -1196,7 +1212,7 @@ struct v3dv_timestamp_query_cpu_job_info {
+ };
+ 
+ /* Number of perfmons required to handle all supported performance counters */
+-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
++#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
+                                        DRM_V3D_MAX_PERF_COUNTERS)
+ 
+ struct v3dv_perf_query {
+@@ -1369,6 +1385,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
+                           bool allocate_tile_state_now,
+                           uint32_t render_target_count,
+                           uint8_t max_internal_bpp,
++                          uint8_t total_color_bpp,
+                           bool msaa);
+ 
+ bool v3dv_job_type_is_gpu(struct v3dv_job *job);
+@@ -1667,7 +1684,7 @@ struct v3dv_query_pool {
+    /* Only used with performance queries */
+    struct {
+       uint32_t ncounters;
+-      uint8_t counters[V3D_PERFCNT_NUM];
++      uint8_t counters[V3D_MAX_PERFCNT];
+ 
+       /* V3D has a limit on the number of counters we can track in a
+        * single performance monitor, so if too many counters are requested
+@@ -1803,7 +1820,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
+ void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
+                                  struct drm_v3d_submit_tfu *tfu);
+ 
+-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
++void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
++                                              struct v3dv_csd_indirect_cpu_job_info *info,
+                                               const uint32_t *wg_counts);
+ 
+ void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
+@@ -2289,7 +2307,8 @@ struct v3dv_pipeline {
+    unsigned char sha1[20];
+ 
+    /* In general we can reuse v3dv_device->default_attribute_float, so note
+-    * that the following can be NULL.
++    * that the following can be NULL. In 7.x this is not used, so it will be
++    * always NULL.
+     *
+     * FIXME: the content of this BO will be small, so it could be improved to
+     * be uploaded to a common BO. But as in most cases it will be NULL, it is
+@@ -2323,6 +2342,9 @@ struct v3dv_pipeline {
+       bool is_z16;
+    } depth_bias;
+ 
++   /* Depth bounds */
++   bool depth_bounds_test_enabled;
++
+    struct {
+       void *mem_ctx;
+       struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
+@@ -2338,6 +2360,13 @@ struct v3dv_pipeline {
+    uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
+ };
+ 
++static inline bool
++v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
++{
++   return device->devinfo.ver > 71 ||
++          (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
++}
++
+ static inline VkPipelineBindPoint
+ v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
+ {
+@@ -2500,10 +2529,6 @@ void
+ v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
+                                     struct v3dv_pipeline_cache *cache);
+ 
+-struct v3dv_bo *
+-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
+-                                              struct v3dv_pipeline *pipeline);
+-
+ VkResult
+ v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
+                                       nir_shader *nir,
+@@ -2608,12 +2633,32 @@ u64_compare(const void *key1, const void *key2)
+    case 42:                                           \
+       v3d_X_thing = &v3d42_##thing;                   \
+       break;                                          \
++   case 71:                                           \
++      v3d_X_thing = &v3d71_##thing;                   \
++      break;                                          \
+    default:                                           \
+       unreachable("Unsupported hardware generation"); \
+    }                                                  \
+    v3d_X_thing;                                       \
+ })
+ 
++/* Helper to get hw-specific macro values */
++#define V3DV_X(device, thing) ({                                \
++   __typeof(V3D42_##thing) V3D_X_THING;                         \
++   switch (device->devinfo.ver) {                               \
++   case 42:                                                     \
++      V3D_X_THING = V3D42_##thing;                              \
++      break;                                                    \
++   case 71:                                                     \
++      V3D_X_THING = V3D71_##thing;                              \
++      break;                                                    \
++   default:                                                     \
++      unreachable("Unsupported hardware generation");           \
++   }                                                            \
++   V3D_X_THING;                                                 \
++})
++
++
+ 
+ /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
+  * define v3dX for each version supported, because when we compile code that
+@@ -2626,6 +2671,10 @@ u64_compare(const void *key1, const void *key2)
+ #  define v3dX(x) v3d42_##x
+ #  include "v3dvx_private.h"
+ #  undef v3dX
++
++#  define v3dX(x) v3d71_##x
++#  include "v3dvx_private.h"
++#  undef v3dX
+ #endif
+ 
+ #ifdef ANDROID
+diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
+index 3284c467d749..deb7821f02b9 100644
+--- a/src/broadcom/vulkan/v3dv_query.c
++++ b/src/broadcom/vulkan/v3dv_query.c
+@@ -23,7 +23,6 @@
+ 
+ #include "v3dv_private.h"
+ 
+-#include "common/v3d_performance_counters.h"
+ #include "util/timespec.h"
+ #include "compiler/nir/nir_builder.h"
+ 
+@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device,
+                            DRM_IOCTL_V3D_PERFMON_CREATE,
+                            &req);
+       if (ret)
+-         fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
++         fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
+ 
+       pool->queries[query].perf.kperfmon_ids[i] = req.id;
+    }
+@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device,
+                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+ 
+       assert(pq_info);
+-      assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
+ 
+       pool->perfmon.ncounters = pq_info->counterIndexCount;
+       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
+@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device,
+    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+ 
+    struct v3dv_query *q = &pool->queries[query];
+-   uint64_t counter_values[V3D_PERFCNT_NUM];
++   uint64_t counter_values[V3D_MAX_PERFCNT];
+ 
+    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+       struct drm_v3d_perfmon_get_values req = {
+@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+    VkPerformanceCounterKHR *pCounters,
+    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+ {
+-   uint32_t desc_count = *pCounterCount;
++   V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
+ 
+-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+-                          out, pCounters, pCounterCount);
+-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+-                          out_desc, pCounterDescriptions, &desc_count);
+-
+-   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+-      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+-         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+-         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+-         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+-
+-         unsigned char sha1_result[20];
+-         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+-                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+-                            sha1_result);
+-
+-         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+-      }
+-
+-      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+-                               &out_desc, desc) {
+-         desc->flags = 0;
+-         snprintf(desc->name, sizeof(desc->name), "%s",
+-            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+-         snprintf(desc->category, sizeof(desc->category), "%s",
+-            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+-         snprintf(desc->description, sizeof(desc->description), "%s",
+-            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+-      }
+-   }
+-
+-   return vk_outarray_status(&out);
++   return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
++                                                                pCounters,
++                                                                pCounterDescriptions);
+ }
+ 
+ VKAPI_ATTR void VKAPI_CALL
+diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
+index b4aae1951806..429d14a91966 100644
+--- a/src/broadcom/vulkan/v3dv_queue.c
++++ b/src/broadcom/vulkan/v3dv_queue.c
+@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
+ 
+    if (memcmp(group_counts, info->csd_job->csd.wg_count,
+               sizeof(info->csd_job->csd.wg_count)) != 0) {
+-      v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
++      v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
+    }
+ 
+    return VK_SUCCESS;
+diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
+index 72fa9a1b39c5..6e5adc368a87 100644
+--- a/src/broadcom/vulkan/v3dv_uniforms.c
++++ b/src/broadcom/vulkan/v3dv_uniforms.c
+@@ -497,7 +497,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
+    struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
+ 
+    struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
+-
+    for (int i = 0; i < uinfo->count; i++) {
+       uint32_t data = uinfo->data[i];
+ 
+@@ -519,13 +518,17 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
+                               cmd_buffer, pipeline, variant->stage);
+          break;
+ 
+-      case QUNIFORM_VIEWPORT_X_SCALE:
+-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
++      case QUNIFORM_VIEWPORT_X_SCALE: {
++         float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
++         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
+          break;
++      }
+ 
+-      case QUNIFORM_VIEWPORT_Y_SCALE:
+-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
++      case QUNIFORM_VIEWPORT_Y_SCALE: {
++         float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
++         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
+          break;
++      }
+ 
+       case QUNIFORM_VIEWPORT_Z_OFFSET: {
+          float translate_z;
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index f182b790d363..011f5c8e1010 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
+    };
+    config.width_in_pixels = tiling->width;
+    config.height_in_pixels = tiling->height;
++#if V3D_VERSION == 42
+    config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+    config.multisample_mode_4x = tiling->msaa;
+    config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+    config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+    uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
+    cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
+@@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
+    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+       config.width_in_pixels = tiling->width;
+       config.height_in_pixels = tiling->height;
++#if V3D_VERSION == 42
+       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+       config.multisample_mode_4x = tiling->msaa;
+       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      config.log2_tile_width = log2_tile_size(tiling->tile_width);
++      config.log2_tile_height = log2_tile_size(tiling->tile_height);
++      /* FIXME: ideally we would like next assert on the packet header (as is
++       * general, so also applies to GL). We would need to expand
++       * gen_pack_header for that.
++       */
++      assert(config.log2_tile_width == config.log2_tile_height ||
++             config.log2_tile_width == config.log2_tile_height + 1);
++#endif
+    }
+ 
+    /* There's definitely nothing in the VCD cache we want. */
+@@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
+                                              iview->vk.base_array_layer + layer,
+                                              image_plane);
+ 
++   /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
++    * is broken in earlier V3D versions.
++    */
++   assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
++
+    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+       store.buffer_to_store = buffer;
+       store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
+@@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+       const VkImageAspectFlags aspects =
+          vk_format_aspects(ds_attachment->desc.format);
+ 
++#if V3D_VERSION <= 42
++      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
++       * for depth/stencil.
++       *
++       * There used to be some confusion regarding the Clear Tile Buffers
++       * Z/S bit also being broken, but we confirmed with Broadcom that this
++       * is not the case, it was just that some other hardware bugs (that we
++       * need to work around, such as GFXH-1461) could cause this bit to behave
++       * incorrectly.
++       *
++       * There used to be another issue where the RTs bit in the Clear Tile
++       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
++       * fixed since V3D 4.1.
++       *
++       * So if we have to emit a clear of depth or stencil we don't use
++       * the per-buffer store clear bit, even if we need to store the buffers,
++       * instead we always have to use the Clear Tile Buffers Z/S bit.
++       * If we have configured the job to do early Z/S clearing, then we
++       * don't want to emit any Clear Tile Buffers command at all here.
++       *
++       * Note that GFXH-1689 is not reproduced in the simulator, where
++       * using the clear buffer bit in depth/stencil stores works fine.
++       */
++
+       /* Only clear once on the first subpass that uses the attachment */
+       uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
+          ds_attachment->first_subpass :
+@@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+                            ds_attachment->desc.stencilLoadOp,
+                            subpass->do_stencil_clear_with_draw);
+ 
++      use_global_zs_clear = !state->job->early_zs_clear &&
++         (needs_depth_clear || needs_stencil_clear);
++#endif
++#if V3D_VERSION >= 71
++      /* The store command's clear buffer bit cannot be used for Z/S stencil:
++       * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
++       * so we don't want to emit redundant clears here.
++       */
++      use_global_zs_clear = false;
++#endif
++
+       /* Skip the last store if it is not required */
+       uint32_t ds_last_subpass = !pass->multiview_enabled ?
+          ds_attachment->last_subpass :
+@@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+          needs_stencil_store = subpass->resolve_stencil;
+       }
+ 
+-      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+-       * for depth/stencil.
+-       *
+-       * There used to be some confusion regarding the Clear Tile Buffers
+-       * Z/S bit also being broken, but we confirmed with Broadcom that this
+-       * is not the case, it was just that some other hardware bugs (that we
+-       * need to work around, such as GFXH-1461) could cause this bit to behave
+-       * incorrectly.
+-       *
+-       * There used to be another issue where the RTs bit in the Clear Tile
+-       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+-       * fixed since V3D 4.1.
+-       *
+-       * So if we have to emit a clear of depth or stencil we don't use
+-       * the per-buffer store clear bit, even if we need to store the buffers,
+-       * instead we always have to use the Clear Tile Buffers Z/S bit.
+-       * If we have configured the job to do early Z/S clearing, then we
+-       * don't want to emit any Clear Tile Buffers command at all here.
+-       *
+-       * Note that GFXH-1689 is not reproduced in the simulator, where
+-       * using the clear buffer bit in depth/stencil stores works fine.
+-       */
+-      use_global_zs_clear = !state->job->early_zs_clear &&
+-         (needs_depth_clear || needs_stencil_clear);
+       if (needs_depth_store || needs_stencil_store) {
+          const uint32_t zs_buffer =
+             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
+@@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+     * bit and instead we have to emit a single clear of all tile buffers.
+     */
+    if (use_global_zs_clear || use_global_rt_clear) {
++#if V3D_VERSION == 42
+       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+          clear.clear_z_stencil_buffer = use_global_zs_clear;
+          clear.clear_all_render_targets = use_global_rt_clear;
+       }
++#endif
++#if V3D_VERSION >= 71
++      cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
++#endif
+    }
+ }
+ 
+@@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
+    }
+ }
+ 
++/* Note that for v71, render target cfg packets has just one field that
++ * combined the internal type and clamp mode. For simplicity we keep just one
++ * helper.
++ *
++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
++ *
++ * FIXME: for v71 we are not returning all the possible combinations for
++ * render target internal type and clamp. For example for int types we are
++ * always using clamp int, and for 16f we are using clamp none or pos (that
++ * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
++ * summary right now we are just porting what we were doing on 4.2
++ */
++uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                VkFormat vk_format)
++{
++#if V3D_VERSION == 42
++   if (vk_format_is_int(vk_format))
++      return V3D_RENDER_TARGET_CLAMP_INT;
++   else if (vk_format_is_srgb(vk_format))
++      return V3D_RENDER_TARGET_CLAMP_NORM;
++   else
++      return V3D_RENDER_TARGET_CLAMP_NONE;
++#endif
++#if V3D_VERSION >= 71
++   switch (rt_type) {
++   case V3D_INTERNAL_TYPE_8I:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
++   case V3D_INTERNAL_TYPE_8UI:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
++   case V3D_INTERNAL_TYPE_8:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_8;
++   case V3D_INTERNAL_TYPE_16I:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
++   case V3D_INTERNAL_TYPE_16UI:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
++   case V3D_INTERNAL_TYPE_16F:
++      return vk_format_is_srgb(vk_format) ?
++         V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
++         V3D_RENDER_TARGET_TYPE_CLAMP_16F;
++   case V3D_INTERNAL_TYPE_32I:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
++   case V3D_INTERNAL_TYPE_32UI:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
++   case V3D_INTERNAL_TYPE_32F:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
++   default:
++      unreachable("Unknown internal render target type");
++   }
++
++   return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
++#endif
++}
++
++static void
++cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
++                                           int rt,
++                                           uint32_t *rt_bpp,
++#if V3D_VERSION == 42
++                                           uint32_t *rt_type,
++                                           uint32_t *rt_clamp)
++#else
++                                           uint32_t *rt_type_clamp)
++#endif
++{
++   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
++
++   assert(state->subpass_idx < state->pass->subpass_count);
++   const struct v3dv_subpass *subpass =
++      &state->pass->subpasses[state->subpass_idx];
++
++   if (rt >= subpass->color_count)
++      return;
++
++   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
++   const uint32_t attachment_idx = attachment->attachment;
++   if (attachment_idx == VK_ATTACHMENT_UNUSED)
++      return;
++
++   assert(attachment_idx < state->framebuffer->attachment_count &&
++          attachment_idx < state->attachment_alloc_count);
++   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
++   assert(vk_format_is_color(iview->vk.format));
++
++   assert(iview->plane_count == 1);
++   *rt_bpp = iview->planes[0].internal_bpp;
++#if V3D_VERSION == 42
++   *rt_type = iview->planes[0].internal_type;
++   *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
++                                               iview->vk.format);
++#endif
++#if V3D_VERSION >= 71
++   *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
++                                                    iview->vk.format);
++#endif
++}
++
+ void
+ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+ {
+@@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       config.number_of_render_targets = MAX2(subpass->color_count, 1);
+       config.multisample_mode_4x = tiling->msaa;
+       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
++#if V3D_VERSION == 42
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      config.log2_tile_width = log2_tile_size(tiling->tile_width);
++      config.log2_tile_height = log2_tile_size(tiling->tile_height);
++      /* FIXME: ideallly we would like next assert on the packet header (as is
++       * general, so also applies to GL). We would need to expand
++       * gen_pack_header for that.
++       */
++      assert(config.log2_tile_width == config.log2_tile_height ||
++             config.log2_tile_width == config.log2_tile_height + 1);
++#endif
+ 
+       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+          const struct v3dv_image_view *iview =
+@@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+           * Early-Z/S clearing is independent of Early Z/S testing, so it is
+           * possible to enable one but not the other so long as their
+           * respective requirements are met.
++          *
++          * From V3D 4.5.6, Z/S buffers are always cleared automatically
++          * between tiles, but we still want to enable early ZS clears
++          * when Z/S are not loaded or stored.
+           */
+          struct v3dv_render_pass_attachment *ds_attachment =
+             &pass->attachments[ds_attachment_idx];
+@@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          const VkImageAspectFlags ds_aspects =
+             vk_format_aspects(ds_attachment->desc.format);
+ 
+-         bool needs_depth_clear =
+-            check_needs_clear(state,
+-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+-                              ds_attachment->first_subpass,
+-                              ds_attachment->desc.loadOp,
+-                              subpass->do_depth_clear_with_draw);
+-
+          bool needs_depth_store =
+             v3dv_cmd_buffer_check_needs_store(state,
+                                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                               ds_attachment->last_subpass,
+                                               ds_attachment->desc.storeOp) ||
+                                               subpass->resolve_depth;
++#if V3D_VERSION <= 42
++         bool needs_depth_clear =
++            check_needs_clear(state,
++                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
++                              ds_attachment->first_subpass,
++                              ds_attachment->desc.loadOp,
++                              subpass->do_depth_clear_with_draw);
+ 
+          do_early_zs_clear = needs_depth_clear && !needs_depth_store;
++#endif
++#if V3D_VERSION >= 71
++         bool needs_depth_load =
++            v3dv_cmd_buffer_check_needs_load(state,
++                                             ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
++                                             ds_attachment->first_subpass,
++                                             ds_attachment->desc.loadOp,
++                                             ds_attachment->last_subpass,
++                                             ds_attachment->desc.storeOp);
++         do_early_zs_clear = !needs_depth_load && !needs_depth_store;
++#endif
++
+          if (do_early_zs_clear &&
+              vk_format_has_stencil(ds_attachment->desc.format)) {
+             bool needs_stencil_load =
+@@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+     */
+    job->early_zs_clear = do_early_zs_clear;
+ 
++#if V3D_VERSION >= 71
++   uint32_t base_addr = 0;
++#endif
+    for (uint32_t i = 0; i < subpass->color_count; i++) {
+       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
++      if (attachment_idx == VK_ATTACHMENT_UNUSED) {
++#if V3D_VERSION >= 71
++         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++            rt.render_target_number = i;
++            rt.stride = 1; /* Unused */
++         }
++#endif
+          continue;
++      }
+ 
+       struct v3dv_image_view *iview =
+          state->attachments[attachment_idx].image_view;
+@@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       const struct v3d_resource_slice *slice =
+          &image->planes[plane].slices[iview->vk.base_mip_level];
+ 
+-      const uint32_t *clear_color =
++      UNUSED const uint32_t *clear_color =
+          &state->attachments[attachment_idx].clear_value.color[0];
+ 
+-      uint32_t clear_pad = 0;
++      UNUSED uint32_t clear_pad = 0;
+       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+           slice->tiling == V3D_TILING_UIF_XOR) {
+          int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
+@@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          }
+       }
+ 
++#if V3D_VERSION == 42
+       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+          clear.clear_color_low_32_bits = clear_color[0];
+          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
+@@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+             clear.render_target_number = i;
+          };
+       }
++#endif
++
++#if V3D_VERSION >= 71
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++         rt.clear_color_low_bits = clear_color[0];
++         cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
++                                                    &rt.internal_type_and_clamping);
++         rt.stride =
++            v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
++                                                   v3d_internal_bpp_words(rt.internal_bpp));
++         rt.base_address = base_addr;
++         rt.render_target_number = i;
++
++         /* base_addr in multiples of 512 bits. We divide by 8 because stride
++          * is in 128-bit units, but it is packing 2 rows worth of data, so we
++          * need to divide it by 2 so it is only 1 row, and then again by 4 so
++          * it is in 512-bit units.
++          */
++         base_addr += (tiling->tile_height * rt.stride) / 8;
++      }
++
++      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
++         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
++            rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
++               ((uint64_t) clear_color[1]) |
++               (((uint64_t) (clear_color[2] & 0xff)) << 32);
++            rt.render_target_number = i;
++         }
++      }
++
++      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
++         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
++            rt.clear_color_top_bits = /* 56 bits (24 + 32) */
++               (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
++               (((uint64_t) (clear_color[3])) << 24);
++            rt.render_target_number = i;
++         }
++      }
++#endif
+    }
+ 
++#if V3D_VERSION >= 71
++   /* If we don't have any color RTs, we still need to emit one and flag
++    * it as not used using stride = 1.
++    */
++   if (subpass->color_count == 0) {
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++         rt.stride = 1;
++      }
++   }
++#endif
++
++#if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
+           &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
+           &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
+           &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
+           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
+    }
++#endif
+ 
+    /* Ends rendering mode config. */
+    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+@@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       }
+       if (cmd_buffer->state.tile_aligned_render_area &&
+           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
+             clear.clear_z_stencil_buffer = !job->early_zs_clear;
+             clear.clear_all_render_targets = true;
+          }
++#endif
++#if V3D_VERSION >= 71
++         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
++#endif
+       }
+       cl_emit(rcl, END_OF_TILE_MARKER, end);
+    }
+@@ -1054,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+    cl_emit(rcl, END_OF_RENDERING, end);
+ }
+ 
++void
++v3dX(viewport_compute_xform)(const VkViewport *viewport,
++                            float scale[3],
++                            float translate[3])
++{
++   float x = viewport->x;
++   float y = viewport->y;
++   float half_width = 0.5f * viewport->width;
++   float half_height = 0.5f * viewport->height;
++   double n = viewport->minDepth;
++   double f = viewport->maxDepth;
++
++   scale[0] = half_width;
++   translate[0] = half_width + x;
++   scale[1] = half_height;
++   translate[1] = half_height + y;
++
++   scale[2] = (f - n);
++   translate[2] = n;
++
++   /* It seems that if the scale is small enough the hardware won't clip
++    * correctly so we work around this my choosing the smallest scale that
++    * seems to work.
++    *
++    * This case is exercised by CTS:
++    * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
++    *
++    * V3D 7.x fixes this by using the new
++    * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
++    */
++#if V3D_VERSION <= 42
++   const float min_abs_scale = 0.0005f;
++   if (fabs(scale[2]) < min_abs_scale)
++      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
++#endif
++}
++
+ void
+ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+ {
+@@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
+    v3dv_return_if_oom(cmd_buffer, NULL);
+ 
++#if V3D_VERSION == 42
+    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
+       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
+    }
++#endif
++#if V3D_VERSION >= 71
++   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
++      clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
++      clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
++   }
++#endif
+ 
+    float translate_z, scale_z;
+    v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
+                                               &translate_z, &scale_z);
+ 
++#if V3D_VERSION == 42
+    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+       clip.viewport_z_offset_zc_to_zs = translate_z;
+       clip.viewport_z_scale_zc_to_zs = scale_z;
+    }
++#endif
++
++#if V3D_VERSION >= 71
++   /* If the Z scale is too small guardband clipping may not clip correctly */
++   if (fabsf(scale_z) < 0.01f) {
++      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
++         clip.viewport_z_offset_zc_to_zs = translate_z;
++         clip.viewport_z_scale_zc_to_zs = scale_z;
++      }
++   } else {
++      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
++         clip.viewport_z_offset_zc_to_zs = translate_z;
++         clip.viewport_z_scale_zc_to_zs = scale_z;
++      }
++   }
++#endif
++
+    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+       /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
+        * we are using OpenGL's [-1, 1] instead.
+@@ -1205,14 +1499,48 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
+    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
+       bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
+       bias.depth_offset_units = dynamic->depth_bias.constant_factor;
++#if V3D_VERSION <= 42
+       if (pipeline->depth_bias.is_z16)
+          bias.depth_offset_units *= 256.0f;
++#endif
+       bias.limit = dynamic->depth_bias.depth_bias_clamp;
+    }
+ 
+    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
+ }
+ 
++void
++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
++{
++   /* No depthBounds support for v42, so this method is empty in that case.
++    *
++    * Note that this method is being called as v3dv_job_init flags all state
++    * as dirty. See FIXME note in v3dv_job_init.
++    */
++
++#if V3D_VERSION >= 71
++   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
++   assert(pipeline);
++
++   if (!pipeline->depth_bounds_test_enabled)
++      return;
++
++   struct v3dv_job *job = cmd_buffer->state.job;
++   assert(job);
++
++   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
++   v3dv_return_if_oom(cmd_buffer, NULL);
++
++   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
++   cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
++      bounds.lower_test_limit = dynamic->depth_bounds.min;
++      bounds.upper_test_limit = dynamic->depth_bounds.max;
++   }
++
++   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
++#endif
++}
++
+ void
+ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
+ {
+@@ -1256,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+    assert(pipeline);
+ 
++   const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
++   const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
++
+    const uint32_t blend_packets_size =
+       cl_packet_length(BLEND_ENABLES) +
+       cl_packet_length(BLEND_CONSTANT_COLOR) +
+-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
++      cl_packet_length(BLEND_CFG) * max_color_rts;
+ 
+    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
+    v3dv_return_if_oom(cmd_buffer, NULL);
+@@ -1271,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
+          }
+       }
+ 
+-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
++      for (uint32_t i = 0; i < max_color_rts; i++) {
+          if (pipeline->blend.enables & (1 << i))
+             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
+       }
+@@ -1298,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
+ 
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
++   uint32_t color_write_mask = ~dynamic->color_write_enable |
++                               pipeline->blend.color_write_masks;
++#if V3D_VERSION <= 42
++   /* Only 4 RTs */
++   color_write_mask &= 0xffff;
++#endif
++
+    cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
+-      mask.mask = (~dynamic->color_write_enable |
+-                   pipeline->blend.color_write_masks) & 0xffff;
++      mask.mask = color_write_mask;
+    }
+ 
+    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+@@ -1591,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+    assert(pipeline);
+ 
+-   bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
+-
+    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
+    v3dv_return_if_oom(cmd_buffer, NULL);
+ 
+    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
++#if V3D_VERSION == 42
++      bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
+       config.early_z_enable = enable_ez;
+       config.early_z_updates_enable = config.early_z_enable &&
+          pipeline->z_updates_enable;
++#endif
+    }
+ }
+ 
+@@ -1845,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
+          gs_bin->prog_data.gs->base.threads == 4;
+       shader.geometry_bin_mode_shader_start_in_final_thread_section =
+          gs_bin->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+       shader.geometry_bin_mode_shader_propagate_nans = true;
++#endif
+       shader.geometry_bin_mode_shader_uniforms_address =
+          gs_bin_uniforms;
+ 
+@@ -1855,7 +2195,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
+          gs->prog_data.gs->base.threads == 4;
+       shader.geometry_render_mode_shader_start_in_final_thread_section =
+          gs->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+       shader.geometry_render_mode_shader_propagate_nans = true;
++#endif
+       shader.geometry_render_mode_shader_uniforms_address =
+          gs_render_uniforms;
+    }
+@@ -2031,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
+                                 pipeline->vpm_cfg.Gv);
+    }
+ 
++#if V3D_VERSION == 42
+    struct v3dv_bo *default_attribute_values =
+       pipeline->default_attribute_values != NULL ?
+       pipeline->default_attribute_values :
+       pipeline->device->default_attribute_float;
++#endif
+ 
+    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
+                           pipeline->shader_state_record, shader) {
+@@ -2060,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
+       shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
+       shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+ 
++#if V3D_VERSION == 42
+       shader.address_of_default_attribute_values =
+          v3dv_cl_address(default_attribute_values, 0);
++#endif
+ 
+       shader.any_shader_reads_hardware_written_primitive_id =
+          (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
+@@ -2370,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
+                                      buffer->mem_offset + offset);
+    }
+ }
+-
+-void
+-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
+-                                                 int rt,
+-                                                 uint32_t *rt_bpp,
+-                                                 uint32_t *rt_type,
+-                                                 uint32_t *rt_clamp)
+-{
+-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+-
+-   assert(state->subpass_idx < state->pass->subpass_count);
+-   const struct v3dv_subpass *subpass =
+-      &state->pass->subpasses[state->subpass_idx];
+-
+-   if (rt >= subpass->color_count)
+-      return;
+-
+-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+-   const uint32_t attachment_idx = attachment->attachment;
+-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
+-      return;
+-
+-   assert(attachment_idx < state->framebuffer->attachment_count &&
+-          attachment_idx < state->attachment_alloc_count);
+-   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+-   assert(vk_format_is_color(iview->vk.format));
+-
+-   assert(iview->plane_count == 1);
+-   *rt_bpp = iview->planes[0].internal_bpp;
+-   *rt_type = iview->planes[0].internal_type;
+-   if (vk_format_is_int(iview->vk.view_format))
+-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
+-   else if (vk_format_is_srgb(iview->vk.view_format))
+-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
+-   else
+-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+-}
+diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
+index e235983864cd..1b50d51e19ff 100644
+--- a/src/broadcom/vulkan/v3dvx_device.c
++++ b/src/broadcom/vulkan/v3dvx_device.c
+@@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = {
+    [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
+ };
+ 
+-
+ static union pipe_color_union encode_border_color(
++   const struct v3dv_device *device,
+    const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+ {
+    const struct util_format_description *desc =
+@@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color(
+     * colors so we need to fix up the swizzle manually for this case.
+     */
+    uint8_t swizzle[4];
+-   if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
++   const bool v3d_has_reverse_swap_rb_bits =
++      v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
++   if (!v3d_has_reverse_swap_rb_bits &&
++       v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+        v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
+       swizzle[0] = PIPE_SWIZZLE_W;
+       swizzle[1] = PIPE_SWIZZLE_X;
+       swizzle[2] = PIPE_SWIZZLE_Y;
+       swizzle[3] = PIPE_SWIZZLE_Z;
++   }
++   /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
++    * we have to use the new reverse and swap_r/b flags in the texture shader
++    * state which will apply the format swizzle automatically when sampling
++    * the border color too and we should not apply it manually here.
++    */
++   else if (v3d_has_reverse_swap_rb_bits &&
++            (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
++             v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
++      swizzle[0] = PIPE_SWIZZLE_X;
++      swizzle[1] = PIPE_SWIZZLE_Y;
++      swizzle[2] = PIPE_SWIZZLE_Z;
++      swizzle[3] = PIPE_SWIZZLE_W;
+    } else {
+       memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
+    }
+@@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color(
+                              (1 << (desc->channel[i].size - 1)) - 1);
+    }
+ 
+-   /* convert from float to expected format */
++#if V3D_VERSION <= 42
++   /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
++    * for us. In V3D 4.x we need to manually convert floating point color
++    * values to the expected format.
++    */
+    if (vk_format_is_srgb(bc_info->format) ||
+        vk_format_is_compressed(bc_info->format)) {
+       for (int i = 0; i < 4; i++)
+@@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color(
+          }
+       }
+    }
++#endif
+ 
+    return border;
+ }
+ 
+ void
+-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
++v3dX(pack_sampler_state)(const struct v3dv_device *device,
++                         struct v3dv_sampler *sampler,
+                          const VkSamplerCreateInfo *pCreateInfo,
+                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+ {
+@@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+       s.border_color_mode = border_color_mode;
+ 
+       if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
+-         union pipe_color_union border = encode_border_color(bc_info);
++         union pipe_color_union border = encode_border_color(device, bc_info);
+ 
+          s.border_color_word_0 = border.ui[0];
+          s.border_color_word_1 = border.ui[1];
+@@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+    const struct v3dv_framebuffer *framebuffer,
+    const struct v3dv_cmd_buffer_attachment_state *attachments,
+    const struct v3dv_subpass *subpass,
+-   uint8_t *max_bpp,
++   uint8_t *max_internal_bpp,
++   uint8_t *total_color_bpp,
+    bool *msaa)
+ {
+    STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
+-   *max_bpp = V3D_INTERNAL_BPP_32;
++   *max_internal_bpp = V3D_INTERNAL_BPP_32;
++   *total_color_bpp = 0;
+    *msaa = false;
+ 
+    if (subpass) {
+@@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+          assert(att);
+          assert(att->plane_count == 1);
+ 
+-         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+-            *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
++         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
++            const uint32_t internal_bpp = att->planes[0].internal_bpp;
++            *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
++            *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
++         }
+ 
+          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+             *msaa = true;
+@@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+             *msaa = true;
+       }
+-
+       return;
+    }
+ 
+@@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+       assert(att);
+       assert(att->plane_count == 1);
+ 
+-      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+-         *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
++      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
++         const uint32_t internal_bpp = att->planes[0].internal_bpp;
++         *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
++         *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
++      }
+ 
+       if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+          *msaa = true;
+diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
+index 80a3e5bfde86..de984e81220f 100644
+--- a/src/broadcom/vulkan/v3dvx_image.c
++++ b/src/broadcom/vulkan/v3dvx_image.c
+@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+          tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
+          tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
+ 
+-         tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
+-
+          tex.texture_type = image_view->format->planes[plane].tex_type;
+ 
+          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+@@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+ 
+          tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+ 
+-         tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
+-
+          /* At this point we don't have the job. That's the reason the first
+           * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+           * add the bo to the job. This also means that we need to add manually
+@@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+             v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
+                               iplane);
+          tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
++
++         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
++
++         /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
++          * the reverse and/or swap_r/b swizzle from the format table with the
++          * image view swizzle. This, however, doesn't work for border colors,
++          * for that there is the reverse_standard_border_color.
++          *
++          * In v3d 7.x, however, there is no reverse_standard_border_color bit,
++          * since the reverse and swap_r/b bits also affect border colors. It is
++          * because of this that we absolutely need to use these bits with
++          * reversed and swpaped formats, since that's the only way to ensure
++          * correct border colors. In that case we don't want to program the
++          * swizzle to the composition of the format swizzle and the view
++          * swizzle like we do in v3d 4.x, since the format swizzle is applied
++          * via the reverse and swap_r/b bits.
++          */
++#if V3D_VERSION == 42
++         tex.srgb = is_srgb;
++         tex.reverse_standard_border_color =
++            image_view->planes[plane].channel_reverse;
++#endif
++#if V3D_VERSION >= 71
++         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
++
++         tex.reverse = image_view->planes[plane].channel_reverse;
++         tex.r_b_swap = image_view->planes[plane].swap_rb;
++
++         if (tex.reverse || tex.r_b_swap) {
++            tex.swizzle_r =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
++            tex.swizzle_g =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
++            tex.swizzle_b =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
++            tex.swizzle_a =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
++         }
++
++         tex.chroma_offset_x = 1;
++         tex.chroma_offset_y = 1;
++         /* See comment in XML field definition for rationale of the shifts */
++         tex.texture_base_pointer_cb = base_offset >> 6;
++         tex.texture_base_pointer_cr = base_offset >> 6;
++#endif
+       }
+    }
+ }
+@@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+ 
+       assert(buffer_view->format->plane_count == 1);
+       tex.texture_type = buffer_view->format->planes[0].tex_type;
+-      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
++
++      bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
++#if V3D_VERSION == 42
++      tex.srgb = is_srgb;
++#endif
++#if V3D_VERSION >= 71
++      tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
++#endif
+ 
+       /* At this point we don't have the job. That's the reason the first
+        * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+@@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+          buffer_view->offset;
+ 
+       tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
++
++#if V3D_VERSION >= 71
++      tex.chroma_offset_x = 1;
++      tex.chroma_offset_y = 1;
++      /* See comment in XML field definition for rationale of the shifts */
++      tex.texture_base_pointer_cb = base_offset >> 6;
++      tex.texture_base_pointer_cr = base_offset >> 6;
++#endif
+    }
+ }
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index 04147b82cbd7..858096f9e4b4 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -26,6 +26,7 @@
+ 
+ #include "broadcom/common/v3d_macros.h"
+ #include "broadcom/common/v3d_tfu.h"
++#include "broadcom/common/v3d_util.h"
+ #include "broadcom/cle/v3dx_pack.h"
+ #include "broadcom/compiler/v3d_compiler.h"
+ 
+@@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job,
+       config.number_of_render_targets = 1;
+       config.multisample_mode_4x = tiling->msaa;
+       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
++#if V3D_VERSION == 42
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      config.log2_tile_width = log2_tile_size(tiling->tile_width);
++      config.log2_tile_height = log2_tile_size(tiling->tile_height);
++      /* FIXME: ideallly we would like next assert on the packet header (as is
++       * general, so also applies to GL). We would need to expand
++       * gen_pack_header for that.
++       */
++      assert(config.log2_tile_width == config.log2_tile_height ||
++             config.log2_tile_width == config.log2_tile_height + 1);
++#endif
+       config.internal_depth_type = fb->internal_depth_type;
+    }
+ 
++   const uint32_t *color = NULL;
+    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
+-      uint32_t clear_pad = 0;
++      UNUSED uint32_t clear_pad = 0;
+       if (clear_info->image) {
+          const struct v3dv_image *image = clear_info->image;
+ 
+@@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
+          }
+       }
+ 
+-      const uint32_t *color = &clear_info->clear_value->color[0];
++      color = &clear_info->clear_value->color[0];
++
++#if V3D_VERSION == 42
+       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+          clear.clear_color_low_32_bits = color[0];
+          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
+@@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
+             clear.render_target_number = 0;
+          };
+       }
++#endif
+    }
+ 
++#if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+       rt.render_target_0_internal_bpp = tiling->internal_bpp;
+       rt.render_target_0_internal_type = fb->internal_type;
+       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+    }
++#endif
++
++#if V3D_VERSION >= 71
++   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++      if (color)
++         rt.clear_color_low_bits = color[0];
++      rt.internal_bpp = tiling->internal_bpp;
++      rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
++                                                                      fb->vk_format);
++      rt.stride =
++         v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
++                                                v3d_internal_bpp_words(rt.internal_bpp));
++      rt.base_address = 0;
++      rt.render_target_number = 0;
++   }
++
++   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
++         rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
++            ((uint64_t) color[1]) |
++            (((uint64_t) (color[2] & 0xff)) << 32);
++         rt.render_target_number = 0;
++      }
++   }
++
++   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
++         rt.clear_color_top_bits = /* 56 bits (24 + 32) */
++            (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
++            (((uint64_t) (color[3])) << 24);
++         rt.render_target_number = 0;
++      }
++   }
++#endif
+ 
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
+@@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job,
+        */
+       if (clear_value &&
+           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
+             clear.clear_z_stencil_buffer = true;
+             clear.clear_all_render_targets = true;
+          }
++#endif
++#if V3D_VERSION >= 71
++         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
++#endif
+       }
+       cl_emit(rcl, END_OF_TILE_MARKER, end);
+    }
+@@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+ 
+    tfu.iia |= src_offset;
+ 
++#if V3D_VERSION <= 42
+    if (src_tiling == V3D_TILING_RASTER) {
+       tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
+    } else {
+@@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+                    V3D33_TFU_ICFG_FORMAT_SHIFT;
+    }
+    tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
++#endif
++#if V3D_VERSION >= 71
++   if (src_tiling == V3D_TILING_RASTER) {
++      tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
++   } else {
++      tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
++                  (src_tiling - V3D_TILING_LINEARTILE)) <<
++                   V3D71_TFU_ICFG_IFORMAT_SHIFT;
++   }
++   tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
++#endif
+ 
+    tfu.ioa = dst_offset;
+ 
++#if V3D_VERSION <= 42
+    tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
+                (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                 V3D33_TFU_IOA_FORMAT_SHIFT;
++#endif
++
++#if V3D_VERSION >= 71
++   tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
++                  (dst_tiling - V3D_TILING_LINEARTILE)) <<
++                   V3D71_TFU_IOC_FORMAT_SHIFT;
++
++   switch (dst_tiling) {
++   case V3D_TILING_UIF_NO_XOR:
++   case V3D_TILING_UIF_XOR:
++      tfu.v71.ioc |=
++         (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
++         V3D71_TFU_IOC_STRIDE_SHIFT;
++      break;
++   case V3D_TILING_RASTER:
++      tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
++                      V3D71_TFU_IOC_STRIDE_SHIFT;
++      break;
++   default:
++      break;
++   }
++#endif
+ 
+    switch (src_tiling) {
+    case V3D_TILING_UIF_NO_XOR:
+@@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+    /* The TFU can handle raster sources but always produces UIF results */
+    assert(dst_tiling != V3D_TILING_RASTER);
+ 
++#if V3D_VERSION <= 42
+    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+     * OPAD field for the destination (how many extra UIF blocks beyond
+     * those necessary to cover the height).
+@@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+                       uif_block_h;
+       tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
+    }
++#endif
+ 
+    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
+ }
+@@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+       uint32_t width, height;
+       framebuffer_size_for_pixel_count(num_items, &width, &height);
+ 
+-      v3dv_job_start_frame(job, width, height, 1, true, true,
+-                           1, internal_bpp, false);
++      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
++                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                           false);
+ 
+       struct v3dv_meta_framebuffer framebuffer;
+       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
+@@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+       uint32_t width, height;
+       framebuffer_size_for_pixel_count(num_items, &width, &height);
+ 
+-      v3dv_job_start_frame(job, width, height, 1, true, true,
+-                           1, internal_bpp, false);
++      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
++                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                           false);
+ 
+       struct v3dv_meta_framebuffer framebuffer;
+       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index 5d32d414ed86..ad22add155d8 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -227,6 +227,45 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
+          ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
+ 
+       pipeline->z_updates_enable = config.z_updates_enable;
++
++#if V3D_VERSION >= 71
++      /* From the Vulkan spec:
++       *
++       *    "depthClampEnable controls whether to clamp the fragment’s depth
++       *     values as described in Depth Test. If the pipeline is not created
++       *     with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
++       *     then enabling depth clamp will also disable clipping primitives to
++       *     the z planes of the frustrum as described in Primitive Clipping.
++       *     Otherwise depth clipping is controlled by the state set in
++       *     VkPipelineRasterizationDepthClipStateCreateInfoEXT."
++       *
++       * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
++       * supported in the driver yet, so in practice we are always enabling Z
++       * clipping for now.
++       */
++      bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
++      bool z_clip_enable = false;
++      const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
++         ds_info ? vk_find_struct_const(ds_info->pNext,
++                                        PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
++                   NULL;
++      if (clip_info)
++         z_clip_enable = clip_info->depthClipEnable;
++      else if (!z_clamp_enable)
++         z_clip_enable = true;
++
++      if (z_clip_enable) {
++         config.z_clipping_mode = pipeline->negative_one_to_one ?
++	    V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
++      } else {
++         config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
++      }
++
++      config.z_clamp_mode = z_clamp_enable;
++
++      config.depth_bounds_test_enable =
++              ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
++#endif
+    };
+ }
+ 
+@@ -360,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
+ static void
+ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+ {
+-   assert(sizeof(pipeline->shader_state_record) ==
++   assert(sizeof(pipeline->shader_state_record) >=
+           cl_packet_length(GL_SHADER_STATE_RECORD));
+ 
+    struct v3d_fs_prog_data *prog_data_fs =
+@@ -435,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+       shader.number_of_varyings_in_fragment_shader =
+          prog_data_fs->num_inputs;
+ 
+-      shader.coordinate_shader_propagate_nans = true;
+-      shader.vertex_shader_propagate_nans = true;
+-      shader.fragment_shader_propagate_nans = true;
+-
+       /* Note: see previous note about addresses */
+       /* shader.coordinate_shader_code_address */
+       /* shader.vertex_shader_code_address */
+       /* shader.fragment_shader_code_address */
+ 
++#if V3D_VERSION == 42
++      shader.coordinate_shader_propagate_nans = true;
++      shader.vertex_shader_propagate_nans = true;
++      shader.fragment_shader_propagate_nans = true;
++
+       /* FIXME: Use combined input/output size flag in the common case (also
+        * on v3d, see v3dx_draw).
+        */
+@@ -451,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+          prog_data_vs_bin->separate_segments;
+       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+          prog_data_vs->separate_segments;
+-
+       shader.coordinate_shader_input_vpm_segment_size =
+          prog_data_vs_bin->separate_segments ?
+          prog_data_vs_bin->vpm_input_size : 1;
+       shader.vertex_shader_input_vpm_segment_size =
+          prog_data_vs->separate_segments ?
+          prog_data_vs->vpm_input_size : 1;
++#endif
++
++      /* On V3D 7.1 there isn't a specific flag to set if we are using
++       * shared/separate segments or not. We just set the value of
++       * vpm_input_size to 0, and set output to the max needed. That should be
++       * already properly set on prog_data_vs_bin
++       */
++#if V3D_VERSION == 71
++      shader.coordinate_shader_input_vpm_segment_size =
++         prog_data_vs_bin->vpm_input_size;
++      shader.vertex_shader_input_vpm_segment_size =
++         prog_data_vs->vpm_input_size;
++#endif
+ 
+       shader.coordinate_shader_output_vpm_segment_size =
+          prog_data_vs_bin->vpm_output_size;
+@@ -659,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+       }
+    }
+ }
++
++#if V3D_VERSION == 42
++static bool
++pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
++{
++   for (uint8_t i = 0; i < pipeline->va_count; i++) {
++      if (vk_format_is_int(pipeline->va[i].vk_format))
++         return true;
++   }
++   return false;
++}
++#endif
++
++bool
++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
++{
++#if V3D_VERSION == 42
++   return pipeline_has_integer_vertex_attrib(pipeline);
++#endif
++
++   return false;
++}
++
++/* @pipeline can be NULL. In that case we assume the most common case. For
++ * example, for v42 we assume in that case that all the attributes have a
++ * float format (we only create an all-float BO once and we reuse it with all
++ * float pipelines), otherwise we look at the actual type of each attribute
++ * used with the specific pipeline passed in.
++ */
++struct v3dv_bo *
++v3dX(create_default_attribute_values)(struct v3dv_device *device,
++                                      struct v3dv_pipeline *pipeline)
++{
++#if V3D_VERSION >= 71
++   return NULL;
++#endif
++
++   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
++   struct v3dv_bo *bo;
++
++   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
++
++   if (!bo) {
++      fprintf(stderr, "failed to allocate memory for the default "
++              "attribute values\n");
++      return NULL;
++   }
++
++   bool ok = v3dv_bo_map(device, bo, size);
++   if (!ok) {
++      fprintf(stderr, "failed to map default attribute values buffer\n");
++      return NULL;
++   }
++
++   uint32_t *attrs = bo->map;
++   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
++   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
++      attrs[i * 4 + 0] = 0;
++      attrs[i * 4 + 1] = 0;
++      attrs[i * 4 + 2] = 0;
++      VkFormat attr_format =
++         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
++      if (i < va_count && vk_format_is_int(attr_format)) {
++         attrs[i * 4 + 3] = 1;
++      } else {
++         attrs[i * 4 + 3] = fui(1.0);
++      }
++   }
++
++   v3dv_bo_unmap(device, bo);
++
++   return bo;
++}
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index ad8ddfa5731c..0f5887eab937 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer);
+ void
+ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
+ 
++void
++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
++
+ void
+ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
+ 
+@@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
+                          uint32_t internal_size,
+                          uint32_t *hw_color);
+ 
+-void
+-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
+-                                                 int rt,
+-                                                 uint32_t *rt_bpp,
+-                                                 uint32_t *rt_type,
+-                                                 uint32_t *rt_clamp);
+-
+ /* Used at v3dv_device */
+ 
+ void
+-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
++v3dX(pack_sampler_state)(const struct v3dv_device *device,
++                         struct v3dv_sampler *sampler,
+                          const VkSamplerCreateInfo *pCreateInfo,
+                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
+ 
+@@ -143,7 +140,9 @@ void
+ v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
+                                             const struct v3dv_cmd_buffer_attachment_state *attachments,
+                                             const struct v3dv_subpass *subpass,
+-                                            uint8_t *max_bpp, bool *msaa);
++                                            uint8_t *max_internal_bpp,
++                                            uint8_t *total_color_bpp,
++                                            bool *msaa);
+ 
+ #ifdef DEBUG
+ void
+@@ -313,10 +312,24 @@ void
+ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+                                   const VkPipelineVertexInputStateCreateInfo *vi_info,
+                                   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
++
++bool
++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
++
++struct v3dv_bo *
++v3dX(create_default_attribute_values)(struct v3dv_device *device,
++                                      struct v3dv_pipeline *pipeline);
++
+ /* Used at v3dv_queue */
+ void
+ v3dX(job_emit_noop)(struct v3dv_job *job);
+ 
++/* Used at v3dv_query */
++VkResult
++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
++                                           VkPerformanceCounterKHR *pCounters,
++                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
++
+ /* Used at v3dv_descriptor_set, and other descriptor set utils */
+ uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
+ 
+@@ -325,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
+ uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
+ 
+ uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
++
++/* General utils */
++
++uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                VkFormat vk_format);
++
++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
++
++uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                VkFormat vk_format);
++
++void
++v3dX(viewport_compute_xform)(const VkViewport *viewport,
++                             float scale[3],
++                             float translate[3]);
+diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
+new file mode 100644
+index 000000000000..e59a1e84ff6c
+--- /dev/null
++++ b/src/broadcom/vulkan/v3dvx_query.c
+@@ -0,0 +1,67 @@
++/*
++ * Copyright © 2023 Raspberry Pi Ltd
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "v3dv_private.h"
++
++#include "common/v3d_performance_counters.h"
++
++VkResult
++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
++                                           VkPerformanceCounterKHR *pCounters,
++                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
++{
++   uint32_t desc_count = *pCounterCount;
++
++   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
++                          out, pCounters, pCounterCount);
++   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
++                          out_desc, pCounterDescriptions, &desc_count);
++
++   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
++      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
++         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
++         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
++         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
++
++         unsigned char sha1_result[20];
++         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
++                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
++                            sha1_result);
++
++         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
++      }
++
++      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
++                               &out_desc, desc) {
++         desc->flags = 0;
++         snprintf(desc->name, sizeof(desc->name), "%s",
++            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
++         snprintf(desc->category, sizeof(desc->category), "%s",
++            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
++         snprintf(desc->description, sizeof(desc->description), "%s",
++            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
++      }
++   }
++
++   return vk_outarray_status(&out);
++}
+diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
+index efe63de425c5..6eed2de9d543 100644
+--- a/src/broadcom/vulkan/v3dvx_queue.c
++++ b/src/broadcom/vulkan/v3dvx_queue.c
+@@ -29,7 +29,8 @@
+ void
+ v3dX(job_emit_noop)(struct v3dv_job *job)
+ {
+-   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false);
++   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
++                        V3D_INTERNAL_BPP_32, 4, false);
+    v3dX(job_emit_binning_flush)(job);
+ 
+    struct v3dv_cl *rcl = &job->rcl;
+@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
+       config.image_height_pixels = 1;
+       config.number_of_render_targets = 1;
+       config.multisample_mode_4x = false;
++#if V3D_VERSION == 42
+       config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
++#endif
++#if V3D_VERSION >= 71
++      config.log2_tile_width = 3; /* Tile size 64 */
++      config.log2_tile_height = 3; /* Tile size 64 */
++#endif
+    }
+ 
++#if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+       rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
+       rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
+       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+    }
++#endif
++#if V3D_VERSION >= 71
++   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++      rt.internal_bpp = V3D_INTERNAL_BPP_32;
++      rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
++      rt.stride = 1; /* Unused RT */
++   }
++#endif
+ 
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+       clear.z_clear_value = 1.0f;
+diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
+index dfa1e88097b9..289473d2ca13 100644
+--- a/src/gallium/drivers/v3d/meson.build
++++ b/src/gallium/drivers/v3d/meson.build
+@@ -34,7 +34,6 @@ files_libv3d = files(
+   'v3d_query.c',
+   'v3d_query.h',
+   'v3d_query_pipe.c',
+-  'v3d_query_perfcnt.c',
+   'v3d_resource.c',
+   'v3d_resource.h',
+   'v3d_screen.c',
+@@ -47,8 +46,10 @@ files_per_version = files(
+   'v3dx_emit.c',
+   'v3dx_format_table.c',
+   'v3dx_job.c',
++  'v3dx_query_perfcnt.c',
+   'v3dx_rcl.c',
+   'v3dx_state.c',
++  'v3dx_tfu.c',
+ )
+ 
+ v3d_args = ['-DV3D_BUILD_NEON']
+@@ -58,7 +59,7 @@ if dep_v3dv3.found()
+   v3d_args += '-DUSE_V3D_SIMULATOR'
+ endif
+ 
+-v3d_versions = ['33', '42']
++v3d_versions = ['33', '42', '71']
+ 
+ per_version_libs = []
+ foreach ver : v3d_versions
+diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
+index 0260bdde6d1c..51ddc292ff73 100644
+--- a/src/gallium/drivers/v3d/v3d_blit.c
++++ b/src/gallium/drivers/v3d/v3d_blit.c
+@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
+         info->mask &= ~PIPE_MASK_S;
+ }
+ 
+-static bool
+-v3d_tfu(struct pipe_context *pctx,
+-        struct pipe_resource *pdst,
+-        struct pipe_resource *psrc,
+-        unsigned int src_level,
+-        unsigned int base_level,
+-        unsigned int last_level,
+-        unsigned int src_layer,
+-        unsigned int dst_layer,
+-        bool for_mipmap)
+-{
+-        struct v3d_context *v3d = v3d_context(pctx);
+-        struct v3d_screen *screen = v3d->screen;
+-        struct v3d_resource *src = v3d_resource(psrc);
+-        struct v3d_resource *dst = v3d_resource(pdst);
+-        struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
+-        struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
+-        int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
+-        int width = u_minify(pdst->width0, base_level) * msaa_scale;
+-        int height = u_minify(pdst->height0, base_level) * msaa_scale;
+-        enum pipe_format pformat;
+-
+-        if (psrc->format != pdst->format)
+-                return false;
+-        if (psrc->nr_samples != pdst->nr_samples)
+-                return false;
+-
+-        /* Can't write to raster. */
+-        if (dst_base_slice->tiling == V3D_TILING_RASTER)
+-                return false;
+-
+-        /* When using TFU for blit, we are doing exact copies (both input and
+-         * output format must be the same, no scaling, etc), so there is no
+-         * pixel format conversions. Thus we can rewrite the format to use one
+-         * that is TFU compatible based on its texel size.
+-         */
+-        if (for_mipmap) {
+-                pformat = pdst->format;
+-        } else {
+-                switch (dst->cpp) {
+-                case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT;   break;
+-                case 8:  pformat = PIPE_FORMAT_R16G16B16A16_FLOAT;   break;
+-                case 4:  pformat = PIPE_FORMAT_R32_FLOAT;            break;
+-                case 2:  pformat = PIPE_FORMAT_R16_FLOAT;            break;
+-                case 1:  pformat = PIPE_FORMAT_R8_UNORM;             break;
+-                default: unreachable("unsupported format bit-size"); break;
+-                };
+-        }
+-
+-        uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
+-        struct v3d_device_info *devinfo = &screen->devinfo;
+-
+-        if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) {
+-                assert(for_mipmap);
+-                return false;
+-        }
+-
+-        v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
+-        v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
+-
+-        struct drm_v3d_submit_tfu tfu = {
+-                .ios = (height << 16) | width,
+-                .bo_handles = {
+-                        dst->bo->handle,
+-                        src != dst ? src->bo->handle : 0
+-                },
+-                .in_sync = v3d->out_sync,
+-                .out_sync = v3d->out_sync,
+-        };
+-        uint32_t src_offset = (src->bo->offset +
+-                               v3d_layer_offset(psrc, src_level, src_layer));
+-        tfu.iia |= src_offset;
+-        if (src_base_slice->tiling == V3D_TILING_RASTER) {
+-                tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
+-                             V3D33_TFU_ICFG_FORMAT_SHIFT);
+-        } else {
+-                tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+-                              (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+-                             V3D33_TFU_ICFG_FORMAT_SHIFT);
+-        }
+-
+-        uint32_t dst_offset = (dst->bo->offset +
+-                               v3d_layer_offset(pdst, base_level, dst_layer));
+-        tfu.ioa |= dst_offset;
+-        if (last_level != base_level)
+-                tfu.ioa |= V3D33_TFU_IOA_DIMTW;
+-        tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
+-                     (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+-                    V3D33_TFU_IOA_FORMAT_SHIFT);
+-
+-        tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
+-        tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
+-
+-        switch (src_base_slice->tiling) {
+-        case V3D_TILING_UIF_NO_XOR:
+-        case V3D_TILING_UIF_XOR:
+-                tfu.iis |= (src_base_slice->padded_height /
+-                            (2 * v3d_utile_height(src->cpp)));
+-                break;
+-        case V3D_TILING_RASTER:
+-                tfu.iis |= src_base_slice->stride / src->cpp;
+-                break;
+-        case V3D_TILING_LINEARTILE:
+-        case V3D_TILING_UBLINEAR_1_COLUMN:
+-        case V3D_TILING_UBLINEAR_2_COLUMN:
+-                break;
+-       }
+-
+-        /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+-         * OPAD field for the destination (how many extra UIF blocks beyond
+-         * those necessary to cover the height).  When filling mipmaps, the
+-         * miplevel 1+ tiling state is inferred.
+-         */
+-        if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
+-            dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
+-                int uif_block_h = 2 * v3d_utile_height(dst->cpp);
+-                int implicit_padded_height = align(height, uif_block_h);
+-
+-                tfu.icfg |= (((dst_base_slice->padded_height -
+-                               implicit_padded_height) / uif_block_h) <<
+-                             V3D33_TFU_ICFG_OPAD_SHIFT);
+-        }
+-
+-        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
+-        if (ret != 0) {
+-                fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
+-                return false;
+-        }
+-
+-        dst->writes++;
+-
+-        return true;
+-}
+-
+ bool
+ v3d_generate_mipmap(struct pipe_context *pctx,
+                     struct pipe_resource *prsc,
+@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx,
+         if (first_layer != last_layer)
+                 return false;
+ 
+-        return v3d_tfu(pctx,
+-                       prsc, prsc,
+-                       base_level,
+-                       base_level, last_level,
+-                       first_layer, first_layer,
+-                       true);
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_device_info *devinfo = &screen->devinfo;
++
++        return v3d_X(devinfo, tfu)(pctx,
++                                   prsc, prsc,
++                                   base_level,
++                                   base_level, last_level,
++                                   first_layer, first_layer,
++                                   true);
+ }
+ 
+ static void
+@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
+         if (info->dst.format != info->src.format)
+                 return;
+ 
+-        if (v3d_tfu(pctx, info->dst.resource, info->src.resource,
+-                    info->src.level,
+-                    info->dst.level, info->dst.level,
+-                    info->src.box.z, info->dst.box.z,
+-                    false)) {
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_device_info *devinfo = &screen->devinfo;
++
++        if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource,
++                                info->src.level,
++                                info->dst.level, info->dst.level,
++                                info->src.box.z, info->dst.box.z,
++                                false)) {
+                 info->mask &= ~PIPE_MASK_RGBA;
+         }
+ }
+@@ -495,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
+         bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa;
+ 
+         uint32_t tile_width, tile_height, max_bpp;
+-        v3d_get_tile_buffer_size(msaa, double_buffer,
++        v3d_get_tile_buffer_size(devinfo, msaa, double_buffer,
+                                  is_color_blit ? 1 : 0, surfaces, src_surf,
+                                  &tile_width, &tile_height, &max_bpp);
+ 
+diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
+index f12e8c92139c..1dc4bd017fe7 100644
+--- a/src/gallium/drivers/v3d/v3d_context.c
++++ b/src/gallium/drivers/v3d/v3d_context.c
+@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
+ }
+ 
+ void
+-v3d_get_tile_buffer_size(bool is_msaa,
++v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
++                         bool is_msaa,
+                          bool double_buffer,
+                          uint32_t nr_cbufs,
+                          struct pipe_surface **cbufs,
+@@ -232,11 +233,13 @@ v3d_get_tile_buffer_size(bool is_msaa,
+         assert(!is_msaa || !double_buffer);
+ 
+         uint32_t max_cbuf_idx = 0;
++        uint32_t total_bpp = 0;
+         *max_bpp = 0;
+         for (int i = 0; i < nr_cbufs; i++) {
+                 if (cbufs[i]) {
+                         struct v3d_surface *surf = v3d_surface(cbufs[i]);
+                         *max_bpp = MAX2(*max_bpp, surf->internal_bpp);
++                        total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp);
+                         max_cbuf_idx = MAX2(i, max_cbuf_idx);
+                 }
+         }
+@@ -245,9 +248,11 @@ v3d_get_tile_buffer_size(bool is_msaa,
+                 struct v3d_surface *bsurf = v3d_surface(bbuf);
+                 assert(bbuf->texture->nr_samples <= 1 || is_msaa);
+                 *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp);
++                total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp);
+         }
+ 
+-        v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp,
++        v3d_choose_tile_size(devinfo, max_cbuf_idx + 1,
++                             *max_bpp, total_bpp,
+                              is_msaa, double_buffer,
+                              tile_width, tile_height);
+ }
+diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
+index 97850b0363eb..eb184b4b2036 100644
+--- a/src/gallium/drivers/v3d/v3d_context.h
++++ b/src/gallium/drivers/v3d/v3d_context.h
+@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj {
+         unsigned num_elements;
+ 
+         uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)];
++        /* defaults can be NULL for some hw generation */
+         struct pipe_resource *defaults;
+         uint32_t defaults_offset;
+ };
+@@ -794,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx);
+ void v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
+                                   enum pipe_shader_type shader);
+ 
+-void v3d_get_tile_buffer_size(bool is_msaa,
++void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
++                              bool is_msaa,
+                               bool double_buffer,
+                               uint32_t nr_cbufs,
+                               struct pipe_surface **cbufs,
+@@ -818,16 +820,52 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
+ 
+ /* Helper to call hw ver specific functions */
+ #define v3d_X(devinfo, thing) ({                                \
+-        __typeof(&v3d42_##thing) v3d_X_thing;                   \
+-        if ((devinfo)->ver >= 42)                               \
+-                v3d_X_thing = &v3d42_##thing;                   \
+-        else if ((devinfo)->ver >= 33)                          \
++        __typeof(&v3d33_##thing) v3d_X_thing;                   \
++        switch (devinfo->ver) {                                 \
++        case 33:                                                \
++        case 40:                                                \
+                 v3d_X_thing = &v3d33_##thing;                   \
+-        else                                                    \
++                break;                                          \
++        case 42:                                                \
++                v3d_X_thing = &v3d42_##thing;                   \
++                break;                                          \
++        case 71:                                                \
++                v3d_X_thing = &v3d71_##thing;                   \
++                break;                                          \
++        default:                                                \
+                 unreachable("Unsupported hardware generation"); \
++        }                                                       \
+         v3d_X_thing;                                            \
+ })
+ 
++/* FIXME: The same for vulkan/opengl. Common place? define it at the
++ * v3d_packet files?
++ */
++#define V3D33_CLIPPER_XY_GRANULARITY 256.0f
++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
++
++/* Helper to get hw-specific macro values */
++#define V3DV_X(devinfo, thing) ({                               \
++   __typeof(V3D33_##thing) V3D_X_THING;                         \
++   switch (devinfo->ver) {                                      \
++   case 33:                                                     \
++   case 40:                                                     \
++      V3D_X_THING = V3D33_##thing;                              \
++      break;                                                    \
++      case 41:                                                  \
++   case 42:                                                     \
++      V3D_X_THING = V3D42_##thing;                              \
++      break;                                                    \
++   case 71:                                                     \
++      V3D_X_THING = V3D71_##thing;                              \
++      break;                                                    \
++   default:                                                     \
++      unreachable("Unsupported hardware generation");           \
++   }                                                            \
++   V3D_X_THING;                                                 \
++})
++
+ #ifdef v3dX
+ #  include "v3dx_context.h"
+ #else
+@@ -838,6 +876,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
+ #  define v3dX(x) v3d42_##x
+ #  include "v3dx_context.h"
+ #  undef v3dX
++
++#  define v3dX(x) v3d71_##x
++#  include "v3dx_context.h"
++#  undef v3dX
+ #endif
+ 
+ #endif /* V3D_CONTEXT_H */
+diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c
+index b022ed45073e..577890a06c31 100644
+--- a/src/gallium/drivers/v3d/v3d_job.c
++++ b/src/gallium/drivers/v3d/v3d_job.c
+@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
+                 job->double_buffer = false;
+         }
+ 
+-        v3d_get_tile_buffer_size(job->msaa, job->double_buffer,
++        v3d_get_tile_buffer_size(&v3d->screen->devinfo,
++                                 job->msaa, job->double_buffer,
+                                  job->nr_cbufs, job->cbufs, job->bbuf,
+-                                 &job->tile_width, &job->tile_height,
++                                 &job->tile_width,
++                                 &job->tile_height,
+                                  &job->internal_bpp);
+ 
+         /* The dirty flags are tracking what's been updated while v3d->job has
+diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c
+index db98c89625f5..83f82e44a3df 100644
+--- a/src/gallium/drivers/v3d/v3d_query.c
++++ b/src/gallium/drivers/v3d/v3d_query.c
+@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index,
+                                 struct pipe_driver_query_group_info *info)
+ {
+         struct v3d_screen *screen = v3d_screen(pscreen);
++        struct v3d_device_info *devinfo = &screen->devinfo;
+ 
+-        return v3d_get_driver_query_group_info_perfcnt(screen, index, info);
++        return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen,
++                                                                   index,
++                                                                   info);
+ }
+ 
+ int
+@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+                           struct pipe_driver_query_info *info)
+ {
+         struct v3d_screen *screen = v3d_screen(pscreen);
++        struct v3d_device_info *devinfo = &screen->devinfo;
+ 
+-        return v3d_get_driver_query_info_perfcnt(screen, index, info);
++        return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen,
++                                                             index,
++                                                             info);
+ }
+ 
+ static struct pipe_query *
+@@ -53,9 +59,13 @@ static struct pipe_query *
+ v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
+                        unsigned *query_types)
+ {
+-        return v3d_create_batch_query_perfcnt(v3d_context(pctx),
+-                                              num_queries,
+-                                              query_types);
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_device_info *devinfo = &screen->devinfo;
++
++        return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx),
++                                                          num_queries,
++                                                          query_types);
+ }
+ 
+ static void
+diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h
+index 3e1426b8d867..605ed1a12f9d 100644
+--- a/src/gallium/drivers/v3d/v3d_query.h
++++ b/src/gallium/drivers/v3d/v3d_query.h
+@@ -42,11 +42,5 @@ struct v3d_query
+ };
+ 
+ struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index);
+-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
+-                                                  unsigned *query_types);
+-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                            struct pipe_driver_query_group_info *info);
+-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                      struct pipe_driver_query_info *info);
+ 
+ #endif /* V3D_QUERY_H */
+diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
+index 98ca9bb69e62..53bfb28924f4 100644
+--- a/src/gallium/drivers/v3d/v3d_screen.c
++++ b/src/gallium/drivers/v3d/v3d_screen.c
+@@ -255,9 +255,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+         case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+                 return V3D_MAX_ARRAY_LAYERS;
+ 
+-                /* Render targets. */
+         case PIPE_CAP_MAX_RENDER_TARGETS:
+-                return 4;
++                return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver);
+ 
+         case PIPE_CAP_VENDOR_ID:
+                 return 0x14E4;
+diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
+index 95eb838954f1..64c217d4f6c6 100644
+--- a/src/gallium/drivers/v3d/v3d_uniforms.c
++++ b/src/gallium/drivers/v3d/v3d_uniforms.c
+@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
+                    struct v3d_compiled_shader *shader,
+                    enum pipe_shader_type stage)
+ {
++        struct v3d_device_info *devinfo = &v3d->screen->devinfo;
+         struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage];
+         struct v3d_texture_stateobj *texstate = &v3d->tex[stage];
+         struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
+@@ -292,13 +293,16 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
+                 case QUNIFORM_UNIFORM:
+                         cl_aligned_u32(&uniforms, gallium_uniforms[data]);
+                         break;
+-                case QUNIFORM_VIEWPORT_X_SCALE:
+-                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f);
++                case QUNIFORM_VIEWPORT_X_SCALE: {
++                        float clipper_xy_granularity = V3DV_X(devinfo, CLIPPER_XY_GRANULARITY);
++                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity);
+                         break;
+-                case QUNIFORM_VIEWPORT_Y_SCALE:
+-                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f);
++                }
++                case QUNIFORM_VIEWPORT_Y_SCALE: {
++                        float clipper_xy_granularity = V3DV_X(devinfo, CLIPPER_XY_GRANULARITY);
++                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity);
+                         break;
+-
++                }
+                 case QUNIFORM_VIEWPORT_Z_OFFSET:
+                         cl_aligned_f(&uniforms, v3d->viewport.translate[2]);
+                         break;
+diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
+index 03d7c244ea2b..c487ac3b9965 100644
+--- a/src/gallium/drivers/v3d/v3dx_context.h
++++ b/src/gallium/drivers/v3d/v3dx_context.h
+@@ -51,3 +51,23 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
+  */
+ bool v3dX(tfu_supports_tex_format)(uint32_t tex_format,
+                                    bool for_mipmap);
++
++bool v3dX(tfu)(struct pipe_context *pctx,
++               struct pipe_resource *pdst,
++               struct pipe_resource *psrc,
++               unsigned int src_level,
++               unsigned int base_level,
++               unsigned int last_level,
++               unsigned int src_layer,
++               unsigned int dst_layer,
++               bool for_mipmap);
++
++int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen,
++                                              unsigned index,
++                                              struct pipe_driver_query_group_info *info);
++int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen,
++                                        unsigned index,
++                                        struct pipe_driver_query_info *info);
++struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d,
++                                                    unsigned num_queries,
++                                                    unsigned *query_types);
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index 17442500ea96..4e1af41d50e0 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -95,7 +95,25 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ #endif
+ 
+         assert(!job->msaa || !job->double_buffer);
+-#if V3D_VERSION >= 40
++#if V3D_VERSION >= 71
++        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
++                config.width_in_pixels = job->draw_width;
++                config.height_in_pixels = job->draw_height;
++
++                config.log2_tile_width = log2_tile_size(job->tile_width);
++                config.log2_tile_height = log2_tile_size(job->tile_height);
++
++                /* FIXME: ideallly we would like next assert on the packet header (as is
++                 * general, so also applies to GL). We would need to expand
++                 * gen_pack_header for that.
++                 */
++                assert(config.log2_tile_width == config.log2_tile_height ||
++                       config.log2_tile_width == config.log2_tile_height + 1);
++        }
++
++#endif
++
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+         cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+                 config.width_in_pixels = job->draw_width;
+                 config.height_in_pixels = job->draw_height;
+@@ -107,7 +125,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ 
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+         }
+-#else /* V3D_VERSION < 40 */
++#endif
++#if V3D_VERSION < 40
+         /* "Binning mode lists start with a Tile Binning Mode Configuration
+          * item (120)"
+          *
+@@ -134,7 +153,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ 
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+         }
+-#endif /* V3D_VERSION < 40 */
++#endif
+ 
+         /* There's definitely nothing in the VCD cache we want. */
+         cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+@@ -377,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
+                         gs_bin->prog_data.gs->base.threads == 4;
+                 shader.geometry_bin_mode_shader_start_in_final_thread_section =
+                         gs_bin->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+                 shader.geometry_bin_mode_shader_propagate_nans = true;
++#endif
+                 shader.geometry_bin_mode_shader_uniforms_address =
+                         gs_bin_uniforms;
+ 
+@@ -387,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
+                         gs->prog_data.gs->base.threads == 4;
+                 shader.geometry_render_mode_shader_start_in_final_thread_section =
+                         gs->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+                 shader.geometry_render_mode_shader_propagate_nans = true;
++#endif
+                 shader.geometry_render_mode_shader_uniforms_address =
+                         gs_render_uniforms;
+         }
+@@ -638,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                 shader.number_of_varyings_in_fragment_shader =
+                         v3d->prog.fs->prog_data.fs->num_inputs;
+ 
+-                shader.coordinate_shader_propagate_nans = true;
+-                shader.vertex_shader_propagate_nans = true;
+-                shader.fragment_shader_propagate_nans = true;
+-
+                 shader.coordinate_shader_code_address =
+                         cl_address(v3d_resource(v3d->prog.cs->resource)->bo,
+                                    v3d->prog.cs->offset);
+@@ -652,6 +671,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                         cl_address(v3d_resource(v3d->prog.fs->resource)->bo,
+                                    v3d->prog.fs->offset);
+ 
++#if V3D_VERSION <= 42
++                shader.coordinate_shader_propagate_nans = true;
++                shader.vertex_shader_propagate_nans = true;
++                shader.fragment_shader_propagate_nans = true;
++
+                 /* XXX: Use combined input/output size flag in the common
+                  * case.
+                  */
+@@ -659,13 +683,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                         v3d->prog.cs->prog_data.vs->separate_segments;
+                 shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+                         v3d->prog.vs->prog_data.vs->separate_segments;
+-
+                 shader.coordinate_shader_input_vpm_segment_size =
+                         v3d->prog.cs->prog_data.vs->separate_segments ?
+                         v3d->prog.cs->prog_data.vs->vpm_input_size : 1;
+                 shader.vertex_shader_input_vpm_segment_size =
+                         v3d->prog.vs->prog_data.vs->separate_segments ?
+                         v3d->prog.vs->prog_data.vs->vpm_input_size : 1;
++#endif
++                /* On V3D 7.1 there isn't a specific flag to set if we are using
++                 * shared/separate segments or not. We just set the value of
++                 * vpm_input_size to 0, and set output to the max needed. That should be
++                 * already properly set on prog_data_vs_bin
++                 */
++#if V3D_VERSION == 71
++                shader.coordinate_shader_input_vpm_segment_size =
++                        v3d->prog.cs->prog_data.vs->vpm_input_size;
++                shader.vertex_shader_input_vpm_segment_size =
++                        v3d->prog.vs->prog_data.vs->vpm_input_size;
++#endif
+ 
+                 shader.coordinate_shader_output_vpm_segment_size =
+                         v3d->prog.cs->prog_data.vs->vpm_output_size;
+@@ -724,9 +759,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                 shader.instance_id_read_by_vertex_shader =
+                         v3d->prog.vs->prog_data.vs->uses_iid;
+ 
++#if V3D_VERSION <= 42
+                 shader.address_of_default_attribute_values =
+                         cl_address(v3d_resource(vtx->defaults)->bo,
+                                    vtx->defaults_offset);
++#endif
+         }
+ 
+         bool cs_loaded_any = false;
+@@ -1436,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
+         submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
+ 
+ 
+-        /* Number of batches the dispatch will invoke (minus 1). */
+-        submit.cfg[4] = num_batches - 1;
++        /* Number of batches the dispatch will invoke.
++         * V3D 7.1.6 and later don't subtract 1 from the number of batches
++         */
++        if (v3d->screen->devinfo.ver < 71 ||
++            (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) {
++                submit.cfg[4] = num_batches - 1;
++        } else {
++                submit.cfg[4] = num_batches;
++        }
+ 
+         /* Make sure we didn't accidentally underflow. */
+         assert(submit.cfg[4] != ~0);
+@@ -1445,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
+         v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo);
+         submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset +
+                          v3d->prog.compute->offset);
+-        submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
++        if (v3d->screen->devinfo.ver < 71)
++                submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+         if (v3d->prog.compute->prog_data.base->single_seg)
+                 submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
+         if (v3d->prog.compute->prog_data.base->threads == 4)
+@@ -1560,9 +1605,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
+         /* GFXH-1461: If we were to emit a load of just depth or just stencil,
+          * then the clear for the other may get lost.  We need to decide now
+          * if it would be possible to need to emit a load of just one after
+-         * we've set up our TLB clears.
++         * we've set up our TLB clears. This issue is fixed since V3D 4.3.18.
+          */
+-        if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
++        if (v3d->screen->devinfo.ver <= 42 &&
++            buffers & PIPE_CLEAR_DEPTHSTENCIL &&
+             (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
+             job->zsbuf &&
+             util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index 0ad3fb68b1e2..ee17b935e196 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -512,13 +512,17 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                         /* Note: EZ state may update based on the compiled FS,
+                          * along with ZSA
+                          */
++#if V3D_VERSION <= 42
+                         config.early_z_updates_enable =
+                                 (job->ez_state != V3D_EZ_DISABLED);
++#endif
+                         if (v3d->zsa->base.depth_enabled) {
+                                 config.z_updates_enable =
+                                         v3d->zsa->base.depth_writemask;
++#if V3D_VERSION <= 42
+                                 config.early_z_enable =
+                                         config.early_z_updates_enable;
++#endif
+                                 config.depth_test_function =
+                                         v3d->zsa->base.depth_func;
+                         } else {
+@@ -535,13 +539,28 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                                 v3d_line_smoothing_enabled(v3d) ?
+                                 V3D_LINE_RASTERIZATION_PERP_END_CAPS :
+                                 V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
+-                }
+ 
++#if V3D_VERSION >= 71
++                        /* The following follows the logic implemented in v3dv
++                         * plus the definition of depth_clip_near/far and
++                         * depth_clamp.
++                         *
++                         * Note: some extensions are not supported by v3d
++                         * (like ARB_depth_clamp) that would affect this, but
++                         * the values on rasterizer are taking that into
++                         * account.
++                         */
++                        config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near ||
++                           v3d->rasterizer->base.depth_clip_far ?
++                           V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_NONE;
++#endif
++                }
+         }
+ 
+         if (v3d->dirty & V3D_DIRTY_RASTERIZER &&
+             v3d->rasterizer->base.offset_tri) {
+-                if (job->zsbuf &&
++                if (v3d->screen->devinfo.ver <= 42 &&
++                    job->zsbuf &&
+                     job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
+                         cl_emit_prepacked_sized(&job->bcl,
+                                                 v3d->rasterizer->depth_offset_z16,
+@@ -564,12 +583,23 @@ v3dX(emit_state)(struct pipe_context *pctx)
+         }
+ 
+         if (v3d->dirty & V3D_DIRTY_VIEWPORT) {
++#if V3D_VERSION <= 42
+                 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+                         clip.viewport_half_width_in_1_256th_of_pixel =
+                                 v3d->viewport.scale[0] * 256.0f;
+                         clip.viewport_half_height_in_1_256th_of_pixel =
+                                 v3d->viewport.scale[1] * 256.0f;
+                 }
++#endif
++#if V3D_VERSION >= 71
++                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
++                        clip.viewport_half_width_in_1_64th_of_pixel =
++                                v3d->viewport.scale[0] * 64.0f;
++                        clip.viewport_half_height_in_1_64th_of_pixel =
++                                v3d->viewport.scale[1] * 64.0f;
++                }
++#endif
++
+ 
+                 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+                         clip.viewport_z_offset_zc_to_zs =
+@@ -633,8 +663,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                         }
+ #endif
+ 
++                        const uint32_t max_rts =
++                                V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
+                         if (blend->base.independent_blend_enable) {
+-                                for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
++                                for (int i = 0; i < max_rts; i++)
+                                         emit_rt_blend(v3d, job, &blend->base, i,
+                                                       (1 << i),
+                                                       v3d->blend_dst_alpha_one & (1 << i));
+@@ -650,16 +682,16 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                                  * RTs without.
+                                  */
+                                 emit_rt_blend(v3d, job, &blend->base, 0,
+-                                              ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
++                                              ((1 << max_rts) - 1) &
+                                                    v3d->blend_dst_alpha_one,
+                                               true);
+                                 emit_rt_blend(v3d, job, &blend->base, 0,
+-                                              ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
++                                              ((1 << max_rts) - 1) &
+                                                    ~v3d->blend_dst_alpha_one,
+                                               false);
+                         } else {
+                                 emit_rt_blend(v3d, job, &blend->base, 0,
+-                                              (1 << V3D_MAX_DRAW_BUFFERS) - 1,
++                                              (1 << max_rts) - 1,
+                                               v3d->blend_dst_alpha_one);
+                         }
+                 }
+@@ -668,8 +700,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
+         if (v3d->dirty & V3D_DIRTY_BLEND) {
+                 struct pipe_blend_state *blend = &v3d->blend->base;
+ 
++                const uint32_t max_rts =
++                        V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
+                 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
+-                        for (int i = 0; i < 4; i++) {
++                        for (int i = 0; i < max_rts; i++) {
+                                 int rt = blend->independent_blend_enable ? i : 0;
+                                 int rt_mask = blend->rt[rt].colormask;
+ 
+diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
+similarity index 94%
+rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c
+rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c
+index e00d84e375f0..431aad14b4fa 100644
+--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c
++++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
+@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon)
+ }
+ 
+ int
+-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                        struct pipe_driver_query_group_info *info)
++v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index,
++                                          struct pipe_driver_query_group_info *info)
+ {
+         if (!screen->has_perfmon)
+                 return 0;
+@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde
+ }
+ 
+ int
+-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                  struct pipe_driver_query_info *info)
++v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index,
++                                    struct pipe_driver_query_info *info)
+ {
+         if (!screen->has_perfmon)
+                 return 0;
+@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = {
+ };
+ 
+ struct pipe_query *
+-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
+-                               unsigned *query_types)
++v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries,
++                                 unsigned *query_types)
+ {
+         struct v3d_query_perfcnt *pquery = NULL;
+         struct v3d_query *query;
+diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
+index 82547437c252..8eabc5ea6263 100644
+--- a/src/gallium/drivers/v3d/v3dx_rcl.c
++++ b/src/gallium/drivers/v3d/v3dx_rcl.c
+@@ -23,8 +23,9 @@
+ 
+ #include "util/format/u_format.h"
+ #include "v3d_context.h"
+-#include "broadcom/common/v3d_tiling.h"
+ #include "broadcom/common/v3d_macros.h"
++#include "broadcom/common/v3d_tiling.h"
++#include "broadcom/common/v3d_util.h"
+ #include "broadcom/cle/v3dx_pack.h"
+ 
+ #define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 |                   \
+@@ -419,10 +420,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
+          * clearing Z/S.
+          */
+         if (job->clear) {
++#if V3D_VERSION <= 42
+                 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+                         clear.clear_z_stencil_buffer = !job->early_zs_clear;
+                         clear.clear_all_render_targets = true;
+                 }
++#endif
++#if V3D_VERSION >= 71
++                cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
++#endif
++
+         }
+ #endif /* V3D_VERSION >= 40 */
+ }
+@@ -483,10 +490,66 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
+         }
+ }
+ 
+-#if V3D_VERSION >= 40
++#if V3D_VERSION > 33
++/* Note that for v71, render target cfg packets has just one field that
++ * combined the internal type and clamp mode. For simplicity we keep just one
++ * helper.
++ *
++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
++ *
++ */
++static uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                enum pipe_format format)
++{
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
++        if (util_format_is_srgb(format)) {
++                return V3D_RENDER_TARGET_CLAMP_NORM;
++#if V3D_VERSION >= 42
++        } else if (util_format_is_pure_integer(format)) {
++                return V3D_RENDER_TARGET_CLAMP_INT;
++#endif
++        } else {
++                return V3D_RENDER_TARGET_CLAMP_NONE;
++        }
++#endif
++#if V3D_VERSION >= 71
++        switch (rt_type) {
++        case V3D_INTERNAL_TYPE_8I:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
++        case V3D_INTERNAL_TYPE_8UI:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
++        case V3D_INTERNAL_TYPE_8:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_8;
++        case V3D_INTERNAL_TYPE_16I:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
++        case V3D_INTERNAL_TYPE_16UI:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
++        case V3D_INTERNAL_TYPE_16F:
++                return util_format_is_srgb(format) ?
++                        V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
++                        V3D_RENDER_TARGET_TYPE_CLAMP_16F;
++        case V3D_INTERNAL_TYPE_32I:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
++        case V3D_INTERNAL_TYPE_32UI:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
++        case V3D_INTERNAL_TYPE_32F:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
++        default:
++                unreachable("Unknown internal render target type");
++        }
++        return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
++#endif
++        return 0;
++}
++#endif
++
++#if V3D_VERSION >= 71
+ static void
+-v3d_setup_render_target(struct v3d_job *job, int cbuf,
+-                        uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
++v3d_setup_render_target(struct v3d_job *job,
++                        int cbuf,
++                        uint32_t *rt_bpp,
++                        uint32_t *rt_type_clamp)
+ {
+         if (!job->cbufs[cbuf])
+                 return;
+@@ -497,19 +560,35 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf,
+            struct v3d_surface *bsurf = v3d_surface(job->bbuf);
+            *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
+         }
+-        *rt_type = surf->internal_type;
+-        if (util_format_is_srgb(surf->base.format))
+-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
+-#if V3D_VERSION >= 42
+-        else if (util_format_is_pure_integer(surf->base.format))
+-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
+-#endif
+-        else
+-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
++        *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
++                                                         surf->base.format);
+ }
++#endif
+ 
+-#else /* V3D_VERSION < 40 */
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
++static void
++v3d_setup_render_target(struct v3d_job *job,
++                        int cbuf,
++                        uint32_t *rt_bpp,
++                        uint32_t *rt_type,
++                        uint32_t *rt_clamp)
++{
++        if (!job->cbufs[cbuf])
++                return;
++
++        struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]);
++        *rt_bpp = surf->internal_bpp;
++        if (job->bbuf) {
++           struct v3d_surface *bsurf = v3d_surface(job->bbuf);
++           *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
++        }
++        *rt_type = surf->internal_type;
++        *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
++                                                    surf->base.format);
++}
++#endif
+ 
++#if V3D_VERSION < 40
+ static void
+ v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
+                           struct v3d_resource *rsc, bool is_separate_stencil)
+@@ -656,7 +735,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
+         cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                 store.buffer_to_store = NONE;
+         }
+-#else
++#endif
++#if V3D_VERSION >= 40
+         for (int i = 0; i < 2; i++) {
+                 if (i > 0)
+                         cl_emit(&job->rcl, TILE_COORDINATES, coords);
+@@ -664,16 +744,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
+                 cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                         store.buffer_to_store = NONE;
+                 }
++
+                 if (i == 0 || do_double_initial_tile_clear(job)) {
++#if V3D_VERSION < 71
+                         cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                 clear.clear_z_stencil_buffer = !job->early_zs_clear;
+                                 clear.clear_all_render_targets = true;
+                         }
++#else
++                        cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear);
++#endif
+                 }
+                 cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
+         }
+ #endif
+-
+         cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+ 
+         v3d_rcl_emit_generic_per_tile_list(job, layer);
+@@ -775,18 +859,52 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                 config.multisample_mode_4x = job->msaa;
+                 config.double_buffer_in_non_ms_mode = job->double_buffer;
+ 
++#if V3D_VERSION <= 42
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++                config.log2_tile_width = log2_tile_size(job->tile_width);
++                config.log2_tile_height = log2_tile_size(job->tile_height);
++
++                /* FIXME: ideallly we would like next assert on the packet header (as is
++                 * general, so also applies to GL). We would need to expand
++                 * gen_pack_header for that.
++                 */
++                assert(config.log2_tile_width == config.log2_tile_height ||
++                       config.log2_tile_width == config.log2_tile_height + 1);
++#endif
++
+         }
+ 
++#if V3D_VERSION >= 71
++        uint32_t base_addr = 0;
++
++        /* If we don't have any color RTs, we sill need to emit one and flag
++         * it as not used using stride = 1
++         */
++        if (job->nr_cbufs == 0) {
++           cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++              rt.stride = 1; /* Unused */
++           }
++        }
++#endif
+         for (int i = 0; i < job->nr_cbufs; i++) {
+                 struct pipe_surface *psurf = job->cbufs[i];
+-                if (!psurf)
++                if (!psurf) {
++#if V3D_VERSION >= 71
++                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++                                rt.render_target_number = i;
++                                rt.stride = 1; /* Unused */
++                        }
++#endif
+                         continue;
++                }
++
+                 struct v3d_surface *surf = v3d_surface(psurf);
+                 struct v3d_resource *rsc = v3d_resource(psurf->texture);
+ 
+                 UNUSED uint32_t config_pad = 0;
+-                uint32_t clear_pad = 0;
++                UNUSED uint32_t clear_pad = 0;
+ 
+                 /* XXX: Set the pad for raster. */
+                 if (surf->tiling == V3D_TILING_UIF_NO_XOR ||
+@@ -819,6 +937,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                 }
+ #endif /* V3D_VERSION < 40 */
+ 
++#if V3D_VERSION <= 42
+                 cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
+                         clear) {
+                         clear.clear_color_low_32_bits = job->clear_color[i][0];
+@@ -847,9 +966,42 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                                 clear.render_target_number = i;
+                         };
+                 }
++#endif
++#if V3D_VERSION >= 71
++                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++                        rt.clear_color_low_bits = job->clear_color[i][0];
++                        v3d_setup_render_target(job, i, &rt.internal_bpp,
++                                                &rt.internal_type_and_clamping);
++                        rt.stride =
++                                v3d_compute_rt_row_row_stride_128_bits(job->tile_width,
++                                                                       v3d_internal_bpp_words(rt.internal_bpp));
++                        rt.base_address = base_addr;
++                        rt.render_target_number = i;
++
++                        base_addr += (job->tile_height * rt.stride) / 8;
++                }
++
++                if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
++                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
++                                rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
++                                        ((uint64_t) job->clear_color[i][1]) |
++                                        (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32);
++                                rt.render_target_number = i;
++                        }
++                }
++
++                if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) {
++                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
++                                rt.clear_color_top_bits = /* 56 bits (24 + 32) */
++                                        (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) |
++                                        (((uint64_t) (job->clear_color[i][3])) << 24);
++                                rt.render_target_number = i;
++                        }
++                }
++#endif
+         }
+ 
+-#if V3D_VERSION >= 40
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+         cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+                 v3d_setup_render_target(job, 0,
+                                         &rt.render_target_0_internal_bpp,
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 0f1735fee666..032a6643fcdc 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
+ #endif
+         }
+ 
+-        /* The HW treats polygon offset units based on a Z24 buffer, so we
++        /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we
+          * need to scale up offset_units if we're only Z16.
+          */
++#if V3D_VERSION <= 42
+         v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) {
+                 depth.depth_offset_factor = cso->offset_scale;
+                 depth.depth_offset_units = cso->offset_units * 256.0;
+@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
+                 depth.limit = cso->offset_clamp;
+ #endif
+         }
++#endif
+ 
+         return so;
+ }
+@@ -138,8 +140,9 @@ v3d_create_blend_state(struct pipe_context *pctx,
+ 
+         so->base = *cso;
+ 
++        uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION);
+         if (cso->independent_blend_enable) {
+-                for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
++                for (int i = 0; i < max_rts; i++) {
+                         so->blend_enables |= cso->rt[i].blend_enable << i;
+ 
+                         /* V3D 4.x is when we got independent blend enables. */
+@@ -148,7 +151,7 @@ v3d_create_blend_state(struct pipe_context *pctx,
+                 }
+         } else {
+                 if (cso->rt[0].blend_enable)
+-                        so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1;
++                        so->blend_enables = (1 << max_rts) - 1;
+         }
+ 
+         return so;
+@@ -337,6 +340,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
+         v3d->dirty |= V3D_DIRTY_ZSA;
+ }
+ 
++
++static bool
++needs_default_attribute_values(void)
++{
++#if V3D_VERSION <= 42
++        /* FIXME: on vulkan we are able to refine even further, as we know in
++         * advance when we create the pipeline if we have an integer vertex
++         * attrib. Pending to check if we could do something similar here.
++         */
++        return true;
++#endif
++        return false;
++}
++
+ static void *
+ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
+                         const struct pipe_vertex_element *elements)
+@@ -414,24 +431,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
+                 }
+         }
+ 
+-        /* Set up the default attribute values in case any of the vertex
+-         * elements use them.
+-         */
+-        uint32_t *attrs;
+-        u_upload_alloc(v3d->state_uploader, 0,
+-                       V3D_MAX_VS_INPUTS * sizeof(float), 16,
+-                       &so->defaults_offset, &so->defaults, (void **)&attrs);
+-
+-        for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
+-                attrs[i * 4 + 0] = 0;
+-                attrs[i * 4 + 1] = 0;
+-                attrs[i * 4 + 2] = 0;
+-                if (i < so->num_elements &&
+-                    util_format_is_pure_integer(so->pipe[i].src_format)) {
+-                        attrs[i * 4 + 3] = 1;
+-                } else {
+-                        attrs[i * 4 + 3] = fui(1.0);
++        if (needs_default_attribute_values()) {
++                /* Set up the default attribute values in case any of the vertex
++                 * elements use them.
++                 */
++                uint32_t *attrs;
++                u_upload_alloc(v3d->state_uploader, 0,
++                               V3D_MAX_VS_INPUTS * sizeof(float), 16,
++                               &so->defaults_offset, &so->defaults, (void **)&attrs);
++
++                for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
++                        attrs[i * 4 + 0] = 0;
++                        attrs[i * 4 + 1] = 0;
++                        attrs[i * 4 + 2] = 0;
++                        if (i < so->num_elements &&
++                            util_format_is_pure_integer(so->pipe[i].src_format)) {
++                                attrs[i * 4 + 3] = 1;
++                        } else {
++                                attrs[i * 4 + 3] = fui(1.0);
++                        }
+                 }
++        } else {
++                so->defaults = NULL;
++                so->defaults_offset = 0;
+         }
+ 
+         u_upload_unmap(v3d->state_uploader);
+@@ -699,21 +721,22 @@ v3d_upload_sampler_state_variant(void *map,
+                                 break;
+                         }
+ 
+-                        if (variant >= V3D_SAMPLER_STATE_32) {
+-                                sampler.border_color_word_0 = border.ui[0];
+-                                sampler.border_color_word_1 = border.ui[1];
+-                                sampler.border_color_word_2 = border.ui[2];
+-                                sampler.border_color_word_3 = border.ui[3];
+-                        } else {
+-                                sampler.border_color_word_0 =
+-                                        _mesa_float_to_half(border.f[0]);
+-                                sampler.border_color_word_1 =
+-                                        _mesa_float_to_half(border.f[1]);
+-                                sampler.border_color_word_2 =
+-                                        _mesa_float_to_half(border.f[2]);
+-                                sampler.border_color_word_3 =
+-                                        _mesa_float_to_half(border.f[3]);
++#if V3D_VERSION <= 42
++                        /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
++                         * for us. In V3D 4.x we need to manually convert floating point color
++                         * values to the expected format.
++                         */
++                        if (variant < V3D_SAMPLER_STATE_32) {
++                                border.ui[0] = _mesa_float_to_half(border.f[0]);
++                                border.ui[1] = _mesa_float_to_half(border.f[1]);
++                                border.ui[2] = _mesa_float_to_half(border.f[2]);
++                                border.ui[3] = _mesa_float_to_half(border.f[3]);
+                         }
++#endif
++                        sampler.border_color_word_0 = border.ui[0];
++                        sampler.border_color_word_1 = border.ui[1];
++                        sampler.border_color_word_2 = border.ui[2];
++                        sampler.border_color_word_3 = border.ui[3];
+                 }
+         }
+ }
+@@ -869,7 +892,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te
+ }
+ 
+ static void
+-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
++v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo,
++                               struct V3DX(TEXTURE_SHADER_STATE) *tex,
+                                struct pipe_resource *prsc,
+                                int base_level, int last_level,
+                                int first_layer, int last_layer,
+@@ -917,19 +941,29 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
+         }
+ 
+         tex->base_level = base_level;
++
+ #if V3D_VERSION >= 40
+         tex->max_level = last_level;
+         /* Note that we don't have a job to reference the texture's sBO
+          * at state create time, so any time this sampler view is used
+          * we need to add the texture to the job.
+          */
+-        tex->texture_base_pointer =
+-                cl_address(NULL,
+-                           rsc->bo->offset +
+-                           v3d_layer_offset(prsc, 0, first_layer));
++        const uint32_t base_offset = rsc->bo->offset +
++                v3d_layer_offset(prsc, 0, first_layer);
++
++        tex->texture_base_pointer = cl_address(NULL, base_offset);
+ #endif
++
+         tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
+ 
++#if V3D_VERSION >= 71
++        tex->chroma_offset_x = 1;
++        tex->chroma_offset_y = 1;
++        /* See comment in XML field definition for rationale of the shifts */
++        tex->texture_base_pointer_cb = base_offset >> 6;
++        tex->texture_base_pointer_cr = base_offset >> 6;
++#endif
++
+         /* Since other platform devices may produce UIF images even
+          * when they're not big enough for V3D to assume they're UIF,
+          * we force images with level 0 as UIF to be always treated
+@@ -977,7 +1011,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+ 
+         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
+                 if (prsc->target != PIPE_BUFFER) {
+-                        v3d_setup_texture_shader_state(&tex, prsc,
++                        v3d_setup_texture_shader_state(&v3d->screen->devinfo,
++                                                       &tex, prsc,
+                                                        cso->u.tex.first_level,
+                                                        cso->u.tex.last_level,
+                                                        cso->u.tex.first_layer,
+@@ -990,7 +1025,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+                                                                    cso->u.buf.size);
+                 }
+ 
+-                tex.srgb = util_format_is_srgb(cso->format);
++                bool is_srgb = util_format_is_srgb(cso->format);
++#if V3D_VERSION <= 42
++                tex.srgb = is_srgb;
++#endif
++#if V3D_VERSION >= 71
++                tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
++#endif
+ 
+ #if V3D_VERSION >= 40
+                 tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]);
+@@ -1040,7 +1081,10 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+                          * shader code if we wanted to read an MSAA sRGB
+                          * texture without sRGB decode.
+                          */
++#if V3D_VERSION <= 42
+                         tex.srgb = false;
++#endif
++
+                 } else {
+                         tex.texture_type = v3d_get_tex_format(&screen->devinfo,
+                                                               cso->format);
+@@ -1404,7 +1448,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d,
+ 
+         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
+                 if (prsc->target != PIPE_BUFFER) {
+-                        v3d_setup_texture_shader_state(&tex, prsc,
++                        v3d_setup_texture_shader_state(&v3d->screen->devinfo,
++                                                       &tex, prsc,
+                                                        iview->base.u.tex.level,
+                                                        iview->base.u.tex.level,
+                                                        iview->base.u.tex.first_layer,
+diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c
+new file mode 100644
+index 000000000000..f4dba0cfa485
+--- /dev/null
++++ b/src/gallium/drivers/v3d/v3dx_tfu.c
+@@ -0,0 +1,202 @@
++/*
++ * Copyright © 2022 Raspberry Pi Ltd
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "v3d_context.h"
++#include "broadcom/common/v3d_tfu.h"
++
++bool
++v3dX(tfu)(struct pipe_context *pctx,
++          struct pipe_resource *pdst,
++          struct pipe_resource *psrc,
++          unsigned int src_level,
++          unsigned int base_level,
++          unsigned int last_level,
++          unsigned int src_layer,
++          unsigned int dst_layer,
++          bool for_mipmap)
++{
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_resource *src = v3d_resource(psrc);
++        struct v3d_resource *dst = v3d_resource(pdst);
++        struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
++        struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
++        int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
++        int width = u_minify(pdst->width0, base_level) * msaa_scale;
++        int height = u_minify(pdst->height0, base_level) * msaa_scale;
++        enum pipe_format pformat;
++
++        if (psrc->format != pdst->format)
++                return false;
++        if (psrc->nr_samples != pdst->nr_samples)
++                return false;
++
++        if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D)
++                return false;
++
++        /* Can't write to raster. */
++        if (dst_base_slice->tiling == V3D_TILING_RASTER)
++                return false;
++
++        /* When using TFU for blit, we are doing exact copies (both input and
++         * output format must be the same, no scaling, etc), so there is no
++         * pixel format conversions. Thus we can rewrite the format to use one
++         * that is TFU compatible based on its texel size.
++         */
++        if (for_mipmap) {
++                pformat = pdst->format;
++        } else {
++                switch (dst->cpp) {
++                case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT;   break;
++                case 8:  pformat = PIPE_FORMAT_R16G16B16A16_FLOAT;   break;
++                case 4:  pformat = PIPE_FORMAT_R32_FLOAT;            break;
++                case 2:  pformat = PIPE_FORMAT_R16_FLOAT;            break;
++                case 1:  pformat = PIPE_FORMAT_R8_UNORM;             break;
++                default: unreachable("unsupported format bit-size"); break;
++                };
++        }
++
++        uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
++
++        if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) {
++                assert(for_mipmap);
++                return false;
++        }
++
++        v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
++        v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
++
++        struct drm_v3d_submit_tfu tfu = {
++                .ios = (height << 16) | width,
++                .bo_handles = {
++                        dst->bo->handle,
++                        src != dst ? src->bo->handle : 0
++                },
++                .in_sync = v3d->out_sync,
++                .out_sync = v3d->out_sync,
++        };
++        uint32_t src_offset = (src->bo->offset +
++                               v3d_layer_offset(psrc, src_level, src_layer));
++        tfu.iia |= src_offset;
++
++        uint32_t dst_offset = (dst->bo->offset +
++                               v3d_layer_offset(pdst, base_level, dst_layer));
++        tfu.ioa |= dst_offset;
++
++        switch (src_base_slice->tiling) {
++        case V3D_TILING_UIF_NO_XOR:
++        case V3D_TILING_UIF_XOR:
++                tfu.iis |= (src_base_slice->padded_height /
++                            (2 * v3d_utile_height(src->cpp)));
++                break;
++        case V3D_TILING_RASTER:
++                tfu.iis |= src_base_slice->stride / src->cpp;
++                break;
++        case V3D_TILING_LINEARTILE:
++        case V3D_TILING_UBLINEAR_1_COLUMN:
++        case V3D_TILING_UBLINEAR_2_COLUMN:
++                break;
++       }
++
++#if V3D_VERSION <= 42
++        if (src_base_slice->tiling == V3D_TILING_RASTER) {
++                tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
++                             V3D33_TFU_ICFG_FORMAT_SHIFT);
++        } else {
++                tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
++                              (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                             V3D33_TFU_ICFG_FORMAT_SHIFT);
++        }
++        tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
++
++        if (last_level != base_level)
++                tfu.ioa |= V3D33_TFU_IOA_DIMTW;
++
++        tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
++                     (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                    V3D33_TFU_IOA_FORMAT_SHIFT);
++
++        tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
++
++        /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
++         * OPAD field for the destination (how many extra UIF blocks beyond
++         * those necessary to cover the height).  When filling mipmaps, the
++         * miplevel 1+ tiling state is inferred.
++         */
++        if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
++            dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
++                int uif_block_h = 2 * v3d_utile_height(dst->cpp);
++                int implicit_padded_height = align(height, uif_block_h);
++
++                tfu.icfg |= (((dst_base_slice->padded_height -
++                               implicit_padded_height) / uif_block_h) <<
++                             V3D33_TFU_ICFG_OPAD_SHIFT);
++        }
++#endif /* V3D_VERSION <= 42 */
++
++#if V3D_VERSION >= 71
++        if (src_base_slice->tiling == V3D_TILING_RASTER) {
++                tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
++        } else {
++                tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
++                            (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                        V3D71_TFU_ICFG_IFORMAT_SHIFT;
++        }
++        tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT;
++
++        if (last_level != base_level)
++                tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW;
++
++        tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
++                         (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                        V3D71_TFU_IOC_FORMAT_SHIFT);
++
++        switch (dst_base_slice->tiling) {
++        case V3D_TILING_UIF_NO_XOR:
++        case V3D_TILING_UIF_XOR:
++                tfu.v71.ioc |=
++                        (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) <<
++                        V3D71_TFU_IOC_STRIDE_SHIFT;
++                break;
++        case V3D_TILING_RASTER:
++                tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) <<
++                        V3D71_TFU_IOC_STRIDE_SHIFT;
++                break;
++        default:
++                break;
++        }
++
++        tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT;
++#endif /* V3D_VERSION >= 71*/
++
++        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
++        if (ret != 0) {
++                fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
++                return false;
++        }
++
++        dst->writes++;
++
++        return true;
++}
++
+-- 
+2.39.2
+