diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp
index 380daffb51f0..6ac9b4e593b4 100644
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@@ -281,6 +281,8 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
 	caps_.blendMinMaxSupported = true;
 	caps_.multiSampleLevelsMask = 1;   // More could be supported with some work.
 
+	caps_.provokingVertexLast = false;  // D3D has it first, unfortunately. (and no way to change it).
+
 	caps_.presentInstantModeChange = true;
 	caps_.presentMaxInterval = 4;
 	caps_.presentModesSupported = PresentMode::FIFO | PresentMode::IMMEDIATE;
diff --git a/Common/GPU/D3D9/thin3d_d3d9.cpp b/Common/GPU/D3D9/thin3d_d3d9.cpp
index 0e12092ee036..97f0919e163b 100644
--- a/Common/GPU/D3D9/thin3d_d3d9.cpp
+++ b/Common/GPU/D3D9/thin3d_d3d9.cpp
@@ -786,6 +786,8 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID
 	caps_.presentMaxInterval = 1;
 	caps_.presentModesSupported = PresentMode::FIFO;
 
+	caps_.provokingVertexLast = false;  // D3D has it first, unfortunately (and no way to change it).
+
 	if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1) {
 		caps_.anisoSupported = true;
 	}
diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp
index bd615d6b648b..e8ff1d4d9cee 100644
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@@ -598,6 +598,9 @@ OpenGLContext::OpenGLContext(bool canChangeSwapInterval) : renderManager_(frameT
 	// GLES has no support for logic framebuffer operations. There doesn't even seem to exist any such extensions.
 	caps_.logicOpSupported = !gl_extensions.IsGLES;
 
+	// Always the case in GL (which is what we want for PSP flat shade).
+	caps_.provokingVertexLast = true;
+
 	// Interesting potential hack for emulating GL_DEPTH_CLAMP (use a separate varying, force depth in fragment shader):
 	// This will induce a performance penalty on many architectures though so a blanket enable of this
 	// is probably not a good idea.
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index d03ab9eaff80..9c9eec436760 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -913,6 +913,9 @@ VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
 	caps_.sampleRateShadingSupported = vulkan->GetDeviceFeatures().enabled.standard.sampleRateShading != 0;
 	caps_.textureSwizzleSupported = true;
 
+	// Note that it must also be enabled on the pipelines (which we do).
+	caps_.provokingVertexLast = vulkan->GetDeviceFeatures().enabled.provokingVertex.provokingVertexLast;
+
 	// Present mode stuff
 	caps_.presentMaxInterval = 1;
 	caps_.presentInstantModeChange = false;  // TODO: Fix this with some work in VulkanContext
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index dc7ba23b24d4..b1a9f9448702 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -612,7 +612,7 @@ struct DeviceCaps {
 	bool setMaxFrameLatencySupported;
 	bool textureSwizzleSupported;
 	bool requiresHalfPixelOffset;
-
+	bool provokingVertexLast;  // GL behavior, what the PSP does
 	bool verySlowShaderCompiler;
 
 	// From the other backends, we can detect if D3D9 support is known bad (like on Xe) and disable it.
diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp
index e3c042f4c312..482b78f569fb 100644
--- a/GPU/Common/IndexGenerator.cpp
+++ b/GPU/Common/IndexGenerator.cpp
@@ -86,13 +86,13 @@ void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) {
 
 alignas(16) static const u16 offsets_clockwise[24] = {
 	0, (u16)(0 + 1), (u16)(0 + 2),
-	1, (u16)(1 + 2), (u16)(1 + 1),
+	(u16)(1 + 1), 1, (u16)(1 + 2),
 	2, (u16)(2 + 1), (u16)(2 + 2),
-	3, (u16)(3 + 2), (u16)(3 + 1),
+	(u16)(3 + 1), 3, (u16)(3 + 2),
 	4, (u16)(4 + 1), (u16)(4 + 2),
-	5, (u16)(5 + 2), (u16)(5 + 1),
+	(u16)(5 + 1), 5, (u16)(5 + 2),
 	6, (u16)(6 + 1), (u16)(6 + 2),
-	7, (u16)(7 + 2), (u16)(7 + 1),
+	(u16)(7 + 1), 7, (u16)(7 + 2),
 };
 
 alignas(16) static const uint16_t offsets_counter_clockwise[24] = {
diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
index fd117de69a93..8d1624000c2b 100644
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@@ -125,33 +125,6 @@ static bool IsReallyAClear(const TransformedVertex *transformed, int numVerts, f
 	return true;
 }
 
-static int ColorIndexOffset(int prim, GEShadeMode shadeMode, bool clearMode) {
-	if (shadeMode != GE_SHADE_FLAT || clearMode) {
-		return 0;
-	}
-
-	switch (prim) {
-	case GE_PRIM_LINES:
-	case GE_PRIM_LINE_STRIP:
-		return 1;
-
-	case GE_PRIM_TRIANGLES:
-	case GE_PRIM_TRIANGLE_STRIP:
-		return 2;
-
-	case GE_PRIM_TRIANGLE_FAN:
-		return 1;
-
-	case GE_PRIM_RECTANGLES:
-		// We already use BR color when expanding, so no need to offset.
-		return 0;
-
-	default:
-		break;
-	}
-	return 0;
-}
-
 void SoftwareTransform::SetProjMatrix(const float mtx[14], bool invertedX, bool invertedY, const Lin::Vec3 &trans, const Lin::Vec3 &scale) {
 	memcpy(&projMatrix_.m, mtx, 16 * sizeof(float));
 
@@ -202,11 +175,6 @@ void SoftwareTransform::Transform(int prim, u32 vertType, const DecVtxFormat &de
 		fog_slope = std::signbit(fog_slope) ? -65535.0f : 65535.0f;
 	}
 
-	int provokeIndOffset = 0;
-	if (params_.provokeFlatFirst) {
-		provokeIndOffset = ColorIndexOffset(prim, gstate.getShadeMode(), gstate.isModeClear());
-	}
-
 	VertexReader reader(decoded, decVtxFormat, vertType);
 	if (throughmode) {
 		const u32 materialAmbientRGBA = gstate.getMaterialAmbientRGBA();
@@ -221,13 +189,7 @@ void SoftwareTransform::Transform(int prim, u32 vertType, const DecVtxFormat &de
 			vert.pos_w = 1.0f;
 
 			if (hasColor) {
-				if (provokeIndOffset != 0 && index + provokeIndOffset < numDecodedVerts) {
-					reader.Goto(index + provokeIndOffset);
-					vert.color0_32 = reader.ReadColor0_8888();
-					reader.Goto(index);
-				} else {
-					vert.color0_32 = reader.ReadColor0_8888();
-				}
+				vert.color0_32 = reader.ReadColor0_8888();
 			} else {
 				vert.color0_32 = materialAmbientRGBA;
 			}
@@ -268,10 +230,7 @@ void SoftwareTransform::Transform(int prim, u32 vertType, const DecVtxFormat &de
 			if (reader.hasUV())
 				reader.ReadUV(ruv);
 
-			// Read all the provoking vertex values here.
 			Vec4f unlitColor;
-			if (provokeIndOffset != 0 && index + provokeIndOffset < numDecodedVerts)
-				reader.Goto(index + provokeIndOffset);
 			if (reader.hasColor0())
 				reader.ReadColor0(unlitColor.AsArray());
 			else
@@ -342,34 +301,14 @@ void SoftwareTransform::Transform(int prim, u32 vertType, const DecVtxFormat &de
 						break;
 
 					case GE_PROJMAP_NORMALIZED_NORMAL: // Use normalized normal as source
-						// Flat uses the vertex normal, not provoking.
-						if (provokeIndOffset == 0) {
-							source = normal.Normalized(cpu_info.bSSE4_1);
-						} else {
-							reader.Goto(index);
-							if (reader.hasNormal())
-								reader.ReadNrm(source.AsArray());
-							if (gstate.areNormalsReversed())
-								source = -source;
-							source.Normalize();
-						}
+						source = normal.Normalized(cpu_info.bSSE4_1);
 						if (!reader.hasNormal()) {
 							ERROR_LOG_REPORT(Log::G3D, "Normal projection mapping without normal?");
 						}
 						break;
 
 					case GE_PROJMAP_NORMAL: // Use non-normalized normal as source!
-						// Flat uses the vertex normal, not provoking.
-						if (provokeIndOffset == 0) {
-							source = normal;
-						} else {
-							// Need to read the normal for this vertex and weight it again..
-							reader.Goto(index);
-							if (reader.hasNormal())
-								reader.ReadNrm(source.AsArray());
-							if (gstate.areNormalsReversed())
-								source = -source;
-						}
+						source = normal;
 						if (!reader.hasNormal()) {
 							ERROR_LOG_REPORT(Log::G3D, "Normal projection mapping without normal?");
 						}
@@ -497,13 +436,11 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 
 	if (prim == GE_PRIM_RECTANGLES) {
 		if (!ExpandRectangles(vertexCount, numDecodedVerts, vertsSize, inds, indsSize, transformed, transformedExpanded, numTrans, throughmode, &result->pixelMapped)) {
-			result->drawIndexed = false;
 			result->drawNumTrans = 0;
 			result->pixelMapped = false;
 			return;
 		}
 		result->drawBuffer = transformedExpanded;
-		result->drawIndexed = true;
 
 		// We don't know the color until here, so we have to do it now, instead of in StateMapping.
 		// Might want to reconsider the order of things later...
@@ -521,25 +458,20 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 	} else if (prim == GE_PRIM_POINTS) {
 		result->pixelMapped = false;
 		if (!ExpandPoints(vertexCount, numDecodedVerts, vertsSize, inds, indsSize, transformed, transformedExpanded, numTrans, throughmode)) {
-			result->drawIndexed = false;
 			result->drawNumTrans = 0;
 			return;
 		}
 		result->drawBuffer = transformedExpanded;
-		result->drawIndexed = true;
 	} else if (prim == GE_PRIM_LINES) {
 		result->pixelMapped = false;
 		if (!ExpandLines(vertexCount, numDecodedVerts, vertsSize, inds, indsSize, transformed, transformedExpanded, numTrans, throughmode)) {
-			result->drawIndexed = false;
 			result->drawNumTrans = 0;
 			return;
 		}
 		result->drawBuffer = transformedExpanded;
-		result->drawIndexed = true;
 	} else {
 		// We can simply draw the unexpanded buffer.
 		numTrans = vertexCount;
-		result->drawIndexed = true;
 		result->pixelMapped = false;
 
 		// If we don't support custom cull in the shader, process it here.
@@ -635,7 +567,7 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 		gpuStats.numClears++;
 	}
 
-	result->action = SW_DRAW_PRIMITIVES;
+	result->action = SW_DRAW_INDEXED;
 	result->drawNumTrans = numTrans;
 }
 
@@ -758,6 +690,38 @@ bool SoftwareTransform::ExpandRectangles(int vertexCount, int &numDecodedVerts,
 	return true;
 }
 
+// In-place. So, better not be doing this on GPU memory!
+void IndexBufferProvokingLastToFirst(int prim, u16 *inds, int indsSize) {
+	switch (prim) {
+	case GE_PRIM_LINES:
+		// Swap every two indices.
+		for (int i = 0; i < indsSize - 1; i += 2) {
+			u16 temp = inds[i];
+			inds[i] = inds[i + 1];
+			inds[i + 1] = temp;
+		}
+		break;
+	case GE_PRIM_TRIANGLES:
+		// Rotate the triangle so the last becomes the first, without changing the winding order.
+		// This could be done with a series of pshufb.
+		for (int i = 0; i < indsSize - 2; i += 3) {
+			u16 temp = inds[i + 2];
+			inds[i + 2] = inds[i + 1];
+			inds[i + 1] = inds[i];
+			inds[i] = temp;
+		}
+		break;
+	case GE_PRIM_POINTS:
+		// Nothing to do,
+		break;
+	case GE_PRIM_RECTANGLES:
+		// Nothing to do, already using the 2nd vertex.
+		break;
+	default:
+		_dbg_assert_msg_(false, "IndexBufferProvokingFirstToLast: Only works with plain indexed primitives, no strips or fans")
+	}
+}
+
 bool SoftwareTransform::ExpandLines(int vertexCount, int &numDecodedVerts, int vertsSize, u16 *&inds, int indsSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
 	// Before we start, do a sanity check - does the output fit?
 	if ((vertexCount / 2) * 6 > indsSize) {
diff --git a/GPU/Common/SoftwareTransformCommon.h b/GPU/Common/SoftwareTransformCommon.h
index 78717cdfa3ae..457fd6a253fc 100644
--- a/GPU/Common/SoftwareTransformCommon.h
+++ b/GPU/Common/SoftwareTransformCommon.h
@@ -26,7 +26,7 @@ class TextureCacheCommon;
 
 enum SoftwareTransformAction {
 	SW_NOT_READY,
-	SW_DRAW_PRIMITIVES,
+	SW_DRAW_INDEXED,
 	SW_CLEAR,
 };
 
@@ -44,8 +44,6 @@ struct SoftwareTransformResult {
 
 	TransformedVertex *drawBuffer;
 	int drawNumTrans;
-	bool drawIndexed;
-
 	bool pixelMapped;
 };
 
@@ -57,11 +55,15 @@ struct SoftwareTransformParams {
 	TextureCacheCommon *texCache;
 	bool allowClear;
 	bool allowSeparateAlphaClear;
-	bool provokeFlatFirst;
 	bool flippedY;
 	bool usesHalfZ;
 };
 
+// Converts an index buffer to make the provoking vertex the last.
+// In-place. So, better not be doing this on GPU memory!
+// TODO: We could do this already during index decode.
+void IndexBufferProvokingLastToFirst(int prim, u16 *inds, int indsSize);
+
 class SoftwareTransform {
 public:
 	SoftwareTransform(SoftwareTransformParams &params) : params_(params) {}
diff --git a/GPU/D3D11/D3D11Util.cpp b/GPU/D3D11/D3D11Util.cpp
index b136ca57bc6a..f84b188d56b2 100644
--- a/GPU/D3D11/D3D11Util.cpp
+++ b/GPU/D3D11/D3D11Util.cpp
@@ -36,7 +36,9 @@ std::vector<uint8_t> CompileShaderToBytecodeD3D11(const char *code, size_t codeS
 				if (trimmed.find("pow(f, e) will not work for negative f") != std::string::npos) {
 					continue;
 				}
-				WARN_LOG(Log::G3D, "%.*s", (int)trimmed.length(), trimmed.data());
+				if (trimmed.size() > 1) {  // ignore single nulls, not sure how they appear.
+					WARN_LOG(Log::G3D, "%.*s", (int)trimmed.length(), trimmed.data());
+				}
 			}
 		} else {
 			ERROR_LOG(Log::G3D, "%s: %s\n\n%s", "errors", errors.c_str(), numberedCode.c_str());
diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp
index f415745d7021..db0440c6c48b 100644
--- a/GPU/D3D11/DrawEngineD3D11.cpp
+++ b/GPU/D3D11/DrawEngineD3D11.cpp
@@ -382,10 +382,15 @@ void DrawEngineD3D11::DoFlush() {
 		params.texCache = textureCache_;
 		params.allowClear = true;
 		params.allowSeparateAlphaClear = false;  // D3D11 doesn't support separate alpha clears
-		params.provokeFlatFirst = true;
 		params.flippedY = false;
 		params.usesHalfZ = true;
 
+		if (gstate.getShadeMode() == GE_SHADE_FLAT) {
+			// We need to rotate the index buffer to simulate a different provoking vertex.
+			// We do this before line expansion etc.
+			IndexBufferProvokingLastToFirst(prim, inds, vertexCount);
+		}
+
 		// We need correct viewport values in gstate_c already.
 		if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) {
 			ViewportAndScissor vpAndScissor;
@@ -424,7 +429,7 @@ void DrawEngineD3D11::DoFlush() {
 
 		ApplyDrawStateLate(result.setStencil, result.stencilValue);
 
-		if (result.action == SW_DRAW_PRIMITIVES) {
+		if (result.action == SW_DRAW_INDEXED) {
 			D3D11VertexShader *vshader;
 			D3D11FragmentShader *fshader;
 			shaderManager_->GetShaders(prim, dec_, &vshader, &fshader, pipelineState_, false, false, decOptions_.expandAllWeightsToFloat, true);
@@ -452,17 +457,13 @@ void DrawEngineD3D11::DoFlush() {
 			pushVerts_->EndPush(context_);
 			ID3D11Buffer *buf = pushVerts_->Buf();
 			context_->IASetVertexBuffers(0, 1, &buf, &stride, &vOffset);
-			if (result.drawIndexed) {
-				UINT iOffset;
-				int iSize = sizeof(uint16_t) * result.drawNumTrans;
-				uint8_t *iptr = pushInds_->BeginPush(context_, &iOffset, iSize);
-				memcpy(iptr, inds, iSize);
-				pushInds_->EndPush(context_);
-				context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset);
-				context_->DrawIndexed(result.drawNumTrans, 0, 0);
-			} else {
-				context_->Draw(result.drawNumTrans, 0);
-			}
+			UINT iOffset;
+			int iSize = sizeof(uint16_t) * result.drawNumTrans;
+			uint8_t *iptr = pushInds_->BeginPush(context_, &iOffset, iSize);
+			memcpy(iptr, inds, iSize);
+			pushInds_->EndPush(context_);
+			context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset);
+			context_->DrawIndexed(result.drawNumTrans, 0, 0);
 		} else if (result.action == SW_CLEAR) {
 			u32 clearColor = result.color;
 			float clearDepth = result.depth;
diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp
index 4d16fd6701fe..ce20f88859e3 100644
--- a/GPU/Directx9/DrawEngineDX9.cpp
+++ b/GPU/Directx9/DrawEngineDX9.cpp
@@ -334,10 +334,16 @@ void DrawEngineDX9::DoFlush() {
 		params.texCache = textureCache_;
 		params.allowClear = true;
 		params.allowSeparateAlphaClear = false;
-		params.provokeFlatFirst = true;
 		params.flippedY = false;
 		params.usesHalfZ = true;
 
+		if (gstate.getShadeMode() == GE_SHADE_FLAT) {
+			// We need to rotate the index buffer to simulate a different provoking vertex.
+			// We do this before line expansion etc.
+			int indexCount = RemainingIndices(inds);
+			IndexBufferProvokingLastToFirst(prim, inds, vertexCount);
+		}
+
 		// We need correct viewport values in gstate_c already.
 		if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) {
 			ViewportAndScissor vpAndScissor;
@@ -382,7 +388,7 @@ void DrawEngineDX9::DoFlush() {
 
 		VSShader *vshader = shaderManager_->ApplyShader(false, false, dec_, decOptions_.expandAllWeightsToFloat, true, pipelineState_);
 
-		if (result.action == SW_DRAW_PRIMITIVES) {
+		if (result.action == SW_DRAW_INDEXED) {
 			if (result.setStencil) {
 				dxstate.stencilFunc.set(D3DCMP_ALWAYS);
 				dxstate.stencilRef.set(result.stencilValue);
@@ -393,11 +399,7 @@ void DrawEngineDX9::DoFlush() {
 			// Might help for text drawing.
 
 			device_->SetVertexDeclaration(transformedVertexDecl_);
-			if (result.drawIndexed) {
-				device_->DrawIndexedPrimitiveUP(d3d_prim[prim], 0, numDecodedVerts_, D3DPrimCount(d3d_prim[prim], result.drawNumTrans), inds, D3DFMT_INDEX16, result.drawBuffer, sizeof(TransformedVertex));
-			} else {
-				device_->DrawPrimitiveUP(d3d_prim[prim], D3DPrimCount(d3d_prim[prim], result.drawNumTrans), result.drawBuffer, sizeof(TransformedVertex));
-			}
+			device_->DrawIndexedPrimitiveUP(d3d_prim[prim], 0, numDecodedVerts_, D3DPrimCount(d3d_prim[prim], result.drawNumTrans), inds, D3DFMT_INDEX16, result.drawBuffer, sizeof(TransformedVertex));
 		} else if (result.action == SW_CLEAR) {
 			u32 clearColor = result.color;
 			float clearDepth = result.depth;
diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp
index d6da9070de98..e2161572d3b6 100644
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@@ -356,7 +356,6 @@ void DrawEngineGLES::DoFlush() {
 		params.texCache = textureCache_;
 		params.allowClear = true;  // Clear in OpenGL respects scissor rects, so we'll use it.
 		params.allowSeparateAlphaClear = true;
-		params.provokeFlatFirst = false;
 		params.flippedY = framebufferManager_->UseBufferedRendering();
 		params.usesHalfZ = false;
 
@@ -415,18 +414,12 @@ void DrawEngineGLES::DoFlush() {
 			goto bail;
 		}
 
-		if (result.action == SW_DRAW_PRIMITIVES) {
-			if (result.drawIndexed) {
-				vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(result.drawBuffer, numDecodedVerts_ * sizeof(TransformedVertex), 4, &vertexBuffer);
-				indexBufferOffset = (uint32_t)frameData.pushIndex->Push(inds, sizeof(uint16_t) * result.drawNumTrans, 2, &indexBuffer);
-				render_->DrawIndexed(
-					softwareInputLayout_, vertexBuffer, vertexBufferOffset, indexBuffer, indexBufferOffset,
-					glprim[prim], result.drawNumTrans, GL_UNSIGNED_SHORT);
-			} else {
-				vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(result.drawBuffer, result.drawNumTrans * sizeof(TransformedVertex), 4, &vertexBuffer);
-				render_->Draw(
-					softwareInputLayout_, vertexBuffer, vertexBufferOffset, glprim[prim], 0, result.drawNumTrans);
-			}
+		if (result.action == SW_DRAW_INDEXED) {
+			vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(result.drawBuffer, numDecodedVerts_ * sizeof(TransformedVertex), 4, &vertexBuffer);
+			indexBufferOffset = (uint32_t)frameData.pushIndex->Push(inds, sizeof(uint16_t) * result.drawNumTrans, 2, &indexBuffer);
+			render_->DrawIndexed(
+				softwareInputLayout_, vertexBuffer, vertexBufferOffset, indexBuffer, indexBufferOffset,
+				glprim[prim], result.drawNumTrans, GL_UNSIGNED_SHORT);
 		} else if (result.action == SW_CLEAR) {
 			u32 clearColor = result.color;
 			float clearDepth = result.depth;
diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp
index c8d0250888cf..b03a8556a01b 100644
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@@ -391,6 +391,8 @@ void DrawEngineVulkan::DoFlush() {
 		gpuStats.numUncachedVertsDrawn += vertexCount;
 		prim = IndexGenerator::GeneralPrim((GEPrimitiveType)drawInds_[0].prim);
 
+		// At this point, the output is always an index triangle/line/point list, no strips/fans.
+
 		u16 *inds = decIndex_;
 		SoftwareTransformResult result{};
 		SoftwareTransformParams params{};
@@ -403,10 +405,13 @@ void DrawEngineVulkan::DoFlush() {
 		// do not respect scissor rects.
 		params.allowClear = framebufferManager_->UseBufferedRendering();
 		params.allowSeparateAlphaClear = false;
-		params.provokeFlatFirst = true;
-		if (renderManager->GetVulkanContext()->GetDeviceFeatures().enabled.provokingVertex.provokingVertexLast) {
-			// We can get the OpenGL behavior, no need for workarounds.
-			params.provokeFlatFirst = false;
+
+		if (gstate.getShadeMode() == GE_SHADE_FLAT) {
+			if (!renderManager->GetVulkanContext()->GetDeviceFeatures().enabled.provokingVertex.provokingVertexLast) {
+				// If we can't have the hardware do it, we need to rotate the index buffer to simulate a different provoking vertex.
+				// We do this before line expansion etc.
+				IndexBufferProvokingLastToFirst(prim, inds, vertexCount);
+			}
 		}
 		params.flippedY = true;
 		params.usesHalfZ = true;
@@ -445,7 +450,7 @@ void DrawEngineVulkan::DoFlush() {
 
 		// Only here, where we know whether to clear or to draw primitives, should we actually set the current framebuffer! Because that gives use the opportunity
 		// to use a "pre-clear" render pass, for high efficiency on tilers.
-		if (result.action == SW_DRAW_PRIMITIVES) {
+		if (result.action == SW_DRAW_INDEXED) {
 			if (textureNeedsApply) {
 				gstate_c.pixelMapped = result.pixelMapped;
 				textureCache_->ApplyTexture();
@@ -522,16 +527,10 @@ void DrawEngineVulkan::DoFlush() {
 
 			PROFILE_THIS_SCOPE("renderman_q");
 
-			if (result.drawIndexed) {
-				VkBuffer vbuf, ibuf;
-				vbOffset = (uint32_t)pushVertex_->Push(result.drawBuffer, numDecodedVerts_ * sizeof(TransformedVertex), 4, &vbuf);
-				ibOffset = (uint32_t)pushIndex_->Push(inds, sizeof(short) * result.drawNumTrans, 4, &ibuf);
-				renderManager->DrawIndexed(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, result.drawNumTrans, 1);
-			} else {
-				VkBuffer vbuf;
-				vbOffset = (uint32_t)pushVertex_->Push(result.drawBuffer, result.drawNumTrans * sizeof(TransformedVertex), 4, &vbuf);
-				renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, result.drawNumTrans);
-			}
+			VkBuffer vbuf, ibuf;
+			vbOffset = (uint32_t)pushVertex_->Push(result.drawBuffer, numDecodedVerts_ * sizeof(TransformedVertex), 4, &vbuf);
+			ibOffset = (uint32_t)pushIndex_->Push(inds, sizeof(short) * result.drawNumTrans, 4, &ibuf);
+			renderManager->DrawIndexed(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, result.drawNumTrans, 1);
 		} else if (result.action == SW_CLEAR) {
 			// Note: we won't get here if the clear is alpha but not color, or color but not alpha.
 			bool clearColor = gstate.isClearModeColorMask();