RTech: cleanup ZStd pak decoder and fix bugs

* Pak_InitDecoder() now takes the output buf and mask as parameters. * Pak_InitDecoder() checks if provided masks are a power of 2 (required). * Pak_ComputeRingBufferFrame() now uses the bit mask instead of modulo to determine # bytes used. * Fixed a bug where PakDecoder_t::bufferSizeNeeded could be bigger than the file stream, causing a deadlock.
2025-02-09 19:15:03 +01:00 · 2024-01-28 01:57:11 +01:00 · 2024-01-28 01:57:11 +01:00 · 1c2b02b972
commit 1c2b02b972
parent 5ea886ab65
5 changed files with 293 additions and 276 deletions
--- a/r5dev/common/callback.cpp
+++ b/r5dev/common/callback.cpp
@ -563,7 +563,7 @@ void Pak_Decompress_f(const CCommand& args)
 	const bool usesCustomCompression = pHeader->flags & PAK_HEADER_FLAGS_ZSTREAM;

 	PakDecoder_t decoder{};
-	const uint64_t nDecompSize = Pak_InitDecoder(&decoder, pPakBuf, UINT64_MAX, nFileSize, NULL, sizeof(PakFileHeader_t), usesCustomCompression);
+	const uint64_t nDecompSize = Pak_InitDecoder(&decoder, pPakBuf, nullptr, UINT64_MAX, UINT64_MAX, nFileSize, NULL, sizeof(PakFileHeader_t), usesCustomCompression);

 	if (nDecompSize != pHeader->decompressedSize)
 	{
--- a/r5dev/public/rtech/ipakfile.h
+++ b/r5dev/public/rtech/ipakfile.h
@ -363,7 +363,7 @@ struct PakDecoder_t
 	uint64_t outputInvMask;

 	uint32_t headerOffset;
-	uint32_t dword44;
+	uint32_t padding; // unused data, available for other stuff

 	uint64_t inBufBytePos;
 	uint64_t outBufBytePos;
@ -377,7 +377,11 @@ struct PakDecoder_t
 	uint32_t dword6C;
 	uint64_t qword70;

-	size_t compressedStreamSize;
+	union
+	{
+		size_t compressedStreamSize;
+		size_t frameHeaderSize;
+	};

 	union
 	{
@ -386,6 +390,12 @@ struct PakDecoder_t
 	};
 };

+struct PakRingBufferFrame_t
+{
+	size_t bufIndex;
+	size_t frameLen;
+};
+
 struct PakFile_t;

 class PakLoadedInfo_t
--- a/r5dev/rtech/pak/pakdecode.cpp
+++ b/r5dev/rtech/pak/pakdecode.cpp
@ -130,16 +130,122 @@ static const unsigned char /*141313180*/ s_defaultDecoderLUT[] =
 	0x4C, 0x39, 0x56, 0x75, 0x42, 0x52, 0x65, 0x75, 0x70, 0x35, 0x31, 0x77, 0x4C, 0x51, 0x64, 0x61,
 };

+//-----------------------------------------------------------------------------
+// checks if we have enough output buffer room to decode the data stream
+//-----------------------------------------------------------------------------
+bool Pak_HasEnoughDecodeBufferAvailable(PakDecoder_t* const decoder, const size_t outLen)
+{
+	const uint64_t bytesWritten = (decoder->outBufBytePos & ~decoder->outputInvMask);
+	return (outLen >= decoder->outputInvMask + (bytesWritten + 1) || outLen >= decoder->decompSize);
+}
+
+//-----------------------------------------------------------------------------
+// checks if we have enough source data streamed to decode the next block
+//-----------------------------------------------------------------------------
+bool Pak_HasEnoughStreamedDataForDecode(PakDecoder_t* const decoder, const size_t inLen)
+{
+	// the decoder needs at least this many bytes to decode the current block
+	// buffer contiguously
+	return (inLen >= decoder->bufferSizeNeeded);
+}
+
+//-----------------------------------------------------------------------------
+// gets the frame for the data in the ring buffer, the frame returned is always
+// ending to the end of the ring buffer, or the end of the data itself
+//-----------------------------------------------------------------------------
+PakRingBufferFrame_t Pak_ComputeRingBufferFrame(const uint64_t bufMask, const size_t seekPos, const size_t dataLen)
+{
+	PakRingBufferFrame_t ring;
+	ring.bufIndex = seekPos & bufMask;
+
+	// the total amount of bytes used and available in this frame
+	const size_t bytesUsed = ring.bufIndex & bufMask;
+	const size_t totalAvail = bufMask+1 - bytesUsed;
+
+	// the last part of the data might be smaller than the remainder of the ring
+	// buffer; clamp it
+	ring.frameLen = Min(dataLen - seekPos, totalAvail);
+	return ring;
+}
+
+//-----------------------------------------------------------------------------
+// initializes the RTech decoder
+//-----------------------------------------------------------------------------
+size_t Pak_RStreamDecoderInit(PakDecoder_t* const decoder, const uint8_t* const fileBuffer,
+	const uint64_t inputMask, const size_t dataSize, const size_t dataOffset, const size_t headerSize)
+{
+	uint64_t frameHeader = *(_QWORD*)((inputMask & (dataOffset + headerSize)) + fileBuffer);
+	const int decompressedSizeBits = frameHeader & 0x3F;
+
+	frameHeader >>= 6;
+	decoder->decompSize = (1i64 << decompressedSizeBits) | frameHeader & ((1i64 << decompressedSizeBits) - 1);
+
+	const uint64_t bytePos = dataOffset + headerSize + 8;
+	const int64_t currByteLow = *(_QWORD*)((inputMask & bytePos) + fileBuffer) << (64 - ((unsigned __int8)decompressedSizeBits + 6));
+
+	decoder->inBufBytePos = bytePos + ((unsigned __int64)(unsigned int)(decompressedSizeBits + 6) >> 3);
+	const uint32_t bitPosFinal = ((decompressedSizeBits + 6) & 7) + 13;
+
+	const uint64_t currByte = (0xFFFFFFFFFFFFFFFFui64 >> ((decompressedSizeBits + 6) & 7)) & ((frameHeader >> decompressedSizeBits) | currByteLow);
+	const uint32_t currbits = (((_BYTE)currByte - 1) & 0x3F) + 1;
+
+	const uint64_t invMaskIn = 0xFFFFFFFFFFFFFFFFui64 >> (64 - (unsigned __int8)currbits);
+	decoder->inputInvMask = invMaskIn;
+
+	const uint64_t invMaskOut = 0xFFFFFFFFFFFFFFFFui64 >> (64 - ((((currByte >> 6) - 1) & 0x3F) + 1));
+	decoder->outputInvMask = invMaskOut;
+
+	const uint64_t finalByteFull = (currByte >> 13) | (*(_QWORD*)((inputMask & decoder->inBufBytePos) + fileBuffer) << (64 - (unsigned __int8)bitPosFinal));
+	const uint32_t finalBitOffset = bitPosFinal & 7;
+
+	decoder->inBufBytePos += bitPosFinal >> 3;
+	const uint64_t finalByte = (0xFFFFFFFFFFFFFFFFui64 >> finalBitOffset) & finalByteFull;
+
+	if (decoder->inputInvMask == 0xFFFFFFFFFFFFFFFFui64)
+	{
+		decoder->headerOffset = 0;
+		decoder->bufferSizeNeeded = dataSize;
+	}
+	else
+	{
+		const uint64_t finalPos = inputMask & decoder->inBufBytePos;
+		decoder->headerOffset = (currbits >> 3) + 1;
+		decoder->inBufBytePos += (currbits >> 3) + 1;
+		decoder->bufferSizeNeeded = *(_QWORD*)(finalPos + fileBuffer) & ((1i64 << (8 * ((unsigned __int8)(currbits >> 3) + 1))) - 1);;
+	}
+
+	decoder->bufferSizeNeeded += dataOffset;
+	decoder->currentByte = finalByte;
+	decoder->currentBit = finalBitOffset;
+	decoder->qword70 = decoder->inputInvMask + dataOffset - 6;
+	decoder->dword6C = 0;
+	decoder->compressedStreamSize = decoder->bufferSizeNeeded;
+	decoder->decompressedStreamSize = decoder->decompSize;
+
+	if ((((unsigned __int8)(currByte >> 6) - 1) & 0x3F) != -1i64 && decoder->decompSize - 1 > decoder->outputInvMask)
+	{
+		const uint64_t streamCompressedSize = decoder->bufferSizeNeeded - decoder->headerOffset;
+		decoder->compressedStreamSize = streamCompressedSize;
+
+		decoder->decompressedStreamSize = decoder->outputInvMask + 1;
+	}
+
+	return decoder->decompSize;
+}
+
+//-----------------------------------------------------------------------------
+// decodes the RTech data stream up to available buffer or data
+//-----------------------------------------------------------------------------
 bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const size_t outLen)
 {
 	bool result; // al
-	uint64_t m_decompBytePosition; // r15
-	uint8_t* m_outputBuf; // r11
-	uint32_t m_currentBit; // ebp
-	uint64_t m_currentByte; // rsi
-	uint64_t m_fileBytePosition; // rdi
+	uint64_t outBufBytePos; // r15
+	uint8_t* outputBuf; // r11
+	uint32_t currentBit; // ebp
+	uint64_t currentByte; // rsi
+	uint64_t inBufBytePos; // rdi
 	size_t qword70; // r12
-	const uint8_t* m_inputBuf; // r13
+	const uint8_t* inputBuf; // r13
 	uint32_t dword6C; // ecx
 	uint64_t v13; // rsi
 	unsigned __int64 i; // rax
@ -152,15 +258,15 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 	int v21; // ecx
 	unsigned __int64 v22; // r11
 	int v23; // edx
-	uint64_t m_outputMask; // rax
+	uint64_t outputMask; // rax
 	int v25; // r8d
 	unsigned int v26; // r13d
 	uint64_t v27; // r10
 	uint8_t* v28; // rax
 	uint8_t* v29; // r10
-	size_t m_decompSize; // r9
-	uint64_t m_inputInvMask; // r10
-	uint64_t m_headerOffset; // r8
+	size_t decompSize; // r9
+	uint64_t inputInvMask; // r10
+	uint64_t headerOffset; // r8
 	uint64_t v33; // rax
 	uint64_t v34; // rax
 	uint64_t v35; // rax
@ -202,49 +308,49 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 	uint32_t v71; // [rsp+60h] [rbp+8h]
 	const uint8_t* v74; // [rsp+78h] [rbp+20h]

-	m_decompBytePosition = decoder->outBufBytePos;
+	outBufBytePos = decoder->outBufBytePos;

-	m_outputBuf = decoder->outputBuf;
-	m_currentBit = decoder->currentBit;
-	m_currentByte = decoder->currentByte;
-	m_fileBytePosition = decoder->inBufBytePos;
+	outputBuf = decoder->outputBuf;
+	currentBit = decoder->currentBit;
+	currentByte = decoder->currentByte;
+	inBufBytePos = decoder->inBufBytePos;
 	qword70 = decoder->qword70;
-	m_inputBuf = decoder->inputBuf;
+	inputBuf = decoder->inputBuf;

 	if (decoder->compressedStreamSize < qword70)
 		qword70 = decoder->compressedStreamSize;

 	dword6C = decoder->dword6C;
-	v74 = m_inputBuf;
-	v70 = m_outputBuf;
+	v74 = inputBuf;
+	v70 = outputBuf;
 	v71 = dword6C;
-	if (!m_currentBit)
+	if (!currentBit)
 		goto LABEL_11;

-	v13 = (*(_QWORD*)&m_inputBuf[m_fileBytePosition & decoder->inputMask] << (64 - (unsigned __int8)m_currentBit)) | m_currentByte;
-	for (i = m_currentBit; ; i = m_currentBit)
+	v13 = (*(_QWORD*)&inputBuf[inBufBytePos & decoder->inputMask] << (64 - (unsigned __int8)currentBit)) | currentByte;
+	for (i = currentBit; ; i = currentBit)
 	{
-		m_currentBit &= 7u;
-		m_fileBytePosition += i >> 3;
+		currentBit &= 7u;
+		inBufBytePos += i >> 3;
 		dword6C = v71;
-		m_currentByte = (0xFFFFFFFFFFFFFFFFui64 >> m_currentBit) & v13;
+		currentByte = (0xFFFFFFFFFFFFFFFFui64 >> currentBit) & v13;
 	LABEL_11:
 		v15 = (unsigned __int64)dword6C << 8;
 		v16 = dword6C;
-		v17 = s_defaultDecoderLUT[(unsigned __int8)m_currentByte + 512 + v15];
-		v18 = (unsigned __int8)m_currentByte + v15;
-		m_currentBit += v17;
-		v19 = m_currentByte >> v17;
+		v17 = s_defaultDecoderLUT[(unsigned __int8)currentByte + 512 + v15];
+		v18 = (unsigned __int8)currentByte + v15;
+		currentBit += v17;
+		v19 = currentByte >> v17;
 		v20 = (unsigned int)(char)s_defaultDecoderLUT[v18];
 		if ((s_defaultDecoderLUT[v18] & 0x80u) != 0)
 		{
 			v56 = -(int)v20;
-			v57 = &m_inputBuf[m_fileBytePosition & decoder->inputMask];
+			v57 = &inputBuf[inBufBytePos & decoder->inputMask];
 			v71 = 1;
-			v58 = &m_outputBuf[m_decompBytePosition & decoder->outputMask];
+			v58 = &outputBuf[outBufBytePos & decoder->outputMask];
 			if (v56 == s_defaultDecoderLUT[v16 + 1248])
 			{
-				if ((~m_fileBytePosition & decoder->inputInvMask) < 0xF || (decoder->outputInvMask & ~m_decompBytePosition) < 0xF || decoder->decompSize - m_decompBytePosition < 0x10)
+				if ((~inBufBytePos & decoder->inputInvMask) < 0xF || (decoder->outputInvMask & ~outBufBytePos) < 0xF || decoder->decompSize - outBufBytePos < 0x10)
 					v56 = 1;
 				v59 = v19;
 				v60 = v19 >> 3;
@ -259,11 +365,11 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 				{
 					v62 = v60 >> 4;
 					v65 = v60 & 0xF;
-					m_currentBit += 4;
+					currentBit += 4;
 					v63 = *(_DWORD*)&s_defaultDecoderLUT[4 * v65 + 1152];
 					v64 = s_defaultDecoderLUT[v65 + 1216];
 				}
-				m_currentBit += v64 + 3;
+				currentBit += v64 + 3;
 				v19 = v62 >> v64;
 				v66 = v63 + (v62 & ((1 << v64) - 1)) + v56;
 				for (j = v66 >> 3; j; --j)
@ -287,15 +393,15 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 				}
 				if ((v66 & 1) != 0)
 					*v58 = *v57;
-				m_fileBytePosition += v66;
-				m_decompBytePosition += v66;
+				inBufBytePos += v66;
+				outBufBytePos += v66;
 			}
 			else
 			{
 				*(_QWORD*)v58 = *(_QWORD*)v57;
 				*((_QWORD*)v58 + 1) = *((_QWORD*)v57 + 1);
-				m_fileBytePosition += v56;
-				m_decompBytePosition += v56;
+				inBufBytePos += v56;
+				outBufBytePos += v56;
 			}
 		}
 		else
@ -304,13 +410,13 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 			v71 = 0;
 			v22 = ((unsigned __int64)(unsigned int)v19 >> (((unsigned int)(v21 - 31) >> 3) & 6)) & 0x3F;
 			v23 = 1 << (v21 + ((v19 >> 4) & ((24 * (((unsigned int)(v21 - 31) >> 3) & 2)) >> 4)));
-			m_currentBit += (((unsigned int)(v21 - 31) >> 3) & 6) + s_defaultDecoderLUT[v22 + 1088] + v21 + ((v19 >> 4) & ((24 * (((unsigned int)(v21 - 31) >> 3) & 2)) >> 4));
-			m_outputMask = decoder->outputMask;
+			currentBit += (((unsigned int)(v21 - 31) >> 3) & 6) + s_defaultDecoderLUT[v22 + 1088] + v21 + ((v19 >> 4) & ((24 * (((unsigned int)(v21 - 31) >> 3) & 2)) >> 4));
+			outputMask = decoder->outputMask;
 			v25 = 16 * (v23 + ((v23 - 1) & (v19 >> ((((unsigned int)(v21 - 31) >> 3) & 6) + s_defaultDecoderLUT[v22 + 1088]))));
 			v19 >>= (((unsigned int)(v21 - 31) >> 3) & 6) + s_defaultDecoderLUT[v22 + 1088] + v21 + ((v19 >> 4) & ((24 * (((unsigned int)(v21 - 31) >> 3) & 2)) >> 4));
 			v26 = v25 + s_defaultDecoderLUT[v22 + 1024] - 16;
-			v27 = m_outputMask & (m_decompBytePosition - v26);
-			v28 = &v70[m_decompBytePosition & m_outputMask];
+			v27 = outputMask & (outBufBytePos - v26);
+			v28 = &v70[outBufBytePos & outputMask];
 			v29 = &v70[v27];
 			if ((_DWORD)v20 == 17)
 			{
@ -325,26 +431,26 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 				}
 				else
 				{
-					m_currentBit += 4;
+					currentBit += 4;
 					v46 = v41 & 0xF;
 					v43 = v41 >> 4;
 					v44 = *(_DWORD*)&s_defaultDecoderLUT[4 * v46 + 1152];
 					v45 = s_defaultDecoderLUT[v46 + 1216];
-					if (v74 && m_currentBit + v45 >= 61)
+					if (v74 && currentBit + v45 >= 61)
 					{
-						v47 = m_fileBytePosition++ & decoder->inputMask;
-						v43 |= (unsigned __int64)v74[v47] << (61 - (unsigned __int8)m_currentBit);
-						m_currentBit -= 8;
+						v47 = inBufBytePos++ & decoder->inputMask;
+						v43 |= (unsigned __int64)v74[v47] << (61 - (unsigned __int8)currentBit);
+						currentBit -= 8;
 					}
 				}
-				m_currentBit += v45 + 3;
+				currentBit += v45 + 3;
 				v19 = v43 >> v45;
 				v48 = ((unsigned int)v43 & ((1 << v45) - 1)) + v44 + 17;
-				m_decompBytePosition += v48;
+				outBufBytePos += v48;
 				if (v26 < 8)
 				{
 					v50 = v48 - 13;
-					m_decompBytePosition -= 13i64;
+					outBufBytePos -= 13i64;
 					if (v26 == 1)
 					{
 						v51 = *v29;
@ -378,59 +484,59 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 			}
 			else
 			{
-				m_decompBytePosition += v20;
+				outBufBytePos += v20;
 				*(_QWORD*)v28 = *(_QWORD*)v29;
 				*((_QWORD*)v28 + 1) = *((_QWORD*)v29 + 1);
 			}
-			m_inputBuf = v74;
+			inputBuf = v74;
 		}
-		if (m_fileBytePosition >= qword70)
+		if (inBufBytePos >= qword70)
 			break;
 	LABEL_29:
-		m_outputBuf = v70;
-		v13 = (*(_QWORD*)&m_inputBuf[m_fileBytePosition & decoder->inputMask] << (64 - (unsigned __int8)m_currentBit)) | v19;
+		outputBuf = v70;
+		v13 = (*(_QWORD*)&inputBuf[inBufBytePos & decoder->inputMask] << (64 - (unsigned __int8)currentBit)) | v19;
 	}
-	if (m_decompBytePosition != decoder->decompressedStreamSize)
+	if (outBufBytePos != decoder->decompressedStreamSize)
 		goto LABEL_25;
-	m_decompSize = decoder->decompSize;
-	if (m_decompBytePosition == m_decompSize)
+	decompSize = decoder->decompSize;
+	if (outBufBytePos == decompSize)
 	{
 		result = true;
 		goto LABEL_69;
 	}
-	m_inputInvMask = decoder->inputInvMask;
-	m_headerOffset = decoder->headerOffset;
-	v33 = m_inputInvMask & -(__int64)m_fileBytePosition;
+	inputInvMask = decoder->inputInvMask;
+	headerOffset = decoder->headerOffset;
+	v33 = inputInvMask & -(__int64)inBufBytePos;
 	v19 >>= 1;
-	++m_currentBit;
-	if (m_headerOffset > v33)
+	++currentBit;
+	if (headerOffset > v33)
 	{
-		m_fileBytePosition += v33;
+		inBufBytePos += v33;
 		v34 = decoder->qword70;
-		if (m_fileBytePosition > v34)
-			decoder->qword70 = m_inputInvMask + v34 + 1;
+		if (inBufBytePos > v34)
+			decoder->qword70 = inputInvMask + v34 + 1;
 	}
-	v35 = m_fileBytePosition & decoder->inputMask;
-	m_fileBytePosition += m_headerOffset;
-	v36 = m_decompBytePosition + decoder->outputInvMask + 1;
-	v37 = *(_QWORD*)&m_inputBuf[v35] & ((1i64 << (8 * (unsigned __int8)m_headerOffset)) - 1);
+	v35 = inBufBytePos & decoder->inputMask;
+	inBufBytePos += headerOffset;
+	v36 = outBufBytePos + decoder->outputInvMask + 1;
+	v37 = *(_QWORD*)&inputBuf[v35] & ((1i64 << (8 * (unsigned __int8)headerOffset)) - 1);
 	v38 = v37 + decoder->bufferSizeNeeded;
 	v39 = v37 + decoder->compressedStreamSize;
 	decoder->bufferSizeNeeded = v38;
 	decoder->compressedStreamSize = v39;
-	if (v36 >= m_decompSize)
+	if (v36 >= decompSize)
 	{
-		v36 = m_decompSize;
-		decoder->compressedStreamSize = m_headerOffset + v39;
+		v36 = decompSize;
+		decoder->compressedStreamSize = headerOffset + v39;
 	}
 	decoder->decompressedStreamSize = v36;
 	if (inLen >= v38 && outLen >= v36)
 	{
 	LABEL_25:
 		qword70 = decoder->qword70;
-		if (m_fileBytePosition >= qword70)
+		if (inBufBytePos >= qword70)
 		{
-			m_fileBytePosition = ~decoder->inputInvMask & (m_fileBytePosition + 7);
+			inBufBytePos = ~decoder->inputInvMask & (inBufBytePos + 7);
 			qword70 += decoder->inputInvMask + 1;
 			decoder->qword70 = qword70;
 		}
@ -439,67 +545,77 @@ bool Pak_RStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 		goto LABEL_29;
 	}
 	v69 = decoder->qword70;
-	if (m_fileBytePosition >= v69)
+	if (inBufBytePos >= v69)
 	{
-		m_fileBytePosition = ~m_inputInvMask & (m_fileBytePosition + 7);
-		decoder->qword70 = v69 + m_inputInvMask + 1;
+		inBufBytePos = ~inputInvMask & (inBufBytePos + 7);
+		decoder->qword70 = v69 + inputInvMask + 1;
 	}
 	decoder->dword6C = v71;
 	result = false;
 	decoder->currentByte = v19;
-	decoder->currentBit = m_currentBit;
+	decoder->currentBit = currentBit;
 LABEL_69:
-	decoder->outBufBytePos = m_decompBytePosition;
-	decoder->inBufBytePos = m_fileBytePosition;
+	decoder->outBufBytePos = outBufBytePos;
+	decoder->inBufBytePos = inBufBytePos;
 	return result;
 }

-
 //-----------------------------------------------------------------------------
-// checks if we have enough output buffer room to decode the data stream
+// initializes the ZStd decoder
 //-----------------------------------------------------------------------------
-bool Pak_HasEnoughDecodeBufferLeft(PakDecoder_t* const decoder, const size_t outLen)
+size_t Pak_ZStreamDecoderInit(PakDecoder_t* const decoder, const uint8_t* const fileBuffer,
+	const uint64_t inputMask, const size_t dataSize, const size_t dataOffset, const size_t headerSize)
 {
-	const uint64_t bytesWritten = (decoder->outBufBytePos & ~decoder->outputInvMask);
-	return (outLen >= decoder->outputInvMask + (bytesWritten + 1) || outLen >= decoder->decompSize);
+	// NOTE: on original paks, this data is passed out of the frame header,
+	// but for ZStd encoded paks we are always limiting this to the ring
+	// buffer size
+	decoder->inputInvMask = PAK_DECODE_OUT_RING_BUFFER_MASK;
+	decoder->outputInvMask = PAK_DECODE_OUT_RING_BUFFER_MASK;
+
+	ZSTD_DStream* const dctx = ZSTD_createDStream();
+	assert(dctx);
+
+	// failure
+	if (!dctx)
+		return NULL;
+
+	decoder->zstreamContext = dctx;
+
+	// this points to the first byte of the frame header, takes dataOffset
+	// into account which is the offset in the ring buffer to the patched
+	// data as we parse it contiguously after the base pak data, which
+	// might have ended somewhere in the middle of the ring buffer
+	const uint8_t* const frameHeaderData = (inputMask & (dataOffset + headerSize)) + fileBuffer;
+
+	// this is the offset to the ZStd header in the input buffer
+	decoder->headerOffset = static_cast<uint32_t>(frameHeaderData - fileBuffer);
+
+	if (ZSTD_getFrameHeader(&dctx->fParams, frameHeaderData, dataSize) != 0)
+	{
+		ZSTD_freeDStream(decoder->zstreamContext);
+		decoder->zstreamContext = nullptr;
+
+		return NULL; // content size error
+	}
+
+	// ideally the frame header of the block gets parsed first, the length
+	// thereof is returned by initDStream and thus being processed first
+	// before moving on to actual data
+	decoder->frameHeaderSize = ZSTD_initDStream(dctx);
+
+	// we need at least this many bytes of streamed data to process the frame
+	// header of the compressed block
+	decoder->bufferSizeNeeded = decoder->headerOffset + decoder->frameHeaderSize;
+
+	// must include header size
+	decoder->decompSize = dctx->fParams.frameContentSize + headerSize;
+	return decoder->decompSize;
 }

 //-----------------------------------------------------------------------------
-// checks if we have enough source data streamed to decode the next block
+// decodes the ZStd data stream up to available buffer or data, whichever ends
+// first (determined by Pak_ComputeRingBufferFrame())
 //-----------------------------------------------------------------------------
-bool Pak_HasEnoughStreamedDataForDecode(PakDecoder_t* const decoder, const size_t inLen, const bool useZStream)
-{
-	// the decoder needs at least this many bytes to decode the current block
-	// buffer contiguously
-	return (inLen >= decoder->bufferSizeNeeded);
-}
-
-struct PakRingBufferFrame_t
-{
-	uint64_t bufIndex;
-	uint64_t frameLen;
-};
-
-//-----------------------------------------------------------------------------
-// gets the frame for the data in the ring buffer, the frame returned is always
-// either the end of the ring buffer, or the end of the data itself
-//-----------------------------------------------------------------------------
-PakRingBufferFrame_t Pak_ComputeRingBufferFrame(const uint64_t bufMask, const uint64_t seekPos, const uint64_t dataLen)
-{
-	PakRingBufferFrame_t ring;
-
-	ring.bufIndex = seekPos & bufMask;
-	const size_t bufSize = bufMask + 1;
-
-	const size_t bytesUsed = ring.bufIndex % bufSize;
-	const size_t totalAvail = bufSize - bytesUsed;
-
-	// the last part of the data might be smaller than the remainder of the ring
-	// buffer; clamp it
-	ring.frameLen = Min(dataLen - seekPos, totalAvail);
-	return ring;
-}
-
 bool Pak_ZStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const size_t outLen)
 {
 	// must have a zstream decoder at this point, and input seek pos may not
@ -533,9 +649,21 @@ bool Pak_ZStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 		return false;
 	}

+	// the first call to this function expects a buffer with at least the size
+	// of the pak file header + the ZStd frame header, after this call we
+	// should subtract it from the needed buffer as
+	if (decoder->frameHeaderSize)
+	{
+		decoder->bufferSizeNeeded -= decoder->frameHeaderSize;
+		decoder->frameHeaderSize = 0;
+	}
+
 	// on the next call, we need at least this amount of data streamed in order
 	// to decode the rest of the pak file, as this is where reading has stopped
-	decoder->bufferSizeNeeded = inBuffer.pos;
+	// this value may equal the currently streamed input size, as its possible
+	// this function is getting called to flush the remainder decoded data into
+	// the out buffer which got truncated off on the call prior due to wrapping
+	decoder->bufferSizeNeeded += inBuffer.pos;

 	// advance buffer io positions, required so the main parser could already
 	// start parsing the headers while the rest is getting decoded still
@ -555,170 +683,52 @@ bool Pak_ZStreamDecode(PakDecoder_t* const decoder, const size_t inLen, const si
 }

 //-----------------------------------------------------------------------------
-// initialize the pak decoder context
+// initializes the decoder
 //-----------------------------------------------------------------------------
-size_t Pak_InitDefaultDecoder(PakDecoder_t* const decoder, const uint8_t* const fileBuffer,
-	const uint64_t inputMask, const size_t dataSize, const size_t dataOffset, const size_t headerSize)
+size_t Pak_InitDecoder(PakDecoder_t* const decoder, const uint8_t* const inputBuf, uint8_t* const outputBuf,
+	const uint64_t inputMask, const uint64_t outputMask, const size_t dataSize, const size_t dataOffset,
+	const size_t headerSize, const bool useZStream)
 {
-	uint64_t v8; // r9
-	unsigned __int64 v9; // r11
-	unsigned __int64 v10; // r8
-	int v11; // er8
-	__int64 v12; // rbx
-	unsigned int v13; // ebp
-	unsigned __int64 v14; // rbx
-	uint64_t v15; // rax
-	unsigned int v16; // er9
-	unsigned __int64 v17; // r12
-	unsigned __int64 v18; // r11
-	unsigned __int64 v19; // r10
-	unsigned __int64 v20; // rax
-	int v21; // ebp
-	unsigned __int64 v22; // r10
-	unsigned int v23; // er9
-	uint64_t v24; // rax
-	__int64 v25; // rsi
-	uint64_t v26; // rdx
-	size_t result; // rax
-	uint64_t v28; // rdx
+	// buffer size must be power of two as we index into buffers using a bit
+	// mask rather than modulo, the mask provided must be bufferSize-1
+	assert(IsPowerOfTwo(inputMask + 1));
+	assert(IsPowerOfTwo(outputMask + 1));

-	v8 = dataOffset + headerSize + 8;
-	v9 = *(_QWORD*)((inputMask & (dataOffset + headerSize)) + fileBuffer);
-
-	v10 = v9;
-	v9 >>= 6;
-	v11 = v10 & 0x3F;
-	decoder->decompSize = (1i64 << v11) | v9 & ((1i64 << v11) - 1);
-	v12 = *(_QWORD*)((inputMask & v8) + fileBuffer) << (64 - ((unsigned __int8)v11 + 6));
-	decoder->inBufBytePos = v8 + ((unsigned __int64)(unsigned int)(v11 + 6) >> 3);
-	v13 = ((v11 + 6) & 7) + 13;
-	v14 = (0xFFFFFFFFFFFFFFFFui64 >> ((v11 + 6) & 7)) & ((v9 >> v11) | v12);
-	v15 = inputMask & decoder->inBufBytePos;
-	v16 = (((_BYTE)v14 - 1) & 0x3F) + 1;
-	v17 = 0xFFFFFFFFFFFFFFFFui64 >> (64 - (unsigned __int8)v16);
-	decoder->inputInvMask = v17;
-	v18 = 0xFFFFFFFFFFFFFFFFui64 >> (64 - ((((v14 >> 6) - 1) & 0x3F) + 1));
-	decoder->outputInvMask = v18;
-	v19 = (v14 >> 13) | (*(_QWORD*)(v15 + fileBuffer) << (64 - (unsigned __int8)v13));
-	v20 = v13;
-	v21 = v13 & 7;
-	decoder->inBufBytePos += v20 >> 3;
-	v22 = (0xFFFFFFFFFFFFFFFFui64 >> v21) & v19;
-
-	if (v17 == -1i64)
-	{
-		decoder->headerOffset = 0;
-		decoder->bufferSizeNeeded = dataSize;
-	}
-	else
-	{
-		v23 = v16 >> 3;
-		v24 = inputMask & decoder->inBufBytePos;
-		decoder->headerOffset = v23 + 1;
-		v25 = *(_QWORD*)(v24 + fileBuffer) & ((1i64 << (8 * ((unsigned __int8)v23 + 1))) - 1);
-		decoder->inBufBytePos += v23 + 1;
-		decoder->bufferSizeNeeded = v25;
-	}
-
-	decoder->bufferSizeNeeded += dataOffset;
-	v26 = decoder->bufferSizeNeeded;
-	decoder->currentByte = v22;
-	decoder->currentBit = v21;
-	decoder->qword70 = v17 + dataOffset - 6;
-	result = decoder->decompSize;
-	decoder->dword6C = 0;
-	decoder->compressedStreamSize = v26;
-	decoder->decompressedStreamSize = result;
-
-	if ((((unsigned __int8)(v14 >> 6) - 1) & 0x3F) != -1i64 && result - 1 > v18)
-	{
-		v28 = v26 - decoder->headerOffset;
-		decoder->decompressedStreamSize = v18 + 1;
-		decoder->compressedStreamSize = v28;
-	}
-
-	return result;
-}
-
-size_t Pak_InitDecoder(PakDecoder_t* const decoder, const uint8_t* const fileBuffer,
-	const uint64_t inputMask, const size_t dataSize, const size_t dataOffset, const size_t headerSize, const bool useZStream)
-{
-	decoder->inputBuf = fileBuffer;
-	decoder->outputBuf = nullptr;
+	// the absolute start address of the input and output buffers
+	decoder->inputBuf = inputBuf;
+	decoder->outputBuf = outputBuf;

+	// the actual file size, which consists of dataOffset (anything up to the
+	// frame header, like the file header) and the actual encoded data itself
 	decoder->fileSize = dataOffset + dataSize;
-	decoder->dword44 = NULL;
+	decoder->padding = NULL;

+	// buffer masks, which essentially gets used to index into the input and
+	// output buffers, similar to 'idx % bufSize', where bufSize = bufMask+1
 	decoder->inputMask = inputMask;
-	decoder->outputMask = NULL;
+	decoder->outputMask = outputMask;

+	// the current positions in the input and output buffers; if we deal with
+	// paks that are patched, the buffer positions during the init and decode
+	// call on subsequent patches may not be at the start of the buffers
 	decoder->inBufBytePos = dataOffset + headerSize;
 	decoder->outBufBytePos = headerSize;

 	if (useZStream)
-	{
-		// NOTE: on original paks, this data is passed out of the frame header,
-		// but for ZStd encoded paks we are always limiting this to the ring
-		// buffer size
-		decoder->inputInvMask = PAK_DECODE_OUT_RING_BUFFER_MASK;
-		decoder->outputInvMask = PAK_DECODE_OUT_RING_BUFFER_MASK;
+		return Pak_ZStreamDecoderInit(decoder, inputBuf, inputMask, dataSize, dataOffset, headerSize);

-		ZSTD_DStream* const dctx = ZSTD_createDStream();
-		assert(dctx);
-
-		if (!dctx)
-			return NULL;
-
-		decoder->zstreamContext = dctx;
-
-		// this is the offset to the ZSTD header in the input buffer
-		decoder->headerOffset = static_cast<uint32_t>(decoder->inBufBytePos);
-
-		// this points to the first byte of the frame header, takes dataOffset
-		// into account which is the offset in the ring buffer to the patched
-		// data as we parse it contiguously after the base pak data, which
-		// might have ended somewhere in the middle of the ring buffer
-		const uint8_t* const frameHeaderData = (inputMask & (dataOffset + headerSize)) + fileBuffer;
-
-		if (ZSTD_getFrameHeader(&dctx->fParams, frameHeaderData, dataSize) != 0)
-		{
-			if (decoder->zstreamContext)
-			{
-				ZSTD_freeDStream(decoder->zstreamContext);
-				decoder->zstreamContext = nullptr;
-			}
-
-			return NULL; // content size error
-		}
-
-		// ideally the frame header of the block gets parsed first, the length
-		// thereof is returned by initDStream and thus being processed first
-		// before moving on to actual data
-		const size_t frameMetaDataSize = ZSTD_initDStream(dctx);
-
-		// we need at least this many bytes of streamed data to process the frame
-		// header of the compressed block
-		decoder->bufferSizeNeeded = frameMetaDataSize + decoder->headerOffset;
-
-		// must include header size
-		const uint64_t decompSize = dctx->fParams.frameContentSize + headerSize;
-
-		decoder->decompSize = decompSize;
-		return decompSize;
-	}
-
-	return Pak_InitDefaultDecoder(decoder, fileBuffer, inputMask, dataSize, dataOffset, headerSize);
+	return Pak_RStreamDecoderInit(decoder, inputBuf, inputMask, dataSize, dataOffset, headerSize);
 }

 //-----------------------------------------------------------------------------
-// decode input pak data
+// decodes streamed input pak data
 //-----------------------------------------------------------------------------
 bool Pak_StreamToBufferDecode(PakDecoder_t* const decoder, const size_t inLen, const size_t outLen, const bool useZStream)
 {
-	if (!Pak_HasEnoughStreamedDataForDecode(decoder, inLen, useZStream))
+	if (!Pak_HasEnoughStreamedDataForDecode(decoder, inLen))
 		return false;

-	if (!Pak_HasEnoughDecodeBufferLeft(decoder, outLen))
+	if (!Pak_HasEnoughDecodeBufferAvailable(decoder, outLen))
 		return false;

 	if (useZStream)
--- a/r5dev/rtech/pak/pakdecode.h
+++ b/r5dev/rtech/pak/pakdecode.h
@ -2,11 +2,9 @@
 #define RTECH_PAKDECODE_H
 #include "rtech/ipakfile.h"

-extern size_t Pak_InitDefaultDecoder(PakDecoder_t* const decoder, const uint8_t* const fileBuffer,
-	const uint64_t inputMask, const size_t dataSize, const size_t dataOffset, const size_t headerSize);
-
-extern size_t Pak_InitDecoder(PakDecoder_t* const decoder, const uint8_t* const fileBuffer,
-	const uint64_t inputMask, const size_t dataSize, const size_t dataOffset, const size_t headerSize, const bool useCustom);
+extern size_t Pak_InitDecoder(PakDecoder_t* const decoder, const uint8_t* const inputBuf, uint8_t* const outputBuf,
+	const uint64_t inputMask, const uint64_t outputMask, const size_t dataSize, const size_t dataOffset,
+	const size_t headerSize, const bool useZStream);

 extern bool Pak_StreamToBufferDecode(PakDecoder_t* const decoder, const size_t inLen, const size_t outLen, const bool useCustom);

--- a/r5dev/rtech/pak/pakparse.cpp
+++ b/r5dev/rtech/pak/pakparse.cpp
@ -382,8 +382,10 @@ LABEL_18:

                decodeContext = &pak->pakDecoder;

-                decompressedSize = Pak_InitDecoder(&pak->pakDecoder, fileStream->buffer,
-                    PAK_DECODE_IN_RING_BUFFER_MASK, v22->compressedSize - (v22->dataOffset - sizeof(PakFileHeader_t)),
+                decompressedSize = Pak_InitDecoder(&pak->pakDecoder,
+                    fileStream->buffer, pak->decompBuffer,
+                    PAK_DECODE_IN_RING_BUFFER_MASK, PAK_DECODE_OUT_RING_BUFFER_MASK, 
+                    v22->compressedSize - (v22->dataOffset - sizeof(PakFileHeader_t)),
                    v22->dataOffset - sizeof(PakFileHeader_t), sizeof(PakFileHeader_t), useZStream);

                if (decompressedSize != v22->decompressedSize)
@ -392,9 +394,6 @@ LABEL_18:
                        pak->memoryData.fileName,
                        decompressedSize,
                        pak->memoryData.pakHeader.decompressedSize);
-
-                pak->pakDecoder.outputBuf = pak->decompBuffer;
-                pak->pakDecoder.outputMask = PAK_DECODE_OUT_RING_BUFFER_MASK;
            }
            else
            {