Skip to content

Implement a native PAK decoder #406

@rdw-software

Description

@rdw-software

Details:

Pretty sure the POC worked, but I haven't tested it recently. Might as well dump the code here:

local jit = require("jit")

-- local LOGFILE_HANDLE = assert(io.open("jit.log", "w+"))

local ffi = require("ffi")

local ArcturusPAK = {
	FOOTER_SIZE_IN_BYTES = 9, -- sizeof ArcturusPAK?
	MAGIC_VERSION_TAG = 18,
	RECORD_TYPE_FILE = 1,
	RECORD_TYPE_DIRECTORY = 2,
	cdefs = [[
		typedef struct ArcturusPAK {
			unsigned long offset;
			unsigned long numRecords;
			unsigned char versionTag;
		}  ArcturusPAK;

		typedef struct ArcturusFileRecord {
			unsigned char pathSize;
			unsigned char type;
			int offset;
			int compressedSize;
			int decompressedSize;
		} ArcturusFileRecord;
	]],
}

ffi.cdef(ArcturusPAK.cdefs)

-- Blocking load using standard Lua io library
function ArcturusPAK:Open(filePath)
	local pakInfo = {}

	local pakFileHandle = io.open(filePath, "rb")
	if not pakFileHandle then
		error("Failed to open PAK file " .. filePath .. " (no such file exists)", 0)
	end

	pakInfo.handle = pakFileHandle

	local EOF = pakFileHandle:seek("end")
	if EOF < self.FOOTER_SIZE_IN_BYTES then
		error("Failed to open PAK file " .. filePath .. " (not a valid PAK file)", 0)
	end
	pakFileHandle:seek("set", EOF - self.FOOTER_SIZE_IN_BYTES)

	local metadata = pakFileHandle:read(self.FOOTER_SIZE_IN_BYTES)
	local header = ffi.cast("ArcturusPAK*", metadata)

	if tonumber(header.versionTag) ~= ArcturusPAK.MAGIC_VERSION_TAG then
		error(
			"Invalid PAK version tag " .. header.versionTag .. " (" .. ArcturusPAK.MAGIC_VERSION_TAG .. " expected)",
			0
		)
	end

	pakInfo.offset = tonumber(header.offset)
	pakInfo.numRecords = tonumber(header.numRecords)
	pakInfo.versionTag = tonumber(header.versionTag)

	return pakInfo
end

function ArcturusPAK:ReadTableOfContents(pakInfo)

	if io.type(pakInfo.handle) ~= "file" then
		error("Failed to read table of contents (invalid PAK file handle)", 0)
	end

	pakInfo.handle:seek("set", pakInfo.offset)

	local records = {}

	-- Even for HUGE archives this should be small enough to keep in memory
	for _ = 1, pakInfo.numRecords, 1 do
		local recordHeader = pakInfo.handle:read(ffi.sizeof("ArcturusFileRecord"))
		local record = ffi.cast("ArcturusFileRecord*", recordHeader)
		local pathString = pakInfo.handle:read(record.pathSize + 1)
		pathString = ffi.string(pathString, record.pathSize)

		-- TBD too big/slow? but at least no cdata glitches...
		local recordData = {
			pathSize = tonumber(record.pathSize),
			type = tonumber(record.type),
			offset = tonumber(record.offset),
			compressedSize = tonumber(record.compressedSize),
			decompressedSize = tonumber(record.decompressedSize),
			pathString = pathString,
			cdata = record,
		}

		table.insert(records, record)
		records[pathString] = recordData
	end

	pakInfo.records = records
	return records
end

function ArcturusPAK:GetCompressedFileContents(pakInfo, pathString)
	if io.type(pakInfo.handle) ~= "file" then
		error("Failed to get compressed file contents for record " .. pathString .. " (invalid PAK file handle)", 0)
	end

	if not pakInfo.records then
		error("Failed to get compressed file contents for record " .. pathString .. " (table of contents not read)", 0)
	end

	local record = pakInfo.records[pathString]
	if not record then
		error("Failed to get compressed file contents for record " .. pathString .. " (invalid path string)", 0)
	end

	if record.type == ArcturusPAK.RECORD_TYPE_DIRECTORY then
		error("Failed to get compressed file contents for record " .. pathString .. " (it's a directory)", 0)
	end

	pakInfo.handle:seek("set", record.offset)
	local compressedFileContents = pakInfo.handle:read(record.compressedSize)
	return compressedFileContents
end

function ArcturusPAK:GetDecompressedFileContents(pakInfo, pathString)
	if io.type(pakInfo.handle) ~= "file" then
		error("Failed to get decompressed file contents for record " .. pathString .. " (invalid PAK file handle)", 0)
	end

	if not pakInfo.records then
		error(
			"Failed to get decompressed file contents for record " .. pathString .. " (table of contents not read)",
			0
		)
	end

	local record = pakInfo.records[pathString]
	if not record then
		error("Failed to get decompressed file contents for record " .. pathString .. " (invalid path string)", 0)
	end

	if record.type == ArcturusPAK.RECORD_TYPE_DIRECTORY then
		error("Failed to get decompressed file contents for record " .. pathString .. " (it's a directory)", 0)
	end

	pakInfo.handle:seek("set", record.offset)
	local compressedFileContents = pakInfo.handle:read(record.compressedSize)

	return self:DecompressFileContents(compressedFileContents, record.decompressedSize)
end

local bit = require("bit")

local CHUNK_SIZE = 1024

local function decompress(size_compressed, size_original, compressed_data)
	compressed_data = { string.byte(compressed_data, 1, #compressed_data) }

	local result = {}
	local result_index, bytes_read = 1, 1

	while bytes_read <= size_compressed do
		if bytes_read > size_compressed then
			break
		end

		local mask = compressed_data[bytes_read]
		bytes_read = bytes_read + 1

		for i = 0, 7 do
			if bytes_read > size_compressed then
				break
			end

			if bit.band(mask, 1) == 1 then
				local byte1, byte2 = compressed_data[bytes_read], compressed_data[bytes_read + 1]
				bytes_read = bytes_read + 2

				local displacement = bit.rshift(byte2, 4) + 2
				local index_offset = bit.lshift(bit.band(byte2, 0x0F), 8) + byte1

				for j = 0, displacement - 1 do
					result[result_index] = result[result_index - index_offset]
					result_index = result_index + 1
				end
			else
				result[result_index] = compressed_data[bytes_read]
				result_index = result_index + 1
				bytes_read = bytes_read + 1
			end

			mask = bit.rshift(mask, 1)
		end
	end
	-- dump(result)
	-- return result
	return string.char(unpack(result))
end

local function decompressFFI(size_compressed, size_original, compressed_data)
	local compressed_buffer = ffi.new("uint8_t[?]", #compressed_data)
	ffi.copy(compressed_buffer, compressed_data, #compressed_data)

	local result_buffer = ffi.new("uint8_t[?]", size_original)

	local result_index, bytes_read = 0, 0

	while bytes_read < size_compressed do
		local mask = compressed_buffer[bytes_read]
		bytes_read = bytes_read + 1

		for i = 0, 7 do
			if bytes_read >= size_compressed then
				break
			end

			if bit.band(mask, 1) == 1 then
				local byte1, byte2 = compressed_buffer[bytes_read], compressed_buffer[bytes_read + 1]
				bytes_read = bytes_read + 2

				local displacement = bit.rshift(byte2, 4) + 2
				local index_offset = bit.lshift(bit.band(byte2, 0x0F), 8) + byte1

				for j = 0, displacement - 1 do
					result_buffer[result_index] = result_buffer[result_index - index_offset]
					result_index = result_index + 1
				end
			else
				result_buffer[result_index] = compressed_buffer[bytes_read]
				result_index = result_index + 1
				bytes_read = bytes_read + 1
			end

			mask = bit.rshift(mask, 1)
		end
	end

	return ffi.string(result_buffer, size_original)
end
-- todo use string buffer as input, too - much easier to index?
-- TODO eliminate/move
function ArcturusPAK:DecompressFileContents(compressedFileContents, decompressedSize)
	return decompress(#compressedFileContents, decompressedSize, compressedFileContents)
end

local pakFilePath = "../Fixtures/data.pak"
local ZERO_SIZE_PAK = "../Fixtures/zerosize.pak"

local describe = _G.describe
local it = _G.it
local assertEquals = _G.assertEquals
local assertThrows = _G.assertThrows

describe("ArcturusPAK", function()
	describe("Open", function()
		it("should throw if passed an invalid file path", function()
			local function openNonexistentFile()
				ArcturusPAK:Open("meep.404")
			end
			assertThrows(openNonexistentFile, "Failed to open PAK file meep.404 (no such file exists)")
		end)

		it("should throw if the file is empty ", function()
			local function openNonexistentFile()
				ArcturusPAK:Open(ZERO_SIZE_PAK) -- Should always exist
			end
			assertThrows(openNonexistentFile, "Failed to open PAK file " .. ZERO_SIZE_PAK .. " (not a valid PAK file)")

			-- TODO assert fd is closed
		end)

		it("should throw if passed a valid non-PAK file path ", function()
			local function openNonexistentFile()
				ArcturusPAK:Open("../Fixtures/invalid.pak") -- Should always exist
			end
			assertThrows(openNonexistentFile, "Invalid PAK version tag 46 (18 expected)")

			-- TODO assert fd is closed
		end)

		-- Throw if not a valid PAK file

		it("should be able to read the archive metadata when given a valid PAK file path", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			assertEquals(pakInfo.versionTag, ArcturusPAK.MAGIC_VERSION_TAG)
			assertEquals(pakInfo.offset, 695226075)
			assertEquals(pakInfo.numRecords, 17743)
			assertEquals(type(pakInfo.handle), "userdata")

			-- TODO close fd
		end)
	end)

	describe("ReadTableOfContents", function()
		it("should throw if the PAK file handle is already closed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			pakInfo.handle:close()
			local function readFromClosedHandle()
				ArcturusPAK:ReadTableOfContents(pakInfo)
			end
			assertThrows(readFromClosedHandle, "Failed to read table of contents (invalid PAK file handle)")
		end)

		it("should return the table of file records when passed a valid PAK file handle", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local records = ArcturusPAK:ReadTableOfContents(pakInfo)
			assertEquals(#records, pakInfo.numRecords)

			local firstRecord = records["data"]
			local secondRecord = records["data/_tactics.scr"]
			local lastRecord = records[pakInfo.numRecords]

			-- TODO assert first record is data folder, last is X (TBD)

			assertEquals(secondRecord.pathSize, 17)
			assertEquals(secondRecord.type, ArcturusPAK.RECORD_TYPE_FILE)
			assertEquals(secondRecord.offset, 695224756)
			assertEquals(secondRecord.compressedSize, 1312)
			assertEquals(secondRecord.decompressedSize, 2472)
			-- assertEquals(records["data"], firstRecord)

			-- TODO close
		end)

		it("should cache the table of contents when passed a valid PAK file handle", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local records = ArcturusPAK:ReadTableOfContents(pakInfo)
			-- print(records, pakInfo.records)
			assertEquals(records, pakInfo.records)
		end)
	end)

	describe("GetCompressedFileContents", function()
		it("should throw if the PAK file handle is already closed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			pakInfo.handle:close()
			local function readFromClosedHandle()
				ArcturusPAK:GetCompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readFromClosedHandle,
				"Failed to get compressed file contents for record hello.world (invalid PAK file handle)"
			)
		end)

		-- TODO
		-- should throw if no handle was opened

		it("should throw if the table of contents wasn't yet read", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithoutTOC()
				ArcturusPAK:GetCompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithoutTOC,
				"Failed to get compressed file contents for record hello.world (table of contents not read)"
			)
		end)

		it("should throw if an invalid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithInvalidPathString()
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetCompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithInvalidPathString,
				"Failed to get compressed file contents for record hello.world (invalid path string)"
			)
			-- TODO close handles everywhere
		end)


		it("should throw if a directory path string was passed", function()
			local function attemptToDecompressDirectoryRecord()
				local pakInfo = ArcturusPAK:Open(pakFilePath)
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetCompressedFileContents(pakInfo, "data/diary")
			end
			assertThrows(
				attemptToDecompressDirectoryRecord,
				"Failed to get compressed file contents for record data/diary (it's a directory)"
			)
		end)

		-- TODO move
		-- local zlib = require("zlib")

		it("should return the compressed buffer if a valid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			ArcturusPAK:ReadTableOfContents(pakInfo)

			local compressedFileContents = ArcturusPAK:GetCompressedFileContents(pakInfo, "data/global.ini")
			-- print(#compressedFileContents, compressedFileContents)
			assertEquals(#compressedFileContents, 187)
			-- local crc = zlib.crc32()(compressedFileContents)
			-- local adler = zlib.adler32()(compressedFileContents)
			-- assertEquals(crc, "hello world123")
			-- assertEquals(adler, "hello world123")
			-- It's not a guarantee that the file contents are 100% correct, but it's good enough for now
			assertEquals(compressedFileContents:sub(6, 8), "gnd")
			assertEquals(compressedFileContents:sub(165, 169), "@load")
		end)
	end)
	-- Close: throw if no fd, success if fd

	describe("GetDecompressedFileContents", function()
		it("should throw if the PAK file handle is already closed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			pakInfo.handle:close()
			local function readFromClosedHandle()
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readFromClosedHandle,
				"Failed to get decompressed file contents for record hello.world (invalid PAK file handle)"
			)
		end)

		-- TODO
		-- should throw if no handle was opened

		it("should throw if the table of contents wasn't yet read", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithoutTOC()
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithoutTOC,
				"Failed to get decompressed file contents for record hello.world (table of contents not read)"
			)
		end)

		it("should throw if an invalid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithInvalidPathString()
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithInvalidPathString,
				"Failed to get decompressed file contents for record hello.world (invalid path string)"
			)
			-- TODO close handles everywhere
		end)

		local assertNil = _G.assertNil -- TODO move

		it("should throw if a directory path string was passed", function()
			local function attemptToDecompressDirectoryRecord()
				local pakInfo = ArcturusPAK:Open(pakFilePath)
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "data/bmp")
			end
			assertThrows(
				attemptToDecompressDirectoryRecord,
				"Failed to get decompressed file contents for record data/bmp (it's a directory)"
			)
		end)

		-- TODO move
		-- local zlib = require("zlib")

		it("should return the decompressed buffer if a valid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			ArcturusPAK:ReadTableOfContents(pakInfo)

			local decompressedFileContents = ArcturusPAK:GetDecompressedFileContents(pakInfo, "data/global.ini")
			-- print(#compressedFileContents, compressedFileContents)
			assertEquals(#decompressedFileContents, 275)
			-- local crc = zlib.crc32()(decompressedFileContents)
			-- local adler = zlib.adler32()(decompressedFileContents)
			-- assertEquals(crc, "hello world123")
			-- assertEquals(adler, "hello world123")
			-- It's not a guarantee that the file contents are 100% correct, but it's good enough for now
			assertEquals(decompressedFileContents:sub(5, 14), "gndopacity")
			assertEquals(decompressedFileContents:sub(145, 153), "wavewater")
		end)
	end)

	describe("DecompressBytes", function()
		it("should do some magic (TBD)", function()
			-- Example usage and benchmark
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			ArcturusPAK:ReadTableOfContents(pakInfo)
			-- local compressedFileContents = ArcturusPAK:GetDecompressedFileContents(pakInfo, "data/arcfonth.dat")
			local compressedFileContents = ArcturusPAK:GetCompressedFileContents(pakInfo, "data/global.ini")
			local size_compressed = #compressedFileContents
			-- local size_original = pakInfo.records["data/arcfonth.dat"].decompressedSize
			local size_original = pakInfo.records["data/global.ini"].decompressedSize

			local result
			local start = os.clock() -- uv.hrtime?
			for i = 0, 1000000, 1 do
				result = decompress(size_compressed, size_original, compressedFileContents)
			end
			local elapsed = os.clock() - start

			print("Decompressed using Lua code in", elapsed, "seconds")

			start = os.clock()
			local resultFFI
			for j = 0, 1000000, 1 do
				resultFFI = decompressFFI(size_compressed, size_original, compressedFileContents)
			end
			elapsed = os.clock() - start

			print("Decompressed using FFI in", elapsed, "seconds")
			-- print(result, resultFFI)
			assert(result == resultFFI)
		end)
	end)
end)


-- todo test all files can be dec, extracted

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions