From 8892f527f529f05fe04092f4b7ee9ebed0cdb530 Mon Sep 17 00:00:00 2001 From: barubary Date: Sat, 14 May 2011 14:19:11 +0000 Subject: [PATCH] C#: added a variation to the original LZ-10 compression algorithm that increases the compression rate, while still ensuring compatibility with the built-in decompression. --- CSharp/DSDecmp/Formats/CompressionFormat.cs | 3 +- CSharp/DSDecmp/Formats/LZOvl.cs | 6 +- CSharp/DSDecmp/Formats/Nitro/Huffman.cs | 10 +- CSharp/DSDecmp/Formats/Nitro/LZ10.cs | 197 +++++++++++++++++++- CSharp/DSDecmp/Formats/Nitro/LZ11.cs | 10 +- CSharp/DSDecmp/Formats/Nitro/RLE.cs | 10 +- CSharp/DSDecmp/NewProgram.cs | 8 +- 7 files changed, 226 insertions(+), 18 deletions(-) diff --git a/CSharp/DSDecmp/Formats/CompressionFormat.cs b/CSharp/DSDecmp/Formats/CompressionFormat.cs index e54e412..b484297 100644 --- a/CSharp/DSDecmp/Formats/CompressionFormat.cs +++ b/CSharp/DSDecmp/Formats/CompressionFormat.cs @@ -75,9 +75,10 @@ namespace DSDecmp.Formats /// input data may be read (if there is padding, for example), however never more than /// this number of bytes is read from the input stream. /// The stream to write the decompressed data to. + /// The length of the output data. /// When the given length of the input data /// is not enough to properly decompress the input. - public abstract void Decompress(Stream instream, long inLength, Stream outstream); + public abstract long Decompress(Stream instream, long inLength, Stream outstream); /// /// Compresses the given input file, and writes the compressed data to the given diff --git a/CSharp/DSDecmp/Formats/LZOvl.cs b/CSharp/DSDecmp/Formats/LZOvl.cs index dd6bde1..45c94b5 100644 --- a/CSharp/DSDecmp/Formats/LZOvl.cs +++ b/CSharp/DSDecmp/Formats/LZOvl.cs @@ -90,7 +90,7 @@ namespace DSDecmp.Formats } #endregion - public override void Decompress(System.IO.Stream instream, long inLength, System.IO.Stream outstream) + public override long Decompress(System.IO.Stream instream, long inLength, System.IO.Stream outstream) { #region Format description // Overlay LZ compression is basically just LZ-0x10 compression. @@ -152,6 +152,8 @@ namespace DSDecmp.Formats // make sure the input is positioned at the end of the file instream.Position += 4; + return inLength - 4; + #endregion } else @@ -268,6 +270,8 @@ namespace DSDecmp.Formats // make sure the input is positioned at the end of the file; the stream is currently // at the compression header. instream.Position += headerSize; + + return decompressedLength + (inLength - headerSize - compressedSize); } } diff --git a/CSharp/DSDecmp/Formats/Nitro/Huffman.cs b/CSharp/DSDecmp/Formats/Nitro/Huffman.cs index 13169f0..7ced9ad 100644 --- a/CSharp/DSDecmp/Formats/Nitro/Huffman.cs +++ b/CSharp/DSDecmp/Formats/Nitro/Huffman.cs @@ -24,7 +24,7 @@ namespace DSDecmp.Formats.Nitro return base.Supports(stream, inLength); } - public override void Decompress(Stream instream, long inLength, Stream outstream) + public override long Decompress(Stream instream, long inLength, Stream outstream) { #region GBATEK format specification /* @@ -178,7 +178,13 @@ namespace DSDecmp.Formats.Nitro readBytes += 4 - (readBytes % 4); if (readBytes < inLength) - throw new TooMuchInputException(readBytes, inLength); + { + // the input may be 4-byte aligned. + if ((readBytes ^ (readBytes & 3)) + 4 < inLength) + throw new TooMuchInputException(readBytes, inLength); + } + + return decompressedSize; } public override int Compress(Stream instream, long inLength, Stream outstream) diff --git a/CSharp/DSDecmp/Formats/Nitro/LZ10.cs b/CSharp/DSDecmp/Formats/Nitro/LZ10.cs index afcea9b..bf5efe3 100644 --- a/CSharp/DSDecmp/Formats/Nitro/LZ10.cs +++ b/CSharp/DSDecmp/Formats/Nitro/LZ10.cs @@ -11,9 +11,27 @@ namespace DSDecmp.Formats.Nitro /// public class LZ10 : NitroCFormat { + private static bool lookAhead = false; + /// + /// Sets the flag that determines if 'look-ahead'/DP should be used when compressing + /// with the LZ-10 format. The default is false, which is what is used in the original + /// implementation. + /// + public static bool LookAhead + { + set { lookAhead = value; } + } + public LZ10() : base(0x10) { } - public override void Decompress(Stream instream, long inLength, + #region 'Original' Decompression method + /// + /// Decompress a stream that is compressed in the LZ-10 format. + /// + /// The compressed stream. + /// The length of the input stream. + /// The output stream, where the decompressed data is written to. + public override long Decompress(Stream instream, long inLength, Stream outstream) { #region format definition form GBATEK/NDSTEK @@ -138,13 +156,21 @@ namespace DSDecmp.Formats.Nitro buffer[bufferOffset] = (byte)next; bufferOffset = (bufferOffset + 1) % bufferLength; } + outstream.Flush(); } if (readBytes < inLength) - throw new TooMuchInputException(readBytes, inLength); + { + // the input may be 4-byte aligned. + if ((readBytes ^ (readBytes & 3)) + 4 < inLength) + throw new TooMuchInputException(readBytes, inLength); + } + return decompressedSize; } - + #endregion + + #region Original Compress method public unsafe override int Compress(Stream instream, long inLength, Stream outstream) { // make sure the decompressed size fits in 3 bytes. @@ -152,6 +178,12 @@ namespace DSDecmp.Formats.Nitro // in every game, as it may not be a built-in function. if (inLength > 0xFFFFFF) throw new InputTooLargeException(); + + // use the other method if lookahead is enabled + if (lookAhead) + { + return CompressWithLA(instream, inLength, outstream); + } // save the input data in an array to prevent having to go back and forth in a file byte[] indata = new byte[inLength]; @@ -177,6 +209,7 @@ namespace DSDecmp.Formats.Nitro int readBytes = 0; while (readBytes < inLength) { + #region If 8 blocks are bufferd, write them and reset the buffer // we can only buffer 8 blocks at a time. if (bufferedBlocks == 8) { @@ -187,6 +220,7 @@ namespace DSDecmp.Formats.Nitro bufferlength = 1; bufferedBlocks = 0; } + #endregion // determine if we're dealing with a compressed or raw block. // it is a compressed block when the next 3 or more bytes can be copied from @@ -234,10 +268,153 @@ namespace DSDecmp.Formats.Nitro return compressedLength; } + #endregion + + #region Dynamic Programming compression method + /// + /// Variation of the original compression method, making use of Dynamic Programming to 'look ahead' + /// and determine the optimal 'length' values for the compressed blocks. Is not 100% optimal, + /// as the flag-bytes are not taken into account. + /// + private unsafe int CompressWithLA(Stream instream, long inLength, Stream outstream) + { + // save the input data in an array to prevent having to go back and forth in a file + byte[] indata = new byte[inLength]; + int numReadBytes = instream.Read(indata, 0, (int)inLength); + if (numReadBytes != inLength) + throw new StreamTooShortException(); + + // write the compression header first + outstream.WriteByte(0x10); + outstream.WriteByte((byte)(inLength & 0xFF)); + outstream.WriteByte((byte)((inLength >> 8) & 0xFF)); + outstream.WriteByte((byte)((inLength >> 16) & 0xFF)); + + int compressedLength = 4; + + fixed (byte* instart = &indata[0]) + { + // we do need to buffer the output, as the first byte indicates which blocks are compressed. + // this version does not use a look-ahead, so we do not need to buffer more than 8 blocks at a time. + byte[] outbuffer = new byte[8 * 2 + 1]; + outbuffer[0] = 0; + int bufferlength = 1, bufferedBlocks = 0; + int readBytes = 0; + + // get the optimal choices for len and disp + int[] lengths, disps; + this.GetOptimalCompressionLengths(instart, indata.Length, out lengths, out disps); + while (readBytes < inLength) + { + // we can only buffer 8 blocks at a time. + if (bufferedBlocks == 8) + { + outstream.Write(outbuffer, 0, bufferlength); + compressedLength += bufferlength; + // reset the buffer + outbuffer[0] = 0; + bufferlength = 1; + bufferedBlocks = 0; + } + + + if (lengths[readBytes] == 1) + { + outbuffer[bufferlength++] = *(instart + (readBytes++)); + } + else + { + // mark the next block as compressed + outbuffer[0] |= (byte)(1 << (7 - bufferedBlocks)); + + outbuffer[bufferlength] = (byte)(((lengths[readBytes] - 3) << 4) & 0xF0); + outbuffer[bufferlength] |= (byte)(((disps[readBytes] - 1) >> 8) & 0x0F); + bufferlength++; + outbuffer[bufferlength] = (byte)((disps[readBytes] - 1) & 0xFF); + bufferlength++; + + readBytes += lengths[readBytes]; + } + + + bufferedBlocks++; + } + + // copy the remaining blocks to the output + if (bufferedBlocks > 0) + { + outstream.Write(outbuffer, 0, bufferlength); + compressedLength += bufferlength; + /*/ make the compressed file 4-byte aligned. + while ((compressedLength % 4) != 0) + { + outstream.WriteByte(0); + compressedLength++; + }/**/ + } + } + + return compressedLength; + } + #endregion + + #region DP compression helper method; GetOptimalCompressionLengths + /// + /// Gets the optimal compression lengths for each start of a compressed block using Dynamic Programming. + /// This takes O(n^2) time. + /// + /// The data to compress. + /// The length of the data to compress. + /// The optimal 'length' of the compressed blocks. For each byte in the input data, + /// this value is the optimal 'length' value. If it is 1, the block should not be compressed. + /// The 'disp' values of the compressed blocks. May be 0, in which case the + /// corresponding length will never be anything other than 1. + private unsafe void GetOptimalCompressionLengths(byte* indata, int inLength, out int[] lengths, out int[] disps) + { + lengths = new int[inLength]; + disps = new int[inLength]; + int[] minLengths = new int[inLength]; + + for (int i = inLength - 1; i >= 0; i--) + { + // first get the compression length when the next byte is not compressed + minLengths[i] = int.MaxValue; + lengths[i] = 1; + if (i + 1 >= inLength) + minLengths[i] = 1; + else + minLengths[i] = 1 + minLengths[i + 1]; + // then the optimal compressed length + int oldLength = Math.Min(0x1000, i); + // get the appropriate disp while at it. Takes at most O(n) time if oldLength is considered O(n) + // be sure to bound the input length with 0x12, as that's the maximum length for LZ-10 compressed blocks. + int maxLen = GetOccurrenceLength(indata + i, Math.Min(inLength - i, 0x12), + indata + i - oldLength, oldLength, out disps[i]); + if (disps[i] > i) + throw new Exception("disp is too large"); + for (int j = 3; j <= maxLen; j++) + { + int newCompLen; + if (i + j >= inLength) + newCompLen = 2; + else + newCompLen = 2 + minLengths[i + j]; + if (newCompLen < minLengths[i]) + { + lengths[i] = j; + minLengths[i] = newCompLen; + } + } + } + + // we could optimize this further to also optimize it with regard to the flag-bytes, but that would require 8 times + // more space and time (one for each position in the block) for only a potentially tiny increase in compression ratio. + } + #endregion /// /// Determine the maximum size of a LZ-compressed block starting at newPtr, using the already compressed data - /// starting at oldPtr. + /// starting at oldPtr. Takes O(inLength * oldLength) = O(n^2) time. /// /// The start of the data that needs to be compressed. /// The number of bytes that still need to be compressed. @@ -251,15 +428,16 @@ namespace DSDecmp.Formats.Nitro if (newLength == 0) return 0; int maxLength = 0; - //for (int i = 1; i < oldLength; i++) + // try every possible 'disp' value (disp = oldLength - i) for (int i = 0; i < oldLength - 1; i++) { - // work from the end of the old data to the start, to mimic the original implementation's behaviour - //byte* currentOldStart = oldPtr + oldLength - i; - // WRONG: original works from start + // work from the start of the old data to the end, to mimic the original implementation's behaviour + // (and going from start to end or from end to start does not influence the compression ratio anyway) byte* currentOldStart = oldPtr + i; int currentLength = 0; - // determine the length we can copy if we go back i bytes + // determine the length we can copy if we go back (oldLength - i) bytes + // always check the next 'newLength' bytes, and not just the available 'old' bytes, + // as the copied data can also originate from what we're currently trying to compress. for (int j = 0; j < newLength; j++) { // stop when the bytes are no longer the same @@ -268,6 +446,7 @@ namespace DSDecmp.Formats.Nitro currentLength++; } + // update the optimal value if (currentLength > maxLength) { maxLength = currentLength; diff --git a/CSharp/DSDecmp/Formats/Nitro/LZ11.cs b/CSharp/DSDecmp/Formats/Nitro/LZ11.cs index e3fb99b..657e8ea 100644 --- a/CSharp/DSDecmp/Formats/Nitro/LZ11.cs +++ b/CSharp/DSDecmp/Formats/Nitro/LZ11.cs @@ -13,7 +13,7 @@ namespace DSDecmp.Formats.Nitro { public LZ11() : base(0x11) { } - public override void Decompress(Stream instream, long inLength, Stream outstream) + public override long Decompress(Stream instream, long inLength, Stream outstream) { #region Format definition in NDSTEK style /* Data header (32bit) @@ -213,7 +213,13 @@ namespace DSDecmp.Formats.Nitro } if (readBytes < inLength) - throw new TooMuchInputException(readBytes, inLength); + { + // the input may be 4-byte aligned. + if ((readBytes ^ (readBytes & 3)) + 4 < inLength) + throw new TooMuchInputException(readBytes, inLength); + } + + return decompressedSize; } public override int Compress(Stream instream, long inLength, Stream outstream) diff --git a/CSharp/DSDecmp/Formats/Nitro/RLE.cs b/CSharp/DSDecmp/Formats/Nitro/RLE.cs index df2e695..c385799 100644 --- a/CSharp/DSDecmp/Formats/Nitro/RLE.cs +++ b/CSharp/DSDecmp/Formats/Nitro/RLE.cs @@ -13,7 +13,7 @@ namespace DSDecmp.Formats.Nitro { public RLE() : base(0x30) { } - public override void Decompress(Stream instream, long inLength, Stream outstream) + public override long Decompress(Stream instream, long inLength, Stream outstream) { /* Data header (32bit) @@ -119,7 +119,13 @@ namespace DSDecmp.Formats.Nitro } if (readBytes < inLength) - throw new TooMuchInputException(readBytes, inLength); + { + // the input may be 4-byte aligned. + if ((readBytes ^ (readBytes & 3)) + 4 < inLength) + throw new TooMuchInputException(readBytes, inLength); + } + + return decompressedSize; } public override int Compress(Stream instream, long inLength, Stream outstream) diff --git a/CSharp/DSDecmp/NewProgram.cs b/CSharp/DSDecmp/NewProgram.cs index 5e78a93..8b28317 100644 --- a/CSharp/DSDecmp/NewProgram.cs +++ b/CSharp/DSDecmp/NewProgram.cs @@ -27,7 +27,7 @@ namespace DSDecmp { Console.WriteLine("DSDecmp - Decompressor for compression formats used on the NDS - by Barubary"); Console.WriteLine(); - Console.WriteLine("Usage:\tDSDecmp (-c FORMAT) (-ge) input (output)"); + Console.WriteLine("Usage:\tDSDecmp (-c FORMAT (FORMATOPTS)) (-ge) input (output)"); Console.WriteLine(); Console.WriteLine("Without the -c modifier, DSDecmp will decompress the input file to the output"); Console.WriteLine("file. If the output file is a directory, the output file will be placed in that"); @@ -60,6 +60,12 @@ namespace DSDecmp Console.WriteLine(" gba* - The built-in compression format that gives the best compression"); Console.WriteLine(" ratio, and is also supported by the GBA."); Console.WriteLine(); + Console.WriteLine("The following format options are available:"); + Console.WriteLine(" lz10, lz11 and lzovl:"); + Console.WriteLine(" -opt : employs a better compression algorithm to boost the compression"); + Console.WriteLine(" ratio. Not using this option will result in using the algorithm"); + Console.WriteLine(" originally used to compress the game files."); + Console.WriteLine(); Console.WriteLine("Supplying the -ge modifier together with the -c modifier, the extension of the"); Console.WriteLine("compressed files will be extended with the 'FORMAT' value that always results"); Console.WriteLine("in that particualr format (so 'lz11', 'rle', etc).");