aboutsummaryrefslogtreecommitdiff
path: root/tools/rasm/decrunch/unaplib_fast.asm
diff options
context:
space:
mode:
authorJuan J. Martinez <jjm@usebox.net>2022-09-06 07:37:20 +0100
committerJuan J. Martinez <jjm@usebox.net>2022-09-06 07:37:20 +0100
commit30bf0f51335e87812ffeb54e9437f0b6a1514d67 (patch)
tree9c85a2de53b4da69fcfaa84488cc6c12ebd3e5d0 /tools/rasm/decrunch/unaplib_fast.asm
parentd8990284057e6401d0374f439df51879595d804d (diff)
downloadubox-msx-lib-30bf0f51335e87812ffeb54e9437f0b6a1514d67.tar.gz
ubox-msx-lib-30bf0f51335e87812ffeb54e9437f0b6a1514d67.zip
Updated rasm to 1.7
Diffstat (limited to 'tools/rasm/decrunch/unaplib_fast.asm')
-rw-r--r--tools/rasm/decrunch/unaplib_fast.asm266
1 files changed, 266 insertions, 0 deletions
diff --git a/tools/rasm/decrunch/unaplib_fast.asm b/tools/rasm/decrunch/unaplib_fast.asm
new file mode 100644
index 0000000..47ad16b
--- /dev/null
+++ b/tools/rasm/decrunch/unaplib_fast.asm
@@ -0,0 +1,266 @@
+;
+; Speed-optimized ApLib decompressor by spke (ver.04 spring 2020, 236 bytes)
+;
+; The original Z80 decompressors for ApLib were written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain) and Antonio Villena.
+;
+; This is a new "implicit state" decompressor heavily optimized for speed by spke.
+; (It is 11 bytes shorter and 14% faster than the previously fastest
+; 247b decompressor by Metalbrain and Antonio Villena.)
+;
+; ver.00 by spke (21/08/2018-01/09/2018, 244 bytes, an edit of the existing 247b decompressor);
+; ver.01 by spke (12-13/11/2018, 234(-10) bytes, +3% speed using the state machine for LWM);
+; ver.02 by spke (06/08/2019, +1% speed);
+; ver.03 by spke (27/08/2019, 236(+2) bytes, +1% speed using partly expanded LDIR);
+; ver.04 by spke (spring 2020, added full revision history and support for long offsets)
+;
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+;
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+; The compression can done as follows:
+;
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+;
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+; The decompression is done in the standard way:
+;
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; The decompressor modifies AF, AF', BC, DE, HL, IXH, IY.
+; (However, note that the option "AllowSelfmodifyingCode" removes the dependency on IY.)
+;
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+;
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+
+MACRO ApUnpack
+
+ ld a,128 : jr @LWM0_CASE0
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+@LWM0: ;LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+@LWM0_ReloadByteC0 ld a,(hl) : inc hl : rla
+ jr c,@LWM0_Check2ndBit
+
+;
+; case "0"+BYTE: copy a single literal
+
+@LWM0_CASE0: ldi ; first byte is always copied as literal
+
+;
+; main decompressor loop
+
+@LWM0_MainLoop: add a : jr z,@LWM0_ReloadByteC0 : jr nc,@LWM0_CASE0 ; "0"+BYTE = copy literal
+@LWM0_Check2ndBit add a : call z,@ReloadByte : jr nc,@LWM0_CASE10 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+ add a : call z,@ReloadByte : jp c,@LWM1_CASE111 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+@LWM0_CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ inc hl
+ ld b,0
+
+ ld iyl,c : ld iyh,b ; save offset for future LWMs
+
+ push hl ; save src
+ ld h,d : ld l,e ; HL = dest
+ jr c,@LWM0_LengthIs3
+
+@LWM0_LengthIs2 sbc hl,bc
+ ldi : ldi
+ jr @LWM0_PreMainLoop
+
+@LWM0_LengthIs3 or a : sbc hl,bc
+ ldi : ldi : ldi
+ jr @LWM0_PreMainLoop
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+@LWM0_CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call @GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : dec c : jr z,@LWM1_KickInLWM
+
+@LWM0_AfterLWM dec c : ld b,c : ld c,(hl) : inc hl ; BC = offset
+
+ ld iyl,c : ld iyh,b : push bc
+
+ call @GetGammaCoded ; BC = len*
+
+ ex (sp),hl
+
+ ; interpretation of length value is offset-dependent:
+ ; if (offs >= 32000) len++; if (offs >= 1280) len++; if (offs < 128) len+=2;
+ ; in other words,
+ ; (1 <= offs < 128) +=2
+ ; (128 <= offs < 1280) +=0
+ ; (1280 <= offs < 31999) +=1
+ ; NB offsets over 32000 need one more check, but other Z80 decompressors seem to ignore it. is it not needed?
+
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000>>8 : jr nc,@LWM0_Add2
+ ENDIF
+ cp 5 : jr nc,@LWM0_Add1
+ or a : jr nz,@LWM0_Add0
+ bit 7,l : jr nz,@LWM0_Add0
+@LWM0_Add2 inc bc
+@LWM0_Add1 inc bc
+@LWM0_Add0 ; for offs<128 : 4+4+7+7 + 4+7 + 8+7 + 6+6 = 60t
+ ; for offs>=1280 : 4+4+7+12 + 6 = 33t
+ ; for 128<=offs<1280 : 4+4+7+7 + 4+12 = 38t OR 4+4+7+7 + 4+7+8+12 = 53t
+; dec bc
+
+@LWM0_CopyMatch: ; this assumes that BC = len, DE = offset, HL = dest
+ ; and also that (SP) = src, while having NC
+ ld a,e : sub l : ld l,a
+ ld a,d : sbc h
+@LWM0_CopyMatchLDH ld h,a : ldi : ldir : exa
+@LWM0_PreMainLoop pop hl ; recover src
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+@LWM1: ; LWM = 1
+
+;
+; main decompressor loop
+
+@LWM1_MainLoop: add a : jr z,@LWM1_ReloadByteC0 : jr nc,@LWM0_CASE0 ; "0"+BYTE = copy literal
+@LWM1_Check2ndBit add a : call z,@ReloadByte : jr nc,@LWM1_CASE10 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+ add a : call z,@ReloadByte : jr nc,@LWM0_CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+@LWM1_CASE111: ld bc,%11100000
+ add a : call z,@ReloadByte : rl c ; read short offset (4 bits)
+ add a : call z,@ReloadByte : rl c ; read short offset (4 bits)
+ add a : call z,@ReloadByte : rl c ; read short offset (4 bits)
+ add a : call z,@ReloadByte : rl c ; read short offset (4 bits)
+ ex de,hl : jr z,@LWM1_WriteZero ; zero offset means "write zero" (NB: B is zero here)
+
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ld b,(hl)
+ pop hl
+
+@LWM1_WriteZero ld (hl),b : ex de,hl
+ inc de : jp @LWM0_MainLoop ; 10+4*(4+10+8)+4+7 + 11+15+7+10 + 7+4+6+10 = 179t
+
+@LWM1_ReloadByteC0 ld a,(hl) : inc hl : rla
+ jp nc,@LWM0_CASE0
+ jr @LWM1_Check2ndBit
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+@LWM1_CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call @GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : jp @LWM0_AfterLWM
+
+;
+; the re-use of the previous offset (LWM magic)
+
+@LWM1_KickInLWM: ; "and a new gamma code for length"
+ call @GetGammaCoded ; BC = len
+ push hl
+ exa : ld a,e : sub iyl : ld l,a
+ ld a,d : sbc iyh
+ jp @LWM0_CopyMatchLDH
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+;
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+
+@GetGammaCoded: ld bc,1
+@ReadGamma add a : jr z,@ReloadByteRG1
+ rl c : rl b
+ add a : jr z,@ReloadByteRG2
+ jr c,@ReadGamma : ret
+
+@ReloadByteRG1 ld a,(hl) : inc hl : rla
+ rl c : rl b
+ add a : jr c,@ReadGamma : ret
+
+@ReloadByteRG2 ld a,(hl) : inc hl : rla
+ jr c,@ReadGamma : ret
+
+;
+; pretty usual getbit for mixed datastreams
+
+@ReloadByte: ld a,(hl) : inc hl : rla : ret
+
+MEND
+