diff options
author | Juan J. Martinez <jjm@usebox.net> | 2021-01-09 09:01:05 +0000 |
---|---|---|
committer | Juan J. Martinez <jjm@usebox.net> | 2021-01-09 09:01:05 +0000 |
commit | 9bcf1e97960c0da7322a868efdbc07e2650716fe (patch) | |
tree | de6d32ad5b0e567991bd3eb262902c15a77074d9 /tools/apultra/asm | |
parent | 3b31adf01305e522f7e28c1435fb47418ce43267 (diff) | |
download | ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.tar.gz ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.zip |
Extra libs: ap.lib
aPLib support with apultra.
Diffstat (limited to 'tools/apultra/asm')
-rw-r--r-- | tools/apultra/asm/6502/aplib_6502.asm | 257 | ||||
-rw-r--r-- | tools/apultra/asm/6502/aplib_6502_b.asm | 218 | ||||
-rw-r--r-- | tools/apultra/asm/68000/unaplib_68000.S | 117 | ||||
-rw-r--r-- | tools/apultra/asm/6809/unaplib.s | 125 | ||||
-rw-r--r-- | tools/apultra/asm/6809/unaplib_6309.s | 139 | ||||
-rw-r--r-- | tools/apultra/asm/6809/unaplib_6309_b.s | 143 | ||||
-rw-r--r-- | tools/apultra/asm/6809/unaplib_b.s | 122 | ||||
-rw-r--r-- | tools/apultra/asm/8088/aplib_8088_fast.S | 178 | ||||
-rw-r--r-- | tools/apultra/asm/8088/aplib_8088_small.S | 177 | ||||
-rw-r--r-- | tools/apultra/asm/ARM7TDMI/aplib_arm.s | 150 | ||||
-rw-r--r-- | tools/apultra/asm/Z80/unaplib_fast.asm | 339 | ||||
-rw-r--r-- | tools/apultra/asm/Z80/unaplib_small.asm | 258 | ||||
-rw-r--r-- | tools/apultra/asm/x86/aplib_x86_fast.asm | 180 | ||||
-rw-r--r-- | tools/apultra/asm/x86/aplib_x86_small.asm | 159 |
14 files changed, 2562 insertions, 0 deletions
diff --git a/tools/apultra/asm/6502/aplib_6502.asm b/tools/apultra/asm/6502/aplib_6502.asm new file mode 100644 index 0000000..1bc11b4 --- /dev/null +++ b/tools/apultra/asm/6502/aplib_6502.asm @@ -0,0 +1,257 @@ +; *************************************************************************** +; *************************************************************************** +; +; aplib_6502.s +; +; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format. +; +; Includes support for Emmanuel Marty's enhancements to the aPLib format. +; +; The code is 252 bytes long for standard format, 270 for enhanced format. +; +; This code is written for the ACME assembler. +; +; Copyright John Brandwood 2019. +; +; Distributed under the Boost Software License, Version 1.0. +; (See accompanying file LICENSE_1_0.txt or copy at +; http://www.boost.org/LICENSE_1_0.txt) +; +; *************************************************************************** +; *************************************************************************** + + + +; *************************************************************************** +; *************************************************************************** +; +; Decompression Macros +; + + ; + ; Macro to increment the source pointer to the next page. + ; + + !macro APL_INC_PAGE { + inc <apl_srcptr + 1 + } + + ; + ; Macro to read a byte from the compressed source data. + ; + + !macro APL_GET_SRC { + lda (apl_srcptr),y + inc <apl_srcptr + 0 + bne .skip + +APL_INC_PAGE +.skip: + } + + + +; *************************************************************************** +; *************************************************************************** +; +; Data usage is last 12 bytes of zero-page. +; + +apl_bitbuf = $F7 ; 1 byte. +apl_offset = $F8 ; 1 word. +apl_winptr = $FA ; 1 word. +apl_srcptr = $FC ; 1 word. +apl_dstptr = $FE ; 1 word. +apl_length = apl_winptr + + +; *************************************************************************** +; *************************************************************************** +; +; apl_decompress - Decompress data stored in Jorgen Ibsen's aPLib format. +; +; Args: apl_srcptr = ptr to compessed data +; Args: apl_dstptr = ptr to output buffer +; Uses: lots! +; +; As an optimization, the code to handle window offsets > 64768 bytes has +; been removed, since these don't occur with a 16-bit address range. +; +; As an optimization, the code to handle window offsets > 32000 bytes can +; be commented-out, since these don't occur in typical 8-bit computer usage. +; + +apl_decompress: ldy #0 ; Initialize source index. + + lda #$80 ; Initialize an empty + sta <apl_bitbuf ; bit-buffer. + + ; + ; 0 bbbbbbbb - One byte from compressed data, i.e. a "literal". + ; + +.literal: +APL_GET_SRC + +.write_byte: ldx #0 ; LWM=0. + + sta (apl_dstptr),y ; Write the byte directly to + inc <apl_dstptr + 0 ; the output. + bne .next_tag + inc <apl_dstptr + 1 + +.next_tag: asl <apl_bitbuf ; 0 bbbbbbbb + bne .skip0 + jsr .load_bit +.skip0: bcc .literal + +.skip1: asl <apl_bitbuf ; 1 0 <offset> <length> + bne .skip2 + jsr .load_bit +.skip2: bcc .copy_large + + asl <apl_bitbuf ; 1 1 0 dddddddn + bne .skip3 + jsr .load_bit +.skip3: bcc .copy_normal + + ; 1 1 1 dddd - Copy 1 byte within 15 bytes (or zero). + +.copy_short: lda #$10 +.nibble_loop: asl <apl_bitbuf + bne .skip4 + pha + jsr .load_bit + pla +.skip4: rol + bcc .nibble_loop + beq .write_byte ; Offset=0 means write zero. + + eor #$FF ; Read the byte directly from + tay ; the destination window. + iny + dec <apl_dstptr + 1 + lda (apl_dstptr),y + inc <apl_dstptr + 1 + ldy #0 + beq .write_byte + + ; + ; 1 1 0 dddddddn - Copy 2 or 3 within 128 bytes. + ; + +.copy_normal: +APL_GET_SRC ; 1 1 0 dddddddn + lsr + beq .finished ; Offset 0 == EOF. + + sta <apl_offset + 0 ; Preserve offset. + sty <apl_offset + 1 + tya ; Y == 0. + tax ; Bits 8..15 of length. + adc #2 ; Bits 0...7 of length. + bne .do_match ; NZ from previous ADC. + + ; + ; Subroutines for byte & bit handling. + ; + +.get_gamma: lda #1 ; Get a gamma-coded value. +.gamma_loop: asl <apl_bitbuf + bne .skip5 + pha + jsr .load_bit + pla +.skip5: rol + rol <apl_length + 1 + asl <apl_bitbuf + bne .skip6 + pha + jsr .load_bit + pla +.skip6: bcs .gamma_loop + +.finished: rts ; All decompressed! + + ; + ; 1 0 <offset> <length> - gamma-coded LZSS pair. + ; + +.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2). + sty <apl_length + 1 ; Clear hi-byte of length. + + cpx #1 ; CC if LWM==0, CS if LWM==1. + sbc #2 ; -3 if LWM==0, -2 if LWM==1. + bcs .normal_pair ; CC if LWM==0 && offset==2. + + jsr .get_gamma ; Get length (A=lo-byte & CC). + ldx <apl_length + 1 + bcc .do_match ; Use previous Offset. + +.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset. + + +APL_GET_SRC + sta <apl_offset + 0 ; Save bits 0...7 of offset. + + jsr .get_gamma ; Get length (A=lo-byte & CC). + ldx <apl_length + 1 + + ldy <apl_offset + 1 ; If offset < 256. + beq .lt256 + cpy #$7D ; If offset >= 32000, length += 2. + bcs .match_plus2 + cpy #$05 ; If offset >= 1280, length += 1. + bcs .match_plus1 + bcc .do_match +.lt256: ldy <apl_offset + 0 ; If offset < 128, length += 2. + bmi .do_match + + sec ; aPLib gamma returns with CC. + +.match_plus2: adc #1 ; CS, so ADC #2. + bcs .match_plus256 + +.match_plus1: adc #0 ; CS, so ADC #1, or CC if fall + bcc .do_match ; through from .match_plus2. + +.match_plus256: inx + +.do_match: eor #$FF ; Negate the lo-byte of length + tay ; and check for zero. + iny + beq .calc_addr + eor #$FF + + inx ; Increment # of pages to copy. + + clc ; Calc destination for partial + adc <apl_dstptr + 0 ; page. + sta <apl_dstptr + 0 + bcs .calc_addr + dec <apl_dstptr + 1 + +.calc_addr: sec ; Calc address of match. + lda <apl_dstptr + 0 + sbc <apl_offset + 0 + sta <apl_winptr + 0 + lda <apl_dstptr + 1 + sbc <apl_offset + 1 + sta <apl_winptr + 1 + +.copy_page: lda (apl_winptr),y + sta (apl_dstptr),y + iny + bne .copy_page + inc <apl_winptr + 1 + inc <apl_dstptr + 1 + dex ; Any full pages left to copy? + bne .copy_page + + inx ; LWM=1. + jmp .next_tag + + ; + ; Subroutines for byte & bit handling. + ; + +.load_bit: +APL_GET_SRC ; Reload an empty bit-buffer + rol ; from the compressed source. + sta <apl_bitbuf + rts diff --git a/tools/apultra/asm/6502/aplib_6502_b.asm b/tools/apultra/asm/6502/aplib_6502_b.asm new file mode 100644 index 0000000..7963e02 --- /dev/null +++ b/tools/apultra/asm/6502/aplib_6502_b.asm @@ -0,0 +1,218 @@ +; ----------------------------------------------------------------------------- +; aplib_6502_b.s - fast aPLib backward decompressor for 6502 - 253 bytes +; written for the ACME assembler +; +; jsr apl_decompress to unpack data backwards. +; create backwards compressed data with apultra -b or oapack -b +; +; in: +; * apl_srcptr (low and high byte) = last byte of compressed data +; * apl_dstptr (low and high byte) = last byte of decompression buffer +; +; out: +; * apl_dstptr (low and high byte) = first byte of decompressed data +; +; Copyright (C) 2020 Emmanuel Marty +; With parts of the code inspired by John Brandwood, Peter Ferrie +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. +; ----------------------------------------------------------------------------- + + ; Zero page locations + +apl_gamma2_hi = $F6 +apl_bitbuf = $F7 +apl_offset = $F8 +apl_winptr = $FA +apl_srcptr = $FC +apl_dstptr = $FE + + ; Read a byte from the source into A. Trashes X + + !macro APL_GET_SRC { + lda (apl_srcptr),y + ldx <apl_srcptr+0 + bne .src_page_done + dec <apl_srcptr+1 +.src_page_done: dec <apl_srcptr+0 + } + + ; Write a byte to the destinatipn + + !macro APL_PUT_DST { + sta (apl_dstptr),y + lda <apl_dstptr+0 + bne .dst_page_done + dec <apl_dstptr+1 +.dst_page_done: dec <apl_dstptr+0 + } + + ; Read one bit from the source into the carry, trash A + + !macro APL_GET_BIT { + asl <apl_bitbuf + bne .has_bits + jsr apl_load_bits +.has_bits: + } + + ; Read one bit from the source into the carry, preserve A + + !macro APL_GET_BIT_SAVEA { + asl <apl_bitbuf + bne .has_bits + pha + jsr apl_load_bits + pla +.has_bits: + } + + ; Decompress aPLib data backwards + +apl_decompress: lda #$80 ; initialize empty bit queue + sta <apl_bitbuf ; plus bit to roll into carry + ldy #$00 ; clear Y for indirect addr + +.copy_literal: +APL_GET_SRC ; read literal from source +.write_literal: +APL_PUT_DST ; write literal to destination + + ldx #$00 ; clear 'follows match' flag + +.next_token: +APL_GET_BIT ; read 'literal or match' bit + bcc .copy_literal ; if 0: literal + + +APL_GET_BIT ; read '8+n bits or other' bit + bcc .long_match ; if 10x: long 8+n bits match + + ; 11x: other type of match + + +APL_GET_BIT ; read '7+1 match or short literal' bit + bcs .short_match ; if 111: 4 bit offset for 1-byte copy + + +APL_GET_SRC ; read low byte of offset + length bit + lsr ; shift offset into place, len bit into carry + beq .done ; check for EOD + sta <apl_offset+0 ; store low byte of offset + sty <apl_offset+1 ; set high byte of offset to 0 + + tya ; set A to 0 + sty <apl_gamma2_hi ; set high byte of len to 0 + adc #$02 ; add 2 or 3 depending on len bit in carry + ; now, low part of len is in A + ; high part of len in apl_gamma2_hi is 0 + ; offset is written to apl_offset + bne .got_len ; go copy matched bytes + +.long_match: jsr .get_gamma2 ; 10: read gamma2 high offset bits in A + sty <apl_gamma2_hi ; zero out high byte of gamma2 + + cpx #$01 ; set carry if following literal + sbc #$02 ; substract 3 if following literal, 2 otherwise + bcs .no_repmatch + + jsr .get_gamma2 ; read repmatch length: low part in A + bcc .got_len ; go copy large match + ; (carry is always clear after .get_gamma2) + +.short_match: lda #$10 ; clear offset, load end bit into place +.read_short_offs: +APL_GET_BIT_SAVEA ; read one bit of offset into carry + rol ; shift into A, shift end bit as well + bcc .read_short_offs ; loop until end bit is shifted out into carry + + beq .write_literal ; zero offset means write a 0 + tay + lda (apl_dstptr),y ; load backreferenced byte + ldy #$00 ; clear Y again + beq .write_literal ; go write byte to destination + +.get_gamma2: lda #$01 ; 1 so it gets shifted to 2 +.gamma2_loop: +APL_GET_BIT_SAVEA ; read data bit + rol ; shift into low byte + rol <apl_gamma2_hi ; shift into high byte + +APL_GET_BIT_SAVEA ; read continuation bit + bcs .gamma2_loop ; loop until a zero continuation bit is read +.done: rts + +.no_repmatch: sta <apl_offset+1 ; write high byte of offset + +APL_GET_SRC ; read low byte of offset from source + sta <apl_offset+0 ; store low byte of offset + + jsr .get_gamma2 ; read match length: low part in A + + ldx <apl_offset+1 ; high offset byte is zero? + beq .offset_1byte ; if so, offset < 256 + + ; offset is >= 256. + + cpx #$7d ; offset >= 32000 (7d00) ? + bcs .offset_incby2 ; if so, increase match len by 2 + cpx #$05 ; offset >= 1280 (0500) ? + bcs .offset_incby1 ; if so, increase match len by 1 + bcc .got_len ; length is fine, go copy + +.offset_1byte: ldx <apl_offset+0 ; offset < 128 ? + bmi .got_len ; if so, increase match len by 2 + sec ; carry must be set below + +.offset_incby2: adc #$01 ; add 1 + set carry (from bcs or sec) + bcs .len_inchi ; go add 256 to len if overflow + + ; carry clear: fall through for no-op + +.offset_incby1: adc #$00 ; add 1 + carry + bcc .got_len +.len_inchi: inc <apl_gamma2_hi ; add 256 to len if low byte overflows + +.got_len: tax ; transfer low byte of len into X + beq .add_offset + inc <apl_gamma2_hi + +.add_offset: clc ; add dest + match offset + lda <apl_dstptr+0 ; low 8 bits + adc <apl_offset+0 + sta <apl_winptr+0 ; store back reference address + lda <apl_dstptr+1 ; high 8 bits + adc <apl_offset+1 + sta <apl_winptr+1 ; store high 8 bits of address + +.copy_match_loop: lda (apl_winptr),y ; read one byte of backreference + +APL_PUT_DST ; write byte to destination + + lda <apl_winptr+0 ; decrement backreference address + bne .backref_page_done + dec <apl_winptr+1 +.backref_page_done: + dec <apl_winptr+0 + + dex ; loop to copy all matched bytes + bne .copy_match_loop + dec <apl_gamma2_hi + bne .copy_match_loop + + ; X is 0 when exiting the loop above + inx ; set 'follows match' flag + jmp .next_token ; go decode next token + +apl_load_bits: lda (apl_srcptr),y ; read 8 bits from source + rol ; shift bit queue, and high bit into carry + sta <apl_bitbuf ; save bit queue + + lda <apl_srcptr+0 + bne .bits_page_done + dec <apl_srcptr+1 +.bits_page_done: dec <apl_srcptr+0 + rts diff --git a/tools/apultra/asm/68000/unaplib_68000.S b/tools/apultra/asm/68000/unaplib_68000.S new file mode 100644 index 0000000..a60ae32 --- /dev/null +++ b/tools/apultra/asm/68000/unaplib_68000.S @@ -0,0 +1,117 @@ +; unaplib_68000.s - aPLib decompressor for 68000 - 154 bytes
+;
+; in: a0 = start of compressed data
+; a1 = start of decompression buffer
+; out: d0 = decompressed size
+;
+; Copyright (C) 2020 Emmanuel Marty
+; With parts of the code inspired by Franck "hitchhikr" Charlet
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress:
+ movem.l a2-a6/d2-d3,-(sp)
+
+ moveq #-128,d1 ; initialize empty bit queue
+ ; plus bit to roll into carry
+ lea 32000.w,a2 ; load 32000 offset constant
+ lea 1280.w,a3 ; load 1280 offset constant
+ lea 128.w,a4 ; load 128 offset constant
+ move.l a1,a5 ; save destination pointer
+
+.literal: move.b (a0)+,(a1)+ ; copy literal byte
+.after_lit: moveq #3,d2 ; set LWM flag
+
+.next_token: bsr.s .get_bit ; read 'literal or match' bit
+ bcc.s .literal ; if 0: literal
+
+ bsr.s .get_bit ; read '8+n bits or other type' bit
+ bcs.s .other_match ; if 11x: other type of match
+
+ bsr.s .get_gamma2 ; 10: read gamma2-coded high offset bits
+ sub.l d2,d0 ; high offset bits == 2 when LWM == 3 ?
+ bcc.s .no_repmatch ; if not, not a rep-match
+
+ bsr.s .get_gamma2 ; read repmatch length
+ bra.s .got_len ; go copy large match
+
+.no_repmatch: lsl.l #8,d0 ; shift high offset bits into place
+ move.b (a0)+,d0 ; read low offset byte
+ move.l d0,d3 ; copy offset into d3
+
+ bsr.s .get_gamma2 ; read match length
+ cmp.l a2,d3 ; offset >= 32000 ?
+ bge.s .inc_by_2 ; if so, increase match len by 2
+ cmp.l a3,d3 ; offset >= 1280 ?
+ bge.s .inc_by_1 ; if so, increase match len by 1
+ cmp.l a4,d3 ; offset < 128 ?
+ bge.s .got_len ; if so, increase match len by 2
+.inc_by_2: addq.l #1,d0 ; increase match len by 1
+.inc_by_1: addq.l #1,d0 ; increase match len by 1
+
+.got_len: move.l a1,a6 ; calculate backreference address
+ sub.l d3,a6 ; (dest - match offset)
+ subq.l #1,d0 ; dbf will loop until d0 is -1, not 0
+.copy_match: move.b (a6)+,(a1)+ ; copy matched byte
+ dbf d0,.copy_match ; loop for all matched bytes
+ moveq #2,d2 ; clear LWM flag
+ bra.s .next_token ; go decode next token
+
+.other_match: bsr.s .get_bit ; read '7+1 match or short literal' bit
+ bcs.s .short_match ; if 111: 4 bit offset for 1-byte copy
+
+ moveq #1,d0 ; 110: prepare match length
+ moveq #0,d3 ; clear high bits of offset
+ move.b (a0)+,d3 ; read low bits of offset + length bit
+ lsr.b #1,d3 ; shift offset into place, len into carry
+ beq.s .done ; check for EOD
+ addx.b d0,d0 ; len = (1 << 1) + carry bit, ie. 2 or 3
+ bra.s .got_len ; go copy match
+
+.short_match: moveq #0,d0 ; clear short offset before reading 4 bits
+ bsr.s .get_dibits ; read a data bit into d0, one into carry
+ addx.b d0,d0 ; shift second bit into d0
+ bsr.s .get_dibits ; read a data bit into d0, one into carry
+ addx.b d0,d0 ; shift second bit into d0
+ beq.s .write_zero ; if offset is zero, write a 0
+
+ move.l a1,a6 ; calculate backreference address
+ sub.l d0,a6 ; (dest - short offset)
+ move.b (a6),d0 ; read matched byte
+.write_zero: move.b d0,(a1)+ ; write matched byte or 0
+ bra.s .after_lit ; set LWM flag and go decode next token
+
+.done: move.l a1,d0 ; pointer to last decompressed byte + 1
+ sub.l a6,d0 ; minus start of decompression buffer = size
+ movem.l (sp)+,a2-a6/d2-d3
+ rts
+
+.get_gamma2: moveq #1,d0 ; init to 1 so it gets shifted to 2 below
+.gamma2_loop: bsr.s .get_dibits ; read data bit, shift into d0
+ ; and read continuation bit
+ bcs.s .gamma2_loop ; loop until a 0 continuation bit is read
+ rts
+
+.get_dibits: bsr.s .get_bit ; read bit
+ addx.l d0,d0 ; shift into d0
+ ; fall through
+.get_bit: add.b d1,d1 ; shift bit queue, high bit into carry
+ bne.s .got_bit ; queue not empty, bits remain
+ move.b (a0)+,d1 ; read 8 new bits
+ addx.b d1,d1 ; shift bit queue, high bit into carry
+ ; and shift 1 from carry into bit queue
+.got_bit: rts
diff --git a/tools/apultra/asm/6809/unaplib.s b/tools/apultra/asm/6809/unaplib.s new file mode 100644 index 0000000..641c3f4 --- /dev/null +++ b/tools/apultra/asm/6809/unaplib.s @@ -0,0 +1,125 @@ +; unaplib.s - aPLib decompressor for 6809 - 157 bytes
+;
+; in: x = start of compressed data
+; y = start of decompression buffer
+; out: y = end of decompression buffer + 1
+;
+; Copyright (C) 2020 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress
+ lda #$80 ; initialize empty bit queue
+ sta <apbitbuf,pcr ; plus bit to roll into carry
+ leau ,x
+
+apcplit ldb ,u+ ; copy literal byte
+apwtlit stb ,y+
+
+ lda #$03 ; set 'follows literal' flag
+
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+
+ sta <aplwm+2,pcr ; store 'follows literal' flag
+
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+aplwm subd #$0000 ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+
+apnorep tfr b,a ; transfer high offset bits to A
+ ldb ,u+ ; read low offset byte in B
+ std <aprepof+1,pcr ; store match offset
+ tfr d,x ; transfer offset to X
+
+ bsr apgamma2 ; read match length
+
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u ; save source compressed data pointer
+ tfr d,x ; copy match length to X
+
+aprepof ldd #$aaaa ; load match offset
+ nega ; reverse sign of offset in D
+ negb
+ sbca #0
+ leau d,y ; put backreference start address in U (dst+offset)
+
+apcpymt lda ,u+ ; copy matched byte
+ sta ,y+
+ leax -1,x ; decrement X
+ bne apcpymt ; loop until all matched bytes are copied
+
+ puls u ; restore source compressed data pointer
+
+ lda #$02 ; clear 'follows literal' flag
+ bra aptoken
+
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+ bne apdone ; queue not empty, bits remain
+ pshs a ; save reg A
+ lda ,u+ ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta <apbitbuf,pcr ; save bit queue
+ puls a,pc ; pop reg A and return
+
+apbitbuf fcb $00 ; bit queue
+
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; go write zero
+
+ negb ; reverse offset in D
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write backreferenced byte
+
+apgamma2 ldd #$1 ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit ; read data bit
+ rolb ; shift into D
+ rola
+ bsr apgetbit ; read continuation bit
+ bcs apg2loop ; loop until a zero continuation bit is read
+apdone rts
+
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+
+ ldb ,u+ ; read low bits of offset + length bit in B
+ beq apdone ; check for EOD
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ std <aprepof+1,pcr ; store match offset
+ ldb #$01 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309.s b/tools/apultra/asm/6809/unaplib_6309.s new file mode 100644 index 0000000..9e8ed71 --- /dev/null +++ b/tools/apultra/asm/6809/unaplib_6309.s @@ -0,0 +1,139 @@ +; unaplib_6309.s - aPLib decompressor for H6309 - 131 bytes +; +; in: x = start of compressed data +; y = start of decompression buffer +; out: y = end of decompression buffer + 1 +; +; Copyright (C) 2020 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + + +; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements +; added by Doug Masten. +; +; Main advantage of H6309 CPU is the "TFM" instruction which can copy one +; byte of memory in 3 clock cycles vs a traditional copy loop that takes +; 20 clock cycles. + +; Options: +; APLIB_VAR +; Define variable to point to a DP memory location for a memory space +; and speed optimization. +; ex. APLIB_VAR equ <memory location> +; +; APLIB_LONG_OFFSET_DISABLE +; Defined variable to disable long offsets >= 32000 for a speed and space +; optimization. Only enable this if you know what you are doing. +; ex. APLIB_LONG_OFFSET_DISABLE equ 1 + + +; define options + ifdef APLIB_VAR +apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization) + else +apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory) + endc + + +apl_decompress + lda #$80 ; initialize empty bit queue + sta apbitbuf ; plus bit to roll into carry + tfr x,u + +apcplit ldb ,u+ ; copy literal byte +apwtlit stb ,y+ + + ldb #3 ; set 'follows literal' flag + +aptoken bsr apgetbit ; read 'literal or match' bit + bcc apcplit ; if 0: literal + + bsr apgetbit ; read '8+n bits or other type' bit + bcs apother ; if 11x: other type of match + + bsr apgamma2 ; 10: read gamma2-coded high offset bits + clra + subr d,w ; high offset bits == 2 when follows_literal == 3 ? + bcc apnorep ; if not, not a rep-match + + bsr apgamma2 ; read repmatch length + bra apgotlen ; go copy large match + +apnorep tfr f,a ; transfer high offset bits to A + ldb ,u+ ; read low offset byte in B + tfr d,x ; save match offset + + bsr apgamma2 ; read match length + + ifndef APLIB_LONG_OFFSET_DISABLE + cmpx #$7D00 ; offset >= 32000 ? + bge apincby2 ; if so, increase match len by 2 + endc + cmpx #$0500 ; offset >= 1280 ? + bge apincby1 ; if so, increase match len by 1 + cmpx #$80 ; offset < 128 ? + bge apgotlen ; if so, increase match len by 2 +apincby2 incw +apincby1 incw + +apgotlen tfr y,d ; transfer dst to D + subr x,d ; put backreference start address in D (dst + offset) + tfm d+,y+ ; copy matched bytes + + ldb #2 ; clear 'follows literal' flag + bra aptoken + +apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below +loop@ bsr apgetbit ; read data bit + rolw ; shift into W + bsr apgetbit ; read continuation bit + bcs loop@ ; loop until a zero continuation bit is read + rts + +apdibits bsr apgetbit ; read bit + rolb ; push into B +apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry + bne aprts ; queue not empty, bits remain + lda ,u+ ; read 8 new bits + rola ; shift bit queue, and high bit into carry + sta apbitbuf ; save bit queue +aprts rts + +apshort clrb + bsr apdibits ; read 2 offset bits + rolb + bsr apdibits ; read 4 offset bits + rolb + beq apwtlit ; if zero, go write it + + negb ; reverse offset in D + ldb b,y ; load backreferenced byte from dst+offset + bra apwtlit ; go write it + +apother bsr apgetbit ; read '7+1 match or short literal' bit + bcs apshort ; if 111: 4 bit offset for 1-byte copy + + ldb ,u+ ; read low bits of offset + length bit in B + beq aprts ; check for EOD and exit if so + clra ; clear high bits in A + lsrb ; shift offset in place, shift length bit into carry + tfr d,x ; save match offset + ldb #1 ; len in B will be 2*1+carry: + rolb ; shift length, and carry into B + tfr d,w + bra apgotlen ; go copy match diff --git a/tools/apultra/asm/6809/unaplib_6309_b.s b/tools/apultra/asm/6809/unaplib_6309_b.s new file mode 100644 index 0000000..8343edf --- /dev/null +++ b/tools/apultra/asm/6809/unaplib_6309_b.s @@ -0,0 +1,143 @@ +; unaplib_6309_b.s - aPLib backward decompressor for H6309 - 139 bytes +; +; in: x = last byte of compressed data +; y = last byte of decompression buffer +; out: y = first byte of decompressed data +; +; Copyright (C) 2020 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + + +; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements +; added by Doug Masten. +; +; Main advantage of H6309 CPU is the "TFM" instruction which can copy one +; byte of memory in 3 clock cycles vs a traditional copy loop that takes +; 20 clock cycles. + +; Options: +; APLIB_VAR +; Define variable to point to a DP memory location for a memory space +; and speed optimization. +; ex. APLIB_VAR equ <memory location> +; +; APLIB_LONG_OFFSET_DISABLE +; Defined variable to disable long offsets >= 32000 for a speed and space +; optimization. Only enable this if you know what you are doing. +; ex. APLIB_LONG_OFFSET_DISABLE equ 1 + + +; define options + ifdef APLIB_VAR +apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization) + else +apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory) + endc + + +apl_decompress + lda #$80 ; initialize empty bit queue + sta apbitbuf ; plus bit to roll into carry + leau 1,x + leay 1,y + +apcplit ldb ,-u ; copy literal byte +apwtlit stb ,-y + + ldb #3 ; set 'follows literal' flag + +aptoken bsr apgetbit ; read 'literal or match' bit + bcc apcplit ; if 0: literal + + bsr apgetbit ; read '8+n bits or other type' bit + bcs apother ; if 11x: other type of match + + bsr apgamma2 ; 10: read gamma2-coded high offset bits + clra + subr d,w ; high offset bits == 2 when follows_literal == 3 ? + bcc apnorep ; if not, not a rep-match + + bsr apgamma2 ; read repmatch length + bra apgotlen ; go copy large match + +apnorep tfr f,a ; transfer high offset bits to A + ldb ,-u ; read low offset byte in B + tfr d,x ; save match offset + + bsr apgamma2 ; read match length + + ifndef APLIB_LONG_OFFSET_DISABLE + cmpx #$7D00 ; offset >= 32000 ? + bge apincby2 ; if so, increase match len by 2 + endc + cmpx #$0500 ; offset >= 1280 ? + bge apincby1 ; if so, increase match len by 1 + cmpx #$80 ; offset < 128 ? + bge apgotlen ; if so, increase match len by 2 +apincby2 incw +apincby1 incw + +apgotlen tfr y,d ; transfer dst to D + addr x,d ; put backreference start address in D (dst + offset) + decd + leay -1,y + tfm d-,y- ; copy matched bytes + leay 1,y + + ldb #2 ; clear 'follows literal' flag + bra aptoken + +apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below +loop@ bsr apgetbit ; read data bit + rolw ; shift into W + bsr apgetbit ; read continuation bit + bcs loop@ ; loop until a zero continuation bit is read + rts + +apdibits bsr apgetbit ; read bit + rolb ; push into B +apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry + bne aprts ; queue not empty, bits remain + lda ,-u ; read 8 new bits + rola ; shift bit queue, and high bit into carry + sta apbitbuf ; save bit queue +aprts rts + +apshort clrb + bsr apdibits ; read 2 offset bits + rolb + bsr apdibits ; read 4 offset bits + rolb + beq apwtlit ; if zero, go write it + + decb ; we load below without predecrement, adjust here + ldb b,y ; load backreferenced byte from dst+offset + bra apwtlit ; go write it + +apother bsr apgetbit ; read '7+1 match or short literal' bit + bcs apshort ; if 111: 4 bit offset for 1-byte copy + + ldb ,-u ; read low bits of offset + length bit in B + beq aprts ; check for EOD and exit if so + clra ; clear high bits in A + lsrb ; shift offset in place, shift length bit into carry + tfr d,x ; save match offset + ldb #1 ; len in B will be 2*1+carry: + rolb ; shift length, and carry into B + tfr d,w + bra apgotlen ; go copy match diff --git a/tools/apultra/asm/6809/unaplib_b.s b/tools/apultra/asm/6809/unaplib_b.s new file mode 100644 index 0000000..02f943c --- /dev/null +++ b/tools/apultra/asm/6809/unaplib_b.s @@ -0,0 +1,122 @@ +; unaplib_b.s - aPLib backward decompressor for 6809 - 154 bytes
+;
+; in: x = last byte of compressed data
+; y = last byte of decompression buffer
+; out: y = first byte of decompressed data
+;
+; Copyright (C) 2020 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress
+ lda #$80 ; initialize empty bit queue
+ sta <apbitbuf,pcr ; plus bit to roll into carry
+ leau 1,x
+ leay 1,y
+
+apcplit ldb ,-u ; copy literal byte
+apwtlit stb ,-y
+
+ lda #$03 ; set 'follows literal' flag
+
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+
+ sta <aplwm+2,pcr ; store 'follows literal' flag
+
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+aplwm subd #$0000 ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+
+apnorep tfr b,a ; transfer high offset bits to A
+ ldb ,-u ; read low offset byte in B
+ std <aprepof+2,pcr ; store match offset
+ tfr d,x ; transfer offset to X
+
+ bsr apgamma2 ; read match length
+
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u ; save source compressed data pointer
+ tfr d,x ; copy match length to X
+
+aprepof leau $aaaa,y ; put backreference start address in U (dst+offset)
+
+apcpymt lda ,-u ; copy matched byte
+ sta ,-y
+ leax -1,x ; decrement X
+ bne apcpymt ; loop until all matched bytes are copied
+
+ puls u ; restore source compressed data pointer
+
+ lda #$02 ; clear 'follows literal' flag
+ bra aptoken
+
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+ bne apdone ; queue not empty, bits remain
+ pshs a ; push reg A
+ lda ,-u ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta <apbitbuf,pcr ; save bit queue
+ puls a,pc ; pop reg A and return
+
+apbitbuf fcb $00 ; bit queue
+
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; go write a zero
+
+ decb ; we load below without predecrement, adjust here
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write backreferenced byte
+
+apgamma2 ldd #$1 ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit ; read data bit
+ rolb ; shift into D
+ rola
+ bsr apgetbit ; read continuation bit
+ bcs apg2loop ; loop until a zero continuation bit is read
+apdone rts
+
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+
+ ldb ,-u ; read low bits of offset + length bit in B
+ beq apdone ; check for EOD
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ std <aprepof+2,pcr ; store match offset
+ ldb #$01 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/8088/aplib_8088_fast.S b/tools/apultra/asm/8088/aplib_8088_fast.S new file mode 100644 index 0000000..c535234 --- /dev/null +++ b/tools/apultra/asm/8088/aplib_8088_fast.S @@ -0,0 +1,178 @@ +; aplib_8088_fast.S - speed-optimized aPLib decompressor for 8088 - 188 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+ segment .text
+ bits 16
+
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * ds:si: compressed aPLib data
+; * es:di: output buffer
+; output:
+; * ax: decompressed size
+; ---------------------------------------------------------------------------
+
+%macro apl_get_bit 0 ; read bit into carry
+ add al,al ; shift bit queue, and high bit into carry
+ jnz %%gotbit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+%%gotbit:
+%endmacro
+
+apl_decompress:
+ push di ; remember decompression offset
+ cld ; make string operations go forward
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: temporary value, trashed
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor dx,dx ; invalidate rep offset
+
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ mov bx,03H ; set follows_literal(bx) to 3
+
+.next_command:
+ apl_get_bit ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+
+ apl_get_bit ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub cx,bx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in zero)
+ jae .not_repmatch ; if not, not a rep-match
+
+ call .get_gamma2 ; read match length
+ jmp short .got_len ; go copy
+
+.not_repmatch:
+ mov dh,cl ; transfer high offset bits to dh
+ mov dl,[si] ; read low offset byte in dl
+ inc si
+
+ call .get_gamma2 ; read match length
+ cmp dh,07DH ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp dh,05H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp dx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc cx ; increase length
+.increase_len_by1:
+ inc cx ; increase length
+
+ ; copy cx bytes from match offset dx
+
+.got_len:
+ push ds ; save ds:si (current pointer to compressed data)
+ mov bp,si
+
+ push es
+ pop ds
+ mov si,di ; point to destination in es:di - offset in dx
+ sub si,dx
+ rep movsb ; copy matched bytes
+
+ mov si,bp ; restore ds:si
+ pop ds
+
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp short .next_command
+
+ ; read gamma2-coded value into cx
+
+.get_gamma2:
+ xor cx,cx ; initialize to 1 so that value will start at 2
+ inc cx ; when shifted left in the adc below
+
+.gamma2_loop:
+ apl_get_bit ; read data bit
+ adc cx,cx ; shift into cx
+ apl_get_bit ; read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+
+ ret
+
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+ xor cx,cx
+ apl_get_bit ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+
+ ; 110: 7 bits offset + 1 bit length
+
+ mov dl,[si] ; read offset + length in dl
+ inc si
+
+ inc cx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3
+
+ xor dh,dh ; clear high bits of offset
+ jmp short .got_len
+
+ ; 4 bits offset / 1 byte copy
+
+.short_literal:
+ apl_get_bit ; read 4 offset bits
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ xchg ax,cx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+
+ ; short offset 1-15
+ mov bx,di ; point to destination in es:di - offset in ax
+ sub bx,ax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[es:bx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ mov ax,cx ; restore bit queue in al
+ jmp .next_command_after_literal
+
+.done:
+ pop ax ; retrieve the original decompression offset
+ xchg di,ax ; compute decompressed size
+ sub ax,di
+ ret
diff --git a/tools/apultra/asm/8088/aplib_8088_small.S b/tools/apultra/asm/8088/aplib_8088_small.S new file mode 100644 index 0000000..542991e --- /dev/null +++ b/tools/apultra/asm/8088/aplib_8088_small.S @@ -0,0 +1,177 @@ +; aplib_8088_small.S - size-optimized aPLib decompressor for 8088 - 145 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+ segment .text
+ bits 16
+
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * ds:si: compressed aPLib data
+; * es:di: output buffer
+; output:
+; * ax: decompressed size
+; ---------------------------------------------------------------------------
+
+apl_decompress:
+ push di ; remember decompression offset
+ cld ; make string operations go forward
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: offset of .get_bit
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor dx,dx ; invalidate rep offset
+ mov bp,.get_bit ; load offset of .get_bit, to be used with call bp
+
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ mov bx,03H ; set follows_literal(bx) to 3
+
+.next_command:
+ call bp ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+
+ call bp ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub cx,bx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in a negative value)
+ jae .not_repmatch ; if not, not a rep-match
+
+ call .get_gamma2 ; read match length
+ jmp short .got_len ; go copy
+
+.not_repmatch:
+ mov dh,cl ; transfer high offset bits to dh
+ mov dl,[si] ; read low offset byte in dl
+ inc si
+
+ call .get_gamma2 ; read match length
+ cmp dh,07DH ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp dh,05H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp dx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc cx ; increase length
+.increase_len_by1:
+ inc cx ; increase length
+
+ ; copy cx bytes from match offset dx
+
+.got_len:
+ push ds ; save ds:si (current pointer to compressed data)
+ push si
+
+ push es
+ pop ds
+ mov si,di ; point to destination in es:di - offset in dx
+ sub si,dx
+ rep movsb ; copy matched bytes
+
+ pop si ; restore ds:si
+ pop ds
+
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp short .next_command
+
+ ; read gamma2-coded value into cx
+
+.get_gamma2:
+ xor cx,cx ; initialize to 1 so that value will start at 2
+ inc cx ; when shifted left in the adc below
+
+.gamma2_loop:
+ call .get_dibits ; read data bit, shift into cx, read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+
+ ret
+
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+ xor cx,cx
+ call bp ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+
+ ; 110: 7 bits offset + 1 bit length
+
+ mov dl,[si] ; read offset + length in dl
+ inc si
+
+ inc cx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3
+
+ xor dh,dh ; clear high bits of offset
+ jmp short .got_len
+
+ ; 4 bits offset / 1 byte copy
+
+.short_literal:
+ call .get_dibits ; read 2 offset bits
+ adc cx,cx
+ call .get_dibits ; read 2 offset bits
+ adc cx,cx
+ xchg ax,cx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+
+ ; short offset 1-15
+ mov bx,di ; point to destination in es:di - offset in ax
+ sub bx,ax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[es:bx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ xchg ax,cx ; restore bit queue in al
+ jmp .next_command_after_literal
+
+.done:
+ pop ax ; retrieve the original decompression offset
+ xchg di,ax ; compute decompressed size
+ sub ax,di
+ ret
+
+.get_dibits:
+ call bp ; read data bit
+ adc cx,cx ; shift into cx
+
+.get_bit:
+ add al,al ; shift bit queue, and high bit into carry
+ jnz .got_bit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+.got_bit:
+ ret
diff --git a/tools/apultra/asm/ARM7TDMI/aplib_arm.s b/tools/apultra/asm/ARM7TDMI/aplib_arm.s new file mode 100644 index 0000000..b6d0cef --- /dev/null +++ b/tools/apultra/asm/ARM7TDMI/aplib_arm.s @@ -0,0 +1,150 @@ +@APlib ARM7 decompressor by Dan Weiss, based on the original C version +@Takes in raw apacked data, NOT data created by the 'safe' compressor. +@Code is from the PocketNES NES Emulator for GBA + +@Code is formatted for GNU Assembler + + src .req r0 + dest .req r1 + byte .req r2 + mask .req r3 + gamma .req r4 + lwm .req r6 + recentoff .req r7 + temp .req r8 + +.global depack +.type depack STT_FUNC + +@r0 = src +@r1 = dest +@r2 = byte +@r3 = rotating bit mask +@r4 = increasing gamma +@r6 = lwm +@r7 = recentoff +@r8 = lr copy/scratch + + .macro GETBIT @3 instructions + movs mask,mask,ror #1 + ldrcsb byte,[src],#1 + tst byte,mask + .endm + + .macro GETBITGAMMA @5 instructions + mov gamma,gamma,lsl #1 + GETBIT + addne gamma,gamma,#1 + .endm + +@This initilaiztion code can go into slow memory + +depack: + stmfd sp!,{r4-r10,lr} + ldrb temp,[src],#1 + strb temp,[dest],#1 + ldr mask,=0x01010101 + b aploop_nolwm + +@This inner-loop code should be placed into fast memory + + @depack enters here +aploop_nolwm: + mov lwm,#0 +aploop: + GETBIT + bne apbranch1 + ldrb temp,[src],#1 + strb temp,[dest],#1 + b aploop_nolwm +apbranch1: + GETBIT + beq apbranch2 + GETBIT + beq apbranch3 + @get an offset + mov gamma,#0 + GETBIT + addne gamma,gamma,#1 + GETBITGAMMA + GETBITGAMMA + GETBITGAMMA + cmp gamma,#0 + ldrneb gamma,[dest,-gamma] + strb gamma,[dest],#1 + b aploop_nolwm +apbranch3: + @use 7 bit offset, length = 2 or 3 + @if a zero is encountered here, it's EOF + ldrb gamma,[src],#1 + movs recentoff,gamma,lsr #1 + beq done + ldrcsb temp,[dest,-recentoff] + strcsb temp,[dest],#1 + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + mov lwm,#1 + b aploop +apbranch2: + @use a gamma code * 256 for offset, another gamma code for length + + bl ap_getgamma + sub gamma,gamma,#2 + cmp lwm,#0 + bne ap_is_lwm + mov lwm,#1 + cmp gamma,#0 + bne ap_not_zero_gamma + + @if gamma code is 2, use old recent offset, and a new gamma code for length + bl ap_getgamma +copyloop1: + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + subs gamma,gamma,#1 + bne copyloop1 + b aploop + +ap_not_zero_gamma: + sub gamma,gamma,#1 +ap_is_lwm: + ldrb temp,[src],#1 + add recentoff,temp,gamma,lsl #8 + bl ap_getgamma + @gamma=length + cmp recentoff,#32000 + addge gamma,gamma,#1 + cmp recentoff,#1280 + addge gamma,gamma,#1 + cmp recentoff,#128 + addlt gamma,gamma,#2 +copyloop2: + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + subs gamma,gamma,#1 + bne copyloop2 + b aploop + +ap_getgamma: + mov gamma,#1 +ap_getgammaloop: + GETBITGAMMA + GETBIT + bne ap_getgammaloop + bx lr + +done: + ldmfd sp!,{r4-r10,lr} + bx lr + +.unreq src +.unreq dest +.unreq byte +.unreq mask +.unreq gamma +.unreq lwm +.unreq recentoff +.unreq temp + diff --git a/tools/apultra/asm/Z80/unaplib_fast.asm b/tools/apultra/asm/Z80/unaplib_fast.asm new file mode 100644 index 0000000..c21eb5d --- /dev/null +++ b/tools/apultra/asm/Z80/unaplib_fast.asm @@ -0,0 +1,339 @@ +;
+; Speed-optimized ApLib decompressor by spke & uniabis (ver.06 01-05/06/2020, 235 bytes)
+;
+; The original Z80 decompressors for ApLib were written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain) and Antonio Villena.
+;
+; This is a new "implicit state" decompressor heavily optimized for speed by spke.
+; (It is 12 bytes shorter and 18% faster than the previously fastest
+; 247b decompressor by Metalbrain and Antonio Villena.)
+;
+; ver.00 by spke (21/08/2018-01/09/2018, 244 bytes, an edit of the existing 247b decompressor);
+; ver.01 by spke (12-13/11/2018, 234(-10) bytes, +3% speed using the state machine for LWM);
+; ver.02 by spke (06/08/2019, +1% speed);
+; ver.03 by spke (27/08/2019, 236(+2) bytes, +1% speed using partly expanded LDIR);
+; ver.04 by spke (spring 2020, added full revision history and support for long offsets)
+; ver.05 by spke (17-31/05/2020, 230(-6) bytes, +3% speed, added support for backward compression) <- BROKEN, DO NOT USE
+; ver.06 by uniabis & spke (01-07/06/2020, 235(+5) bytes, +1% speed, added support for HD64180)
+;
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+;
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+; The compression can done as follows:
+;
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+;
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+; The decompression is done in the standard way:
+;
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; Backward decompression is also supported; you can compress files backward using:
+;
+; apultra.exe -b <sourcefile> <outfile>
+;
+; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+;
+; ld hl,LastByteOfCompressedData
+; ld de,LastByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; The decompressor modifies AF, AF', BC, DE, HL, IX.
+;
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+;
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+; DEFINE BackwardDecompression ; decompress data compressed backwards, -10 bytes, speeds decompression up by 3%
+; DEFINE HD64180 ; -2 bytes for HD64180/Z180 support, slows decompression down by 1%
+
+ IFNDEF BackwardDecompression
+
+ MACRO NEXT_HL
+ inc hl
+ ENDM
+
+ MACRO COPY_1
+ ldi
+ ENDM
+
+ MACRO COPY_BC
+ ldir
+ ENDM
+
+ ELSE
+
+ MACRO NEXT_HL
+ dec hl
+ ENDM
+
+ MACRO COPY_1
+ ldd
+ ENDM
+
+ MACRO COPY_BC
+ lddr
+ ENDM
+
+ ENDIF
+
+ MACRO RELOAD_A
+ ld a,(hl) : NEXT_HL : rla
+ ENDM
+
+@Decompress: COPY_1 : scf
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+LWM0: ;LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+.ReloadByteC0 RELOAD_A : jr c,.Check2ndBit
+
+;
+; case "0"+BYTE: copy a single literal
+
+.CASE0: COPY_1 ; first byte is always copied as literal
+
+;
+; main decompressor loop
+
+.MainLoop: add a : jr nc,.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal
+.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit add a : call z,ReloadByte : jp c,LWM1.CASE111 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+.CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ NEXT_HL
+ ld b,0
+
+ IFNDEF HD64180
+ ld ixl,c : ld ixh,b ; save offset for future LWMs
+ ELSE
+ push bc : pop ix
+ ENDIF
+
+ push hl ; save src
+ ld h,d : ld l,e ; HL = dest
+ jr c,.LengthIs3
+
+.LengthIs2
+ IFNDEF BackwardDecompression
+ sbc hl,bc
+ ELSE
+ add hl,bc
+ ENDIF
+ COPY_1 : COPY_1
+ jr .PreMainLoop
+
+.LengthIs3
+ IFNDEF BackwardDecompression
+ or a : sbc hl,bc
+ ELSE
+ add hl,bc
+ ENDIF
+ COPY_1 : COPY_1 : COPY_1
+ jr .PreMainLoop
+
+.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : dec c : jr z,LWM1.KickInLWM
+
+.AfterLWM dec c : ld b,c : ld c,(hl) : NEXT_HL ; BC = offset
+
+ IFNDEF HD64180
+ ld ixl,c : ld ixh,b : push bc
+ ELSE
+ push bc : push bc : pop ix
+ ENDIF
+
+ call GetGammaCoded ; BC = len*
+
+ ex (sp),hl
+
+ ; interpretation of length value is offset-dependent:
+ ; if (offs >= 32000) len++; if (offs >= 1280) len++; if (offs < 128) len+=2;
+ ; in other words,
+ ; (1 <= offs < 128) +=2
+ ; (128 <= offs < 1280) +=0
+ ; (1280 <= offs < 31999) +=1
+ ; NB offsets over 32000 need one more check, but other Z80 decompressors seem to ignore it. is it not needed?
+
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000/256 : jr nc,.Add2
+ ENDIF
+ cp 5 : jr nc,.Add1
+ or a : jr nz,.Add0
+ bit 7,l : jr nz,.Add0
+.Add2 inc bc
+.Add1 inc bc
+.Add0 ; for offs<128 : 4+4+7+7 + 4+7 + 8+7 + 6+6 = 60t
+ ; for offs>=1280 : 4+4+7+12 + 6 = 33t
+ ; for 128<=offs<1280 : 4+4+7+7 + 4+12 = 38t OR 4+4+7+7 + 4+7+8+12 = 53t
+
+.CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset
+ ; and also that (SP) = src, while having NC
+ IFNDEF BackwardDecompression
+ ld a,e : sub l : ld l,a
+ ld a,d : sbc h
+ ld h,a : exa
+ ELSE
+ exa
+.CopyMatchLDH add hl,de
+ ENDIF
+ COPY_1 : COPY_BC
+.PreMainLoop pop hl ; recover src
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+LWM1: ; LWM = 1
+
+;
+; main decompressor loop
+
+.MainLoop: add a : jr nc,LWM0.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal
+.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit add a : call z,ReloadByte : jr nc,LWM0.CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+.CASE111: ld bc,%11100000
+ DUP 4
+ add a : call z,ReloadByte : rl c ; read short offset (4 bits)
+ EDUP
+ ex de,hl : jr z,.WriteZero ; zero offset means "write zero" (NB: B is zero here)
+
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ IFNDEF BackwardDecompression
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ELSE
+ add hl,bc
+ ENDIF
+ ld c,(hl)
+ pop hl
+
+.WriteZero ld (hl),c : NEXT_HL
+ ex de,hl : jp LWM0.MainLoop ; 10+4*(4+10+8)+4+7 + 11+15+7+10 + 7+4+6+10 = 179t
+
+.ReloadByteC0 RELOAD_A : jp nc,LWM0.CASE0
+ jr .Check2ndBit
+
+.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : jr LWM0.AfterLWM
+
+;
+; the re-use of the previous offset (LWM magic)
+
+.KickInLWM: ; "and a new gamma code for length"
+ inc c : call GetGammaCoded.ReadGamma ; BC = len
+
+ IFNDEF BackwardDecompression
+ push ix : ex (sp),hl : exa
+ jr LWM0.CopyMatch
+ ELSE
+ push ix : ex (sp),hl
+ jr LWM0.CopyMatchLDH
+ ENDIF
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+;
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+
+GetGammaCoded: ld bc,1
+.ReadGamma add a : jr z,.ReloadByteRG1
+ rl c : rl b
+ add a : ret nc ; NB: flag NC immediately says we do not need to reload our byte...
+ jr nz,.ReadGamma ; ...even better, flag NZ then automatically means flag C :)
+
+.ReloadByteRG2 RELOAD_A : ret nc : jr .ReadGamma
+
+.ReloadByteRG1 RELOAD_A : rl c : rl b
+ add a : ret nc : jr .ReadGamma
+
+;
+; pretty usual getbit for mixed datastreams
+
+ReloadByte: RELOAD_A : ret
+
diff --git a/tools/apultra/asm/Z80/unaplib_small.asm b/tools/apultra/asm/Z80/unaplib_small.asm new file mode 100644 index 0000000..280de15 --- /dev/null +++ b/tools/apultra/asm/Z80/unaplib_small.asm @@ -0,0 +1,258 @@ +;
+; Size-optimized ApLib decompressor by spke & uniabis (ver.04 01-07/06/2020, 139 bytes)
+;
+; The original Z80 decompressor for ApLib was written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain).
+;
+; This version was heavily re-optimized for size by spke.
+; (It is 17 bytes shorter and 22% faster than the 156b version by Metalbrain.)
+;
+; ver.00 by spke (21/08/2018-01/09/2018, 141 bytes);
+; ver.01 by spke (spring 2019, 140(-1) bytes, slightly faster);
+; ver.02 by spke (05-07/01/2020, added full revision history, support for long offsets
+; and an option to use self-modifying code instead of IY)
+; ver.03 by spke (18-29/05/2020, +0.5% speed, added support for backward compression)
+; ver.04 by uniabis (01-07/06/2020, 139(-1) bytes, +1% speed, added support for HD64180)
+;
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+;
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+; The compression can be done as follows:
+;
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+;
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+; The decompression is done in the standard way:
+;
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; Backward decompression is also supported; you can compress files backward using:
+;
+; apultra.exe -b <sourcefile> <outfile>
+;
+; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+;
+; ld hl,LastByteOfCompressedData
+; ld de,LastByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; The decompressor modifies AF, AF', BC, DE, HL, IX.
+;
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+;
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; DEFINE FasterGetBit ; 16% speed-up at the cost of extra 4 bytes
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+; DEFINE BackwardDecompression ; decompress data compressed backwards, -5 bytes, speeds decompression up by 3%
+
+
+ IFDEF FasterGetBit
+ MACRO GET_BIT
+ add a : call z,ReloadByte
+ ENDM
+ ELSE
+ MACRO GET_BIT
+ call GetOneBit
+ ENDM
+ ENDIF
+
+ IFNDEF BackwardDecompression
+
+ MACRO NEXT_HL
+ inc hl
+ ENDM
+
+ MACRO COPY_1
+ ldi
+ ENDM
+
+ MACRO COPY_BC
+ ldir
+ ENDM
+
+ ELSE
+
+ MACRO NEXT_HL
+ dec hl
+ ENDM
+
+ MACRO COPY_1
+ ldd
+ ENDM
+
+ MACRO COPY_BC
+ lddr
+ ENDM
+
+ ENDIF
+
+@DecompressApLib: ld a,128
+
+;
+; case "0"+BYTE: copy a single literal
+
+CASE0: COPY_1 ; first byte is always copied as literal
+ResetLWM: ld b,-1 ; LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+;
+; main decompressor loop
+
+MainLoop: GET_BIT : jr nc,CASE0 ; "0"+BYTE = copy literal
+ GET_BIT : jr nc,CASE10 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+
+ ld bc,%11100000
+ GET_BIT : jr nc,CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+CASE111:
+ReadFourBits GET_BIT ; read short offset (4 bits)
+ rl c : jr c,ReadFourBits
+ ex de,hl : jr z,WriteZero ; zero offset means "write zero" (NB: B is zero here)
+
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ IFNDEF BackwardDecompression
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ELSE
+ add hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ENDIF
+ ld c,(hl) : pop hl
+
+WriteZero ld (hl),c : NEXT_HL
+ ex de,hl : jr ResetLWM ; write one byte, reset LWM
+
+;
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ NEXT_HL
+
+ push hl ; save src
+ ld h,b : ld l,c ; HL = offset
+
+ ; flag NC means len=2, flag C means len=3
+ ld c,1 : rl c : jr SaveLWMOffset
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+CASE10: ; save state of LWM into A'
+ exa : ld a,b : exa
+
+ ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ exa : add c : ld c,a : exa
+
+ ; "if gamma code is 2, use old r0 offset"
+ dec c : jr z,KickInLWM
+ dec c
+ ld b,c : ld c,(hl) : NEXT_HL ; BC = offset
+
+ push bc ; (SP) = offset
+ call GetGammaCoded ; BC = len*
+ ex (sp),hl ; HL = offset, (SP) = src
+
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000/256 : jr nc,.Add2
+ ENDIF
+ cp 5 : jr nc,.Add1
+ or a : jr nz,.Add0
+ bit 7,l : jr nz,.Add0
+.Add2 inc bc
+.Add1 inc bc
+.Add0 exa
+
+SaveLWMOffset:
+ push hl : pop ix ; save offset for future LWMs
+
+CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset
+ ; and also that (SP) = src, while having NC
+ IFNDEF BackwardDecompression
+ push de
+ ex de,hl : sbc hl,de ; HL = dest-offset
+ pop de ; DE = dest
+ ELSE
+ add hl,de ; HL = dest+offset
+ ENDIF
+
+ COPY_BC
+ pop hl ; recover src
+ jr MainLoop
+
+;
+; the re-use of the previous offset (LWM magic)
+
+KickInLWM: ; "and a new gamma code for length"
+ call GetGammaCoded ; BC = len
+ push ix : ex (sp),hl ; DE = dest, HL = prev offset
+ jr CopyMatch
+
+;
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+
+GetGammaCoded: ld bc,1
+ReadGamma GET_BIT : rl c : rl b
+ GET_BIT : ret nc
+ jr ReadGamma
+
+;
+; pretty usual getbit for mixed datastreams
+
+ IFNDEF FasterGetBit
+GetOneBit: add a : ret nz
+ ENDIF
+ReloadByte: ld a,(hl) : NEXT_HL
+ rla : ret
+
diff --git a/tools/apultra/asm/x86/aplib_x86_fast.asm b/tools/apultra/asm/x86/aplib_x86_fast.asm new file mode 100644 index 0000000..9e41d31 --- /dev/null +++ b/tools/apultra/asm/x86/aplib_x86_fast.asm @@ -0,0 +1,180 @@ +; aplib_x86_fast.asm - speed-optimized aPLib decompressor for x86 - 188 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+ segment .text
+ bits 32
+
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * esi: compressed aPLib data
+; * edi: output buffer
+; output:
+; * eax: decompressed size
+; ---------------------------------------------------------------------------
+ %ifndef BIN
+ global apl_decompress
+ global _apl_decompress
+ %endif
+
+ ; uint32_t apl_decompress(const void *Source, void *Destination);
+
+%macro apl_get_bit 0 ; read bit into carry
+ add al,al ; shift bit queue, and high bit into carry
+ jnz %%gotbit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+%%gotbit:
+%endmacro
+
+apl_decompress:
+_apl_decompress:
+ pushad
+
+ %ifdef CDECL
+ mov esi, [esp+32+4] ; esi = aPLib compressed data
+ mov edi, [esp+32+8] ; edi = output
+ %endif
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: temporary value, trashed
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor edx, edx ; invalidate rep offset
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ mov ebx,03H ; set follows_literal(bx) to 3
+
+.next_command:
+ apl_get_bit ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+
+ apl_get_bit ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in zero)
+ jae .not_repmatch ; if not, not a rep-match
+
+ call .get_gamma2 ; read match length
+ jmp .got_len ; go copy
+
+.not_repmatch:
+ mov edx,ecx ; transfer high offset bits to dh
+ shl edx, 8
+ mov dl,[esi] ; read low offset byte in dl
+ inc esi
+
+ call .get_gamma2 ; read match length
+ cmp edx,07D00H ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp edx,0500H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp edx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc ecx ; increase length
+.increase_len_by1:
+ inc ecx ; increase length
+
+ ; copy cx bytes from match offset dx
+
+.got_len:
+ push esi
+ mov esi,edi ; point to destination in es:di - offset in dx
+ sub esi,edx
+ rep movsb ; copy matched bytes
+ pop esi
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp .next_command
+
+ ; read gamma2-coded value into cx
+
+.get_gamma2:
+ xor ecx,ecx ; initialize to 1 so that value will start at 2
+ inc ecx ; when shifted left in the adc below
+
+.gamma2_loop:
+ apl_get_bit ; read data bit
+ adc ecx,ecx ; shift into cx
+ apl_get_bit ; read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+
+ ret
+
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+ xor ecx,ecx
+ apl_get_bit ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+
+ ; 110: 7 bits offset + 1 bit length
+
+ movzx edx,byte[esi] ; read offset + length in dl
+ inc esi
+
+ inc ecx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3
+ jmp .got_len
+
+ ; 4 bits offset / 1 byte copy
+
+.short_literal:
+ apl_get_bit ; read 4 offset bits
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ xchg eax,ecx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+
+ ; short offset 1-15
+ mov ebx,edi ; point to destination in es:di - offset in ax
+ sub ebx,eax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[ebx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ mov eax,ecx ; restore bit queue in al
+ jmp .next_command_after_literal
+
+.done:
+ sub edi, [esp+32+8] ; compute decompressed size
+ mov [esp+28], edi
+ popad
+ ret
diff --git a/tools/apultra/asm/x86/aplib_x86_small.asm b/tools/apultra/asm/x86/aplib_x86_small.asm new file mode 100644 index 0000000..ada00f6 --- /dev/null +++ b/tools/apultra/asm/x86/aplib_x86_small.asm @@ -0,0 +1,159 @@ +; aplib_x86_small.asm - size-optimized aPLib decompressor for x86 - 185 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ segment .text
+ bits 32
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * esi: compressed aPLib data
+; * edi: output buffer
+; output:
+; * eax: decompressed size
+; ---------------------------------------------------------------------------
+ %ifndef BIN
+ global apl_decompress
+ global _apl_decompress
+ %endif
+
+apl_decompress:
+_apl_decompress:
+ pushad
+
+ %ifdef CDECL
+ mov esi, [esp+32+4] ; esi = aPLib compressed data
+ mov edi, [esp+32+8] ; edi = output
+ %endif
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; ebx: follows_literal
+ ; ecx: scratch register for reading gamma2 codes and storing copy length
+ ; edx: match offset (and rep-offset)
+ ; esi: input (compressed data) pointer
+ ; edi: output (decompressed data) pointer
+ ; ebp: offset of .get_bit
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor edx, edx ; invalidate rep offset in edx
+
+ call .init_get_bit
+.get_dibits:
+ call ebp ; read data bit
+ adc ecx,ecx ; shift into cx
+.get_bit:
+ add al,al ; shift bit queue, and high bit into carry
+ jnz .got_bit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+.got_bit:
+ ret
+.init_get_bit:
+ pop ebp ; load offset of .get_bit, to be used with call ebp
+ add ebp, .get_bit - .get_dibits
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ push 03H
+ pop ebx ; set follows_literal(bx) to 3
+
+.next_command:
+ call ebp ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+ call ebp ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in a negative value)
+ jae .not_repmatch ; if not, not a rep-match
+ call .get_gamma2 ; read match length
+ jmp .got_len ; go copy
+.not_repmatch:
+ mov edx,ecx ; transfer high offset bits to dh
+ shl edx,8
+ mov dl,[esi] ; read low offset byte in dl
+ inc esi
+ call .get_gamma2 ; read match length
+ cmp edx,7D00H ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp edx,0500H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp edx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc ecx ; increase length
+.increase_len_by1:
+ inc ecx ; increase length
+ ; copy ecx bytes from match offset edx
+.got_len:
+ push esi ; save esi (current pointer to compressed data)
+ mov esi,edi ; point to destination in edi - offset in edx
+ sub esi,edx
+ rep movsb ; copy matched bytes
+ pop esi ; restore esi
+ mov bl,02H ; set follows_literal to 2 (ebx is unmodified by match commands)
+ jmp .next_command
+ ; read gamma2-coded value into ecx
+.get_gamma2:
+ xor ecx,ecx ; initialize to 1 so that value will start at 2
+ inc ecx ; when shifted left in the adc below
+.gamma2_loop:
+ call .get_dibits ; read data bit, shift into cx, read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+ ret
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+.other:
+ xor ecx,ecx
+ call ebp ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+ ; 110: 7 bits offset + 1 bit length
+
+ movzx edx,byte[esi] ; read offset + length in dl
+ inc esi
+ inc ecx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3
+ jmp .got_len
+ ; 4 bits offset / 1 byte copy
+.short_literal:
+ call .get_dibits ; read 2 offset bits
+ adc ecx,ecx
+ call .get_dibits ; read 2 offset bits
+ adc ecx,ecx
+ xchg eax,ecx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+ ; short offset 1-15
+ mov ebx,edi ; point to destination in es:di - offset in ax
+ sub ebx,eax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[ebx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ xchg eax,ecx ; restore bit queue in al
+ jmp .next_command_after_literal
+.done:
+ sub edi, [esp+32+8] ; compute decompressed size
+ mov [esp+28], edi
+ popad
+ ret
|