path: root/tools/apultra/asm
diff options
authorJuan J. Martinez <jjm@usebox.net>2021-01-09 09:01:05 +0000
committerJuan J. Martinez <jjm@usebox.net>2021-01-09 09:01:05 +0000
commit9bcf1e97960c0da7322a868efdbc07e2650716fe (patch)
treede6d32ad5b0e567991bd3eb262902c15a77074d9 /tools/apultra/asm
parent3b31adf01305e522f7e28c1435fb47418ce43267 (diff)
Extra libs: ap.lib
aPLib support with apultra.
Diffstat (limited to 'tools/apultra/asm')
14 files changed, 2562 insertions, 0 deletions
diff --git a/tools/apultra/asm/6502/aplib_6502.asm b/tools/apultra/asm/6502/aplib_6502.asm
new file mode 100644
index 0000000..1bc11b4
--- /dev/null
+++ b/tools/apultra/asm/6502/aplib_6502.asm
@@ -0,0 +1,257 @@
+; ***************************************************************************
+; ***************************************************************************
+; aplib_6502.s
+; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format.
+; Includes support for Emmanuel Marty's enhancements to the aPLib format.
+; The code is 252 bytes long for standard format, 270 for enhanced format.
+; This code is written for the ACME assembler.
+; Copyright John Brandwood 2019.
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+; http://www.boost.org/LICENSE_1_0.txt)
+; ***************************************************************************
+; ***************************************************************************
+; ***************************************************************************
+; ***************************************************************************
+; Decompression Macros
+ ;
+ ; Macro to increment the source pointer to the next page.
+ ;
+ !macro APL_INC_PAGE {
+ inc <apl_srcptr + 1
+ }
+ ;
+ ; Macro to read a byte from the compressed source data.
+ ;
+ !macro APL_GET_SRC {
+ lda (apl_srcptr),y
+ inc <apl_srcptr + 0
+ bne .skip
+ }
+; ***************************************************************************
+; ***************************************************************************
+; Data usage is last 12 bytes of zero-page.
+apl_bitbuf = $F7 ; 1 byte.
+apl_offset = $F8 ; 1 word.
+apl_winptr = $FA ; 1 word.
+apl_srcptr = $FC ; 1 word.
+apl_dstptr = $FE ; 1 word.
+apl_length = apl_winptr
+; ***************************************************************************
+; ***************************************************************************
+; apl_decompress - Decompress data stored in Jorgen Ibsen's aPLib format.
+; Args: apl_srcptr = ptr to compessed data
+; Args: apl_dstptr = ptr to output buffer
+; Uses: lots!
+; As an optimization, the code to handle window offsets > 64768 bytes has
+; been removed, since these don't occur with a 16-bit address range.
+; As an optimization, the code to handle window offsets > 32000 bytes can
+; be commented-out, since these don't occur in typical 8-bit computer usage.
+apl_decompress: ldy #0 ; Initialize source index.
+ lda #$80 ; Initialize an empty
+ sta <apl_bitbuf ; bit-buffer.
+ ;
+ ; 0 bbbbbbbb - One byte from compressed data, i.e. a "literal".
+ ;
+.literal: +APL_GET_SRC
+.write_byte: ldx #0 ; LWM=0.
+ sta (apl_dstptr),y ; Write the byte directly to
+ inc <apl_dstptr + 0 ; the output.
+ bne .next_tag
+ inc <apl_dstptr + 1
+.next_tag: asl <apl_bitbuf ; 0 bbbbbbbb
+ bne .skip0
+ jsr .load_bit
+.skip0: bcc .literal
+.skip1: asl <apl_bitbuf ; 1 0 <offset> <length>
+ bne .skip2
+ jsr .load_bit
+.skip2: bcc .copy_large
+ asl <apl_bitbuf ; 1 1 0 dddddddn
+ bne .skip3
+ jsr .load_bit
+.skip3: bcc .copy_normal
+ ; 1 1 1 dddd - Copy 1 byte within 15 bytes (or zero).
+.copy_short: lda #$10
+.nibble_loop: asl <apl_bitbuf
+ bne .skip4
+ pha
+ jsr .load_bit
+ pla
+.skip4: rol
+ bcc .nibble_loop
+ beq .write_byte ; Offset=0 means write zero.
+ eor #$FF ; Read the byte directly from
+ tay ; the destination window.
+ iny
+ dec <apl_dstptr + 1
+ lda (apl_dstptr),y
+ inc <apl_dstptr + 1
+ ldy #0
+ beq .write_byte
+ ;
+ ; 1 1 0 dddddddn - Copy 2 or 3 within 128 bytes.
+ ;
+.copy_normal: +APL_GET_SRC ; 1 1 0 dddddddn
+ lsr
+ beq .finished ; Offset 0 == EOF.
+ sta <apl_offset + 0 ; Preserve offset.
+ sty <apl_offset + 1
+ tya ; Y == 0.
+ tax ; Bits 8..15 of length.
+ adc #2 ; Bits 0...7 of length.
+ bne .do_match ; NZ from previous ADC.
+ ;
+ ; Subroutines for byte & bit handling.
+ ;
+.get_gamma: lda #1 ; Get a gamma-coded value.
+.gamma_loop: asl <apl_bitbuf
+ bne .skip5
+ pha
+ jsr .load_bit
+ pla
+.skip5: rol
+ rol <apl_length + 1
+ asl <apl_bitbuf
+ bne .skip6
+ pha
+ jsr .load_bit
+ pla
+.skip6: bcs .gamma_loop
+.finished: rts ; All decompressed!
+ ;
+ ; 1 0 <offset> <length> - gamma-coded LZSS pair.
+ ;
+.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2).
+ sty <apl_length + 1 ; Clear hi-byte of length.
+ cpx #1 ; CC if LWM==0, CS if LWM==1.
+ sbc #2 ; -3 if LWM==0, -2 if LWM==1.
+ bcs .normal_pair ; CC if LWM==0 && offset==2.
+ jsr .get_gamma ; Get length (A=lo-byte & CC).
+ ldx <apl_length + 1
+ bcc .do_match ; Use previous Offset.
+.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset.
+ sta <apl_offset + 0 ; Save bits 0...7 of offset.
+ jsr .get_gamma ; Get length (A=lo-byte & CC).
+ ldx <apl_length + 1
+ ldy <apl_offset + 1 ; If offset < 256.
+ beq .lt256
+ cpy #$7D ; If offset >= 32000, length += 2.
+ bcs .match_plus2
+ cpy #$05 ; If offset >= 1280, length += 1.
+ bcs .match_plus1
+ bcc .do_match
+.lt256: ldy <apl_offset + 0 ; If offset < 128, length += 2.
+ bmi .do_match
+ sec ; aPLib gamma returns with CC.
+.match_plus2: adc #1 ; CS, so ADC #2.
+ bcs .match_plus256
+.match_plus1: adc #0 ; CS, so ADC #1, or CC if fall
+ bcc .do_match ; through from .match_plus2.
+.match_plus256: inx
+.do_match: eor #$FF ; Negate the lo-byte of length
+ tay ; and check for zero.
+ iny
+ beq .calc_addr
+ eor #$FF
+ inx ; Increment # of pages to copy.
+ clc ; Calc destination for partial
+ adc <apl_dstptr + 0 ; page.
+ sta <apl_dstptr + 0
+ bcs .calc_addr
+ dec <apl_dstptr + 1
+.calc_addr: sec ; Calc address of match.
+ lda <apl_dstptr + 0
+ sbc <apl_offset + 0
+ sta <apl_winptr + 0
+ lda <apl_dstptr + 1
+ sbc <apl_offset + 1
+ sta <apl_winptr + 1
+.copy_page: lda (apl_winptr),y
+ sta (apl_dstptr),y
+ iny
+ bne .copy_page
+ inc <apl_winptr + 1
+ inc <apl_dstptr + 1
+ dex ; Any full pages left to copy?
+ bne .copy_page
+ inx ; LWM=1.
+ jmp .next_tag
+ ;
+ ; Subroutines for byte & bit handling.
+ ;
+.load_bit: +APL_GET_SRC ; Reload an empty bit-buffer
+ rol ; from the compressed source.
+ sta <apl_bitbuf
+ rts
diff --git a/tools/apultra/asm/6502/aplib_6502_b.asm b/tools/apultra/asm/6502/aplib_6502_b.asm
new file mode 100644
index 0000000..7963e02
--- /dev/null
+++ b/tools/apultra/asm/6502/aplib_6502_b.asm
@@ -0,0 +1,218 @@
+; -----------------------------------------------------------------------------
+; aplib_6502_b.s - fast aPLib backward decompressor for 6502 - 253 bytes
+; written for the ACME assembler
+; jsr apl_decompress to unpack data backwards.
+; create backwards compressed data with apultra -b or oapack -b
+; in:
+; * apl_srcptr (low and high byte) = last byte of compressed data
+; * apl_dstptr (low and high byte) = last byte of decompression buffer
+; out:
+; * apl_dstptr (low and high byte) = first byte of decompressed data
+; Copyright (C) 2020 Emmanuel Marty
+; With parts of the code inspired by John Brandwood, Peter Ferrie
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+ ; Zero page locations
+apl_gamma2_hi = $F6
+apl_bitbuf = $F7
+apl_offset = $F8
+apl_winptr = $FA
+apl_srcptr = $FC
+apl_dstptr = $FE
+ ; Read a byte from the source into A. Trashes X
+ !macro APL_GET_SRC {
+ lda (apl_srcptr),y
+ ldx <apl_srcptr+0
+ bne .src_page_done
+ dec <apl_srcptr+1
+.src_page_done: dec <apl_srcptr+0
+ }
+ ; Write a byte to the destinatipn
+ !macro APL_PUT_DST {
+ sta (apl_dstptr),y
+ lda <apl_dstptr+0
+ bne .dst_page_done
+ dec <apl_dstptr+1
+.dst_page_done: dec <apl_dstptr+0
+ }
+ ; Read one bit from the source into the carry, trash A
+ !macro APL_GET_BIT {
+ asl <apl_bitbuf
+ bne .has_bits
+ jsr apl_load_bits
+ }
+ ; Read one bit from the source into the carry, preserve A
+ !macro APL_GET_BIT_SAVEA {
+ asl <apl_bitbuf
+ bne .has_bits
+ pha
+ jsr apl_load_bits
+ pla
+ }
+ ; Decompress aPLib data backwards
+apl_decompress: lda #$80 ; initialize empty bit queue
+ sta <apl_bitbuf ; plus bit to roll into carry
+ ldy #$00 ; clear Y for indirect addr
+.copy_literal: +APL_GET_SRC ; read literal from source
+.write_literal: +APL_PUT_DST ; write literal to destination
+ ldx #$00 ; clear 'follows match' flag
+.next_token: +APL_GET_BIT ; read 'literal or match' bit
+ bcc .copy_literal ; if 0: literal
+ +APL_GET_BIT ; read '8+n bits or other' bit
+ bcc .long_match ; if 10x: long 8+n bits match
+ ; 11x: other type of match
+ +APL_GET_BIT ; read '7+1 match or short literal' bit
+ bcs .short_match ; if 111: 4 bit offset for 1-byte copy
+ +APL_GET_SRC ; read low byte of offset + length bit
+ lsr ; shift offset into place, len bit into carry
+ beq .done ; check for EOD
+ sta <apl_offset+0 ; store low byte of offset
+ sty <apl_offset+1 ; set high byte of offset to 0
+ tya ; set A to 0
+ sty <apl_gamma2_hi ; set high byte of len to 0
+ adc #$02 ; add 2 or 3 depending on len bit in carry
+ ; now, low part of len is in A
+ ; high part of len in apl_gamma2_hi is 0
+ ; offset is written to apl_offset
+ bne .got_len ; go copy matched bytes
+.long_match: jsr .get_gamma2 ; 10: read gamma2 high offset bits in A
+ sty <apl_gamma2_hi ; zero out high byte of gamma2
+ cpx #$01 ; set carry if following literal
+ sbc #$02 ; substract 3 if following literal, 2 otherwise
+ bcs .no_repmatch
+ jsr .get_gamma2 ; read repmatch length: low part in A
+ bcc .got_len ; go copy large match
+ ; (carry is always clear after .get_gamma2)
+.short_match: lda #$10 ; clear offset, load end bit into place
+.read_short_offs: +APL_GET_BIT_SAVEA ; read one bit of offset into carry
+ rol ; shift into A, shift end bit as well
+ bcc .read_short_offs ; loop until end bit is shifted out into carry
+ beq .write_literal ; zero offset means write a 0
+ tay
+ lda (apl_dstptr),y ; load backreferenced byte
+ ldy #$00 ; clear Y again
+ beq .write_literal ; go write byte to destination
+.get_gamma2: lda #$01 ; 1 so it gets shifted to 2
+.gamma2_loop: +APL_GET_BIT_SAVEA ; read data bit
+ rol ; shift into low byte
+ rol <apl_gamma2_hi ; shift into high byte
+ +APL_GET_BIT_SAVEA ; read continuation bit
+ bcs .gamma2_loop ; loop until a zero continuation bit is read
+.done: rts
+.no_repmatch: sta <apl_offset+1 ; write high byte of offset
+ +APL_GET_SRC ; read low byte of offset from source
+ sta <apl_offset+0 ; store low byte of offset
+ jsr .get_gamma2 ; read match length: low part in A
+ ldx <apl_offset+1 ; high offset byte is zero?
+ beq .offset_1byte ; if so, offset < 256
+ ; offset is >= 256.
+ cpx #$7d ; offset >= 32000 (7d00) ?
+ bcs .offset_incby2 ; if so, increase match len by 2
+ cpx #$05 ; offset >= 1280 (0500) ?
+ bcs .offset_incby1 ; if so, increase match len by 1
+ bcc .got_len ; length is fine, go copy
+.offset_1byte: ldx <apl_offset+0 ; offset < 128 ?
+ bmi .got_len ; if so, increase match len by 2
+ sec ; carry must be set below
+.offset_incby2: adc #$01 ; add 1 + set carry (from bcs or sec)
+ bcs .len_inchi ; go add 256 to len if overflow
+ ; carry clear: fall through for no-op
+.offset_incby1: adc #$00 ; add 1 + carry
+ bcc .got_len
+.len_inchi: inc <apl_gamma2_hi ; add 256 to len if low byte overflows
+.got_len: tax ; transfer low byte of len into X
+ beq .add_offset
+ inc <apl_gamma2_hi
+.add_offset: clc ; add dest + match offset
+ lda <apl_dstptr+0 ; low 8 bits
+ adc <apl_offset+0
+ sta <apl_winptr+0 ; store back reference address
+ lda <apl_dstptr+1 ; high 8 bits
+ adc <apl_offset+1
+ sta <apl_winptr+1 ; store high 8 bits of address
+.copy_match_loop: lda (apl_winptr),y ; read one byte of backreference
+ +APL_PUT_DST ; write byte to destination
+ lda <apl_winptr+0 ; decrement backreference address
+ bne .backref_page_done
+ dec <apl_winptr+1
+ dec <apl_winptr+0
+ dex ; loop to copy all matched bytes
+ bne .copy_match_loop
+ dec <apl_gamma2_hi
+ bne .copy_match_loop
+ ; X is 0 when exiting the loop above
+ inx ; set 'follows match' flag
+ jmp .next_token ; go decode next token
+apl_load_bits: lda (apl_srcptr),y ; read 8 bits from source
+ rol ; shift bit queue, and high bit into carry
+ sta <apl_bitbuf ; save bit queue
+ lda <apl_srcptr+0
+ bne .bits_page_done
+ dec <apl_srcptr+1
+.bits_page_done: dec <apl_srcptr+0
+ rts
diff --git a/tools/apultra/asm/68000/unaplib_68000.S b/tools/apultra/asm/68000/unaplib_68000.S
new file mode 100644
index 0000000..a60ae32
--- /dev/null
+++ b/tools/apultra/asm/68000/unaplib_68000.S
@@ -0,0 +1,117 @@
+; unaplib_68000.s - aPLib decompressor for 68000 - 154 bytes
+; in: a0 = start of compressed data
+; a1 = start of decompression buffer
+; out: d0 = decompressed size
+; Copyright (C) 2020 Emmanuel Marty
+; With parts of the code inspired by Franck "hitchhikr" Charlet
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ movem.l a2-a6/d2-d3,-(sp)
+ moveq #-128,d1 ; initialize empty bit queue
+ ; plus bit to roll into carry
+ lea 32000.w,a2 ; load 32000 offset constant
+ lea 1280.w,a3 ; load 1280 offset constant
+ lea 128.w,a4 ; load 128 offset constant
+ move.l a1,a5 ; save destination pointer
+.literal: move.b (a0)+,(a1)+ ; copy literal byte
+.after_lit: moveq #3,d2 ; set LWM flag
+.next_token: bsr.s .get_bit ; read 'literal or match' bit
+ bcc.s .literal ; if 0: literal
+ bsr.s .get_bit ; read '8+n bits or other type' bit
+ bcs.s .other_match ; if 11x: other type of match
+ bsr.s .get_gamma2 ; 10: read gamma2-coded high offset bits
+ sub.l d2,d0 ; high offset bits == 2 when LWM == 3 ?
+ bcc.s .no_repmatch ; if not, not a rep-match
+ bsr.s .get_gamma2 ; read repmatch length
+ bra.s .got_len ; go copy large match
+.no_repmatch: lsl.l #8,d0 ; shift high offset bits into place
+ move.b (a0)+,d0 ; read low offset byte
+ move.l d0,d3 ; copy offset into d3
+ bsr.s .get_gamma2 ; read match length
+ cmp.l a2,d3 ; offset >= 32000 ?
+ bge.s .inc_by_2 ; if so, increase match len by 2
+ cmp.l a3,d3 ; offset >= 1280 ?
+ bge.s .inc_by_1 ; if so, increase match len by 1
+ cmp.l a4,d3 ; offset < 128 ?
+ bge.s .got_len ; if so, increase match len by 2
+.inc_by_2: addq.l #1,d0 ; increase match len by 1
+.inc_by_1: addq.l #1,d0 ; increase match len by 1
+.got_len: move.l a1,a6 ; calculate backreference address
+ sub.l d3,a6 ; (dest - match offset)
+ subq.l #1,d0 ; dbf will loop until d0 is -1, not 0
+.copy_match: move.b (a6)+,(a1)+ ; copy matched byte
+ dbf d0,.copy_match ; loop for all matched bytes
+ moveq #2,d2 ; clear LWM flag
+ bra.s .next_token ; go decode next token
+.other_match: bsr.s .get_bit ; read '7+1 match or short literal' bit
+ bcs.s .short_match ; if 111: 4 bit offset for 1-byte copy
+ moveq #1,d0 ; 110: prepare match length
+ moveq #0,d3 ; clear high bits of offset
+ move.b (a0)+,d3 ; read low bits of offset + length bit
+ lsr.b #1,d3 ; shift offset into place, len into carry
+ beq.s .done ; check for EOD
+ addx.b d0,d0 ; len = (1 << 1) + carry bit, ie. 2 or 3
+ bra.s .got_len ; go copy match
+.short_match: moveq #0,d0 ; clear short offset before reading 4 bits
+ bsr.s .get_dibits ; read a data bit into d0, one into carry
+ addx.b d0,d0 ; shift second bit into d0
+ bsr.s .get_dibits ; read a data bit into d0, one into carry
+ addx.b d0,d0 ; shift second bit into d0
+ beq.s .write_zero ; if offset is zero, write a 0
+ move.l a1,a6 ; calculate backreference address
+ sub.l d0,a6 ; (dest - short offset)
+ move.b (a6),d0 ; read matched byte
+.write_zero: move.b d0,(a1)+ ; write matched byte or 0
+ bra.s .after_lit ; set LWM flag and go decode next token
+.done: move.l a1,d0 ; pointer to last decompressed byte + 1
+ sub.l a6,d0 ; minus start of decompression buffer = size
+ movem.l (sp)+,a2-a6/d2-d3
+ rts
+.get_gamma2: moveq #1,d0 ; init to 1 so it gets shifted to 2 below
+.gamma2_loop: bsr.s .get_dibits ; read data bit, shift into d0
+ ; and read continuation bit
+ bcs.s .gamma2_loop ; loop until a 0 continuation bit is read
+ rts
+.get_dibits: bsr.s .get_bit ; read bit
+ addx.l d0,d0 ; shift into d0
+ ; fall through
+.get_bit: add.b d1,d1 ; shift bit queue, high bit into carry
+ bne.s .got_bit ; queue not empty, bits remain
+ move.b (a0)+,d1 ; read 8 new bits
+ addx.b d1,d1 ; shift bit queue, high bit into carry
+ ; and shift 1 from carry into bit queue
+.got_bit: rts
diff --git a/tools/apultra/asm/6809/unaplib.s b/tools/apultra/asm/6809/unaplib.s
new file mode 100644
index 0000000..641c3f4
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib.s
@@ -0,0 +1,125 @@
+; unaplib.s - aPLib decompressor for 6809 - 157 bytes
+; in: x = start of compressed data
+; y = start of decompression buffer
+; out: y = end of decompression buffer + 1
+; Copyright (C) 2020 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ lda #$80 ; initialize empty bit queue
+ sta <apbitbuf,pcr ; plus bit to roll into carry
+ leau ,x
+apcplit ldb ,u+ ; copy literal byte
+apwtlit stb ,y+
+ lda #$03 ; set 'follows literal' flag
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+ sta <aplwm+2,pcr ; store 'follows literal' flag
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+aplwm subd #$0000 ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+apnorep tfr b,a ; transfer high offset bits to A
+ ldb ,u+ ; read low offset byte in B
+ std <aprepof+1,pcr ; store match offset
+ tfr d,x ; transfer offset to X
+ bsr apgamma2 ; read match length
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u ; save source compressed data pointer
+ tfr d,x ; copy match length to X
+aprepof ldd #$aaaa ; load match offset
+ nega ; reverse sign of offset in D
+ negb
+ sbca #0
+ leau d,y ; put backreference start address in U (dst+offset)
+apcpymt lda ,u+ ; copy matched byte
+ sta ,y+
+ leax -1,x ; decrement X
+ bne apcpymt ; loop until all matched bytes are copied
+ puls u ; restore source compressed data pointer
+ lda #$02 ; clear 'follows literal' flag
+ bra aptoken
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+ bne apdone ; queue not empty, bits remain
+ pshs a ; save reg A
+ lda ,u+ ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta <apbitbuf,pcr ; save bit queue
+ puls a,pc ; pop reg A and return
+apbitbuf fcb $00 ; bit queue
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; go write zero
+ negb ; reverse offset in D
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write backreferenced byte
+apgamma2 ldd #$1 ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit ; read data bit
+ rolb ; shift into D
+ rola
+ bsr apgetbit ; read continuation bit
+ bcs apg2loop ; loop until a zero continuation bit is read
+apdone rts
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+ ldb ,u+ ; read low bits of offset + length bit in B
+ beq apdone ; check for EOD
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ std <aprepof+1,pcr ; store match offset
+ ldb #$01 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309.s b/tools/apultra/asm/6809/unaplib_6309.s
new file mode 100644
index 0000000..9e8ed71
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_6309.s
@@ -0,0 +1,139 @@
+; unaplib_6309.s - aPLib decompressor for H6309 - 131 bytes
+; in: x = start of compressed data
+; y = start of decompression buffer
+; out: y = end of decompression buffer + 1
+; Copyright (C) 2020 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements
+; added by Doug Masten.
+; Main advantage of H6309 CPU is the "TFM" instruction which can copy one
+; byte of memory in 3 clock cycles vs a traditional copy loop that takes
+; 20 clock cycles.
+; Options:
+; Define variable to point to a DP memory location for a memory space
+; and speed optimization.
+; ex. APLIB_VAR equ <memory location>
+; Defined variable to disable long offsets >= 32000 for a speed and space
+; optimization. Only enable this if you know what you are doing.
+; define options
+ ifdef APLIB_VAR
+apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization)
+ else
+apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory)
+ endc
+ lda #$80 ; initialize empty bit queue
+ sta apbitbuf ; plus bit to roll into carry
+ tfr x,u
+apcplit ldb ,u+ ; copy literal byte
+apwtlit stb ,y+
+ ldb #3 ; set 'follows literal' flag
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+ clra
+ subr d,w ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+apnorep tfr f,a ; transfer high offset bits to A
+ ldb ,u+ ; read low offset byte in B
+ tfr d,x ; save match offset
+ bsr apgamma2 ; read match length
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ endc
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 incw
+apincby1 incw
+apgotlen tfr y,d ; transfer dst to D
+ subr x,d ; put backreference start address in D (dst + offset)
+ tfm d+,y+ ; copy matched bytes
+ ldb #2 ; clear 'follows literal' flag
+ bra aptoken
+apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below
+loop@ bsr apgetbit ; read data bit
+ rolw ; shift into W
+ bsr apgetbit ; read continuation bit
+ bcs loop@ ; loop until a zero continuation bit is read
+ rts
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry
+ bne aprts ; queue not empty, bits remain
+ lda ,u+ ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta apbitbuf ; save bit queue
+aprts rts
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; if zero, go write it
+ negb ; reverse offset in D
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write it
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+ ldb ,u+ ; read low bits of offset + length bit in B
+ beq aprts ; check for EOD and exit if so
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ tfr d,x ; save match offset
+ ldb #1 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ tfr d,w
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309_b.s b/tools/apultra/asm/6809/unaplib_6309_b.s
new file mode 100644
index 0000000..8343edf
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_6309_b.s
@@ -0,0 +1,143 @@
+; unaplib_6309_b.s - aPLib backward decompressor for H6309 - 139 bytes
+; in: x = last byte of compressed data
+; y = last byte of decompression buffer
+; out: y = first byte of decompressed data
+; Copyright (C) 2020 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements
+; added by Doug Masten.
+; Main advantage of H6309 CPU is the "TFM" instruction which can copy one
+; byte of memory in 3 clock cycles vs a traditional copy loop that takes
+; 20 clock cycles.
+; Options:
+; Define variable to point to a DP memory location for a memory space
+; and speed optimization.
+; ex. APLIB_VAR equ <memory location>
+; Defined variable to disable long offsets >= 32000 for a speed and space
+; optimization. Only enable this if you know what you are doing.
+; define options
+ ifdef APLIB_VAR
+apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization)
+ else
+apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory)
+ endc
+ lda #$80 ; initialize empty bit queue
+ sta apbitbuf ; plus bit to roll into carry
+ leau 1,x
+ leay 1,y
+apcplit ldb ,-u ; copy literal byte
+apwtlit stb ,-y
+ ldb #3 ; set 'follows literal' flag
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+ clra
+ subr d,w ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+apnorep tfr f,a ; transfer high offset bits to A
+ ldb ,-u ; read low offset byte in B
+ tfr d,x ; save match offset
+ bsr apgamma2 ; read match length
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ endc
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 incw
+apincby1 incw
+apgotlen tfr y,d ; transfer dst to D
+ addr x,d ; put backreference start address in D (dst + offset)
+ decd
+ leay -1,y
+ tfm d-,y- ; copy matched bytes
+ leay 1,y
+ ldb #2 ; clear 'follows literal' flag
+ bra aptoken
+apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below
+loop@ bsr apgetbit ; read data bit
+ rolw ; shift into W
+ bsr apgetbit ; read continuation bit
+ bcs loop@ ; loop until a zero continuation bit is read
+ rts
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry
+ bne aprts ; queue not empty, bits remain
+ lda ,-u ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta apbitbuf ; save bit queue
+aprts rts
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; if zero, go write it
+ decb ; we load below without predecrement, adjust here
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write it
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+ ldb ,-u ; read low bits of offset + length bit in B
+ beq aprts ; check for EOD and exit if so
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ tfr d,x ; save match offset
+ ldb #1 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ tfr d,w
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_b.s b/tools/apultra/asm/6809/unaplib_b.s
new file mode 100644
index 0000000..02f943c
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_b.s
@@ -0,0 +1,122 @@
+; unaplib_b.s - aPLib backward decompressor for 6809 - 154 bytes
+; in: x = last byte of compressed data
+; y = last byte of decompression buffer
+; out: y = first byte of decompressed data
+; Copyright (C) 2020 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ lda #$80 ; initialize empty bit queue
+ sta <apbitbuf,pcr ; plus bit to roll into carry
+ leau 1,x
+ leay 1,y
+apcplit ldb ,-u ; copy literal byte
+apwtlit stb ,-y
+ lda #$03 ; set 'follows literal' flag
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+ sta <aplwm+2,pcr ; store 'follows literal' flag
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+aplwm subd #$0000 ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+apnorep tfr b,a ; transfer high offset bits to A
+ ldb ,-u ; read low offset byte in B
+ std <aprepof+2,pcr ; store match offset
+ tfr d,x ; transfer offset to X
+ bsr apgamma2 ; read match length
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u ; save source compressed data pointer
+ tfr d,x ; copy match length to X
+aprepof leau $aaaa,y ; put backreference start address in U (dst+offset)
+apcpymt lda ,-u ; copy matched byte
+ sta ,-y
+ leax -1,x ; decrement X
+ bne apcpymt ; loop until all matched bytes are copied
+ puls u ; restore source compressed data pointer
+ lda #$02 ; clear 'follows literal' flag
+ bra aptoken
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+ bne apdone ; queue not empty, bits remain
+ pshs a ; push reg A
+ lda ,-u ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta <apbitbuf,pcr ; save bit queue
+ puls a,pc ; pop reg A and return
+apbitbuf fcb $00 ; bit queue
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; go write a zero
+ decb ; we load below without predecrement, adjust here
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write backreferenced byte
+apgamma2 ldd #$1 ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit ; read data bit
+ rolb ; shift into D
+ rola
+ bsr apgetbit ; read continuation bit
+ bcs apg2loop ; loop until a zero continuation bit is read
+apdone rts
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+ ldb ,-u ; read low bits of offset + length bit in B
+ beq apdone ; check for EOD
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ std <aprepof+2,pcr ; store match offset
+ ldb #$01 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/8088/aplib_8088_fast.S b/tools/apultra/asm/8088/aplib_8088_fast.S
new file mode 100644
index 0000000..c535234
--- /dev/null
+++ b/tools/apultra/asm/8088/aplib_8088_fast.S
@@ -0,0 +1,178 @@
+; aplib_8088_fast.S - speed-optimized aPLib decompressor for 8088 - 188 bytes
+; Copyright (C) 2019 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ segment .text
+ bits 16
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * ds:si: compressed aPLib data
+; * es:di: output buffer
+; output:
+; * ax: decompressed size
+; ---------------------------------------------------------------------------
+%macro apl_get_bit 0 ; read bit into carry
+ add al,al ; shift bit queue, and high bit into carry
+ jnz %%gotbit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+ push di ; remember decompression offset
+ cld ; make string operations go forward
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: temporary value, trashed
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor dx,dx ; invalidate rep offset
+ movsb ; read and write literal byte
+ mov bx,03H ; set follows_literal(bx) to 3
+ apl_get_bit ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+ ; 1x: match
+ apl_get_bit ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub cx,bx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in zero)
+ jae .not_repmatch ; if not, not a rep-match
+ call .get_gamma2 ; read match length
+ jmp short .got_len ; go copy
+ mov dh,cl ; transfer high offset bits to dh
+ mov dl,[si] ; read low offset byte in dl
+ inc si
+ call .get_gamma2 ; read match length
+ cmp dh,07DH ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp dh,05H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp dx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+ inc cx ; increase length
+ inc cx ; increase length
+ ; copy cx bytes from match offset dx
+ push ds ; save ds:si (current pointer to compressed data)
+ mov bp,si
+ push es
+ pop ds
+ mov si,di ; point to destination in es:di - offset in dx
+ sub si,dx
+ rep movsb ; copy matched bytes
+ mov si,bp ; restore ds:si
+ pop ds
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp short .next_command
+ ; read gamma2-coded value into cx
+ xor cx,cx ; initialize to 1 so that value will start at 2
+ inc cx ; when shifted left in the adc below
+ apl_get_bit ; read data bit
+ adc cx,cx ; shift into cx
+ apl_get_bit ; read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+ ret
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+ xor cx,cx
+ apl_get_bit ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+ ; 110: 7 bits offset + 1 bit length
+ mov dl,[si] ; read offset + length in dl
+ inc si
+ inc cx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3
+ xor dh,dh ; clear high bits of offset
+ jmp short .got_len
+ ; 4 bits offset / 1 byte copy
+ apl_get_bit ; read 4 offset bits
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ xchg ax,cx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+ ; short offset 1-15
+ mov bx,di ; point to destination in es:di - offset in ax
+ sub bx,ax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[es:bx] ; read byte from short offset
+ stosb ; copy matched byte
+ mov ax,cx ; restore bit queue in al
+ jmp .next_command_after_literal
+ pop ax ; retrieve the original decompression offset
+ xchg di,ax ; compute decompressed size
+ sub ax,di
+ ret
diff --git a/tools/apultra/asm/8088/aplib_8088_small.S b/tools/apultra/asm/8088/aplib_8088_small.S
new file mode 100644
index 0000000..542991e
--- /dev/null
+++ b/tools/apultra/asm/8088/aplib_8088_small.S
@@ -0,0 +1,177 @@
+; aplib_8088_small.S - size-optimized aPLib decompressor for 8088 - 145 bytes
+; Copyright (C) 2019 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ segment .text
+ bits 16
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * ds:si: compressed aPLib data
+; * es:di: output buffer
+; output:
+; * ax: decompressed size
+; ---------------------------------------------------------------------------
+ push di ; remember decompression offset
+ cld ; make string operations go forward
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: offset of .get_bit
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor dx,dx ; invalidate rep offset
+ mov bp,.get_bit ; load offset of .get_bit, to be used with call bp
+ movsb ; read and write literal byte
+ mov bx,03H ; set follows_literal(bx) to 3
+ call bp ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+ ; 1x: match
+ call bp ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub cx,bx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in a negative value)
+ jae .not_repmatch ; if not, not a rep-match
+ call .get_gamma2 ; read match length
+ jmp short .got_len ; go copy
+ mov dh,cl ; transfer high offset bits to dh
+ mov dl,[si] ; read low offset byte in dl
+ inc si
+ call .get_gamma2 ; read match length
+ cmp dh,07DH ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp dh,05H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp dx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+ inc cx ; increase length
+ inc cx ; increase length
+ ; copy cx bytes from match offset dx
+ push ds ; save ds:si (current pointer to compressed data)
+ push si
+ push es
+ pop ds
+ mov si,di ; point to destination in es:di - offset in dx
+ sub si,dx
+ rep movsb ; copy matched bytes
+ pop si ; restore ds:si
+ pop ds
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp short .next_command
+ ; read gamma2-coded value into cx
+ xor cx,cx ; initialize to 1 so that value will start at 2
+ inc cx ; when shifted left in the adc below
+ call .get_dibits ; read data bit, shift into cx, read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+ ret
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+ xor cx,cx
+ call bp ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+ ; 110: 7 bits offset + 1 bit length
+ mov dl,[si] ; read offset + length in dl
+ inc si
+ inc cx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3
+ xor dh,dh ; clear high bits of offset
+ jmp short .got_len
+ ; 4 bits offset / 1 byte copy
+ call .get_dibits ; read 2 offset bits
+ adc cx,cx
+ call .get_dibits ; read 2 offset bits
+ adc cx,cx
+ xchg ax,cx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+ ; short offset 1-15
+ mov bx,di ; point to destination in es:di - offset in ax
+ sub bx,ax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[es:bx] ; read byte from short offset
+ stosb ; copy matched byte
+ xchg ax,cx ; restore bit queue in al
+ jmp .next_command_after_literal
+ pop ax ; retrieve the original decompression offset
+ xchg di,ax ; compute decompressed size
+ sub ax,di
+ ret
+ call bp ; read data bit
+ adc cx,cx ; shift into cx
+ add al,al ; shift bit queue, and high bit into carry
+ jnz .got_bit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+ ret
diff --git a/tools/apultra/asm/ARM7TDMI/aplib_arm.s b/tools/apultra/asm/ARM7TDMI/aplib_arm.s
new file mode 100644
index 0000000..b6d0cef
--- /dev/null
+++ b/tools/apultra/asm/ARM7TDMI/aplib_arm.s
@@ -0,0 +1,150 @@
+@APlib ARM7 decompressor by Dan Weiss, based on the original C version
+@Takes in raw apacked data, NOT data created by the 'safe' compressor.
+@Code is from the PocketNES NES Emulator for GBA
+@Code is formatted for GNU Assembler
+ src .req r0
+ dest .req r1
+ byte .req r2
+ mask .req r3
+ gamma .req r4
+ lwm .req r6
+ recentoff .req r7
+ temp .req r8
+.global depack
+.type depack STT_FUNC
+@r0 = src
+@r1 = dest
+@r2 = byte
+@r3 = rotating bit mask
+@r4 = increasing gamma
+@r6 = lwm
+@r7 = recentoff
+@r8 = lr copy/scratch
+ .macro GETBIT @3 instructions
+ movs mask,mask,ror #1
+ ldrcsb byte,[src],#1
+ tst byte,mask
+ .endm
+ .macro GETBITGAMMA @5 instructions
+ mov gamma,gamma,lsl #1
+ addne gamma,gamma,#1
+ .endm
+@This initilaiztion code can go into slow memory
+ stmfd sp!,{r4-r10,lr}
+ ldrb temp,[src],#1
+ strb temp,[dest],#1
+ ldr mask,=0x01010101
+ b aploop_nolwm
+@This inner-loop code should be placed into fast memory
+ @depack enters here
+ mov lwm,#0
+ bne apbranch1
+ ldrb temp,[src],#1
+ strb temp,[dest],#1
+ b aploop_nolwm
+ beq apbranch2
+ beq apbranch3
+ @get an offset
+ mov gamma,#0
+ addne gamma,gamma,#1
+ cmp gamma,#0
+ ldrneb gamma,[dest,-gamma]
+ strb gamma,[dest],#1
+ b aploop_nolwm
+ @use 7 bit offset, length = 2 or 3
+ @if a zero is encountered here, it's EOF
+ ldrb gamma,[src],#1
+ movs recentoff,gamma,lsr #1
+ beq done
+ ldrcsb temp,[dest,-recentoff]
+ strcsb temp,[dest],#1
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ mov lwm,#1
+ b aploop
+ @use a gamma code * 256 for offset, another gamma code for length
+ bl ap_getgamma
+ sub gamma,gamma,#2
+ cmp lwm,#0
+ bne ap_is_lwm
+ mov lwm,#1
+ cmp gamma,#0
+ bne ap_not_zero_gamma
+ @if gamma code is 2, use old recent offset, and a new gamma code for length
+ bl ap_getgamma
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ subs gamma,gamma,#1
+ bne copyloop1
+ b aploop
+ sub gamma,gamma,#1
+ ldrb temp,[src],#1
+ add recentoff,temp,gamma,lsl #8
+ bl ap_getgamma
+ @gamma=length
+ cmp recentoff,#32000
+ addge gamma,gamma,#1
+ cmp recentoff,#1280
+ addge gamma,gamma,#1
+ cmp recentoff,#128
+ addlt gamma,gamma,#2
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ subs gamma,gamma,#1
+ bne copyloop2
+ b aploop
+ mov gamma,#1
+ bne ap_getgammaloop
+ bx lr
+ ldmfd sp!,{r4-r10,lr}
+ bx lr
+.unreq src
+.unreq dest
+.unreq byte
+.unreq mask
+.unreq gamma
+.unreq lwm
+.unreq recentoff
+.unreq temp
diff --git a/tools/apultra/asm/Z80/unaplib_fast.asm b/tools/apultra/asm/Z80/unaplib_fast.asm
new file mode 100644
index 0000000..c21eb5d
--- /dev/null
+++ b/tools/apultra/asm/Z80/unaplib_fast.asm
@@ -0,0 +1,339 @@
+; Speed-optimized ApLib decompressor by spke & uniabis (ver.06 01-05/06/2020, 235 bytes)
+; The original Z80 decompressors for ApLib were written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain) and Antonio Villena.
+; This is a new "implicit state" decompressor heavily optimized for speed by spke.
+; (It is 12 bytes shorter and 18% faster than the previously fastest
+; 247b decompressor by Metalbrain and Antonio Villena.)
+; ver.00 by spke (21/08/2018-01/09/2018, 244 bytes, an edit of the existing 247b decompressor);
+; ver.01 by spke (12-13/11/2018, 234(-10) bytes, +3% speed using the state machine for LWM);
+; ver.02 by spke (06/08/2019, +1% speed);
+; ver.03 by spke (27/08/2019, 236(+2) bytes, +1% speed using partly expanded LDIR);
+; ver.04 by spke (spring 2020, added full revision history and support for long offsets)
+; ver.05 by spke (17-31/05/2020, 230(-6) bytes, +3% speed, added support for backward compression) <- BROKEN, DO NOT USE
+; ver.06 by uniabis & spke (01-07/06/2020, 235(+5) bytes, +1% speed, added support for HD64180)
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+; The compression can done as follows:
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+; The decompression is done in the standard way:
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+; Backward decompression is also supported; you can compress files backward using:
+; apultra.exe -b <sourcefile> <outfile>
+; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+; ld hl,LastByteOfCompressedData
+; ld de,LastByteOfMemoryForDecompressedData
+; call DecompressApLib
+; The decompressor modifies AF, AF', BC, DE, HL, IX.
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+; DEFINE BackwardDecompression ; decompress data compressed backwards, -10 bytes, speeds decompression up by 3%
+; DEFINE HD64180 ; -2 bytes for HD64180/Z180 support, slows decompression down by 1%
+ IFNDEF BackwardDecompression
+ inc hl
+ ldi
+ ldir
+ dec hl
+ ldd
+ lddr
+ ld a,(hl) : NEXT_HL : rla
+@Decompress: COPY_1 : scf
+LWM0: ;LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+.ReloadByteC0 RELOAD_A : jr c,.Check2ndBit
+; case "0"+BYTE: copy a single literal
+.CASE0: COPY_1 ; first byte is always copied as literal
+; main decompressor loop
+.MainLoop: add a : jr nc,.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal
+.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit add a : call z,ReloadByte : jp c,LWM1.CASE111 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+.CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ ld b,0
+ IFNDEF HD64180
+ ld ixl,c : ld ixh,b ; save offset for future LWMs
+ push bc : pop ix
+ push hl ; save src
+ ld h,d : ld l,e ; HL = dest
+ jr c,.LengthIs3
+ IFNDEF BackwardDecompression
+ sbc hl,bc
+ add hl,bc
+ COPY_1 : COPY_1
+ jr .PreMainLoop
+ IFNDEF BackwardDecompression
+ or a : sbc hl,bc
+ add hl,bc
+ COPY_1 : COPY_1 : COPY_1
+ jr .PreMainLoop
+.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : dec c : jr z,LWM1.KickInLWM
+.AfterLWM dec c : ld b,c : ld c,(hl) : NEXT_HL ; BC = offset
+ IFNDEF HD64180
+ ld ixl,c : ld ixh,b : push bc
+ push bc : push bc : pop ix
+ call GetGammaCoded ; BC = len*
+ ex (sp),hl
+ ; interpretation of length value is offset-dependent:
+ ; if (offs >= 32000) len++; if (offs >= 1280) len++; if (offs < 128) len+=2;
+ ; in other words,
+ ; (1 <= offs < 128) +=2
+ ; (128 <= offs < 1280) +=0
+ ; (1280 <= offs < 31999) +=1
+ ; NB offsets over 32000 need one more check, but other Z80 decompressors seem to ignore it. is it not needed?
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000/256 : jr nc,.Add2
+ cp 5 : jr nc,.Add1
+ or a : jr nz,.Add0
+ bit 7,l : jr nz,.Add0
+.Add2 inc bc
+.Add1 inc bc
+.Add0 ; for offs<128 : 4+4+7+7 + 4+7 + 8+7 + 6+6 = 60t
+ ; for offs>=1280 : 4+4+7+12 + 6 = 33t
+ ; for 128<=offs<1280 : 4+4+7+7 + 4+12 = 38t OR 4+4+7+7 + 4+7+8+12 = 53t
+.CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset
+ ; and also that (SP) = src, while having NC
+ IFNDEF BackwardDecompression
+ ld a,e : sub l : ld l,a
+ ld a,d : sbc h
+ ld h,a : exa
+ exa
+.CopyMatchLDH add hl,de
+.PreMainLoop pop hl ; recover src
+LWM1: ; LWM = 1
+; main decompressor loop
+.MainLoop: add a : jr nc,LWM0.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal
+.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit add a : call z,ReloadByte : jr nc,LWM0.CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+.CASE111: ld bc,%11100000
+ DUP 4
+ add a : call z,ReloadByte : rl c ; read short offset (4 bits)
+ ex de,hl : jr z,.WriteZero ; zero offset means "write zero" (NB: B is zero here)
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ IFNDEF BackwardDecompression
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ add hl,bc
+ ld c,(hl)
+ pop hl
+.WriteZero ld (hl),c : NEXT_HL
+ ex de,hl : jp LWM0.MainLoop ; 10+4*(4+10+8)+4+7 + 11+15+7+10 + 7+4+6+10 = 179t
+.ReloadByteC0 RELOAD_A : jp nc,LWM0.CASE0
+ jr .Check2ndBit
+.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : jr LWM0.AfterLWM
+; the re-use of the previous offset (LWM magic)
+.KickInLWM: ; "and a new gamma code for length"
+ inc c : call GetGammaCoded.ReadGamma ; BC = len
+ IFNDEF BackwardDecompression
+ push ix : ex (sp),hl : exa
+ jr LWM0.CopyMatch
+ push ix : ex (sp),hl
+ jr LWM0.CopyMatchLDH
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+GetGammaCoded: ld bc,1
+.ReadGamma add a : jr z,.ReloadByteRG1
+ rl c : rl b
+ add a : ret nc ; NB: flag NC immediately says we do not need to reload our byte...
+ jr nz,.ReadGamma ; ...even better, flag NZ then automatically means flag C :)
+.ReloadByteRG2 RELOAD_A : ret nc : jr .ReadGamma
+.ReloadByteRG1 RELOAD_A : rl c : rl b
+ add a : ret nc : jr .ReadGamma
+; pretty usual getbit for mixed datastreams
+ReloadByte: RELOAD_A : ret
diff --git a/tools/apultra/asm/Z80/unaplib_small.asm b/tools/apultra/asm/Z80/unaplib_small.asm
new file mode 100644
index 0000000..280de15
--- /dev/null
+++ b/tools/apultra/asm/Z80/unaplib_small.asm
@@ -0,0 +1,258 @@
+; Size-optimized ApLib decompressor by spke & uniabis (ver.04 01-07/06/2020, 139 bytes)
+; The original Z80 decompressor for ApLib was written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain).
+; This version was heavily re-optimized for size by spke.
+; (It is 17 bytes shorter and 22% faster than the 156b version by Metalbrain.)
+; ver.00 by spke (21/08/2018-01/09/2018, 141 bytes);
+; ver.01 by spke (spring 2019, 140(-1) bytes, slightly faster);
+; ver.02 by spke (05-07/01/2020, added full revision history, support for long offsets
+; and an option to use self-modifying code instead of IY)
+; ver.03 by spke (18-29/05/2020, +0.5% speed, added support for backward compression)
+; ver.04 by uniabis (01-07/06/2020, 139(-1) bytes, +1% speed, added support for HD64180)
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+; The compression can be done as follows:
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+; The decompression is done in the standard way:
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+; Backward decompression is also supported; you can compress files backward using:
+; apultra.exe -b <sourcefile> <outfile>
+; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+; ld hl,LastByteOfCompressedData
+; ld de,LastByteOfMemoryForDecompressedData
+; call DecompressApLib
+; The decompressor modifies AF, AF', BC, DE, HL, IX.
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+; DEFINE FasterGetBit ; 16% speed-up at the cost of extra 4 bytes
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+; DEFINE BackwardDecompression ; decompress data compressed backwards, -5 bytes, speeds decompression up by 3%
+ IFDEF FasterGetBit
+ add a : call z,ReloadByte
+ call GetOneBit
+ IFNDEF BackwardDecompression
+ inc hl
+ ldi
+ ldir
+ dec hl
+ ldd
+ lddr
+@DecompressApLib: ld a,128
+; case "0"+BYTE: copy a single literal
+CASE0: COPY_1 ; first byte is always copied as literal
+ResetLWM: ld b,-1 ; LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+; main decompressor loop
+MainLoop: GET_BIT : jr nc,CASE0 ; "0"+BYTE = copy literal
+ GET_BIT : jr nc,CASE10 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+ ld bc,%11100000
+ GET_BIT : jr nc,CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+ReadFourBits GET_BIT ; read short offset (4 bits)
+ rl c : jr c,ReadFourBits
+ ex de,hl : jr z,WriteZero ; zero offset means "write zero" (NB: B is zero here)
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ IFNDEF BackwardDecompression
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ add hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ld c,(hl) : pop hl
+WriteZero ld (hl),c : NEXT_HL
+ ex de,hl : jr ResetLWM ; write one byte, reset LWM
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ push hl ; save src
+ ld h,b : ld l,c ; HL = offset
+ ; flag NC means len=2, flag C means len=3
+ ld c,1 : rl c : jr SaveLWMOffset
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+CASE10: ; save state of LWM into A'
+ exa : ld a,b : exa
+ ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ exa : add c : ld c,a : exa
+ ; "if gamma code is 2, use old r0 offset"
+ dec c : jr z,KickInLWM
+ dec c
+ ld b,c : ld c,(hl) : NEXT_HL ; BC = offset
+ push bc ; (SP) = offset
+ call GetGammaCoded ; BC = len*
+ ex (sp),hl ; HL = offset, (SP) = src
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000/256 : jr nc,.Add2
+ cp 5 : jr nc,.Add1
+ or a : jr nz,.Add0
+ bit 7,l : jr nz,.Add0
+.Add2 inc bc
+.Add1 inc bc
+.Add0 exa
+ push hl : pop ix ; save offset for future LWMs
+CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset
+ ; and also that (SP) = src, while having NC
+ IFNDEF BackwardDecompression
+ push de
+ ex de,hl : sbc hl,de ; HL = dest-offset
+ pop de ; DE = dest
+ add hl,de ; HL = dest+offset
+ pop hl ; recover src
+ jr MainLoop
+; the re-use of the previous offset (LWM magic)
+KickInLWM: ; "and a new gamma code for length"
+ call GetGammaCoded ; BC = len
+ push ix : ex (sp),hl ; DE = dest, HL = prev offset
+ jr CopyMatch
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+GetGammaCoded: ld bc,1
+ReadGamma GET_BIT : rl c : rl b
+ GET_BIT : ret nc
+ jr ReadGamma
+; pretty usual getbit for mixed datastreams
+ IFNDEF FasterGetBit
+GetOneBit: add a : ret nz
+ReloadByte: ld a,(hl) : NEXT_HL
+ rla : ret
diff --git a/tools/apultra/asm/x86/aplib_x86_fast.asm b/tools/apultra/asm/x86/aplib_x86_fast.asm
new file mode 100644
index 0000000..9e41d31
--- /dev/null
+++ b/tools/apultra/asm/x86/aplib_x86_fast.asm
@@ -0,0 +1,180 @@
+; aplib_x86_fast.asm - speed-optimized aPLib decompressor for x86 - 188 bytes
+; Copyright (C) 2019 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ segment .text
+ bits 32
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * esi: compressed aPLib data
+; * edi: output buffer
+; output:
+; * eax: decompressed size
+; ---------------------------------------------------------------------------
+ %ifndef BIN
+ global apl_decompress
+ global _apl_decompress
+ %endif
+ ; uint32_t apl_decompress(const void *Source, void *Destination);
+%macro apl_get_bit 0 ; read bit into carry
+ add al,al ; shift bit queue, and high bit into carry
+ jnz %%gotbit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+ pushad
+ %ifdef CDECL
+ mov esi, [esp+32+4] ; esi = aPLib compressed data
+ mov edi, [esp+32+8] ; edi = output
+ %endif
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: temporary value, trashed
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor edx, edx ; invalidate rep offset
+ movsb ; read and write literal byte
+ mov ebx,03H ; set follows_literal(bx) to 3
+ apl_get_bit ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+ ; 1x: match
+ apl_get_bit ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in zero)
+ jae .not_repmatch ; if not, not a rep-match
+ call .get_gamma2 ; read match length
+ jmp .got_len ; go copy
+ mov edx,ecx ; transfer high offset bits to dh
+ shl edx, 8
+ mov dl,[esi] ; read low offset byte in dl
+ inc esi
+ call .get_gamma2 ; read match length
+ cmp edx,07D00H ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp edx,0500H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp edx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+ inc ecx ; increase length
+ inc ecx ; increase length
+ ; copy cx bytes from match offset dx
+ push esi
+ mov esi,edi ; point to destination in es:di - offset in dx
+ sub esi,edx
+ rep movsb ; copy matched bytes
+ pop esi
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp .next_command
+ ; read gamma2-coded value into cx
+ xor ecx,ecx ; initialize to 1 so that value will start at 2
+ inc ecx ; when shifted left in the adc below
+ apl_get_bit ; read data bit
+ adc ecx,ecx ; shift into cx
+ apl_get_bit ; read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+ ret
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+ xor ecx,ecx
+ apl_get_bit ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+ ; 110: 7 bits offset + 1 bit length
+ movzx edx,byte[esi] ; read offset + length in dl
+ inc esi
+ inc ecx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3
+ jmp .got_len
+ ; 4 bits offset / 1 byte copy
+ apl_get_bit ; read 4 offset bits
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ xchg eax,ecx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+ ; short offset 1-15
+ mov ebx,edi ; point to destination in es:di - offset in ax
+ sub ebx,eax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[ebx] ; read byte from short offset
+ stosb ; copy matched byte
+ mov eax,ecx ; restore bit queue in al
+ jmp .next_command_after_literal
+ sub edi, [esp+32+8] ; compute decompressed size
+ mov [esp+28], edi
+ popad
+ ret
diff --git a/tools/apultra/asm/x86/aplib_x86_small.asm b/tools/apultra/asm/x86/aplib_x86_small.asm
new file mode 100644
index 0000000..ada00f6
--- /dev/null
+++ b/tools/apultra/asm/x86/aplib_x86_small.asm
@@ -0,0 +1,159 @@
+; aplib_x86_small.asm - size-optimized aPLib decompressor for x86 - 185 bytes
+; Copyright (C) 2019 Emmanuel Marty
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ segment .text
+ bits 32
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * esi: compressed aPLib data
+; * edi: output buffer
+; output:
+; * eax: decompressed size
+; ---------------------------------------------------------------------------
+ %ifndef BIN
+ global apl_decompress
+ global _apl_decompress
+ %endif
+ pushad
+ %ifdef CDECL
+ mov esi, [esp+32+4] ; esi = aPLib compressed data
+ mov edi, [esp+32+8] ; edi = output
+ %endif
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; ebx: follows_literal
+ ; ecx: scratch register for reading gamma2 codes and storing copy length
+ ; edx: match offset (and rep-offset)
+ ; esi: input (compressed data) pointer
+ ; edi: output (decompressed data) pointer
+ ; ebp: offset of .get_bit
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor edx, edx ; invalidate rep offset in edx
+ call .init_get_bit
+ call ebp ; read data bit
+ adc ecx,ecx ; shift into cx
+ add al,al ; shift bit queue, and high bit into carry
+ jnz .got_bit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+ ret
+ pop ebp ; load offset of .get_bit, to be used with call ebp
+ add ebp, .get_bit - .get_dibits
+ movsb ; read and write literal byte
+ push 03H
+ pop ebx ; set follows_literal(bx) to 3
+ call ebp ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+ ; 1x: match
+ call ebp ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in a negative value)
+ jae .not_repmatch ; if not, not a rep-match
+ call .get_gamma2 ; read match length
+ jmp .got_len ; go copy
+ mov edx,ecx ; transfer high offset bits to dh
+ shl edx,8
+ mov dl,[esi] ; read low offset byte in dl
+ inc esi
+ call .get_gamma2 ; read match length
+ cmp edx,7D00H ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp edx,0500H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp edx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+ inc ecx ; increase length
+ inc ecx ; increase length
+ ; copy ecx bytes from match offset edx
+ push esi ; save esi (current pointer to compressed data)
+ mov esi,edi ; point to destination in edi - offset in edx
+ sub esi,edx
+ rep movsb ; copy matched bytes
+ pop esi ; restore esi
+ mov bl,02H ; set follows_literal to 2 (ebx is unmodified by match commands)
+ jmp .next_command
+ ; read gamma2-coded value into ecx
+ xor ecx,ecx ; initialize to 1 so that value will start at 2
+ inc ecx ; when shifted left in the adc below
+ call .get_dibits ; read data bit, shift into cx, read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+ ret
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+ xor ecx,ecx
+ call ebp ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+ ; 110: 7 bits offset + 1 bit length
+ movzx edx,byte[esi] ; read offset + length in dl
+ inc esi
+ inc ecx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3
+ jmp .got_len
+ ; 4 bits offset / 1 byte copy
+ call .get_dibits ; read 2 offset bits
+ adc ecx,ecx
+ call .get_dibits ; read 2 offset bits
+ adc ecx,ecx
+ xchg eax,ecx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+ ; short offset 1-15
+ mov ebx,edi ; point to destination in es:di - offset in ax
+ sub ebx,eax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[ebx] ; read byte from short offset
+ stosb ; copy matched byte
+ xchg eax,ecx ; restore bit queue in al
+ jmp .next_command_after_literal
+ sub edi, [esp+32+8] ; compute decompressed size
+ mov [esp+28], edi
+ popad
+ ret