summaryrefslogtreecommitdiff
path: root/tools/apultra/asm
diff options
context:
space:
mode:
authorJuan J. Martinez <jjm@usebox.net>2021-01-09 09:01:05 +0000
committerJuan J. Martinez <jjm@usebox.net>2021-01-09 09:01:05 +0000
commit9bcf1e97960c0da7322a868efdbc07e2650716fe (patch)
treede6d32ad5b0e567991bd3eb262902c15a77074d9 /tools/apultra/asm
parent3b31adf01305e522f7e28c1435fb47418ce43267 (diff)
downloadubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.tar.gz
ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.zip
Extra libs: ap.lib
aPLib support with apultra.
Diffstat (limited to 'tools/apultra/asm')
-rw-r--r--tools/apultra/asm/6502/aplib_6502.asm257
-rw-r--r--tools/apultra/asm/6502/aplib_6502_b.asm218
-rw-r--r--tools/apultra/asm/68000/unaplib_68000.S117
-rw-r--r--tools/apultra/asm/6809/unaplib.s125
-rw-r--r--tools/apultra/asm/6809/unaplib_6309.s139
-rw-r--r--tools/apultra/asm/6809/unaplib_6309_b.s143
-rw-r--r--tools/apultra/asm/6809/unaplib_b.s122
-rw-r--r--tools/apultra/asm/8088/aplib_8088_fast.S178
-rw-r--r--tools/apultra/asm/8088/aplib_8088_small.S177
-rw-r--r--tools/apultra/asm/ARM7TDMI/aplib_arm.s150
-rw-r--r--tools/apultra/asm/Z80/unaplib_fast.asm339
-rw-r--r--tools/apultra/asm/Z80/unaplib_small.asm258
-rw-r--r--tools/apultra/asm/x86/aplib_x86_fast.asm180
-rw-r--r--tools/apultra/asm/x86/aplib_x86_small.asm159
14 files changed, 2562 insertions, 0 deletions
diff --git a/tools/apultra/asm/6502/aplib_6502.asm b/tools/apultra/asm/6502/aplib_6502.asm
new file mode 100644
index 0000000..1bc11b4
--- /dev/null
+++ b/tools/apultra/asm/6502/aplib_6502.asm
@@ -0,0 +1,257 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; aplib_6502.s
+;
+; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format.
+;
+; Includes support for Emmanuel Marty's enhancements to the aPLib format.
+;
+; The code is 252 bytes long for standard format, 270 for enhanced format.
+;
+; This code is written for the ACME assembler.
+;
+; Copyright John Brandwood 2019.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+; http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Macros
+;
+
+ ;
+ ; Macro to increment the source pointer to the next page.
+ ;
+
+ !macro APL_INC_PAGE {
+ inc <apl_srcptr + 1
+ }
+
+ ;
+ ; Macro to read a byte from the compressed source data.
+ ;
+
+ !macro APL_GET_SRC {
+ lda (apl_srcptr),y
+ inc <apl_srcptr + 0
+ bne .skip
+ +APL_INC_PAGE
+.skip:
+ }
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 12 bytes of zero-page.
+;
+
+apl_bitbuf = $F7 ; 1 byte.
+apl_offset = $F8 ; 1 word.
+apl_winptr = $FA ; 1 word.
+apl_srcptr = $FC ; 1 word.
+apl_dstptr = $FE ; 1 word.
+apl_length = apl_winptr
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; apl_decompress - Decompress data stored in Jorgen Ibsen's aPLib format.
+;
+; Args: apl_srcptr = ptr to compessed data
+; Args: apl_dstptr = ptr to output buffer
+; Uses: lots!
+;
+; As an optimization, the code to handle window offsets > 64768 bytes has
+; been removed, since these don't occur with a 16-bit address range.
+;
+; As an optimization, the code to handle window offsets > 32000 bytes can
+; be commented-out, since these don't occur in typical 8-bit computer usage.
+;
+
+apl_decompress: ldy #0 ; Initialize source index.
+
+ lda #$80 ; Initialize an empty
+ sta <apl_bitbuf ; bit-buffer.
+
+ ;
+ ; 0 bbbbbbbb - One byte from compressed data, i.e. a "literal".
+ ;
+
+.literal: +APL_GET_SRC
+
+.write_byte: ldx #0 ; LWM=0.
+
+ sta (apl_dstptr),y ; Write the byte directly to
+ inc <apl_dstptr + 0 ; the output.
+ bne .next_tag
+ inc <apl_dstptr + 1
+
+.next_tag: asl <apl_bitbuf ; 0 bbbbbbbb
+ bne .skip0
+ jsr .load_bit
+.skip0: bcc .literal
+
+.skip1: asl <apl_bitbuf ; 1 0 <offset> <length>
+ bne .skip2
+ jsr .load_bit
+.skip2: bcc .copy_large
+
+ asl <apl_bitbuf ; 1 1 0 dddddddn
+ bne .skip3
+ jsr .load_bit
+.skip3: bcc .copy_normal
+
+ ; 1 1 1 dddd - Copy 1 byte within 15 bytes (or zero).
+
+.copy_short: lda #$10
+.nibble_loop: asl <apl_bitbuf
+ bne .skip4
+ pha
+ jsr .load_bit
+ pla
+.skip4: rol
+ bcc .nibble_loop
+ beq .write_byte ; Offset=0 means write zero.
+
+ eor #$FF ; Read the byte directly from
+ tay ; the destination window.
+ iny
+ dec <apl_dstptr + 1
+ lda (apl_dstptr),y
+ inc <apl_dstptr + 1
+ ldy #0
+ beq .write_byte
+
+ ;
+ ; 1 1 0 dddddddn - Copy 2 or 3 within 128 bytes.
+ ;
+
+.copy_normal: +APL_GET_SRC ; 1 1 0 dddddddn
+ lsr
+ beq .finished ; Offset 0 == EOF.
+
+ sta <apl_offset + 0 ; Preserve offset.
+ sty <apl_offset + 1
+ tya ; Y == 0.
+ tax ; Bits 8..15 of length.
+ adc #2 ; Bits 0...7 of length.
+ bne .do_match ; NZ from previous ADC.
+
+ ;
+ ; Subroutines for byte & bit handling.
+ ;
+
+.get_gamma: lda #1 ; Get a gamma-coded value.
+.gamma_loop: asl <apl_bitbuf
+ bne .skip5
+ pha
+ jsr .load_bit
+ pla
+.skip5: rol
+ rol <apl_length + 1
+ asl <apl_bitbuf
+ bne .skip6
+ pha
+ jsr .load_bit
+ pla
+.skip6: bcs .gamma_loop
+
+.finished: rts ; All decompressed!
+
+ ;
+ ; 1 0 <offset> <length> - gamma-coded LZSS pair.
+ ;
+
+.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2).
+ sty <apl_length + 1 ; Clear hi-byte of length.
+
+ cpx #1 ; CC if LWM==0, CS if LWM==1.
+ sbc #2 ; -3 if LWM==0, -2 if LWM==1.
+ bcs .normal_pair ; CC if LWM==0 && offset==2.
+
+ jsr .get_gamma ; Get length (A=lo-byte & CC).
+ ldx <apl_length + 1
+ bcc .do_match ; Use previous Offset.
+
+.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset.
+
+ +APL_GET_SRC
+ sta <apl_offset + 0 ; Save bits 0...7 of offset.
+
+ jsr .get_gamma ; Get length (A=lo-byte & CC).
+ ldx <apl_length + 1
+
+ ldy <apl_offset + 1 ; If offset < 256.
+ beq .lt256
+ cpy #$7D ; If offset >= 32000, length += 2.
+ bcs .match_plus2
+ cpy #$05 ; If offset >= 1280, length += 1.
+ bcs .match_plus1
+ bcc .do_match
+.lt256: ldy <apl_offset + 0 ; If offset < 128, length += 2.
+ bmi .do_match
+
+ sec ; aPLib gamma returns with CC.
+
+.match_plus2: adc #1 ; CS, so ADC #2.
+ bcs .match_plus256
+
+.match_plus1: adc #0 ; CS, so ADC #1, or CC if fall
+ bcc .do_match ; through from .match_plus2.
+
+.match_plus256: inx
+
+.do_match: eor #$FF ; Negate the lo-byte of length
+ tay ; and check for zero.
+ iny
+ beq .calc_addr
+ eor #$FF
+
+ inx ; Increment # of pages to copy.
+
+ clc ; Calc destination for partial
+ adc <apl_dstptr + 0 ; page.
+ sta <apl_dstptr + 0
+ bcs .calc_addr
+ dec <apl_dstptr + 1
+
+.calc_addr: sec ; Calc address of match.
+ lda <apl_dstptr + 0
+ sbc <apl_offset + 0
+ sta <apl_winptr + 0
+ lda <apl_dstptr + 1
+ sbc <apl_offset + 1
+ sta <apl_winptr + 1
+
+.copy_page: lda (apl_winptr),y
+ sta (apl_dstptr),y
+ iny
+ bne .copy_page
+ inc <apl_winptr + 1
+ inc <apl_dstptr + 1
+ dex ; Any full pages left to copy?
+ bne .copy_page
+
+ inx ; LWM=1.
+ jmp .next_tag
+
+ ;
+ ; Subroutines for byte & bit handling.
+ ;
+
+.load_bit: +APL_GET_SRC ; Reload an empty bit-buffer
+ rol ; from the compressed source.
+ sta <apl_bitbuf
+ rts
diff --git a/tools/apultra/asm/6502/aplib_6502_b.asm b/tools/apultra/asm/6502/aplib_6502_b.asm
new file mode 100644
index 0000000..7963e02
--- /dev/null
+++ b/tools/apultra/asm/6502/aplib_6502_b.asm
@@ -0,0 +1,218 @@
+; -----------------------------------------------------------------------------
+; aplib_6502_b.s - fast aPLib backward decompressor for 6502 - 253 bytes
+; written for the ACME assembler
+;
+; jsr apl_decompress to unpack data backwards.
+; create backwards compressed data with apultra -b or oapack -b
+;
+; in:
+; * apl_srcptr (low and high byte) = last byte of compressed data
+; * apl_dstptr (low and high byte) = last byte of decompression buffer
+;
+; out:
+; * apl_dstptr (low and high byte) = first byte of decompressed data
+;
+; Copyright (C) 2020 Emmanuel Marty
+; With parts of the code inspired by John Brandwood, Peter Ferrie
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+ ; Zero page locations
+
+apl_gamma2_hi = $F6
+apl_bitbuf = $F7
+apl_offset = $F8
+apl_winptr = $FA
+apl_srcptr = $FC
+apl_dstptr = $FE
+
+ ; Read a byte from the source into A. Trashes X
+
+ !macro APL_GET_SRC {
+ lda (apl_srcptr),y
+ ldx <apl_srcptr+0
+ bne .src_page_done
+ dec <apl_srcptr+1
+.src_page_done: dec <apl_srcptr+0
+ }
+
+ ; Write a byte to the destinatipn
+
+ !macro APL_PUT_DST {
+ sta (apl_dstptr),y
+ lda <apl_dstptr+0
+ bne .dst_page_done
+ dec <apl_dstptr+1
+.dst_page_done: dec <apl_dstptr+0
+ }
+
+ ; Read one bit from the source into the carry, trash A
+
+ !macro APL_GET_BIT {
+ asl <apl_bitbuf
+ bne .has_bits
+ jsr apl_load_bits
+.has_bits:
+ }
+
+ ; Read one bit from the source into the carry, preserve A
+
+ !macro APL_GET_BIT_SAVEA {
+ asl <apl_bitbuf
+ bne .has_bits
+ pha
+ jsr apl_load_bits
+ pla
+.has_bits:
+ }
+
+ ; Decompress aPLib data backwards
+
+apl_decompress: lda #$80 ; initialize empty bit queue
+ sta <apl_bitbuf ; plus bit to roll into carry
+ ldy #$00 ; clear Y for indirect addr
+
+.copy_literal: +APL_GET_SRC ; read literal from source
+.write_literal: +APL_PUT_DST ; write literal to destination
+
+ ldx #$00 ; clear 'follows match' flag
+
+.next_token: +APL_GET_BIT ; read 'literal or match' bit
+ bcc .copy_literal ; if 0: literal
+
+ +APL_GET_BIT ; read '8+n bits or other' bit
+ bcc .long_match ; if 10x: long 8+n bits match
+
+ ; 11x: other type of match
+
+ +APL_GET_BIT ; read '7+1 match or short literal' bit
+ bcs .short_match ; if 111: 4 bit offset for 1-byte copy
+
+ +APL_GET_SRC ; read low byte of offset + length bit
+ lsr ; shift offset into place, len bit into carry
+ beq .done ; check for EOD
+ sta <apl_offset+0 ; store low byte of offset
+ sty <apl_offset+1 ; set high byte of offset to 0
+
+ tya ; set A to 0
+ sty <apl_gamma2_hi ; set high byte of len to 0
+ adc #$02 ; add 2 or 3 depending on len bit in carry
+ ; now, low part of len is in A
+ ; high part of len in apl_gamma2_hi is 0
+ ; offset is written to apl_offset
+ bne .got_len ; go copy matched bytes
+
+.long_match: jsr .get_gamma2 ; 10: read gamma2 high offset bits in A
+ sty <apl_gamma2_hi ; zero out high byte of gamma2
+
+ cpx #$01 ; set carry if following literal
+ sbc #$02 ; substract 3 if following literal, 2 otherwise
+ bcs .no_repmatch
+
+ jsr .get_gamma2 ; read repmatch length: low part in A
+ bcc .got_len ; go copy large match
+ ; (carry is always clear after .get_gamma2)
+
+.short_match: lda #$10 ; clear offset, load end bit into place
+.read_short_offs: +APL_GET_BIT_SAVEA ; read one bit of offset into carry
+ rol ; shift into A, shift end bit as well
+ bcc .read_short_offs ; loop until end bit is shifted out into carry
+
+ beq .write_literal ; zero offset means write a 0
+ tay
+ lda (apl_dstptr),y ; load backreferenced byte
+ ldy #$00 ; clear Y again
+ beq .write_literal ; go write byte to destination
+
+.get_gamma2: lda #$01 ; 1 so it gets shifted to 2
+.gamma2_loop: +APL_GET_BIT_SAVEA ; read data bit
+ rol ; shift into low byte
+ rol <apl_gamma2_hi ; shift into high byte
+ +APL_GET_BIT_SAVEA ; read continuation bit
+ bcs .gamma2_loop ; loop until a zero continuation bit is read
+.done: rts
+
+.no_repmatch: sta <apl_offset+1 ; write high byte of offset
+ +APL_GET_SRC ; read low byte of offset from source
+ sta <apl_offset+0 ; store low byte of offset
+
+ jsr .get_gamma2 ; read match length: low part in A
+
+ ldx <apl_offset+1 ; high offset byte is zero?
+ beq .offset_1byte ; if so, offset < 256
+
+ ; offset is >= 256.
+
+ cpx #$7d ; offset >= 32000 (7d00) ?
+ bcs .offset_incby2 ; if so, increase match len by 2
+ cpx #$05 ; offset >= 1280 (0500) ?
+ bcs .offset_incby1 ; if so, increase match len by 1
+ bcc .got_len ; length is fine, go copy
+
+.offset_1byte: ldx <apl_offset+0 ; offset < 128 ?
+ bmi .got_len ; if so, increase match len by 2
+ sec ; carry must be set below
+
+.offset_incby2: adc #$01 ; add 1 + set carry (from bcs or sec)
+ bcs .len_inchi ; go add 256 to len if overflow
+
+ ; carry clear: fall through for no-op
+
+.offset_incby1: adc #$00 ; add 1 + carry
+ bcc .got_len
+.len_inchi: inc <apl_gamma2_hi ; add 256 to len if low byte overflows
+
+.got_len: tax ; transfer low byte of len into X
+ beq .add_offset
+ inc <apl_gamma2_hi
+
+.add_offset: clc ; add dest + match offset
+ lda <apl_dstptr+0 ; low 8 bits
+ adc <apl_offset+0
+ sta <apl_winptr+0 ; store back reference address
+ lda <apl_dstptr+1 ; high 8 bits
+ adc <apl_offset+1
+ sta <apl_winptr+1 ; store high 8 bits of address
+
+.copy_match_loop: lda (apl_winptr),y ; read one byte of backreference
+ +APL_PUT_DST ; write byte to destination
+
+ lda <apl_winptr+0 ; decrement backreference address
+ bne .backref_page_done
+ dec <apl_winptr+1
+.backref_page_done:
+ dec <apl_winptr+0
+
+ dex ; loop to copy all matched bytes
+ bne .copy_match_loop
+ dec <apl_gamma2_hi
+ bne .copy_match_loop
+
+ ; X is 0 when exiting the loop above
+ inx ; set 'follows match' flag
+ jmp .next_token ; go decode next token
+
+apl_load_bits: lda (apl_srcptr),y ; read 8 bits from source
+ rol ; shift bit queue, and high bit into carry
+ sta <apl_bitbuf ; save bit queue
+
+ lda <apl_srcptr+0
+ bne .bits_page_done
+ dec <apl_srcptr+1
+.bits_page_done: dec <apl_srcptr+0
+ rts
diff --git a/tools/apultra/asm/68000/unaplib_68000.S b/tools/apultra/asm/68000/unaplib_68000.S
new file mode 100644
index 0000000..a60ae32
--- /dev/null
+++ b/tools/apultra/asm/68000/unaplib_68000.S
@@ -0,0 +1,117 @@
+; unaplib_68000.s - aPLib decompressor for 68000 - 154 bytes
+;
+; in: a0 = start of compressed data
+; a1 = start of decompression buffer
+; out: d0 = decompressed size
+;
+; Copyright (C) 2020 Emmanuel Marty
+; With parts of the code inspired by Franck "hitchhikr" Charlet
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress:
+ movem.l a2-a6/d2-d3,-(sp)
+
+ moveq #-128,d1 ; initialize empty bit queue
+ ; plus bit to roll into carry
+ lea 32000.w,a2 ; load 32000 offset constant
+ lea 1280.w,a3 ; load 1280 offset constant
+ lea 128.w,a4 ; load 128 offset constant
+ move.l a1,a5 ; save destination pointer
+
+.literal: move.b (a0)+,(a1)+ ; copy literal byte
+.after_lit: moveq #3,d2 ; set LWM flag
+
+.next_token: bsr.s .get_bit ; read 'literal or match' bit
+ bcc.s .literal ; if 0: literal
+
+ bsr.s .get_bit ; read '8+n bits or other type' bit
+ bcs.s .other_match ; if 11x: other type of match
+
+ bsr.s .get_gamma2 ; 10: read gamma2-coded high offset bits
+ sub.l d2,d0 ; high offset bits == 2 when LWM == 3 ?
+ bcc.s .no_repmatch ; if not, not a rep-match
+
+ bsr.s .get_gamma2 ; read repmatch length
+ bra.s .got_len ; go copy large match
+
+.no_repmatch: lsl.l #8,d0 ; shift high offset bits into place
+ move.b (a0)+,d0 ; read low offset byte
+ move.l d0,d3 ; copy offset into d3
+
+ bsr.s .get_gamma2 ; read match length
+ cmp.l a2,d3 ; offset >= 32000 ?
+ bge.s .inc_by_2 ; if so, increase match len by 2
+ cmp.l a3,d3 ; offset >= 1280 ?
+ bge.s .inc_by_1 ; if so, increase match len by 1
+ cmp.l a4,d3 ; offset < 128 ?
+ bge.s .got_len ; if so, increase match len by 2
+.inc_by_2: addq.l #1,d0 ; increase match len by 1
+.inc_by_1: addq.l #1,d0 ; increase match len by 1
+
+.got_len: move.l a1,a6 ; calculate backreference address
+ sub.l d3,a6 ; (dest - match offset)
+ subq.l #1,d0 ; dbf will loop until d0 is -1, not 0
+.copy_match: move.b (a6)+,(a1)+ ; copy matched byte
+ dbf d0,.copy_match ; loop for all matched bytes
+ moveq #2,d2 ; clear LWM flag
+ bra.s .next_token ; go decode next token
+
+.other_match: bsr.s .get_bit ; read '7+1 match or short literal' bit
+ bcs.s .short_match ; if 111: 4 bit offset for 1-byte copy
+
+ moveq #1,d0 ; 110: prepare match length
+ moveq #0,d3 ; clear high bits of offset
+ move.b (a0)+,d3 ; read low bits of offset + length bit
+ lsr.b #1,d3 ; shift offset into place, len into carry
+ beq.s .done ; check for EOD
+ addx.b d0,d0 ; len = (1 << 1) + carry bit, ie. 2 or 3
+ bra.s .got_len ; go copy match
+
+.short_match: moveq #0,d0 ; clear short offset before reading 4 bits
+ bsr.s .get_dibits ; read a data bit into d0, one into carry
+ addx.b d0,d0 ; shift second bit into d0
+ bsr.s .get_dibits ; read a data bit into d0, one into carry
+ addx.b d0,d0 ; shift second bit into d0
+ beq.s .write_zero ; if offset is zero, write a 0
+
+ move.l a1,a6 ; calculate backreference address
+ sub.l d0,a6 ; (dest - short offset)
+ move.b (a6),d0 ; read matched byte
+.write_zero: move.b d0,(a1)+ ; write matched byte or 0
+ bra.s .after_lit ; set LWM flag and go decode next token
+
+.done: move.l a1,d0 ; pointer to last decompressed byte + 1
+ sub.l a6,d0 ; minus start of decompression buffer = size
+ movem.l (sp)+,a2-a6/d2-d3
+ rts
+
+.get_gamma2: moveq #1,d0 ; init to 1 so it gets shifted to 2 below
+.gamma2_loop: bsr.s .get_dibits ; read data bit, shift into d0
+ ; and read continuation bit
+ bcs.s .gamma2_loop ; loop until a 0 continuation bit is read
+ rts
+
+.get_dibits: bsr.s .get_bit ; read bit
+ addx.l d0,d0 ; shift into d0
+ ; fall through
+.get_bit: add.b d1,d1 ; shift bit queue, high bit into carry
+ bne.s .got_bit ; queue not empty, bits remain
+ move.b (a0)+,d1 ; read 8 new bits
+ addx.b d1,d1 ; shift bit queue, high bit into carry
+ ; and shift 1 from carry into bit queue
+.got_bit: rts
diff --git a/tools/apultra/asm/6809/unaplib.s b/tools/apultra/asm/6809/unaplib.s
new file mode 100644
index 0000000..641c3f4
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib.s
@@ -0,0 +1,125 @@
+; unaplib.s - aPLib decompressor for 6809 - 157 bytes
+;
+; in: x = start of compressed data
+; y = start of decompression buffer
+; out: y = end of decompression buffer + 1
+;
+; Copyright (C) 2020 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress
+ lda #$80 ; initialize empty bit queue
+ sta <apbitbuf,pcr ; plus bit to roll into carry
+ leau ,x
+
+apcplit ldb ,u+ ; copy literal byte
+apwtlit stb ,y+
+
+ lda #$03 ; set 'follows literal' flag
+
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+
+ sta <aplwm+2,pcr ; store 'follows literal' flag
+
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+aplwm subd #$0000 ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+
+apnorep tfr b,a ; transfer high offset bits to A
+ ldb ,u+ ; read low offset byte in B
+ std <aprepof+1,pcr ; store match offset
+ tfr d,x ; transfer offset to X
+
+ bsr apgamma2 ; read match length
+
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u ; save source compressed data pointer
+ tfr d,x ; copy match length to X
+
+aprepof ldd #$aaaa ; load match offset
+ nega ; reverse sign of offset in D
+ negb
+ sbca #0
+ leau d,y ; put backreference start address in U (dst+offset)
+
+apcpymt lda ,u+ ; copy matched byte
+ sta ,y+
+ leax -1,x ; decrement X
+ bne apcpymt ; loop until all matched bytes are copied
+
+ puls u ; restore source compressed data pointer
+
+ lda #$02 ; clear 'follows literal' flag
+ bra aptoken
+
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+ bne apdone ; queue not empty, bits remain
+ pshs a ; save reg A
+ lda ,u+ ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta <apbitbuf,pcr ; save bit queue
+ puls a,pc ; pop reg A and return
+
+apbitbuf fcb $00 ; bit queue
+
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; go write zero
+
+ negb ; reverse offset in D
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write backreferenced byte
+
+apgamma2 ldd #$1 ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit ; read data bit
+ rolb ; shift into D
+ rola
+ bsr apgetbit ; read continuation bit
+ bcs apg2loop ; loop until a zero continuation bit is read
+apdone rts
+
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+
+ ldb ,u+ ; read low bits of offset + length bit in B
+ beq apdone ; check for EOD
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ std <aprepof+1,pcr ; store match offset
+ ldb #$01 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309.s b/tools/apultra/asm/6809/unaplib_6309.s
new file mode 100644
index 0000000..9e8ed71
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_6309.s
@@ -0,0 +1,139 @@
+; unaplib_6309.s - aPLib decompressor for H6309 - 131 bytes
+;
+; in: x = start of compressed data
+; y = start of decompression buffer
+; out: y = end of decompression buffer + 1
+;
+; Copyright (C) 2020 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+
+; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements
+; added by Doug Masten.
+;
+; Main advantage of H6309 CPU is the "TFM" instruction which can copy one
+; byte of memory in 3 clock cycles vs a traditional copy loop that takes
+; 20 clock cycles.
+
+; Options:
+; APLIB_VAR
+; Define variable to point to a DP memory location for a memory space
+; and speed optimization.
+; ex. APLIB_VAR equ <memory location>
+;
+; APLIB_LONG_OFFSET_DISABLE
+; Defined variable to disable long offsets >= 32000 for a speed and space
+; optimization. Only enable this if you know what you are doing.
+; ex. APLIB_LONG_OFFSET_DISABLE equ 1
+
+
+; define options
+ ifdef APLIB_VAR
+apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization)
+ else
+apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory)
+ endc
+
+
+apl_decompress
+ lda #$80 ; initialize empty bit queue
+ sta apbitbuf ; plus bit to roll into carry
+ tfr x,u
+
+apcplit ldb ,u+ ; copy literal byte
+apwtlit stb ,y+
+
+ ldb #3 ; set 'follows literal' flag
+
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+ clra
+ subr d,w ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+
+apnorep tfr f,a ; transfer high offset bits to A
+ ldb ,u+ ; read low offset byte in B
+ tfr d,x ; save match offset
+
+ bsr apgamma2 ; read match length
+
+ ifndef APLIB_LONG_OFFSET_DISABLE
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ endc
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 incw
+apincby1 incw
+
+apgotlen tfr y,d ; transfer dst to D
+ subr x,d ; put backreference start address in D (dst + offset)
+ tfm d+,y+ ; copy matched bytes
+
+ ldb #2 ; clear 'follows literal' flag
+ bra aptoken
+
+apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below
+loop@ bsr apgetbit ; read data bit
+ rolw ; shift into W
+ bsr apgetbit ; read continuation bit
+ bcs loop@ ; loop until a zero continuation bit is read
+ rts
+
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry
+ bne aprts ; queue not empty, bits remain
+ lda ,u+ ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta apbitbuf ; save bit queue
+aprts rts
+
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; if zero, go write it
+
+ negb ; reverse offset in D
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write it
+
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+
+ ldb ,u+ ; read low bits of offset + length bit in B
+ beq aprts ; check for EOD and exit if so
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ tfr d,x ; save match offset
+ ldb #1 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ tfr d,w
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309_b.s b/tools/apultra/asm/6809/unaplib_6309_b.s
new file mode 100644
index 0000000..8343edf
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_6309_b.s
@@ -0,0 +1,143 @@
+; unaplib_6309_b.s - aPLib backward decompressor for H6309 - 139 bytes
+;
+; in: x = last byte of compressed data
+; y = last byte of decompression buffer
+; out: y = first byte of decompressed data
+;
+; Copyright (C) 2020 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+
+; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements
+; added by Doug Masten.
+;
+; Main advantage of H6309 CPU is the "TFM" instruction which can copy one
+; byte of memory in 3 clock cycles vs a traditional copy loop that takes
+; 20 clock cycles.
+
+; Options:
+; APLIB_VAR
+; Define variable to point to a DP memory location for a memory space
+; and speed optimization.
+; ex. APLIB_VAR equ <memory location>
+;
+; APLIB_LONG_OFFSET_DISABLE
+; Defined variable to disable long offsets >= 32000 for a speed and space
+; optimization. Only enable this if you know what you are doing.
+; ex. APLIB_LONG_OFFSET_DISABLE equ 1
+
+
+; define options
+ ifdef APLIB_VAR
+apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization)
+ else
+apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory)
+ endc
+
+
+apl_decompress
+ lda #$80 ; initialize empty bit queue
+ sta apbitbuf ; plus bit to roll into carry
+ leau 1,x
+ leay 1,y
+
+apcplit ldb ,-u ; copy literal byte
+apwtlit stb ,-y
+
+ ldb #3 ; set 'follows literal' flag
+
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+ clra
+ subr d,w ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+
+apnorep tfr f,a ; transfer high offset bits to A
+ ldb ,-u ; read low offset byte in B
+ tfr d,x ; save match offset
+
+ bsr apgamma2 ; read match length
+
+ ifndef APLIB_LONG_OFFSET_DISABLE
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ endc
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 incw
+apincby1 incw
+
+apgotlen tfr y,d ; transfer dst to D
+ addr x,d ; put backreference start address in D (dst + offset)
+ decd
+ leay -1,y
+ tfm d-,y- ; copy matched bytes
+ leay 1,y
+
+ ldb #2 ; clear 'follows literal' flag
+ bra aptoken
+
+apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below
+loop@ bsr apgetbit ; read data bit
+ rolw ; shift into W
+ bsr apgetbit ; read continuation bit
+ bcs loop@ ; loop until a zero continuation bit is read
+ rts
+
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry
+ bne aprts ; queue not empty, bits remain
+ lda ,-u ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta apbitbuf ; save bit queue
+aprts rts
+
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; if zero, go write it
+
+ decb ; we load below without predecrement, adjust here
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write it
+
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+
+ ldb ,-u ; read low bits of offset + length bit in B
+ beq aprts ; check for EOD and exit if so
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ tfr d,x ; save match offset
+ ldb #1 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ tfr d,w
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_b.s b/tools/apultra/asm/6809/unaplib_b.s
new file mode 100644
index 0000000..02f943c
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_b.s
@@ -0,0 +1,122 @@
+; unaplib_b.s - aPLib backward decompressor for 6809 - 154 bytes
+;
+; in: x = last byte of compressed data
+; y = last byte of decompression buffer
+; out: y = first byte of decompressed data
+;
+; Copyright (C) 2020 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress
+ lda #$80 ; initialize empty bit queue
+ sta <apbitbuf,pcr ; plus bit to roll into carry
+ leau 1,x
+ leay 1,y
+
+apcplit ldb ,-u ; copy literal byte
+apwtlit stb ,-y
+
+ lda #$03 ; set 'follows literal' flag
+
+aptoken bsr apgetbit ; read 'literal or match' bit
+ bcc apcplit ; if 0: literal
+
+ bsr apgetbit ; read '8+n bits or other type' bit
+ bcs apother ; if 11x: other type of match
+
+ sta <aplwm+2,pcr ; store 'follows literal' flag
+
+ bsr apgamma2 ; 10: read gamma2-coded high offset bits
+aplwm subd #$0000 ; high offset bits == 2 when follows_literal == 3 ?
+ bcc apnorep ; if not, not a rep-match
+
+ bsr apgamma2 ; read repmatch length
+ bra apgotlen ; go copy large match
+
+apnorep tfr b,a ; transfer high offset bits to A
+ ldb ,-u ; read low offset byte in B
+ std <aprepof+2,pcr ; store match offset
+ tfr d,x ; transfer offset to X
+
+ bsr apgamma2 ; read match length
+
+ cmpx #$7D00 ; offset >= 32000 ?
+ bge apincby2 ; if so, increase match len by 2
+ cmpx #$0500 ; offset >= 1280 ?
+ bge apincby1 ; if so, increase match len by 1
+ cmpx #$80 ; offset < 128 ?
+ bge apgotlen ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u ; save source compressed data pointer
+ tfr d,x ; copy match length to X
+
+aprepof leau $aaaa,y ; put backreference start address in U (dst+offset)
+
+apcpymt lda ,-u ; copy matched byte
+ sta ,-y
+ leax -1,x ; decrement X
+ bne apcpymt ; loop until all matched bytes are copied
+
+ puls u ; restore source compressed data pointer
+
+ lda #$02 ; clear 'follows literal' flag
+ bra aptoken
+
+apdibits bsr apgetbit ; read bit
+ rolb ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+ bne apdone ; queue not empty, bits remain
+ pshs a ; push reg A
+ lda ,-u ; read 8 new bits
+ rola ; shift bit queue, and high bit into carry
+ sta <apbitbuf,pcr ; save bit queue
+ puls a,pc ; pop reg A and return
+
+apbitbuf fcb $00 ; bit queue
+
+apshort clrb
+ bsr apdibits ; read 2 offset bits
+ rolb
+ bsr apdibits ; read 4 offset bits
+ rolb
+ beq apwtlit ; go write a zero
+
+ decb ; we load below without predecrement, adjust here
+ ldb b,y ; load backreferenced byte from dst+offset
+ bra apwtlit ; go write backreferenced byte
+
+apgamma2 ldd #$1 ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit ; read data bit
+ rolb ; shift into D
+ rola
+ bsr apgetbit ; read continuation bit
+ bcs apg2loop ; loop until a zero continuation bit is read
+apdone rts
+
+apother bsr apgetbit ; read '7+1 match or short literal' bit
+ bcs apshort ; if 111: 4 bit offset for 1-byte copy
+
+ ldb ,-u ; read low bits of offset + length bit in B
+ beq apdone ; check for EOD
+ clra ; clear high bits in A
+ lsrb ; shift offset in place, shift length bit into carry
+ std <aprepof+2,pcr ; store match offset
+ ldb #$01 ; len in B will be 2*1+carry:
+ rolb ; shift length, and carry into B
+ bra apgotlen ; go copy match
diff --git a/tools/apultra/asm/8088/aplib_8088_fast.S b/tools/apultra/asm/8088/aplib_8088_fast.S
new file mode 100644
index 0000000..c535234
--- /dev/null
+++ b/tools/apultra/asm/8088/aplib_8088_fast.S
@@ -0,0 +1,178 @@
+; aplib_8088_fast.S - speed-optimized aPLib decompressor for 8088 - 188 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+ segment .text
+ bits 16
+
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * ds:si: compressed aPLib data
+; * es:di: output buffer
+; output:
+; * ax: decompressed size
+; ---------------------------------------------------------------------------
+
+%macro apl_get_bit 0 ; read bit into carry
+ add al,al ; shift bit queue, and high bit into carry
+ jnz %%gotbit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+%%gotbit:
+%endmacro
+
+apl_decompress:
+ push di ; remember decompression offset
+ cld ; make string operations go forward
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: temporary value, trashed
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor dx,dx ; invalidate rep offset
+
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ mov bx,03H ; set follows_literal(bx) to 3
+
+.next_command:
+ apl_get_bit ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+
+ apl_get_bit ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub cx,bx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in zero)
+ jae .not_repmatch ; if not, not a rep-match
+
+ call .get_gamma2 ; read match length
+ jmp short .got_len ; go copy
+
+.not_repmatch:
+ mov dh,cl ; transfer high offset bits to dh
+ mov dl,[si] ; read low offset byte in dl
+ inc si
+
+ call .get_gamma2 ; read match length
+ cmp dh,07DH ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp dh,05H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp dx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc cx ; increase length
+.increase_len_by1:
+ inc cx ; increase length
+
+ ; copy cx bytes from match offset dx
+
+.got_len:
+ push ds ; save ds:si (current pointer to compressed data)
+ mov bp,si
+
+ push es
+ pop ds
+ mov si,di ; point to destination in es:di - offset in dx
+ sub si,dx
+ rep movsb ; copy matched bytes
+
+ mov si,bp ; restore ds:si
+ pop ds
+
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp short .next_command
+
+ ; read gamma2-coded value into cx
+
+.get_gamma2:
+ xor cx,cx ; initialize to 1 so that value will start at 2
+ inc cx ; when shifted left in the adc below
+
+.gamma2_loop:
+ apl_get_bit ; read data bit
+ adc cx,cx ; shift into cx
+ apl_get_bit ; read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+
+ ret
+
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+ xor cx,cx
+ apl_get_bit ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+
+ ; 110: 7 bits offset + 1 bit length
+
+ mov dl,[si] ; read offset + length in dl
+ inc si
+
+ inc cx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3
+
+ xor dh,dh ; clear high bits of offset
+ jmp short .got_len
+
+ ; 4 bits offset / 1 byte copy
+
+.short_literal:
+ apl_get_bit ; read 4 offset bits
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ apl_get_bit
+ adc cl,cl
+ xchg ax,cx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+
+ ; short offset 1-15
+ mov bx,di ; point to destination in es:di - offset in ax
+ sub bx,ax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[es:bx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ mov ax,cx ; restore bit queue in al
+ jmp .next_command_after_literal
+
+.done:
+ pop ax ; retrieve the original decompression offset
+ xchg di,ax ; compute decompressed size
+ sub ax,di
+ ret
diff --git a/tools/apultra/asm/8088/aplib_8088_small.S b/tools/apultra/asm/8088/aplib_8088_small.S
new file mode 100644
index 0000000..542991e
--- /dev/null
+++ b/tools/apultra/asm/8088/aplib_8088_small.S
@@ -0,0 +1,177 @@
+; aplib_8088_small.S - size-optimized aPLib decompressor for 8088 - 145 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+ segment .text
+ bits 16
+
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * ds:si: compressed aPLib data
+; * es:di: output buffer
+; output:
+; * ax: decompressed size
+; ---------------------------------------------------------------------------
+
+apl_decompress:
+ push di ; remember decompression offset
+ cld ; make string operations go forward
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: offset of .get_bit
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor dx,dx ; invalidate rep offset
+ mov bp,.get_bit ; load offset of .get_bit, to be used with call bp
+
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ mov bx,03H ; set follows_literal(bx) to 3
+
+.next_command:
+ call bp ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+
+ call bp ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub cx,bx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in a negative value)
+ jae .not_repmatch ; if not, not a rep-match
+
+ call .get_gamma2 ; read match length
+ jmp short .got_len ; go copy
+
+.not_repmatch:
+ mov dh,cl ; transfer high offset bits to dh
+ mov dl,[si] ; read low offset byte in dl
+ inc si
+
+ call .get_gamma2 ; read match length
+ cmp dh,07DH ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp dh,05H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp dx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc cx ; increase length
+.increase_len_by1:
+ inc cx ; increase length
+
+ ; copy cx bytes from match offset dx
+
+.got_len:
+ push ds ; save ds:si (current pointer to compressed data)
+ push si
+
+ push es
+ pop ds
+ mov si,di ; point to destination in es:di - offset in dx
+ sub si,dx
+ rep movsb ; copy matched bytes
+
+ pop si ; restore ds:si
+ pop ds
+
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp short .next_command
+
+ ; read gamma2-coded value into cx
+
+.get_gamma2:
+ xor cx,cx ; initialize to 1 so that value will start at 2
+ inc cx ; when shifted left in the adc below
+
+.gamma2_loop:
+ call .get_dibits ; read data bit, shift into cx, read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+
+ ret
+
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+ xor cx,cx
+ call bp ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+
+ ; 110: 7 bits offset + 1 bit length
+
+ mov dl,[si] ; read offset + length in dl
+ inc si
+
+ inc cx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3
+
+ xor dh,dh ; clear high bits of offset
+ jmp short .got_len
+
+ ; 4 bits offset / 1 byte copy
+
+.short_literal:
+ call .get_dibits ; read 2 offset bits
+ adc cx,cx
+ call .get_dibits ; read 2 offset bits
+ adc cx,cx
+ xchg ax,cx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+
+ ; short offset 1-15
+ mov bx,di ; point to destination in es:di - offset in ax
+ sub bx,ax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[es:bx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ xchg ax,cx ; restore bit queue in al
+ jmp .next_command_after_literal
+
+.done:
+ pop ax ; retrieve the original decompression offset
+ xchg di,ax ; compute decompressed size
+ sub ax,di
+ ret
+
+.get_dibits:
+ call bp ; read data bit
+ adc cx,cx ; shift into cx
+
+.get_bit:
+ add al,al ; shift bit queue, and high bit into carry
+ jnz .got_bit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+.got_bit:
+ ret
diff --git a/tools/apultra/asm/ARM7TDMI/aplib_arm.s b/tools/apultra/asm/ARM7TDMI/aplib_arm.s
new file mode 100644
index 0000000..b6d0cef
--- /dev/null
+++ b/tools/apultra/asm/ARM7TDMI/aplib_arm.s
@@ -0,0 +1,150 @@
+@APlib ARM7 decompressor by Dan Weiss, based on the original C version
+@Takes in raw apacked data, NOT data created by the 'safe' compressor.
+@Code is from the PocketNES NES Emulator for GBA
+
+@Code is formatted for GNU Assembler
+
+ src .req r0
+ dest .req r1
+ byte .req r2
+ mask .req r3
+ gamma .req r4
+ lwm .req r6
+ recentoff .req r7
+ temp .req r8
+
+.global depack
+.type depack STT_FUNC
+
+@r0 = src
+@r1 = dest
+@r2 = byte
+@r3 = rotating bit mask
+@r4 = increasing gamma
+@r6 = lwm
+@r7 = recentoff
+@r8 = lr copy/scratch
+
+ .macro GETBIT @3 instructions
+ movs mask,mask,ror #1
+ ldrcsb byte,[src],#1
+ tst byte,mask
+ .endm
+
+ .macro GETBITGAMMA @5 instructions
+ mov gamma,gamma,lsl #1
+ GETBIT
+ addne gamma,gamma,#1
+ .endm
+
+@This initilaiztion code can go into slow memory
+
+depack:
+ stmfd sp!,{r4-r10,lr}
+ ldrb temp,[src],#1
+ strb temp,[dest],#1
+ ldr mask,=0x01010101
+ b aploop_nolwm
+
+@This inner-loop code should be placed into fast memory
+
+ @depack enters here
+aploop_nolwm:
+ mov lwm,#0
+aploop:
+ GETBIT
+ bne apbranch1
+ ldrb temp,[src],#1
+ strb temp,[dest],#1
+ b aploop_nolwm
+apbranch1:
+ GETBIT
+ beq apbranch2
+ GETBIT
+ beq apbranch3
+ @get an offset
+ mov gamma,#0
+ GETBIT
+ addne gamma,gamma,#1
+ GETBITGAMMA
+ GETBITGAMMA
+ GETBITGAMMA
+ cmp gamma,#0
+ ldrneb gamma,[dest,-gamma]
+ strb gamma,[dest],#1
+ b aploop_nolwm
+apbranch3:
+ @use 7 bit offset, length = 2 or 3
+ @if a zero is encountered here, it's EOF
+ ldrb gamma,[src],#1
+ movs recentoff,gamma,lsr #1
+ beq done
+ ldrcsb temp,[dest,-recentoff]
+ strcsb temp,[dest],#1
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ mov lwm,#1
+ b aploop
+apbranch2:
+ @use a gamma code * 256 for offset, another gamma code for length
+
+ bl ap_getgamma
+ sub gamma,gamma,#2
+ cmp lwm,#0
+ bne ap_is_lwm
+ mov lwm,#1
+ cmp gamma,#0
+ bne ap_not_zero_gamma
+
+ @if gamma code is 2, use old recent offset, and a new gamma code for length
+ bl ap_getgamma
+copyloop1:
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ subs gamma,gamma,#1
+ bne copyloop1
+ b aploop
+
+ap_not_zero_gamma:
+ sub gamma,gamma,#1
+ap_is_lwm:
+ ldrb temp,[src],#1
+ add recentoff,temp,gamma,lsl #8
+ bl ap_getgamma
+ @gamma=length
+ cmp recentoff,#32000
+ addge gamma,gamma,#1
+ cmp recentoff,#1280
+ addge gamma,gamma,#1
+ cmp recentoff,#128
+ addlt gamma,gamma,#2
+copyloop2:
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ subs gamma,gamma,#1
+ bne copyloop2
+ b aploop
+
+ap_getgamma:
+ mov gamma,#1
+ap_getgammaloop:
+ GETBITGAMMA
+ GETBIT
+ bne ap_getgammaloop
+ bx lr
+
+done:
+ ldmfd sp!,{r4-r10,lr}
+ bx lr
+
+.unreq src
+.unreq dest
+.unreq byte
+.unreq mask
+.unreq gamma
+.unreq lwm
+.unreq recentoff
+.unreq temp
+
diff --git a/tools/apultra/asm/Z80/unaplib_fast.asm b/tools/apultra/asm/Z80/unaplib_fast.asm
new file mode 100644
index 0000000..c21eb5d
--- /dev/null
+++ b/tools/apultra/asm/Z80/unaplib_fast.asm
@@ -0,0 +1,339 @@
+;
+; Speed-optimized ApLib decompressor by spke & uniabis (ver.06 01-05/06/2020, 235 bytes)
+;
+; The original Z80 decompressors for ApLib were written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain) and Antonio Villena.
+;
+; This is a new "implicit state" decompressor heavily optimized for speed by spke.
+; (It is 12 bytes shorter and 18% faster than the previously fastest
+; 247b decompressor by Metalbrain and Antonio Villena.)
+;
+; ver.00 by spke (21/08/2018-01/09/2018, 244 bytes, an edit of the existing 247b decompressor);
+; ver.01 by spke (12-13/11/2018, 234(-10) bytes, +3% speed using the state machine for LWM);
+; ver.02 by spke (06/08/2019, +1% speed);
+; ver.03 by spke (27/08/2019, 236(+2) bytes, +1% speed using partly expanded LDIR);
+; ver.04 by spke (spring 2020, added full revision history and support for long offsets)
+; ver.05 by spke (17-31/05/2020, 230(-6) bytes, +3% speed, added support for backward compression) <- BROKEN, DO NOT USE
+; ver.06 by uniabis & spke (01-07/06/2020, 235(+5) bytes, +1% speed, added support for HD64180)
+;
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+;
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+; The compression can done as follows:
+;
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+;
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+; The decompression is done in the standard way:
+;
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; Backward decompression is also supported; you can compress files backward using:
+;
+; apultra.exe -b <sourcefile> <outfile>
+;
+; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+;
+; ld hl,LastByteOfCompressedData
+; ld de,LastByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; The decompressor modifies AF, AF', BC, DE, HL, IX.
+;
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+;
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+; DEFINE BackwardDecompression ; decompress data compressed backwards, -10 bytes, speeds decompression up by 3%
+; DEFINE HD64180 ; -2 bytes for HD64180/Z180 support, slows decompression down by 1%
+
+ IFNDEF BackwardDecompression
+
+ MACRO NEXT_HL
+ inc hl
+ ENDM
+
+ MACRO COPY_1
+ ldi
+ ENDM
+
+ MACRO COPY_BC
+ ldir
+ ENDM
+
+ ELSE
+
+ MACRO NEXT_HL
+ dec hl
+ ENDM
+
+ MACRO COPY_1
+ ldd
+ ENDM
+
+ MACRO COPY_BC
+ lddr
+ ENDM
+
+ ENDIF
+
+ MACRO RELOAD_A
+ ld a,(hl) : NEXT_HL : rla
+ ENDM
+
+@Decompress: COPY_1 : scf
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+LWM0: ;LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+.ReloadByteC0 RELOAD_A : jr c,.Check2ndBit
+
+;
+; case "0"+BYTE: copy a single literal
+
+.CASE0: COPY_1 ; first byte is always copied as literal
+
+;
+; main decompressor loop
+
+.MainLoop: add a : jr nc,.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal
+.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit add a : call z,ReloadByte : jp c,LWM1.CASE111 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+.CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ NEXT_HL
+ ld b,0
+
+ IFNDEF HD64180
+ ld ixl,c : ld ixh,b ; save offset for future LWMs
+ ELSE
+ push bc : pop ix
+ ENDIF
+
+ push hl ; save src
+ ld h,d : ld l,e ; HL = dest
+ jr c,.LengthIs3
+
+.LengthIs2
+ IFNDEF BackwardDecompression
+ sbc hl,bc
+ ELSE
+ add hl,bc
+ ENDIF
+ COPY_1 : COPY_1
+ jr .PreMainLoop
+
+.LengthIs3
+ IFNDEF BackwardDecompression
+ or a : sbc hl,bc
+ ELSE
+ add hl,bc
+ ENDIF
+ COPY_1 : COPY_1 : COPY_1
+ jr .PreMainLoop
+
+.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : dec c : jr z,LWM1.KickInLWM
+
+.AfterLWM dec c : ld b,c : ld c,(hl) : NEXT_HL ; BC = offset
+
+ IFNDEF HD64180
+ ld ixl,c : ld ixh,b : push bc
+ ELSE
+ push bc : push bc : pop ix
+ ENDIF
+
+ call GetGammaCoded ; BC = len*
+
+ ex (sp),hl
+
+ ; interpretation of length value is offset-dependent:
+ ; if (offs >= 32000) len++; if (offs >= 1280) len++; if (offs < 128) len+=2;
+ ; in other words,
+ ; (1 <= offs < 128) +=2
+ ; (128 <= offs < 1280) +=0
+ ; (1280 <= offs < 31999) +=1
+ ; NB offsets over 32000 need one more check, but other Z80 decompressors seem to ignore it. is it not needed?
+
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000/256 : jr nc,.Add2
+ ENDIF
+ cp 5 : jr nc,.Add1
+ or a : jr nz,.Add0
+ bit 7,l : jr nz,.Add0
+.Add2 inc bc
+.Add1 inc bc
+.Add0 ; for offs<128 : 4+4+7+7 + 4+7 + 8+7 + 6+6 = 60t
+ ; for offs>=1280 : 4+4+7+12 + 6 = 33t
+ ; for 128<=offs<1280 : 4+4+7+7 + 4+12 = 38t OR 4+4+7+7 + 4+7+8+12 = 53t
+
+.CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset
+ ; and also that (SP) = src, while having NC
+ IFNDEF BackwardDecompression
+ ld a,e : sub l : ld l,a
+ ld a,d : sbc h
+ ld h,a : exa
+ ELSE
+ exa
+.CopyMatchLDH add hl,de
+ ENDIF
+ COPY_1 : COPY_BC
+.PreMainLoop pop hl ; recover src
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+LWM1: ; LWM = 1
+
+;
+; main decompressor loop
+
+.MainLoop: add a : jr nc,LWM0.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal
+.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit add a : call z,ReloadByte : jr nc,LWM0.CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+.CASE111: ld bc,%11100000
+ DUP 4
+ add a : call z,ReloadByte : rl c ; read short offset (4 bits)
+ EDUP
+ ex de,hl : jr z,.WriteZero ; zero offset means "write zero" (NB: B is zero here)
+
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ IFNDEF BackwardDecompression
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ELSE
+ add hl,bc
+ ENDIF
+ ld c,(hl)
+ pop hl
+
+.WriteZero ld (hl),c : NEXT_HL
+ ex de,hl : jp LWM0.MainLoop ; 10+4*(4+10+8)+4+7 + 11+15+7+10 + 7+4+6+10 = 179t
+
+.ReloadByteC0 RELOAD_A : jp nc,LWM0.CASE0
+ jr .Check2ndBit
+
+.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ dec c : jr LWM0.AfterLWM
+
+;
+; the re-use of the previous offset (LWM magic)
+
+.KickInLWM: ; "and a new gamma code for length"
+ inc c : call GetGammaCoded.ReadGamma ; BC = len
+
+ IFNDEF BackwardDecompression
+ push ix : ex (sp),hl : exa
+ jr LWM0.CopyMatch
+ ELSE
+ push ix : ex (sp),hl
+ jr LWM0.CopyMatchLDH
+ ENDIF
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+;
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+
+GetGammaCoded: ld bc,1
+.ReadGamma add a : jr z,.ReloadByteRG1
+ rl c : rl b
+ add a : ret nc ; NB: flag NC immediately says we do not need to reload our byte...
+ jr nz,.ReadGamma ; ...even better, flag NZ then automatically means flag C :)
+
+.ReloadByteRG2 RELOAD_A : ret nc : jr .ReadGamma
+
+.ReloadByteRG1 RELOAD_A : rl c : rl b
+ add a : ret nc : jr .ReadGamma
+
+;
+; pretty usual getbit for mixed datastreams
+
+ReloadByte: RELOAD_A : ret
+
diff --git a/tools/apultra/asm/Z80/unaplib_small.asm b/tools/apultra/asm/Z80/unaplib_small.asm
new file mode 100644
index 0000000..280de15
--- /dev/null
+++ b/tools/apultra/asm/Z80/unaplib_small.asm
@@ -0,0 +1,258 @@
+;
+; Size-optimized ApLib decompressor by spke & uniabis (ver.04 01-07/06/2020, 139 bytes)
+;
+; The original Z80 decompressor for ApLib was written by Dan Weiss (Dwedit),
+; then tweaked by Francisco Javier Pena Pareja (utopian),
+; and optimized by Jaime Tejedor Gomez (Metalbrain).
+;
+; This version was heavily re-optimized for size by spke.
+; (It is 17 bytes shorter and 22% faster than the 156b version by Metalbrain.)
+;
+; ver.00 by spke (21/08/2018-01/09/2018, 141 bytes);
+; ver.01 by spke (spring 2019, 140(-1) bytes, slightly faster);
+; ver.02 by spke (05-07/01/2020, added full revision history, support for long offsets
+; and an option to use self-modifying code instead of IY)
+; ver.03 by spke (18-29/05/2020, +0.5% speed, added support for backward compression)
+; ver.04 by uniabis (01-07/06/2020, 139(-1) bytes, +1% speed, added support for HD64180)
+;
+; The data must be compressed using any compressor for ApLib capable of generating raw data.
+; At present, two best available compressors are:
+;
+; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+; The compression can be done as follows:
+;
+; apc.exe e <sourcefile> <outfile>
+; or
+; apultra.exe <sourcefile> <outfile>
+;
+; A decent compressor was written by r57shell (although it is worse than compressors above):
+; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+; The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+; The decompression is done in the standard way:
+;
+; ld hl,FirstByteOfCompressedData
+; ld de,FirstByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; Backward decompression is also supported; you can compress files backward using:
+;
+; apultra.exe -b <sourcefile> <outfile>
+;
+; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+;
+; ld hl,LastByteOfCompressedData
+; ld de,LastByteOfMemoryForDecompressedData
+; call DecompressApLib
+;
+; The decompressor modifies AF, AF', BC, DE, HL, IX.
+;
+; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+; see http://www.ibsensoftware.com/ for more information
+;
+; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; DEFINE FasterGetBit ; 16% speed-up at the cost of extra 4 bytes
+; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+; DEFINE BackwardDecompression ; decompress data compressed backwards, -5 bytes, speeds decompression up by 3%
+
+
+ IFDEF FasterGetBit
+ MACRO GET_BIT
+ add a : call z,ReloadByte
+ ENDM
+ ELSE
+ MACRO GET_BIT
+ call GetOneBit
+ ENDM
+ ENDIF
+
+ IFNDEF BackwardDecompression
+
+ MACRO NEXT_HL
+ inc hl
+ ENDM
+
+ MACRO COPY_1
+ ldi
+ ENDM
+
+ MACRO COPY_BC
+ ldir
+ ENDM
+
+ ELSE
+
+ MACRO NEXT_HL
+ dec hl
+ ENDM
+
+ MACRO COPY_1
+ ldd
+ ENDM
+
+ MACRO COPY_BC
+ lddr
+ ENDM
+
+ ENDIF
+
+@DecompressApLib: ld a,128
+
+;
+; case "0"+BYTE: copy a single literal
+
+CASE0: COPY_1 ; first byte is always copied as literal
+ResetLWM: ld b,-1 ; LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+;
+; main decompressor loop
+
+MainLoop: GET_BIT : jr nc,CASE0 ; "0"+BYTE = copy literal
+ GET_BIT : jr nc,CASE10 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+
+ ld bc,%11100000
+ GET_BIT : jr nc,CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+CASE111:
+ReadFourBits GET_BIT ; read short offset (4 bits)
+ rl c : jr c,ReadFourBits
+ ex de,hl : jr z,WriteZero ; zero offset means "write zero" (NB: B is zero here)
+
+ ; "write a previous byte (1-15 away from dest)"
+ push hl ; BC = offset, DE = src, HL = dest
+ IFNDEF BackwardDecompression
+ sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ELSE
+ add hl,bc ; HL = dest-offset (SBC works because branching above ensured NC)
+ ENDIF
+ ld c,(hl) : pop hl
+
+WriteZero ld (hl),c : NEXT_HL
+ ex de,hl : jr ResetLWM ; write one byte, reset LWM
+
+;
+; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+CASE110: ; "use 7 bit offset, length = 2 or 3"
+ ; "if a zero is found here, it's EOF"
+ ld c,(hl) : rr c : ret z ; process EOF
+ NEXT_HL
+
+ push hl ; save src
+ ld h,b : ld l,c ; HL = offset
+
+ ; flag NC means len=2, flag C means len=3
+ ld c,1 : rl c : jr SaveLWMOffset
+
+;
+; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+CASE10: ; save state of LWM into A'
+ exa : ld a,b : exa
+
+ ; "use a gamma code * 256 for offset, another gamma code for length"
+ call GetGammaCoded
+
+ ; the original decompressor contains
+ ;
+ ; if ((LWM == 0) && (offs == 2)) { ... }
+ ; else {
+ ; if (LWM == 0) { offs -= 3; }
+ ; else { offs -= 2; }
+ ; }
+ ;
+ ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+ ; and to split the first condition by noticing that C-1 can never be zero
+ exa : add c : ld c,a : exa
+
+ ; "if gamma code is 2, use old r0 offset"
+ dec c : jr z,KickInLWM
+ dec c
+ ld b,c : ld c,(hl) : NEXT_HL ; BC = offset
+
+ push bc ; (SP) = offset
+ call GetGammaCoded ; BC = len*
+ ex (sp),hl ; HL = offset, (SP) = src
+
+ ; interpretation of length value is offset-dependent
+ exa : ld a,h
+ IFDEF SupportLongOffsets
+ ; NB offsets over 32000 require an additional check, which is skipped in most
+ ; Z80 decompressors (seemingly as a performance optimization)
+ cp 32000/256 : jr nc,.Add2
+ ENDIF
+ cp 5 : jr nc,.Add1
+ or a : jr nz,.Add0
+ bit 7,l : jr nz,.Add0
+.Add2 inc bc
+.Add1 inc bc
+.Add0 exa
+
+SaveLWMOffset:
+ push hl : pop ix ; save offset for future LWMs
+
+CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset
+ ; and also that (SP) = src, while having NC
+ IFNDEF BackwardDecompression
+ push de
+ ex de,hl : sbc hl,de ; HL = dest-offset
+ pop de ; DE = dest
+ ELSE
+ add hl,de ; HL = dest+offset
+ ENDIF
+
+ COPY_BC
+ pop hl ; recover src
+ jr MainLoop
+
+;
+; the re-use of the previous offset (LWM magic)
+
+KickInLWM: ; "and a new gamma code for length"
+ call GetGammaCoded ; BC = len
+ push ix : ex (sp),hl ; DE = dest, HL = prev offset
+ jr CopyMatch
+
+;
+; interlaced gamma code reader
+; x0 -> 1x
+; x1y0 -> 1xy
+; x1y1z0 -> 1xyz etc
+; (technically, this is a 2-based variation of Exp-Golomb-1)
+
+GetGammaCoded: ld bc,1
+ReadGamma GET_BIT : rl c : rl b
+ GET_BIT : ret nc
+ jr ReadGamma
+
+;
+; pretty usual getbit for mixed datastreams
+
+ IFNDEF FasterGetBit
+GetOneBit: add a : ret nz
+ ENDIF
+ReloadByte: ld a,(hl) : NEXT_HL
+ rla : ret
+
diff --git a/tools/apultra/asm/x86/aplib_x86_fast.asm b/tools/apultra/asm/x86/aplib_x86_fast.asm
new file mode 100644
index 0000000..9e41d31
--- /dev/null
+++ b/tools/apultra/asm/x86/aplib_x86_fast.asm
@@ -0,0 +1,180 @@
+; aplib_x86_fast.asm - speed-optimized aPLib decompressor for x86 - 188 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+ segment .text
+ bits 32
+
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * esi: compressed aPLib data
+; * edi: output buffer
+; output:
+; * eax: decompressed size
+; ---------------------------------------------------------------------------
+ %ifndef BIN
+ global apl_decompress
+ global _apl_decompress
+ %endif
+
+ ; uint32_t apl_decompress(const void *Source, void *Destination);
+
+%macro apl_get_bit 0 ; read bit into carry
+ add al,al ; shift bit queue, and high bit into carry
+ jnz %%gotbit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+%%gotbit:
+%endmacro
+
+apl_decompress:
+_apl_decompress:
+ pushad
+
+ %ifdef CDECL
+ mov esi, [esp+32+4] ; esi = aPLib compressed data
+ mov edi, [esp+32+8] ; edi = output
+ %endif
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; bx: follows_literal
+ ; cx: scratch register for reading gamma2 codes and storing copy length
+ ; dx: match offset (and rep-offset)
+ ; si: input (compressed data) pointer
+ ; di: output (decompressed data) pointer
+ ; bp: temporary value, trashed
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor edx, edx ; invalidate rep offset
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ mov ebx,03H ; set follows_literal(bx) to 3
+
+.next_command:
+ apl_get_bit ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+
+ apl_get_bit ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in zero)
+ jae .not_repmatch ; if not, not a rep-match
+
+ call .get_gamma2 ; read match length
+ jmp .got_len ; go copy
+
+.not_repmatch:
+ mov edx,ecx ; transfer high offset bits to dh
+ shl edx, 8
+ mov dl,[esi] ; read low offset byte in dl
+ inc esi
+
+ call .get_gamma2 ; read match length
+ cmp edx,07D00H ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp edx,0500H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp edx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc ecx ; increase length
+.increase_len_by1:
+ inc ecx ; increase length
+
+ ; copy cx bytes from match offset dx
+
+.got_len:
+ push esi
+ mov esi,edi ; point to destination in es:di - offset in dx
+ sub esi,edx
+ rep movsb ; copy matched bytes
+ pop esi
+ mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands)
+ jmp .next_command
+
+ ; read gamma2-coded value into cx
+
+.get_gamma2:
+ xor ecx,ecx ; initialize to 1 so that value will start at 2
+ inc ecx ; when shifted left in the adc below
+
+.gamma2_loop:
+ apl_get_bit ; read data bit
+ adc ecx,ecx ; shift into cx
+ apl_get_bit ; read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+
+ ret
+
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+ xor ecx,ecx
+ apl_get_bit ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+
+ ; 110: 7 bits offset + 1 bit length
+
+ movzx edx,byte[esi] ; read offset + length in dl
+ inc esi
+
+ inc ecx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3
+ jmp .got_len
+
+ ; 4 bits offset / 1 byte copy
+
+.short_literal:
+ apl_get_bit ; read 4 offset bits
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ apl_get_bit
+ adc ecx,ecx
+ xchg eax,ecx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+
+ ; short offset 1-15
+ mov ebx,edi ; point to destination in es:di - offset in ax
+ sub ebx,eax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[ebx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ mov eax,ecx ; restore bit queue in al
+ jmp .next_command_after_literal
+
+.done:
+ sub edi, [esp+32+8] ; compute decompressed size
+ mov [esp+28], edi
+ popad
+ ret
diff --git a/tools/apultra/asm/x86/aplib_x86_small.asm b/tools/apultra/asm/x86/aplib_x86_small.asm
new file mode 100644
index 0000000..ada00f6
--- /dev/null
+++ b/tools/apultra/asm/x86/aplib_x86_small.asm
@@ -0,0 +1,159 @@
+; aplib_x86_small.asm - size-optimized aPLib decompressor for x86 - 185 bytes
+;
+; Copyright (C) 2019 Emmanuel Marty
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+ segment .text
+ bits 32
+; ---------------------------------------------------------------------------
+; Decompress aPLib data
+; inputs:
+; * esi: compressed aPLib data
+; * edi: output buffer
+; output:
+; * eax: decompressed size
+; ---------------------------------------------------------------------------
+ %ifndef BIN
+ global apl_decompress
+ global _apl_decompress
+ %endif
+
+apl_decompress:
+_apl_decompress:
+ pushad
+
+ %ifdef CDECL
+ mov esi, [esp+32+4] ; esi = aPLib compressed data
+ mov edi, [esp+32+8] ; edi = output
+ %endif
+
+ ; === register map ===
+ ; al: bit queue
+ ; ah: unused, but value is trashed
+ ; ebx: follows_literal
+ ; ecx: scratch register for reading gamma2 codes and storing copy length
+ ; edx: match offset (and rep-offset)
+ ; esi: input (compressed data) pointer
+ ; edi: output (decompressed data) pointer
+ ; ebp: offset of .get_bit
+
+ mov al,080H ; clear bit queue(al) and set high bit to move into carry
+ xor edx, edx ; invalidate rep offset in edx
+
+ call .init_get_bit
+.get_dibits:
+ call ebp ; read data bit
+ adc ecx,ecx ; shift into cx
+.get_bit:
+ add al,al ; shift bit queue, and high bit into carry
+ jnz .got_bit ; queue not empty, bits remain
+ lodsb ; read 8 new bits
+ adc al,al ; shift bit queue, and high bit into carry
+.got_bit:
+ ret
+.init_get_bit:
+ pop ebp ; load offset of .get_bit, to be used with call ebp
+ add ebp, .get_bit - .get_dibits
+.literal:
+ movsb ; read and write literal byte
+.next_command_after_literal:
+ push 03H
+ pop ebx ; set follows_literal(bx) to 3
+
+.next_command:
+ call ebp ; read 'literal or match' bit
+ jnc .literal ; if 0: literal
+
+ ; 1x: match
+ call ebp ; read '8+n bits or other type' bit
+ jc .other ; 11x: other type of match
+ ; 10: 8+n bits match
+ call .get_gamma2 ; read gamma2-coded high offset bits
+ sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ?
+ ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+ ; is == 2 will never result in a negative value)
+ jae .not_repmatch ; if not, not a rep-match
+ call .get_gamma2 ; read match length
+ jmp .got_len ; go copy
+.not_repmatch:
+ mov edx,ecx ; transfer high offset bits to dh
+ shl edx,8
+ mov dl,[esi] ; read low offset byte in dl
+ inc esi
+ call .get_gamma2 ; read match length
+ cmp edx,7D00H ; offset >= 32000 ?
+ jae .increase_len_by2 ; if so, increase match len by 2
+ cmp edx,0500H ; offset >= 1280 ?
+ jae .increase_len_by1 ; if so, increase match len by 1
+ cmp edx,0080H ; offset < 128 ?
+ jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+ inc ecx ; increase length
+.increase_len_by1:
+ inc ecx ; increase length
+ ; copy ecx bytes from match offset edx
+.got_len:
+ push esi ; save esi (current pointer to compressed data)
+ mov esi,edi ; point to destination in edi - offset in edx
+ sub esi,edx
+ rep movsb ; copy matched bytes
+ pop esi ; restore esi
+ mov bl,02H ; set follows_literal to 2 (ebx is unmodified by match commands)
+ jmp .next_command
+ ; read gamma2-coded value into ecx
+.get_gamma2:
+ xor ecx,ecx ; initialize to 1 so that value will start at 2
+ inc ecx ; when shifted left in the adc below
+.gamma2_loop:
+ call .get_dibits ; read data bit, shift into cx, read continuation bit
+ jc .gamma2_loop ; loop until a zero continuation bit is read
+ ret
+ ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+.other:
+ xor ecx,ecx
+ call ebp ; read '7+1 match or short literal' bit
+ jc .short_literal ; 111: 4 bit offset for 1-byte copy
+ ; 110: 7 bits offset + 1 bit length
+
+ movzx edx,byte[esi] ; read offset + length in dl
+ inc esi
+ inc ecx ; prepare cx for length below
+ shr dl,1 ; shift len bit into carry, and offset in place
+ je .done ; if zero offset: EOD
+ adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3
+ jmp .got_len
+ ; 4 bits offset / 1 byte copy
+.short_literal:
+ call .get_dibits ; read 2 offset bits
+ adc ecx,ecx
+ call .get_dibits ; read 2 offset bits
+ adc ecx,ecx
+ xchg eax,ecx ; preserve bit queue in cx, put offset in ax
+ jz .write_zero ; if offset is 0, write a zero byte
+ ; short offset 1-15
+ mov ebx,edi ; point to destination in es:di - offset in ax
+ sub ebx,eax ; we trash bx, it will be reset to 3 when we loop
+ mov al,[ebx] ; read byte from short offset
+.write_zero:
+ stosb ; copy matched byte
+ xchg eax,ecx ; restore bit queue in al
+ jmp .next_command_after_literal
+.done:
+ sub edi, [esp+32+8] ; compute decompressed size
+ mov [esp+28], edi
+ popad
+ ret