From 9bcf1e97960c0da7322a868efdbc07e2650716fe Mon Sep 17 00:00:00 2001 From: "Juan J. Martinez" Date: Sat, 9 Jan 2021 09:01:05 +0000 Subject: Extra libs: ap.lib aPLib support with apultra. --- tools/apultra/asm/6502/aplib_6502.asm | 257 ++++++++++++++++++++++ tools/apultra/asm/6502/aplib_6502_b.asm | 218 +++++++++++++++++++ tools/apultra/asm/68000/unaplib_68000.S | 117 +++++++++++ tools/apultra/asm/6809/unaplib.s | 125 +++++++++++ tools/apultra/asm/6809/unaplib_6309.s | 139 ++++++++++++ tools/apultra/asm/6809/unaplib_6309_b.s | 143 +++++++++++++ tools/apultra/asm/6809/unaplib_b.s | 122 +++++++++++ tools/apultra/asm/8088/aplib_8088_fast.S | 178 ++++++++++++++++ tools/apultra/asm/8088/aplib_8088_small.S | 177 ++++++++++++++++ tools/apultra/asm/ARM7TDMI/aplib_arm.s | 150 +++++++++++++ tools/apultra/asm/Z80/unaplib_fast.asm | 339 ++++++++++++++++++++++++++++++ tools/apultra/asm/Z80/unaplib_small.asm | 258 +++++++++++++++++++++++ tools/apultra/asm/x86/aplib_x86_fast.asm | 180 ++++++++++++++++ tools/apultra/asm/x86/aplib_x86_small.asm | 159 ++++++++++++++ 14 files changed, 2562 insertions(+) create mode 100644 tools/apultra/asm/6502/aplib_6502.asm create mode 100644 tools/apultra/asm/6502/aplib_6502_b.asm create mode 100644 tools/apultra/asm/68000/unaplib_68000.S create mode 100644 tools/apultra/asm/6809/unaplib.s create mode 100644 tools/apultra/asm/6809/unaplib_6309.s create mode 100644 tools/apultra/asm/6809/unaplib_6309_b.s create mode 100644 tools/apultra/asm/6809/unaplib_b.s create mode 100644 tools/apultra/asm/8088/aplib_8088_fast.S create mode 100644 tools/apultra/asm/8088/aplib_8088_small.S create mode 100644 tools/apultra/asm/ARM7TDMI/aplib_arm.s create mode 100644 tools/apultra/asm/Z80/unaplib_fast.asm create mode 100644 tools/apultra/asm/Z80/unaplib_small.asm create mode 100644 tools/apultra/asm/x86/aplib_x86_fast.asm create mode 100644 tools/apultra/asm/x86/aplib_x86_small.asm (limited to 'tools/apultra/asm') diff --git a/tools/apultra/asm/6502/aplib_6502.asm b/tools/apultra/asm/6502/aplib_6502.asm new file mode 100644 index 0000000..1bc11b4 --- /dev/null +++ b/tools/apultra/asm/6502/aplib_6502.asm @@ -0,0 +1,257 @@ +; *************************************************************************** +; *************************************************************************** +; +; aplib_6502.s +; +; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format. +; +; Includes support for Emmanuel Marty's enhancements to the aPLib format. +; +; The code is 252 bytes long for standard format, 270 for enhanced format. +; +; This code is written for the ACME assembler. +; +; Copyright John Brandwood 2019. +; +; Distributed under the Boost Software License, Version 1.0. +; (See accompanying file LICENSE_1_0.txt or copy at +; http://www.boost.org/LICENSE_1_0.txt) +; +; *************************************************************************** +; *************************************************************************** + + + +; *************************************************************************** +; *************************************************************************** +; +; Decompression Macros +; + + ; + ; Macro to increment the source pointer to the next page. + ; + + !macro APL_INC_PAGE { + inc 64768 bytes has +; been removed, since these don't occur with a 16-bit address range. +; +; As an optimization, the code to handle window offsets > 32000 bytes can +; be commented-out, since these don't occur in typical 8-bit computer usage. +; + +apl_decompress: ldy #0 ; Initialize source index. + + lda #$80 ; Initialize an empty + sta + bne .skip2 + jsr .load_bit +.skip2: bcc .copy_large + + asl - gamma-coded LZSS pair. + ; + +.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2). + sty = 32000, length += 2. + bcs .match_plus2 + cpy #$05 ; If offset >= 1280, length += 1. + bcs .match_plus1 + bcc .do_match +.lt256: ldy = 256. + + cpx #$7d ; offset >= 32000 (7d00) ? + bcs .offset_incby2 ; if so, increase match len by 2 + cpx #$05 ; offset >= 1280 (0500) ? + bcs .offset_incby1 ; if so, increase match len by 1 + bcc .got_len ; length is fine, go copy + +.offset_1byte: ldx = 32000 ? + bge.s .inc_by_2 ; if so, increase match len by 2 + cmp.l a3,d3 ; offset >= 1280 ? + bge.s .inc_by_1 ; if so, increase match len by 1 + cmp.l a4,d3 ; offset < 128 ? + bge.s .got_len ; if so, increase match len by 2 +.inc_by_2: addq.l #1,d0 ; increase match len by 1 +.inc_by_1: addq.l #1,d0 ; increase match len by 1 + +.got_len: move.l a1,a6 ; calculate backreference address + sub.l d3,a6 ; (dest - match offset) + subq.l #1,d0 ; dbf will loop until d0 is -1, not 0 +.copy_match: move.b (a6)+,(a1)+ ; copy matched byte + dbf d0,.copy_match ; loop for all matched bytes + moveq #2,d2 ; clear LWM flag + bra.s .next_token ; go decode next token + +.other_match: bsr.s .get_bit ; read '7+1 match or short literal' bit + bcs.s .short_match ; if 111: 4 bit offset for 1-byte copy + + moveq #1,d0 ; 110: prepare match length + moveq #0,d3 ; clear high bits of offset + move.b (a0)+,d3 ; read low bits of offset + length bit + lsr.b #1,d3 ; shift offset into place, len into carry + beq.s .done ; check for EOD + addx.b d0,d0 ; len = (1 << 1) + carry bit, ie. 2 or 3 + bra.s .got_len ; go copy match + +.short_match: moveq #0,d0 ; clear short offset before reading 4 bits + bsr.s .get_dibits ; read a data bit into d0, one into carry + addx.b d0,d0 ; shift second bit into d0 + bsr.s .get_dibits ; read a data bit into d0, one into carry + addx.b d0,d0 ; shift second bit into d0 + beq.s .write_zero ; if offset is zero, write a 0 + + move.l a1,a6 ; calculate backreference address + sub.l d0,a6 ; (dest - short offset) + move.b (a6),d0 ; read matched byte +.write_zero: move.b d0,(a1)+ ; write matched byte or 0 + bra.s .after_lit ; set LWM flag and go decode next token + +.done: move.l a1,d0 ; pointer to last decompressed byte + 1 + sub.l a6,d0 ; minus start of decompression buffer = size + movem.l (sp)+,a2-a6/d2-d3 + rts + +.get_gamma2: moveq #1,d0 ; init to 1 so it gets shifted to 2 below +.gamma2_loop: bsr.s .get_dibits ; read data bit, shift into d0 + ; and read continuation bit + bcs.s .gamma2_loop ; loop until a 0 continuation bit is read + rts + +.get_dibits: bsr.s .get_bit ; read bit + addx.l d0,d0 ; shift into d0 + ; fall through +.get_bit: add.b d1,d1 ; shift bit queue, high bit into carry + bne.s .got_bit ; queue not empty, bits remain + move.b (a0)+,d1 ; read 8 new bits + addx.b d1,d1 ; shift bit queue, high bit into carry + ; and shift 1 from carry into bit queue +.got_bit: rts diff --git a/tools/apultra/asm/6809/unaplib.s b/tools/apultra/asm/6809/unaplib.s new file mode 100644 index 0000000..641c3f4 --- /dev/null +++ b/tools/apultra/asm/6809/unaplib.s @@ -0,0 +1,125 @@ +; unaplib.s - aPLib decompressor for 6809 - 157 bytes +; +; in: x = start of compressed data +; y = start of decompression buffer +; out: y = end of decompression buffer + 1 +; +; Copyright (C) 2020 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +apl_decompress + lda #$80 ; initialize empty bit queue + sta = 32000 ? + bge apincby2 ; if so, increase match len by 2 + cmpx #$0500 ; offset >= 1280 ? + bge apincby1 ; if so, increase match len by 1 + cmpx #$80 ; offset < 128 ? + bge apgotlen ; if so, increase match len by 2 +apincby2 addd #1 +apincby1 addd #1 +apgotlen pshs u ; save source compressed data pointer + tfr d,x ; copy match length to X + +aprepof ldd #$aaaa ; load match offset + nega ; reverse sign of offset in D + negb + sbca #0 + leau d,y ; put backreference start address in U (dst+offset) + +apcpymt lda ,u+ ; copy matched byte + sta ,y+ + leax -1,x ; decrement X + bne apcpymt ; loop until all matched bytes are copied + + puls u ; restore source compressed data pointer + + lda #$02 ; clear 'follows literal' flag + bra aptoken + +apdibits bsr apgetbit ; read bit + rolb ; push into B +apgetbit lsl +; +; APLIB_LONG_OFFSET_DISABLE +; Defined variable to disable long offsets >= 32000 for a speed and space +; optimization. Only enable this if you know what you are doing. +; ex. APLIB_LONG_OFFSET_DISABLE equ 1 + + +; define options + ifdef APLIB_VAR +apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization) + else +apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory) + endc + + +apl_decompress + lda #$80 ; initialize empty bit queue + sta apbitbuf ; plus bit to roll into carry + tfr x,u + +apcplit ldb ,u+ ; copy literal byte +apwtlit stb ,y+ + + ldb #3 ; set 'follows literal' flag + +aptoken bsr apgetbit ; read 'literal or match' bit + bcc apcplit ; if 0: literal + + bsr apgetbit ; read '8+n bits or other type' bit + bcs apother ; if 11x: other type of match + + bsr apgamma2 ; 10: read gamma2-coded high offset bits + clra + subr d,w ; high offset bits == 2 when follows_literal == 3 ? + bcc apnorep ; if not, not a rep-match + + bsr apgamma2 ; read repmatch length + bra apgotlen ; go copy large match + +apnorep tfr f,a ; transfer high offset bits to A + ldb ,u+ ; read low offset byte in B + tfr d,x ; save match offset + + bsr apgamma2 ; read match length + + ifndef APLIB_LONG_OFFSET_DISABLE + cmpx #$7D00 ; offset >= 32000 ? + bge apincby2 ; if so, increase match len by 2 + endc + cmpx #$0500 ; offset >= 1280 ? + bge apincby1 ; if so, increase match len by 1 + cmpx #$80 ; offset < 128 ? + bge apgotlen ; if so, increase match len by 2 +apincby2 incw +apincby1 incw + +apgotlen tfr y,d ; transfer dst to D + subr x,d ; put backreference start address in D (dst + offset) + tfm d+,y+ ; copy matched bytes + + ldb #2 ; clear 'follows literal' flag + bra aptoken + +apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below +loop@ bsr apgetbit ; read data bit + rolw ; shift into W + bsr apgetbit ; read continuation bit + bcs loop@ ; loop until a zero continuation bit is read + rts + +apdibits bsr apgetbit ; read bit + rolb ; push into B +apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry + bne aprts ; queue not empty, bits remain + lda ,u+ ; read 8 new bits + rola ; shift bit queue, and high bit into carry + sta apbitbuf ; save bit queue +aprts rts + +apshort clrb + bsr apdibits ; read 2 offset bits + rolb + bsr apdibits ; read 4 offset bits + rolb + beq apwtlit ; if zero, go write it + + negb ; reverse offset in D + ldb b,y ; load backreferenced byte from dst+offset + bra apwtlit ; go write it + +apother bsr apgetbit ; read '7+1 match or short literal' bit + bcs apshort ; if 111: 4 bit offset for 1-byte copy + + ldb ,u+ ; read low bits of offset + length bit in B + beq aprts ; check for EOD and exit if so + clra ; clear high bits in A + lsrb ; shift offset in place, shift length bit into carry + tfr d,x ; save match offset + ldb #1 ; len in B will be 2*1+carry: + rolb ; shift length, and carry into B + tfr d,w + bra apgotlen ; go copy match diff --git a/tools/apultra/asm/6809/unaplib_6309_b.s b/tools/apultra/asm/6809/unaplib_6309_b.s new file mode 100644 index 0000000..8343edf --- /dev/null +++ b/tools/apultra/asm/6809/unaplib_6309_b.s @@ -0,0 +1,143 @@ +; unaplib_6309_b.s - aPLib backward decompressor for H6309 - 139 bytes +; +; in: x = last byte of compressed data +; y = last byte of decompression buffer +; out: y = first byte of decompressed data +; +; Copyright (C) 2020 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + + +; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements +; added by Doug Masten. +; +; Main advantage of H6309 CPU is the "TFM" instruction which can copy one +; byte of memory in 3 clock cycles vs a traditional copy loop that takes +; 20 clock cycles. + +; Options: +; APLIB_VAR +; Define variable to point to a DP memory location for a memory space +; and speed optimization. +; ex. APLIB_VAR equ +; +; APLIB_LONG_OFFSET_DISABLE +; Defined variable to disable long offsets >= 32000 for a speed and space +; optimization. Only enable this if you know what you are doing. +; ex. APLIB_LONG_OFFSET_DISABLE equ 1 + + +; define options + ifdef APLIB_VAR +apbitbuf equ APLIB_VAR ; bit queue (use DP memory for mem & space optimization) + else +apbitbuf fcb 0 ; bit queue (DEFAULT - use extended memory) + endc + + +apl_decompress + lda #$80 ; initialize empty bit queue + sta apbitbuf ; plus bit to roll into carry + leau 1,x + leay 1,y + +apcplit ldb ,-u ; copy literal byte +apwtlit stb ,-y + + ldb #3 ; set 'follows literal' flag + +aptoken bsr apgetbit ; read 'literal or match' bit + bcc apcplit ; if 0: literal + + bsr apgetbit ; read '8+n bits or other type' bit + bcs apother ; if 11x: other type of match + + bsr apgamma2 ; 10: read gamma2-coded high offset bits + clra + subr d,w ; high offset bits == 2 when follows_literal == 3 ? + bcc apnorep ; if not, not a rep-match + + bsr apgamma2 ; read repmatch length + bra apgotlen ; go copy large match + +apnorep tfr f,a ; transfer high offset bits to A + ldb ,-u ; read low offset byte in B + tfr d,x ; save match offset + + bsr apgamma2 ; read match length + + ifndef APLIB_LONG_OFFSET_DISABLE + cmpx #$7D00 ; offset >= 32000 ? + bge apincby2 ; if so, increase match len by 2 + endc + cmpx #$0500 ; offset >= 1280 ? + bge apincby1 ; if so, increase match len by 1 + cmpx #$80 ; offset < 128 ? + bge apgotlen ; if so, increase match len by 2 +apincby2 incw +apincby1 incw + +apgotlen tfr y,d ; transfer dst to D + addr x,d ; put backreference start address in D (dst + offset) + decd + leay -1,y + tfm d-,y- ; copy matched bytes + leay 1,y + + ldb #2 ; clear 'follows literal' flag + bra aptoken + +apgamma2 ldw #1 ; init to 1 so it gets shifted to 2 below +loop@ bsr apgetbit ; read data bit + rolw ; shift into W + bsr apgetbit ; read continuation bit + bcs loop@ ; loop until a zero continuation bit is read + rts + +apdibits bsr apgetbit ; read bit + rolb ; push into B +apgetbit lsl apbitbuf ; shift bit queue, and high bit into carry + bne aprts ; queue not empty, bits remain + lda ,-u ; read 8 new bits + rola ; shift bit queue, and high bit into carry + sta apbitbuf ; save bit queue +aprts rts + +apshort clrb + bsr apdibits ; read 2 offset bits + rolb + bsr apdibits ; read 4 offset bits + rolb + beq apwtlit ; if zero, go write it + + decb ; we load below without predecrement, adjust here + ldb b,y ; load backreferenced byte from dst+offset + bra apwtlit ; go write it + +apother bsr apgetbit ; read '7+1 match or short literal' bit + bcs apshort ; if 111: 4 bit offset for 1-byte copy + + ldb ,-u ; read low bits of offset + length bit in B + beq aprts ; check for EOD and exit if so + clra ; clear high bits in A + lsrb ; shift offset in place, shift length bit into carry + tfr d,x ; save match offset + ldb #1 ; len in B will be 2*1+carry: + rolb ; shift length, and carry into B + tfr d,w + bra apgotlen ; go copy match diff --git a/tools/apultra/asm/6809/unaplib_b.s b/tools/apultra/asm/6809/unaplib_b.s new file mode 100644 index 0000000..02f943c --- /dev/null +++ b/tools/apultra/asm/6809/unaplib_b.s @@ -0,0 +1,122 @@ +; unaplib_b.s - aPLib backward decompressor for 6809 - 154 bytes +; +; in: x = last byte of compressed data +; y = last byte of decompression buffer +; out: y = first byte of decompressed data +; +; Copyright (C) 2020 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +apl_decompress + lda #$80 ; initialize empty bit queue + sta = 32000 ? + bge apincby2 ; if so, increase match len by 2 + cmpx #$0500 ; offset >= 1280 ? + bge apincby1 ; if so, increase match len by 1 + cmpx #$80 ; offset < 128 ? + bge apgotlen ; if so, increase match len by 2 +apincby2 addd #1 +apincby1 addd #1 +apgotlen pshs u ; save source compressed data pointer + tfr d,x ; copy match length to X + +aprepof leau $aaaa,y ; put backreference start address in U (dst+offset) + +apcpymt lda ,-u ; copy matched byte + sta ,-y + leax -1,x ; decrement X + bne apcpymt ; loop until all matched bytes are copied + + puls u ; restore source compressed data pointer + + lda #$02 ; clear 'follows literal' flag + bra aptoken + +apdibits bsr apgetbit ; read bit + rolb ; push into B +apgetbit lsl = 2, so substracting follows_literal when it + ; is == 2 will never result in zero) + jae .not_repmatch ; if not, not a rep-match + + call .get_gamma2 ; read match length + jmp short .got_len ; go copy + +.not_repmatch: + mov dh,cl ; transfer high offset bits to dh + mov dl,[si] ; read low offset byte in dl + inc si + + call .get_gamma2 ; read match length + cmp dh,07DH ; offset >= 32000 ? + jae .increase_len_by2 ; if so, increase match len by 2 + cmp dh,05H ; offset >= 1280 ? + jae .increase_len_by1 ; if so, increase match len by 1 + cmp dx,0080H ; offset < 128 ? + jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy +.increase_len_by2: + inc cx ; increase length +.increase_len_by1: + inc cx ; increase length + + ; copy cx bytes from match offset dx + +.got_len: + push ds ; save ds:si (current pointer to compressed data) + mov bp,si + + push es + pop ds + mov si,di ; point to destination in es:di - offset in dx + sub si,dx + rep movsb ; copy matched bytes + + mov si,bp ; restore ds:si + pop ds + + mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands) + jmp short .next_command + + ; read gamma2-coded value into cx + +.get_gamma2: + xor cx,cx ; initialize to 1 so that value will start at 2 + inc cx ; when shifted left in the adc below + +.gamma2_loop: + apl_get_bit ; read data bit + adc cx,cx ; shift into cx + apl_get_bit ; read continuation bit + jc .gamma2_loop ; loop until a zero continuation bit is read + + ret + + ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy + +.other: + xor cx,cx + apl_get_bit ; read '7+1 match or short literal' bit + jc .short_literal ; 111: 4 bit offset for 1-byte copy + + ; 110: 7 bits offset + 1 bit length + + mov dl,[si] ; read offset + length in dl + inc si + + inc cx ; prepare cx for length below + shr dl,1 ; shift len bit into carry, and offset in place + je .done ; if zero offset: EOD + adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3 + + xor dh,dh ; clear high bits of offset + jmp short .got_len + + ; 4 bits offset / 1 byte copy + +.short_literal: + apl_get_bit ; read 4 offset bits + adc cl,cl + apl_get_bit + adc cl,cl + apl_get_bit + adc cl,cl + apl_get_bit + adc cl,cl + xchg ax,cx ; preserve bit queue in cx, put offset in ax + jz .write_zero ; if offset is 0, write a zero byte + + ; short offset 1-15 + mov bx,di ; point to destination in es:di - offset in ax + sub bx,ax ; we trash bx, it will be reset to 3 when we loop + mov al,[es:bx] ; read byte from short offset +.write_zero: + stosb ; copy matched byte + mov ax,cx ; restore bit queue in al + jmp .next_command_after_literal + +.done: + pop ax ; retrieve the original decompression offset + xchg di,ax ; compute decompressed size + sub ax,di + ret diff --git a/tools/apultra/asm/8088/aplib_8088_small.S b/tools/apultra/asm/8088/aplib_8088_small.S new file mode 100644 index 0000000..542991e --- /dev/null +++ b/tools/apultra/asm/8088/aplib_8088_small.S @@ -0,0 +1,177 @@ +; aplib_8088_small.S - size-optimized aPLib decompressor for 8088 - 145 bytes +; +; Copyright (C) 2019 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + + segment .text + bits 16 + +; --------------------------------------------------------------------------- +; Decompress aPLib data +; inputs: +; * ds:si: compressed aPLib data +; * es:di: output buffer +; output: +; * ax: decompressed size +; --------------------------------------------------------------------------- + +apl_decompress: + push di ; remember decompression offset + cld ; make string operations go forward + + ; === register map === + ; al: bit queue + ; ah: unused, but value is trashed + ; bx: follows_literal + ; cx: scratch register for reading gamma2 codes and storing copy length + ; dx: match offset (and rep-offset) + ; si: input (compressed data) pointer + ; di: output (decompressed data) pointer + ; bp: offset of .get_bit + + mov al,080H ; clear bit queue(al) and set high bit to move into carry + xor dx,dx ; invalidate rep offset + mov bp,.get_bit ; load offset of .get_bit, to be used with call bp + +.literal: + movsb ; read and write literal byte +.next_command_after_literal: + mov bx,03H ; set follows_literal(bx) to 3 + +.next_command: + call bp ; read 'literal or match' bit + jnc .literal ; if 0: literal + + ; 1x: match + + call bp ; read '8+n bits or other type' bit + jc .other ; 11x: other type of match + + ; 10: 8+n bits match + call .get_gamma2 ; read gamma2-coded high offset bits + sub cx,bx ; high offset bits == 2 when follows_literal == 3 ? + ; (a gamma2 value is always >= 2, so substracting follows_literal when it + ; is == 2 will never result in a negative value) + jae .not_repmatch ; if not, not a rep-match + + call .get_gamma2 ; read match length + jmp short .got_len ; go copy + +.not_repmatch: + mov dh,cl ; transfer high offset bits to dh + mov dl,[si] ; read low offset byte in dl + inc si + + call .get_gamma2 ; read match length + cmp dh,07DH ; offset >= 32000 ? + jae .increase_len_by2 ; if so, increase match len by 2 + cmp dh,05H ; offset >= 1280 ? + jae .increase_len_by1 ; if so, increase match len by 1 + cmp dx,0080H ; offset < 128 ? + jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy +.increase_len_by2: + inc cx ; increase length +.increase_len_by1: + inc cx ; increase length + + ; copy cx bytes from match offset dx + +.got_len: + push ds ; save ds:si (current pointer to compressed data) + push si + + push es + pop ds + mov si,di ; point to destination in es:di - offset in dx + sub si,dx + rep movsb ; copy matched bytes + + pop si ; restore ds:si + pop ds + + mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands) + jmp short .next_command + + ; read gamma2-coded value into cx + +.get_gamma2: + xor cx,cx ; initialize to 1 so that value will start at 2 + inc cx ; when shifted left in the adc below + +.gamma2_loop: + call .get_dibits ; read data bit, shift into cx, read continuation bit + jc .gamma2_loop ; loop until a zero continuation bit is read + + ret + + ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy + +.other: + xor cx,cx + call bp ; read '7+1 match or short literal' bit + jc .short_literal ; 111: 4 bit offset for 1-byte copy + + ; 110: 7 bits offset + 1 bit length + + mov dl,[si] ; read offset + length in dl + inc si + + inc cx ; prepare cx for length below + shr dl,1 ; shift len bit into carry, and offset in place + je .done ; if zero offset: EOD + adc cx,cx ; len in cx: 1*2 + carry bit = 2 or 3 + + xor dh,dh ; clear high bits of offset + jmp short .got_len + + ; 4 bits offset / 1 byte copy + +.short_literal: + call .get_dibits ; read 2 offset bits + adc cx,cx + call .get_dibits ; read 2 offset bits + adc cx,cx + xchg ax,cx ; preserve bit queue in cx, put offset in ax + jz .write_zero ; if offset is 0, write a zero byte + + ; short offset 1-15 + mov bx,di ; point to destination in es:di - offset in ax + sub bx,ax ; we trash bx, it will be reset to 3 when we loop + mov al,[es:bx] ; read byte from short offset +.write_zero: + stosb ; copy matched byte + xchg ax,cx ; restore bit queue in al + jmp .next_command_after_literal + +.done: + pop ax ; retrieve the original decompression offset + xchg di,ax ; compute decompressed size + sub ax,di + ret + +.get_dibits: + call bp ; read data bit + adc cx,cx ; shift into cx + +.get_bit: + add al,al ; shift bit queue, and high bit into carry + jnz .got_bit ; queue not empty, bits remain + lodsb ; read 8 new bits + adc al,al ; shift bit queue, and high bit into carry +.got_bit: + ret diff --git a/tools/apultra/asm/ARM7TDMI/aplib_arm.s b/tools/apultra/asm/ARM7TDMI/aplib_arm.s new file mode 100644 index 0000000..b6d0cef --- /dev/null +++ b/tools/apultra/asm/ARM7TDMI/aplib_arm.s @@ -0,0 +1,150 @@ +@APlib ARM7 decompressor by Dan Weiss, based on the original C version +@Takes in raw apacked data, NOT data created by the 'safe' compressor. +@Code is from the PocketNES NES Emulator for GBA + +@Code is formatted for GNU Assembler + + src .req r0 + dest .req r1 + byte .req r2 + mask .req r3 + gamma .req r4 + lwm .req r6 + recentoff .req r7 + temp .req r8 + +.global depack +.type depack STT_FUNC + +@r0 = src +@r1 = dest +@r2 = byte +@r3 = rotating bit mask +@r4 = increasing gamma +@r6 = lwm +@r7 = recentoff +@r8 = lr copy/scratch + + .macro GETBIT @3 instructions + movs mask,mask,ror #1 + ldrcsb byte,[src],#1 + tst byte,mask + .endm + + .macro GETBITGAMMA @5 instructions + mov gamma,gamma,lsl #1 + GETBIT + addne gamma,gamma,#1 + .endm + +@This initilaiztion code can go into slow memory + +depack: + stmfd sp!,{r4-r10,lr} + ldrb temp,[src],#1 + strb temp,[dest],#1 + ldr mask,=0x01010101 + b aploop_nolwm + +@This inner-loop code should be placed into fast memory + + @depack enters here +aploop_nolwm: + mov lwm,#0 +aploop: + GETBIT + bne apbranch1 + ldrb temp,[src],#1 + strb temp,[dest],#1 + b aploop_nolwm +apbranch1: + GETBIT + beq apbranch2 + GETBIT + beq apbranch3 + @get an offset + mov gamma,#0 + GETBIT + addne gamma,gamma,#1 + GETBITGAMMA + GETBITGAMMA + GETBITGAMMA + cmp gamma,#0 + ldrneb gamma,[dest,-gamma] + strb gamma,[dest],#1 + b aploop_nolwm +apbranch3: + @use 7 bit offset, length = 2 or 3 + @if a zero is encountered here, it's EOF + ldrb gamma,[src],#1 + movs recentoff,gamma,lsr #1 + beq done + ldrcsb temp,[dest,-recentoff] + strcsb temp,[dest],#1 + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + mov lwm,#1 + b aploop +apbranch2: + @use a gamma code * 256 for offset, another gamma code for length + + bl ap_getgamma + sub gamma,gamma,#2 + cmp lwm,#0 + bne ap_is_lwm + mov lwm,#1 + cmp gamma,#0 + bne ap_not_zero_gamma + + @if gamma code is 2, use old recent offset, and a new gamma code for length + bl ap_getgamma +copyloop1: + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + subs gamma,gamma,#1 + bne copyloop1 + b aploop + +ap_not_zero_gamma: + sub gamma,gamma,#1 +ap_is_lwm: + ldrb temp,[src],#1 + add recentoff,temp,gamma,lsl #8 + bl ap_getgamma + @gamma=length + cmp recentoff,#32000 + addge gamma,gamma,#1 + cmp recentoff,#1280 + addge gamma,gamma,#1 + cmp recentoff,#128 + addlt gamma,gamma,#2 +copyloop2: + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + subs gamma,gamma,#1 + bne copyloop2 + b aploop + +ap_getgamma: + mov gamma,#1 +ap_getgammaloop: + GETBITGAMMA + GETBIT + bne ap_getgammaloop + bx lr + +done: + ldmfd sp!,{r4-r10,lr} + bx lr + +.unreq src +.unreq dest +.unreq byte +.unreq mask +.unreq gamma +.unreq lwm +.unreq recentoff +.unreq temp + diff --git a/tools/apultra/asm/Z80/unaplib_fast.asm b/tools/apultra/asm/Z80/unaplib_fast.asm new file mode 100644 index 0000000..c21eb5d --- /dev/null +++ b/tools/apultra/asm/Z80/unaplib_fast.asm @@ -0,0 +1,339 @@ +; +; Speed-optimized ApLib decompressor by spke & uniabis (ver.06 01-05/06/2020, 235 bytes) +; +; The original Z80 decompressors for ApLib were written by Dan Weiss (Dwedit), +; then tweaked by Francisco Javier Pena Pareja (utopian), +; and optimized by Jaime Tejedor Gomez (Metalbrain) and Antonio Villena. +; +; This is a new "implicit state" decompressor heavily optimized for speed by spke. +; (It is 12 bytes shorter and 18% faster than the previously fastest +; 247b decompressor by Metalbrain and Antonio Villena.) +; +; ver.00 by spke (21/08/2018-01/09/2018, 244 bytes, an edit of the existing 247b decompressor); +; ver.01 by spke (12-13/11/2018, 234(-10) bytes, +3% speed using the state machine for LWM); +; ver.02 by spke (06/08/2019, +1% speed); +; ver.03 by spke (27/08/2019, 236(+2) bytes, +1% speed using partly expanded LDIR); +; ver.04 by spke (spring 2020, added full revision history and support for long offsets) +; ver.05 by spke (17-31/05/2020, 230(-6) bytes, +3% speed, added support for backward compression) <- BROKEN, DO NOT USE +; ver.06 by uniabis & spke (01-07/06/2020, 235(+5) bytes, +1% speed, added support for HD64180) +; +; The data must be compressed using any compressor for ApLib capable of generating raw data. +; At present, two best available compressors are: +; +; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or +; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra +; +; The compression can done as follows: +; +; apc.exe e +; or +; apultra.exe +; +; A decent compressor was written by r57shell (although it is worse than compressors above): +; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548 +; The use of the official ApLib compressor by Joergen Ibsen is not recommended. +; +; The decompression is done in the standard way: +; +; ld hl,FirstByteOfCompressedData +; ld de,FirstByteOfMemoryForDecompressedData +; call DecompressApLib +; +; Backward decompression is also supported; you can compress files backward using: +; +; apultra.exe -b +; +; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using: +; +; ld hl,LastByteOfCompressedData +; ld de,LastByteOfMemoryForDecompressedData +; call DecompressApLib +; +; The decompressor modifies AF, AF', BC, DE, HL, IX. +; +; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen, +; see http://www.ibsensoftware.com/ for more information +; +; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K +; DEFINE BackwardDecompression ; decompress data compressed backwards, -10 bytes, speeds decompression up by 3% +; DEFINE HD64180 ; -2 bytes for HD64180/Z180 support, slows decompression down by 1% + + IFNDEF BackwardDecompression + + MACRO NEXT_HL + inc hl + ENDM + + MACRO COPY_1 + ldi + ENDM + + MACRO COPY_BC + ldir + ENDM + + ELSE + + MACRO NEXT_HL + dec hl + ENDM + + MACRO COPY_1 + ldd + ENDM + + MACRO COPY_BC + lddr + ENDM + + ENDIF + + MACRO RELOAD_A + ld a,(hl) : NEXT_HL : rla + ENDM + +@Decompress: COPY_1 : scf + +;================================================================================================================== +;================================================================================================================== +;================================================================================================================== + +LWM0: ;LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match) + +.ReloadByteC0 RELOAD_A : jr c,.Check2ndBit + +; +; case "0"+BYTE: copy a single literal + +.CASE0: COPY_1 ; first byte is always copied as literal + +; +; main decompressor loop + +.MainLoop: add a : jr nc,.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal +.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism +.Check3rdBit add a : call z,ReloadByte : jp c,LWM1.CASE111 ; "110"+[oooooool] = matched 2-3 bytes with a small offset + +; +; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop + +.CASE110: ; "use 7 bit offset, length = 2 or 3" + ; "if a zero is found here, it's EOF" + ld c,(hl) : rr c : ret z ; process EOF + NEXT_HL + ld b,0 + + IFNDEF HD64180 + ld ixl,c : ld ixh,b ; save offset for future LWMs + ELSE + push bc : pop ix + ENDIF + + push hl ; save src + ld h,d : ld l,e ; HL = dest + jr c,.LengthIs3 + +.LengthIs2 + IFNDEF BackwardDecompression + sbc hl,bc + ELSE + add hl,bc + ENDIF + COPY_1 : COPY_1 + jr .PreMainLoop + +.LengthIs3 + IFNDEF BackwardDecompression + or a : sbc hl,bc + ELSE + add hl,bc + ENDIF + COPY_1 : COPY_1 : COPY_1 + jr .PreMainLoop + +.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit + +; +; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism + +.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length" + call GetGammaCoded + + ; the original decompressor contains + ; + ; if ((LWM == 0) && (offs == 2)) { ... } + ; else { + ; if (LWM == 0) { offs -= 3; } + ; else { offs -= 2; } + ; } + ; + ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2, + ; and to split the first condition by noticing that C-1 can never be zero + dec c : dec c : jr z,LWM1.KickInLWM + +.AfterLWM dec c : ld b,c : ld c,(hl) : NEXT_HL ; BC = offset + + IFNDEF HD64180 + ld ixl,c : ld ixh,b : push bc + ELSE + push bc : push bc : pop ix + ENDIF + + call GetGammaCoded ; BC = len* + + ex (sp),hl + + ; interpretation of length value is offset-dependent: + ; if (offs >= 32000) len++; if (offs >= 1280) len++; if (offs < 128) len+=2; + ; in other words, + ; (1 <= offs < 128) +=2 + ; (128 <= offs < 1280) +=0 + ; (1280 <= offs < 31999) +=1 + ; NB offsets over 32000 need one more check, but other Z80 decompressors seem to ignore it. is it not needed? + + ; interpretation of length value is offset-dependent + exa : ld a,h + IFDEF SupportLongOffsets + ; NB offsets over 32000 require an additional check, which is skipped in most + ; Z80 decompressors (seemingly as a performance optimization) + cp 32000/256 : jr nc,.Add2 + ENDIF + cp 5 : jr nc,.Add1 + or a : jr nz,.Add0 + bit 7,l : jr nz,.Add0 +.Add2 inc bc +.Add1 inc bc +.Add0 ; for offs<128 : 4+4+7+7 + 4+7 + 8+7 + 6+6 = 60t + ; for offs>=1280 : 4+4+7+12 + 6 = 33t + ; for 128<=offs<1280 : 4+4+7+7 + 4+12 = 38t OR 4+4+7+7 + 4+7+8+12 = 53t + +.CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset + ; and also that (SP) = src, while having NC + IFNDEF BackwardDecompression + ld a,e : sub l : ld l,a + ld a,d : sbc h + ld h,a : exa + ELSE + exa +.CopyMatchLDH add hl,de + ENDIF + COPY_1 : COPY_BC +.PreMainLoop pop hl ; recover src + +;================================================================================================================== +;================================================================================================================== +;================================================================================================================== + +LWM1: ; LWM = 1 + +; +; main decompressor loop + +.MainLoop: add a : jr nc,LWM0.CASE0 : jr z,.ReloadByteC0 ; "0"+BYTE = copy literal +.Check2ndBit add a : jr nc,.CASE10 : jr z,.ReloadByteC1 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism +.Check3rdBit add a : call z,ReloadByte : jr nc,LWM0.CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset + +; +; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest + +.CASE111: ld bc,%11100000 + DUP 4 + add a : call z,ReloadByte : rl c ; read short offset (4 bits) + EDUP + ex de,hl : jr z,.WriteZero ; zero offset means "write zero" (NB: B is zero here) + + ; "write a previous byte (1-15 away from dest)" + push hl ; BC = offset, DE = src, HL = dest + IFNDEF BackwardDecompression + sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC) + ELSE + add hl,bc + ENDIF + ld c,(hl) + pop hl + +.WriteZero ld (hl),c : NEXT_HL + ex de,hl : jp LWM0.MainLoop ; 10+4*(4+10+8)+4+7 + 11+15+7+10 + 7+4+6+10 = 179t + +.ReloadByteC0 RELOAD_A : jp nc,LWM0.CASE0 + jr .Check2ndBit + +.ReloadByteC1 RELOAD_A : jr c,.Check3rdBit + +; +; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism + +.CASE10: ; "use a gamma code * 256 for offset, another gamma code for length" + call GetGammaCoded + + ; the original decompressor contains + ; + ; if ((LWM == 0) && (offs == 2)) { ... } + ; else { + ; if (LWM == 0) { offs -= 3; } + ; else { offs -= 2; } + ; } + ; + ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2, + ; and to split the first condition by noticing that C-1 can never be zero + dec c : jr LWM0.AfterLWM + +; +; the re-use of the previous offset (LWM magic) + +.KickInLWM: ; "and a new gamma code for length" + inc c : call GetGammaCoded.ReadGamma ; BC = len + + IFNDEF BackwardDecompression + push ix : ex (sp),hl : exa + jr LWM0.CopyMatch + ELSE + push ix : ex (sp),hl + jr LWM0.CopyMatchLDH + ENDIF + +;================================================================================================================== +;================================================================================================================== +;================================================================================================================== + +; +; interlaced gamma code reader +; x0 -> 1x +; x1y0 -> 1xy +; x1y1z0 -> 1xyz etc +; (technically, this is a 2-based variation of Exp-Golomb-1) + +GetGammaCoded: ld bc,1 +.ReadGamma add a : jr z,.ReloadByteRG1 + rl c : rl b + add a : ret nc ; NB: flag NC immediately says we do not need to reload our byte... + jr nz,.ReadGamma ; ...even better, flag NZ then automatically means flag C :) + +.ReloadByteRG2 RELOAD_A : ret nc : jr .ReadGamma + +.ReloadByteRG1 RELOAD_A : rl c : rl b + add a : ret nc : jr .ReadGamma + +; +; pretty usual getbit for mixed datastreams + +ReloadByte: RELOAD_A : ret + diff --git a/tools/apultra/asm/Z80/unaplib_small.asm b/tools/apultra/asm/Z80/unaplib_small.asm new file mode 100644 index 0000000..280de15 --- /dev/null +++ b/tools/apultra/asm/Z80/unaplib_small.asm @@ -0,0 +1,258 @@ +; +; Size-optimized ApLib decompressor by spke & uniabis (ver.04 01-07/06/2020, 139 bytes) +; +; The original Z80 decompressor for ApLib was written by Dan Weiss (Dwedit), +; then tweaked by Francisco Javier Pena Pareja (utopian), +; and optimized by Jaime Tejedor Gomez (Metalbrain). +; +; This version was heavily re-optimized for size by spke. +; (It is 17 bytes shorter and 22% faster than the 156b version by Metalbrain.) +; +; ver.00 by spke (21/08/2018-01/09/2018, 141 bytes); +; ver.01 by spke (spring 2019, 140(-1) bytes, slightly faster); +; ver.02 by spke (05-07/01/2020, added full revision history, support for long offsets +; and an option to use self-modifying code instead of IY) +; ver.03 by spke (18-29/05/2020, +0.5% speed, added support for backward compression) +; ver.04 by uniabis (01-07/06/2020, 139(-1) bytes, +1% speed, added support for HD64180) +; +; The data must be compressed using any compressor for ApLib capable of generating raw data. +; At present, two best available compressors are: +; +; "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or +; "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra +; +; The compression can be done as follows: +; +; apc.exe e +; or +; apultra.exe +; +; A decent compressor was written by r57shell (although it is worse than compressors above): +; http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548 +; The use of the official ApLib compressor by Joergen Ibsen is not recommended. +; +; The decompression is done in the standard way: +; +; ld hl,FirstByteOfCompressedData +; ld de,FirstByteOfMemoryForDecompressedData +; call DecompressApLib +; +; Backward decompression is also supported; you can compress files backward using: +; +; apultra.exe -b +; +; uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using: +; +; ld hl,LastByteOfCompressedData +; ld de,LastByteOfMemoryForDecompressedData +; call DecompressApLib +; +; The decompressor modifies AF, AF', BC, DE, HL, IX. +; +; Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen, +; see http://www.ibsensoftware.com/ for more information +; +; Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +; DEFINE FasterGetBit ; 16% speed-up at the cost of extra 4 bytes +; DEFINE SupportLongOffsets ; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K +; DEFINE BackwardDecompression ; decompress data compressed backwards, -5 bytes, speeds decompression up by 3% + + + IFDEF FasterGetBit + MACRO GET_BIT + add a : call z,ReloadByte + ENDM + ELSE + MACRO GET_BIT + call GetOneBit + ENDM + ENDIF + + IFNDEF BackwardDecompression + + MACRO NEXT_HL + inc hl + ENDM + + MACRO COPY_1 + ldi + ENDM + + MACRO COPY_BC + ldir + ENDM + + ELSE + + MACRO NEXT_HL + dec hl + ENDM + + MACRO COPY_1 + ldd + ENDM + + MACRO COPY_BC + lddr + ENDM + + ENDIF + +@DecompressApLib: ld a,128 + +; +; case "0"+BYTE: copy a single literal + +CASE0: COPY_1 ; first byte is always copied as literal +ResetLWM: ld b,-1 ; LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match) + +; +; main decompressor loop + +MainLoop: GET_BIT : jr nc,CASE0 ; "0"+BYTE = copy literal + GET_BIT : jr nc,CASE10 ; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism + + ld bc,%11100000 + GET_BIT : jr nc,CASE110 ; "110"+[oooooool] = matched 2-3 bytes with a small offset + +; +; case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest + +CASE111: +ReadFourBits GET_BIT ; read short offset (4 bits) + rl c : jr c,ReadFourBits + ex de,hl : jr z,WriteZero ; zero offset means "write zero" (NB: B is zero here) + + ; "write a previous byte (1-15 away from dest)" + push hl ; BC = offset, DE = src, HL = dest + IFNDEF BackwardDecompression + sbc hl,bc ; HL = dest-offset (SBC works because branching above ensured NC) + ELSE + add hl,bc ; HL = dest-offset (SBC works because branching above ensured NC) + ENDIF + ld c,(hl) : pop hl + +WriteZero ld (hl),c : NEXT_HL + ex de,hl : jr ResetLWM ; write one byte, reset LWM + +; +; branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop + +CASE110: ; "use 7 bit offset, length = 2 or 3" + ; "if a zero is found here, it's EOF" + ld c,(hl) : rr c : ret z ; process EOF + NEXT_HL + + push hl ; save src + ld h,b : ld l,c ; HL = offset + + ; flag NC means len=2, flag C means len=3 + ld c,1 : rl c : jr SaveLWMOffset + +; +; branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism + +CASE10: ; save state of LWM into A' + exa : ld a,b : exa + + ; "use a gamma code * 256 for offset, another gamma code for length" + call GetGammaCoded + + ; the original decompressor contains + ; + ; if ((LWM == 0) && (offs == 2)) { ... } + ; else { + ; if (LWM == 0) { offs -= 3; } + ; else { offs -= 2; } + ; } + ; + ; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2, + ; and to split the first condition by noticing that C-1 can never be zero + exa : add c : ld c,a : exa + + ; "if gamma code is 2, use old r0 offset" + dec c : jr z,KickInLWM + dec c + ld b,c : ld c,(hl) : NEXT_HL ; BC = offset + + push bc ; (SP) = offset + call GetGammaCoded ; BC = len* + ex (sp),hl ; HL = offset, (SP) = src + + ; interpretation of length value is offset-dependent + exa : ld a,h + IFDEF SupportLongOffsets + ; NB offsets over 32000 require an additional check, which is skipped in most + ; Z80 decompressors (seemingly as a performance optimization) + cp 32000/256 : jr nc,.Add2 + ENDIF + cp 5 : jr nc,.Add1 + or a : jr nz,.Add0 + bit 7,l : jr nz,.Add0 +.Add2 inc bc +.Add1 inc bc +.Add0 exa + +SaveLWMOffset: + push hl : pop ix ; save offset for future LWMs + +CopyMatch: ; this assumes that BC = len, DE = dest, HL = offset + ; and also that (SP) = src, while having NC + IFNDEF BackwardDecompression + push de + ex de,hl : sbc hl,de ; HL = dest-offset + pop de ; DE = dest + ELSE + add hl,de ; HL = dest+offset + ENDIF + + COPY_BC + pop hl ; recover src + jr MainLoop + +; +; the re-use of the previous offset (LWM magic) + +KickInLWM: ; "and a new gamma code for length" + call GetGammaCoded ; BC = len + push ix : ex (sp),hl ; DE = dest, HL = prev offset + jr CopyMatch + +; +; interlaced gamma code reader +; x0 -> 1x +; x1y0 -> 1xy +; x1y1z0 -> 1xyz etc +; (technically, this is a 2-based variation of Exp-Golomb-1) + +GetGammaCoded: ld bc,1 +ReadGamma GET_BIT : rl c : rl b + GET_BIT : ret nc + jr ReadGamma + +; +; pretty usual getbit for mixed datastreams + + IFNDEF FasterGetBit +GetOneBit: add a : ret nz + ENDIF +ReloadByte: ld a,(hl) : NEXT_HL + rla : ret + diff --git a/tools/apultra/asm/x86/aplib_x86_fast.asm b/tools/apultra/asm/x86/aplib_x86_fast.asm new file mode 100644 index 0000000..9e41d31 --- /dev/null +++ b/tools/apultra/asm/x86/aplib_x86_fast.asm @@ -0,0 +1,180 @@ +; aplib_x86_fast.asm - speed-optimized aPLib decompressor for x86 - 188 bytes +; +; Copyright (C) 2019 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + + segment .text + bits 32 + +; --------------------------------------------------------------------------- +; Decompress aPLib data +; inputs: +; * esi: compressed aPLib data +; * edi: output buffer +; output: +; * eax: decompressed size +; --------------------------------------------------------------------------- + %ifndef BIN + global apl_decompress + global _apl_decompress + %endif + + ; uint32_t apl_decompress(const void *Source, void *Destination); + +%macro apl_get_bit 0 ; read bit into carry + add al,al ; shift bit queue, and high bit into carry + jnz %%gotbit ; queue not empty, bits remain + lodsb ; read 8 new bits + adc al,al ; shift bit queue, and high bit into carry +%%gotbit: +%endmacro + +apl_decompress: +_apl_decompress: + pushad + + %ifdef CDECL + mov esi, [esp+32+4] ; esi = aPLib compressed data + mov edi, [esp+32+8] ; edi = output + %endif + + ; === register map === + ; al: bit queue + ; ah: unused, but value is trashed + ; bx: follows_literal + ; cx: scratch register for reading gamma2 codes and storing copy length + ; dx: match offset (and rep-offset) + ; si: input (compressed data) pointer + ; di: output (decompressed data) pointer + ; bp: temporary value, trashed + + mov al,080H ; clear bit queue(al) and set high bit to move into carry + xor edx, edx ; invalidate rep offset +.literal: + movsb ; read and write literal byte +.next_command_after_literal: + mov ebx,03H ; set follows_literal(bx) to 3 + +.next_command: + apl_get_bit ; read 'literal or match' bit + jnc .literal ; if 0: literal + + ; 1x: match + + apl_get_bit ; read '8+n bits or other type' bit + jc .other ; 11x: other type of match + + ; 10: 8+n bits match + call .get_gamma2 ; read gamma2-coded high offset bits + sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ? + ; (a gamma2 value is always >= 2, so substracting follows_literal when it + ; is == 2 will never result in zero) + jae .not_repmatch ; if not, not a rep-match + + call .get_gamma2 ; read match length + jmp .got_len ; go copy + +.not_repmatch: + mov edx,ecx ; transfer high offset bits to dh + shl edx, 8 + mov dl,[esi] ; read low offset byte in dl + inc esi + + call .get_gamma2 ; read match length + cmp edx,07D00H ; offset >= 32000 ? + jae .increase_len_by2 ; if so, increase match len by 2 + cmp edx,0500H ; offset >= 1280 ? + jae .increase_len_by1 ; if so, increase match len by 1 + cmp edx,0080H ; offset < 128 ? + jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy +.increase_len_by2: + inc ecx ; increase length +.increase_len_by1: + inc ecx ; increase length + + ; copy cx bytes from match offset dx + +.got_len: + push esi + mov esi,edi ; point to destination in es:di - offset in dx + sub esi,edx + rep movsb ; copy matched bytes + pop esi + mov bl,02H ; set follows_literal to 2 (bx is unmodified by match commands) + jmp .next_command + + ; read gamma2-coded value into cx + +.get_gamma2: + xor ecx,ecx ; initialize to 1 so that value will start at 2 + inc ecx ; when shifted left in the adc below + +.gamma2_loop: + apl_get_bit ; read data bit + adc ecx,ecx ; shift into cx + apl_get_bit ; read continuation bit + jc .gamma2_loop ; loop until a zero continuation bit is read + + ret + + ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy + +.other: + xor ecx,ecx + apl_get_bit ; read '7+1 match or short literal' bit + jc .short_literal ; 111: 4 bit offset for 1-byte copy + + ; 110: 7 bits offset + 1 bit length + + movzx edx,byte[esi] ; read offset + length in dl + inc esi + + inc ecx ; prepare cx for length below + shr dl,1 ; shift len bit into carry, and offset in place + je .done ; if zero offset: EOD + adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3 + jmp .got_len + + ; 4 bits offset / 1 byte copy + +.short_literal: + apl_get_bit ; read 4 offset bits + adc ecx,ecx + apl_get_bit + adc ecx,ecx + apl_get_bit + adc ecx,ecx + apl_get_bit + adc ecx,ecx + xchg eax,ecx ; preserve bit queue in cx, put offset in ax + jz .write_zero ; if offset is 0, write a zero byte + + ; short offset 1-15 + mov ebx,edi ; point to destination in es:di - offset in ax + sub ebx,eax ; we trash bx, it will be reset to 3 when we loop + mov al,[ebx] ; read byte from short offset +.write_zero: + stosb ; copy matched byte + mov eax,ecx ; restore bit queue in al + jmp .next_command_after_literal + +.done: + sub edi, [esp+32+8] ; compute decompressed size + mov [esp+28], edi + popad + ret diff --git a/tools/apultra/asm/x86/aplib_x86_small.asm b/tools/apultra/asm/x86/aplib_x86_small.asm new file mode 100644 index 0000000..ada00f6 --- /dev/null +++ b/tools/apultra/asm/x86/aplib_x86_small.asm @@ -0,0 +1,159 @@ +; aplib_x86_small.asm - size-optimized aPLib decompressor for x86 - 185 bytes +; +; Copyright (C) 2019 Emmanuel Marty +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + segment .text + bits 32 +; --------------------------------------------------------------------------- +; Decompress aPLib data +; inputs: +; * esi: compressed aPLib data +; * edi: output buffer +; output: +; * eax: decompressed size +; --------------------------------------------------------------------------- + %ifndef BIN + global apl_decompress + global _apl_decompress + %endif + +apl_decompress: +_apl_decompress: + pushad + + %ifdef CDECL + mov esi, [esp+32+4] ; esi = aPLib compressed data + mov edi, [esp+32+8] ; edi = output + %endif + + ; === register map === + ; al: bit queue + ; ah: unused, but value is trashed + ; ebx: follows_literal + ; ecx: scratch register for reading gamma2 codes and storing copy length + ; edx: match offset (and rep-offset) + ; esi: input (compressed data) pointer + ; edi: output (decompressed data) pointer + ; ebp: offset of .get_bit + + mov al,080H ; clear bit queue(al) and set high bit to move into carry + xor edx, edx ; invalidate rep offset in edx + + call .init_get_bit +.get_dibits: + call ebp ; read data bit + adc ecx,ecx ; shift into cx +.get_bit: + add al,al ; shift bit queue, and high bit into carry + jnz .got_bit ; queue not empty, bits remain + lodsb ; read 8 new bits + adc al,al ; shift bit queue, and high bit into carry +.got_bit: + ret +.init_get_bit: + pop ebp ; load offset of .get_bit, to be used with call ebp + add ebp, .get_bit - .get_dibits +.literal: + movsb ; read and write literal byte +.next_command_after_literal: + push 03H + pop ebx ; set follows_literal(bx) to 3 + +.next_command: + call ebp ; read 'literal or match' bit + jnc .literal ; if 0: literal + + ; 1x: match + call ebp ; read '8+n bits or other type' bit + jc .other ; 11x: other type of match + ; 10: 8+n bits match + call .get_gamma2 ; read gamma2-coded high offset bits + sub ecx,ebx ; high offset bits == 2 when follows_literal == 3 ? + ; (a gamma2 value is always >= 2, so substracting follows_literal when it + ; is == 2 will never result in a negative value) + jae .not_repmatch ; if not, not a rep-match + call .get_gamma2 ; read match length + jmp .got_len ; go copy +.not_repmatch: + mov edx,ecx ; transfer high offset bits to dh + shl edx,8 + mov dl,[esi] ; read low offset byte in dl + inc esi + call .get_gamma2 ; read match length + cmp edx,7D00H ; offset >= 32000 ? + jae .increase_len_by2 ; if so, increase match len by 2 + cmp edx,0500H ; offset >= 1280 ? + jae .increase_len_by1 ; if so, increase match len by 1 + cmp edx,0080H ; offset < 128 ? + jae .got_len ; if so, increase match len by 2, otherwise it would be a 7+1 copy +.increase_len_by2: + inc ecx ; increase length +.increase_len_by1: + inc ecx ; increase length + ; copy ecx bytes from match offset edx +.got_len: + push esi ; save esi (current pointer to compressed data) + mov esi,edi ; point to destination in edi - offset in edx + sub esi,edx + rep movsb ; copy matched bytes + pop esi ; restore esi + mov bl,02H ; set follows_literal to 2 (ebx is unmodified by match commands) + jmp .next_command + ; read gamma2-coded value into ecx +.get_gamma2: + xor ecx,ecx ; initialize to 1 so that value will start at 2 + inc ecx ; when shifted left in the adc below +.gamma2_loop: + call .get_dibits ; read data bit, shift into cx, read continuation bit + jc .gamma2_loop ; loop until a zero continuation bit is read + ret + ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy +.other: + xor ecx,ecx + call ebp ; read '7+1 match or short literal' bit + jc .short_literal ; 111: 4 bit offset for 1-byte copy + ; 110: 7 bits offset + 1 bit length + + movzx edx,byte[esi] ; read offset + length in dl + inc esi + inc ecx ; prepare cx for length below + shr dl,1 ; shift len bit into carry, and offset in place + je .done ; if zero offset: EOD + adc ecx,ecx ; len in cx: 1*2 + carry bit = 2 or 3 + jmp .got_len + ; 4 bits offset / 1 byte copy +.short_literal: + call .get_dibits ; read 2 offset bits + adc ecx,ecx + call .get_dibits ; read 2 offset bits + adc ecx,ecx + xchg eax,ecx ; preserve bit queue in cx, put offset in ax + jz .write_zero ; if offset is 0, write a zero byte + ; short offset 1-15 + mov ebx,edi ; point to destination in es:di - offset in ax + sub ebx,eax ; we trash bx, it will be reset to 3 when we loop + mov al,[ebx] ; read byte from short offset +.write_zero: + stosb ; copy matched byte + xchg eax,ecx ; restore bit queue in al + jmp .next_command_after_literal +.done: + sub edi, [esp+32+8] ; compute decompressed size + mov [esp+28], edi + popad + ret -- cgit v1.2.3