diff options
author | Juan J. Martinez <jjm@usebox.net> | 2023-11-05 11:22:55 +0000 |
---|---|---|
committer | Juan J. Martinez <jjm@usebox.net> | 2023-11-05 11:31:28 +0000 |
commit | 2fbdf974338bde8576efdae40a819a76b2391033 (patch) | |
tree | 64d41a37470143f142344f9a439d96de3e7918c2 /tools/apultra_src/asm | |
download | kitsunes-curse-2fbdf974338bde8576efdae40a819a76b2391033.tar.gz kitsunes-curse-2fbdf974338bde8576efdae40a819a76b2391033.zip |
Initial import of the open source release
Diffstat (limited to 'tools/apultra_src/asm')
-rw-r--r-- | tools/apultra_src/asm/6502/aplib_6502.asm | 371 | ||||
-rw-r--r-- | tools/apultra_src/asm/ARM7TDMI/aplib_arm.s | 150 | ||||
-rw-r--r-- | tools/apultra_src/asm/Z80/aplib_z80.asm | 190 |
3 files changed, 711 insertions, 0 deletions
diff --git a/tools/apultra_src/asm/6502/aplib_6502.asm b/tools/apultra_src/asm/6502/aplib_6502.asm new file mode 100644 index 0000000..219ac86 --- /dev/null +++ b/tools/apultra_src/asm/6502/aplib_6502.asm @@ -0,0 +1,371 @@ +; *************************************************************************** +; *************************************************************************** +; +; aplib_6502.s +; +; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format. +; +; Includes support for Emmanuel Marty's enhancements to the aPLib format. +; +; The code is 252 bytes long for standard format, 270 for enhanced format. +; +; This code is written for the ACME assembler. +; +; Copyright John Brandwood 2019. +; +; Distributed under the Boost Software License, Version 1.0. +; (See accompanying file LICENSE_1_0.txt or copy at +; http://www.boost.org/LICENSE_1_0.txt) +; +; *************************************************************************** +; *************************************************************************** + + + +; *************************************************************************** +; *************************************************************************** +; +; Decompression Options & Macros +; + + ; + ; Use the enhanced format from Emmanuel Marty's APULTRA? + ; + ; The enhancements speed up decompression on an 8-bit CPU. + ; + ; This gives an 11% improvement in decompresison speed, but + ; breaks compatibility with standard aPLib encoders. + ; + +APL_ENHANCED = 0 + + ; + ; Assume that we're decompessing from a large multi-bank + ; compressed data file, and that the next bank may need to + ; paged in when a page-boundary is crossed. + ; + +APL_FROM_BANK = 0 + + ; + ; Macro to increment the source pointer to the next page. + ; + + !if APL_FROM_BANK { + !macro APL_INC_PAGE { + jsr .next_page + } + } else { + !macro APL_INC_PAGE { + inc <apl_srcptr + 1 + } + } + + ; + ; Macro to read a byte from the compressed source data. + ; + + !macro APL_GET_SRC { + lda (apl_srcptr),y + inc <apl_srcptr + 0 + bne .skip + +APL_INC_PAGE +.skip: + } + + + +; *************************************************************************** +; *************************************************************************** +; +; Data usage is last 12 bytes of zero-page. +; + + !if APL_ENHANCED { +apl_nibflg = $F4 ; 1 byte. +apl_nibble = $F5 ; 1 byte. +apl_egamma = $F6 ; 1 byte. + } + +apl_bitbuf = $F7 ; 1 byte. +apl_offset = $F8 ; 1 word. +apl_winptr = $FA ; 1 word. +apl_srcptr = $FC ; 1 word. +apl_dstptr = $FE ; 1 word. +apl_length = apl_winptr + + +; *************************************************************************** +; *************************************************************************** +; +; apl_decompress - Decompress data stored in Jorgen Ibsen's aPLib format. +; +; Args: apl_srcptr = ptr to compessed data +; Args: apl_dstptr = ptr to output buffer +; Uses: lots! +; +; If compiled with APL_FROM_BANK, then apl_srcptr should be within the bank +; window range. +; +; As an optimization, the code to handle window offsets > 64768 bytes has +; been removed, since these don't occur with a 16-bit address range. +; +; As an optimization, the code to handle window offsets > 32000 bytes can +; be commented-out, since these don't occur in typical 8-bit computer usage. +; + +apl_decompress: ldy #0 ; Initialize source index. + + lda #$80 ; Initialize an empty + sta <apl_bitbuf ; bit-buffer. + + !if APL_ENHANCED { + sta <apl_egamma ; Bit-buffer for gamma pairs. + sty <apl_nibflg ; Reset the flag. + } + + ; + ; 0 bbbbbbbb - One byte from compressed data, i.e. a "literal". + ; + +.literal: +APL_GET_SRC + +.write_byte: ldx #0 ; LWM=0. + + sta (apl_dstptr),y ; Write the byte directly to + inc <apl_dstptr + 0 ; the output. + bne .next_tag + inc <apl_dstptr + 1 + +.next_tag: asl <apl_bitbuf ; 0 bbbbbbbb + bne .skip0 + jsr .load_bit +.skip0: bcc .literal + +.skip1: asl <apl_bitbuf ; 1 0 <offset> <length> + bne .skip2 + jsr .load_bit +.skip2: bcc .copy_large + + asl <apl_bitbuf ; 1 1 0 dddddddn + bne .skip3 + jsr .load_bit +.skip3: bcc .copy_normal + + ; 1 1 1 dddd - Copy 1 byte within 15 bytes (or zero). + + !if APL_ENHANCED { + +.copy_short: lsr <apl_nibflg ; Is there a nibble waiting? + lda <apl_nibble ; Extract the lo-nibble. + bcs .skip4 + + inc <apl_nibflg ; Reset the flag. + +APL_GET_SRC + sta <apl_nibble ; Preserve for next time. + lsr ; Extract the hi-nibble. + lsr + lsr + lsr + +.skip4: and #$0F ; Current nibble. + beq .write_byte ; Offset=0 means write zero. + + } else { + +.copy_short: lda #$10 +.nibble_loop: asl <apl_bitbuf + bne .skip4 + pha + jsr .load_bit + pla +.skip4: rol + bcc .nibble_loop + beq .write_byte ; Offset=0 means write zero. + + } + + eor #$FF ; Read the byte directly from + tay ; the destination window. + iny + dec <apl_dstptr + 1 + lda (apl_dstptr),y + inc <apl_dstptr + 1 + ldy #0 + beq .write_byte + + ; + ; 1 1 0 dddddddn - Copy 2 or 3 within 128 bytes. + ; + +.copy_normal: +APL_GET_SRC ; 1 1 0 dddddddn + lsr + beq .finished ; Offset 0 == EOF. + + sta <apl_offset + 0 ; Preserve offset. + sty <apl_offset + 1 + tya ; Y == 0. + tax ; Bits 8..15 of length. + adc #2 ; Bits 0...7 of length. + bne .do_match ; NZ from previous ADC. + + ; + ; Subroutines for byte & bit handling. + ; + + !if APL_ENHANCED { + +.get_gamma: lda #1 ; Get a gamma-coded value. +.gamma_loop: asl <apl_egamma + bne .rotate_gamma + pha + +APL_GET_SRC ; Reload an empty bit-buffer + rol ; from the compressed source. + sta <apl_egamma + pla +.rotate_gamma: rol + bcs .big_gamma ; Got 8 bits, now read rest. + asl <apl_egamma + bcc .gamma_loop + rts ; Always returns CS. + +.big_gamma: pha ; Read remaining bits of length + tya ; larger than 255. This is very + jsr .rotate_gamma ; rare, so it saves cycles on + tax ; the 6502 to do it this way. + pla + +.finished: rts ; All decompressed! + + } else { + +.get_gamma: lda #1 ; Get a gamma-coded value. +.gamma_loop: asl <apl_bitbuf + bne .skip5 + pha + jsr .load_bit + pla +.skip5: rol + rol <apl_length + 1 + asl <apl_bitbuf + bne .skip6 + pha + jsr .load_bit + pla +.skip6: bcs .gamma_loop + +.finished: rts ; All decompressed! + + } + + ; + ; 1 0 <offset> <length> - gamma-coded LZSS pair. + ; + + !if APL_ENHANCED { + +.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2). + + cpx #1 ; CC if LWM==0, CS if LWM==1. + ldx #0 ; Clear hi-byte of length. + sbc #2 ; -3 if LWM==0, -2 if LWM==1. + bcs .normal_pair ; CC if LWM==0 && offset==2. + + jsr .get_gamma ; Get length (A=lo-byte & CS). + bcs .do_match ; Use previous Offset. + +.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset. + + +APL_GET_SRC + sta <apl_offset + 0 ; Save bits 0...7 of offset. + + jsr .get_gamma ; Get length (A=lo-byte & CS). + + } else { + +.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2). + sty <apl_length + 1 ; Clear hi-byte of length. + + cpx #1 ; CC if LWM==0, CS if LWM==1. + sbc #2 ; -3 if LWM==0, -2 if LWM==1. + bcs .normal_pair ; CC if LWM==0 && offset==2. + + jsr .get_gamma ; Get length (A=lo-byte & CC). + ldx <apl_length + 1 + bcc .do_match ; Use previous Offset. + +.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset. + + +APL_GET_SRC + sta <apl_offset + 0 ; Save bits 0...7 of offset. + + jsr .get_gamma ; Get length (A=lo-byte & CC). + ldx <apl_length + 1 + + } + + ldy <apl_offset + 1 ; If offset < 256. + beq .lt256 + cpy #$7D ; If offset >= 32000, length += 2. + bcs .match_plus2 + cpy #$05 ; If offset >= 1280, length += 1. + bcs .match_plus1 + bcc .do_match +.lt256: ldy <apl_offset + 0 ; If offset < 128, length += 2. + bmi .do_match + + !if APL_ENHANCED { + } else { + sec ; aPLib gamma returns with CC. + } + +.match_plus2: adc #1 ; CS, so ADC #2. + bcs .match_plus256 + +.match_plus1: adc #0 ; CS, so ADC #1, or CC if fall + bcc .do_match ; through from .match_plus2. + +.match_plus256: inx + +.do_match: eor #$FF ; Negate the lo-byte of length + tay ; and check for zero. + iny + beq .calc_addr + eor #$FF + + inx ; Increment # of pages to copy. + + clc ; Calc destination for partial + adc <apl_dstptr + 0 ; page. + sta <apl_dstptr + 0 + bcs .calc_addr + dec <apl_dstptr + 1 + +.calc_addr: sec ; Calc address of match. + lda <apl_dstptr + 0 + sbc <apl_offset + 0 + sta <apl_winptr + 0 + lda <apl_dstptr + 1 + sbc <apl_offset + 1 + sta <apl_winptr + 1 + +.copy_page: lda (apl_winptr),y + sta (apl_dstptr),y + iny + bne .copy_page + inc <apl_winptr + 1 + inc <apl_dstptr + 1 + dex ; Any full pages left to copy? + bne .copy_page + + inx ; LWM=1. + jmp .next_tag + + ; + ; Subroutines for byte & bit handling. + ; + +.load_bit: +APL_GET_SRC ; Reload an empty bit-buffer + rol ; from the compressed source. + sta <apl_bitbuf + rts diff --git a/tools/apultra_src/asm/ARM7TDMI/aplib_arm.s b/tools/apultra_src/asm/ARM7TDMI/aplib_arm.s new file mode 100644 index 0000000..b6d0cef --- /dev/null +++ b/tools/apultra_src/asm/ARM7TDMI/aplib_arm.s @@ -0,0 +1,150 @@ +@APlib ARM7 decompressor by Dan Weiss, based on the original C version +@Takes in raw apacked data, NOT data created by the 'safe' compressor. +@Code is from the PocketNES NES Emulator for GBA + +@Code is formatted for GNU Assembler + + src .req r0 + dest .req r1 + byte .req r2 + mask .req r3 + gamma .req r4 + lwm .req r6 + recentoff .req r7 + temp .req r8 + +.global depack +.type depack STT_FUNC + +@r0 = src +@r1 = dest +@r2 = byte +@r3 = rotating bit mask +@r4 = increasing gamma +@r6 = lwm +@r7 = recentoff +@r8 = lr copy/scratch + + .macro GETBIT @3 instructions + movs mask,mask,ror #1 + ldrcsb byte,[src],#1 + tst byte,mask + .endm + + .macro GETBITGAMMA @5 instructions + mov gamma,gamma,lsl #1 + GETBIT + addne gamma,gamma,#1 + .endm + +@This initilaiztion code can go into slow memory + +depack: + stmfd sp!,{r4-r10,lr} + ldrb temp,[src],#1 + strb temp,[dest],#1 + ldr mask,=0x01010101 + b aploop_nolwm + +@This inner-loop code should be placed into fast memory + + @depack enters here +aploop_nolwm: + mov lwm,#0 +aploop: + GETBIT + bne apbranch1 + ldrb temp,[src],#1 + strb temp,[dest],#1 + b aploop_nolwm +apbranch1: + GETBIT + beq apbranch2 + GETBIT + beq apbranch3 + @get an offset + mov gamma,#0 + GETBIT + addne gamma,gamma,#1 + GETBITGAMMA + GETBITGAMMA + GETBITGAMMA + cmp gamma,#0 + ldrneb gamma,[dest,-gamma] + strb gamma,[dest],#1 + b aploop_nolwm +apbranch3: + @use 7 bit offset, length = 2 or 3 + @if a zero is encountered here, it's EOF + ldrb gamma,[src],#1 + movs recentoff,gamma,lsr #1 + beq done + ldrcsb temp,[dest,-recentoff] + strcsb temp,[dest],#1 + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + mov lwm,#1 + b aploop +apbranch2: + @use a gamma code * 256 for offset, another gamma code for length + + bl ap_getgamma + sub gamma,gamma,#2 + cmp lwm,#0 + bne ap_is_lwm + mov lwm,#1 + cmp gamma,#0 + bne ap_not_zero_gamma + + @if gamma code is 2, use old recent offset, and a new gamma code for length + bl ap_getgamma +copyloop1: + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + subs gamma,gamma,#1 + bne copyloop1 + b aploop + +ap_not_zero_gamma: + sub gamma,gamma,#1 +ap_is_lwm: + ldrb temp,[src],#1 + add recentoff,temp,gamma,lsl #8 + bl ap_getgamma + @gamma=length + cmp recentoff,#32000 + addge gamma,gamma,#1 + cmp recentoff,#1280 + addge gamma,gamma,#1 + cmp recentoff,#128 + addlt gamma,gamma,#2 +copyloop2: + ldrb temp,[dest,-recentoff] + strb temp,[dest],#1 + subs gamma,gamma,#1 + bne copyloop2 + b aploop + +ap_getgamma: + mov gamma,#1 +ap_getgammaloop: + GETBITGAMMA + GETBIT + bne ap_getgammaloop + bx lr + +done: + ldmfd sp!,{r4-r10,lr} + bx lr + +.unreq src +.unreq dest +.unreq byte +.unreq mask +.unreq gamma +.unreq lwm +.unreq recentoff +.unreq temp + diff --git a/tools/apultra_src/asm/Z80/aplib_z80.asm b/tools/apultra_src/asm/Z80/aplib_z80.asm new file mode 100644 index 0000000..6843a14 --- /dev/null +++ b/tools/apultra_src/asm/Z80/aplib_z80.asm @@ -0,0 +1,190 @@ +;Z80 Version by Dan Weiss +;Call depack. +;hl = source +;de = dest + +ap_bits: .db 0 +ap_byte: .db 0 +lwm: .db 0 +r0: .dw 0 + +ap_getbit: + push bc + ld bc,(ap_bits) + rrc c + jr nc,ap_getbit_continue + ld b,(hl) + inc hl +ap_getbit_continue: + ld a,c + and b + ld (ap_bits),bc + pop bc + ret + +ap_getbitbc: ;doubles BC and adds the read bit + sla c + rl b + call ap_getbit + ret z + inc bc + ret + +ap_getgamma: + ld bc,1 +ap_getgammaloop: + call ap_getbitbc + call ap_getbit + jr nz,ap_getgammaloop + ret + + +depack: + ;hl = source + ;de = dest + ldi + xor a + ld (lwm),a + inc a + ld (ap_bits),a + +aploop: + call ap_getbit + jp z, apbranch1 + call ap_getbit + jr z, apbranch2 + call ap_getbit + jr z, apbranch3 + ;LWM = 0 + xor a + ld (lwm),a + ;get an offset + ld bc,0 + call ap_getbitbc + call ap_getbitbc + call ap_getbitbc + call ap_getbitbc + ld a,b + or c + jr nz,apbranch4 + xor a ;write a 0 + ld (de),a + inc de + jr aploop +apbranch4: + ex de,hl ;write a previous bit (1-15 away from dest) + push hl + sbc hl,bc + ld a,(hl) + pop hl + ld (hl),a + inc hl + ex de,hl + jr aploop +apbranch3: + ;use 7 bit offset, length = 2 or 3 + ;if a zero is encountered here, it's EOF + ld c,(hl) + inc hl + rr c + ret z + ld b,2 + jr nc,ap_dont_inc_b + inc b +ap_dont_inc_b: + ;LWM = 1 + ld a,1 + ld (lwm),a + + push hl + ld a,b + ld b,0 + ;R0 = c + ld (r0),bc + ld h,d + ld l,e + or a + sbc hl,bc + ld c,a + ldir + pop hl + jr aploop +apbranch2: + ;use a gamma code * 256 for offset, another gamma code for length + call ap_getgamma + dec bc + dec bc + ld a,(lwm) + or a + jr nz,ap_not_lwm + ;bc = 2? + ld a,b + or c + jr nz,ap_not_zero_gamma + ;if gamma code is 2, use old r0 offset, and a new gamma code for length + call ap_getgamma + push hl + ld h,d + ld l,e + push bc + ld bc,(r0) + sbc hl,bc + pop bc + ldir + pop hl + jr ap_finishup + +ap_not_zero_gamma: + dec bc +ap_not_lwm: + ;do I even need this code? + ;bc=bc*256+(hl), lazy 16bit way + ld b,c + ld c,(hl) + inc hl + ld (r0),bc + push bc + call ap_getgamma + ex (sp),hl + ;bc = len, hl=offs + push de + ex de,hl + ;some comparison junk for some reason + ld hl,31999 + or a + sbc hl,de + jr nc,skip1 + inc bc +skip1: + ld hl,1279 + or a + sbc hl,de + jr nc,skip2 + inc bc +skip2: + ld hl,127 + or a + sbc hl,de + jr c,skip3 + inc bc + inc bc +skip3: + ;bc = len, de = offs, hl=junk + pop hl + push hl + or a + sbc hl,de + pop de + ;hl=dest-offs, bc=len, de = dest + ldir + pop hl +ap_finishup: + ld a,1 + ld (lwm),a + jp aploop + +apbranch1: + ldi + xor a + ld (lwm),a + jp aploop |