aboutsummaryrefslogtreecommitdiff
path: root/tools/apultra_src/asm
diff options
context:
space:
mode:
authorJuan J. Martinez <jjm@usebox.net>2023-11-05 11:22:55 +0000
committerJuan J. Martinez <jjm@usebox.net>2023-11-05 11:31:28 +0000
commit2fbdf974338bde8576efdae40a819a76b2391033 (patch)
tree64d41a37470143f142344f9a439d96de3e7918c2 /tools/apultra_src/asm
downloadkitsunes-curse-2fbdf974338bde8576efdae40a819a76b2391033.tar.gz
kitsunes-curse-2fbdf974338bde8576efdae40a819a76b2391033.zip
Initial import of the open source release
Diffstat (limited to 'tools/apultra_src/asm')
-rw-r--r--tools/apultra_src/asm/6502/aplib_6502.asm371
-rw-r--r--tools/apultra_src/asm/ARM7TDMI/aplib_arm.s150
-rw-r--r--tools/apultra_src/asm/Z80/aplib_z80.asm190
3 files changed, 711 insertions, 0 deletions
diff --git a/tools/apultra_src/asm/6502/aplib_6502.asm b/tools/apultra_src/asm/6502/aplib_6502.asm
new file mode 100644
index 0000000..219ac86
--- /dev/null
+++ b/tools/apultra_src/asm/6502/aplib_6502.asm
@@ -0,0 +1,371 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; aplib_6502.s
+;
+; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format.
+;
+; Includes support for Emmanuel Marty's enhancements to the aPLib format.
+;
+; The code is 252 bytes long for standard format, 270 for enhanced format.
+;
+; This code is written for the ACME assembler.
+;
+; Copyright John Brandwood 2019.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+; http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Options & Macros
+;
+
+ ;
+ ; Use the enhanced format from Emmanuel Marty's APULTRA?
+ ;
+ ; The enhancements speed up decompression on an 8-bit CPU.
+ ;
+ ; This gives an 11% improvement in decompresison speed, but
+ ; breaks compatibility with standard aPLib encoders.
+ ;
+
+APL_ENHANCED = 0
+
+ ;
+ ; Assume that we're decompessing from a large multi-bank
+ ; compressed data file, and that the next bank may need to
+ ; paged in when a page-boundary is crossed.
+ ;
+
+APL_FROM_BANK = 0
+
+ ;
+ ; Macro to increment the source pointer to the next page.
+ ;
+
+ !if APL_FROM_BANK {
+ !macro APL_INC_PAGE {
+ jsr .next_page
+ }
+ } else {
+ !macro APL_INC_PAGE {
+ inc <apl_srcptr + 1
+ }
+ }
+
+ ;
+ ; Macro to read a byte from the compressed source data.
+ ;
+
+ !macro APL_GET_SRC {
+ lda (apl_srcptr),y
+ inc <apl_srcptr + 0
+ bne .skip
+ +APL_INC_PAGE
+.skip:
+ }
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 12 bytes of zero-page.
+;
+
+ !if APL_ENHANCED {
+apl_nibflg = $F4 ; 1 byte.
+apl_nibble = $F5 ; 1 byte.
+apl_egamma = $F6 ; 1 byte.
+ }
+
+apl_bitbuf = $F7 ; 1 byte.
+apl_offset = $F8 ; 1 word.
+apl_winptr = $FA ; 1 word.
+apl_srcptr = $FC ; 1 word.
+apl_dstptr = $FE ; 1 word.
+apl_length = apl_winptr
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; apl_decompress - Decompress data stored in Jorgen Ibsen's aPLib format.
+;
+; Args: apl_srcptr = ptr to compessed data
+; Args: apl_dstptr = ptr to output buffer
+; Uses: lots!
+;
+; If compiled with APL_FROM_BANK, then apl_srcptr should be within the bank
+; window range.
+;
+; As an optimization, the code to handle window offsets > 64768 bytes has
+; been removed, since these don't occur with a 16-bit address range.
+;
+; As an optimization, the code to handle window offsets > 32000 bytes can
+; be commented-out, since these don't occur in typical 8-bit computer usage.
+;
+
+apl_decompress: ldy #0 ; Initialize source index.
+
+ lda #$80 ; Initialize an empty
+ sta <apl_bitbuf ; bit-buffer.
+
+ !if APL_ENHANCED {
+ sta <apl_egamma ; Bit-buffer for gamma pairs.
+ sty <apl_nibflg ; Reset the flag.
+ }
+
+ ;
+ ; 0 bbbbbbbb - One byte from compressed data, i.e. a "literal".
+ ;
+
+.literal: +APL_GET_SRC
+
+.write_byte: ldx #0 ; LWM=0.
+
+ sta (apl_dstptr),y ; Write the byte directly to
+ inc <apl_dstptr + 0 ; the output.
+ bne .next_tag
+ inc <apl_dstptr + 1
+
+.next_tag: asl <apl_bitbuf ; 0 bbbbbbbb
+ bne .skip0
+ jsr .load_bit
+.skip0: bcc .literal
+
+.skip1: asl <apl_bitbuf ; 1 0 <offset> <length>
+ bne .skip2
+ jsr .load_bit
+.skip2: bcc .copy_large
+
+ asl <apl_bitbuf ; 1 1 0 dddddddn
+ bne .skip3
+ jsr .load_bit
+.skip3: bcc .copy_normal
+
+ ; 1 1 1 dddd - Copy 1 byte within 15 bytes (or zero).
+
+ !if APL_ENHANCED {
+
+.copy_short: lsr <apl_nibflg ; Is there a nibble waiting?
+ lda <apl_nibble ; Extract the lo-nibble.
+ bcs .skip4
+
+ inc <apl_nibflg ; Reset the flag.
+ +APL_GET_SRC
+ sta <apl_nibble ; Preserve for next time.
+ lsr ; Extract the hi-nibble.
+ lsr
+ lsr
+ lsr
+
+.skip4: and #$0F ; Current nibble.
+ beq .write_byte ; Offset=0 means write zero.
+
+ } else {
+
+.copy_short: lda #$10
+.nibble_loop: asl <apl_bitbuf
+ bne .skip4
+ pha
+ jsr .load_bit
+ pla
+.skip4: rol
+ bcc .nibble_loop
+ beq .write_byte ; Offset=0 means write zero.
+
+ }
+
+ eor #$FF ; Read the byte directly from
+ tay ; the destination window.
+ iny
+ dec <apl_dstptr + 1
+ lda (apl_dstptr),y
+ inc <apl_dstptr + 1
+ ldy #0
+ beq .write_byte
+
+ ;
+ ; 1 1 0 dddddddn - Copy 2 or 3 within 128 bytes.
+ ;
+
+.copy_normal: +APL_GET_SRC ; 1 1 0 dddddddn
+ lsr
+ beq .finished ; Offset 0 == EOF.
+
+ sta <apl_offset + 0 ; Preserve offset.
+ sty <apl_offset + 1
+ tya ; Y == 0.
+ tax ; Bits 8..15 of length.
+ adc #2 ; Bits 0...7 of length.
+ bne .do_match ; NZ from previous ADC.
+
+ ;
+ ; Subroutines for byte & bit handling.
+ ;
+
+ !if APL_ENHANCED {
+
+.get_gamma: lda #1 ; Get a gamma-coded value.
+.gamma_loop: asl <apl_egamma
+ bne .rotate_gamma
+ pha
+ +APL_GET_SRC ; Reload an empty bit-buffer
+ rol ; from the compressed source.
+ sta <apl_egamma
+ pla
+.rotate_gamma: rol
+ bcs .big_gamma ; Got 8 bits, now read rest.
+ asl <apl_egamma
+ bcc .gamma_loop
+ rts ; Always returns CS.
+
+.big_gamma: pha ; Read remaining bits of length
+ tya ; larger than 255. This is very
+ jsr .rotate_gamma ; rare, so it saves cycles on
+ tax ; the 6502 to do it this way.
+ pla
+
+.finished: rts ; All decompressed!
+
+ } else {
+
+.get_gamma: lda #1 ; Get a gamma-coded value.
+.gamma_loop: asl <apl_bitbuf
+ bne .skip5
+ pha
+ jsr .load_bit
+ pla
+.skip5: rol
+ rol <apl_length + 1
+ asl <apl_bitbuf
+ bne .skip6
+ pha
+ jsr .load_bit
+ pla
+.skip6: bcs .gamma_loop
+
+.finished: rts ; All decompressed!
+
+ }
+
+ ;
+ ; 1 0 <offset> <length> - gamma-coded LZSS pair.
+ ;
+
+ !if APL_ENHANCED {
+
+.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2).
+
+ cpx #1 ; CC if LWM==0, CS if LWM==1.
+ ldx #0 ; Clear hi-byte of length.
+ sbc #2 ; -3 if LWM==0, -2 if LWM==1.
+ bcs .normal_pair ; CC if LWM==0 && offset==2.
+
+ jsr .get_gamma ; Get length (A=lo-byte & CS).
+ bcs .do_match ; Use previous Offset.
+
+.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset.
+
+ +APL_GET_SRC
+ sta <apl_offset + 0 ; Save bits 0...7 of offset.
+
+ jsr .get_gamma ; Get length (A=lo-byte & CS).
+
+ } else {
+
+.copy_large: jsr .get_gamma ; Bits 8..15 of offset (min 2).
+ sty <apl_length + 1 ; Clear hi-byte of length.
+
+ cpx #1 ; CC if LWM==0, CS if LWM==1.
+ sbc #2 ; -3 if LWM==0, -2 if LWM==1.
+ bcs .normal_pair ; CC if LWM==0 && offset==2.
+
+ jsr .get_gamma ; Get length (A=lo-byte & CC).
+ ldx <apl_length + 1
+ bcc .do_match ; Use previous Offset.
+
+.normal_pair: sta <apl_offset + 1 ; Save bits 8..15 of offset.
+
+ +APL_GET_SRC
+ sta <apl_offset + 0 ; Save bits 0...7 of offset.
+
+ jsr .get_gamma ; Get length (A=lo-byte & CC).
+ ldx <apl_length + 1
+
+ }
+
+ ldy <apl_offset + 1 ; If offset < 256.
+ beq .lt256
+ cpy #$7D ; If offset >= 32000, length += 2.
+ bcs .match_plus2
+ cpy #$05 ; If offset >= 1280, length += 1.
+ bcs .match_plus1
+ bcc .do_match
+.lt256: ldy <apl_offset + 0 ; If offset < 128, length += 2.
+ bmi .do_match
+
+ !if APL_ENHANCED {
+ } else {
+ sec ; aPLib gamma returns with CC.
+ }
+
+.match_plus2: adc #1 ; CS, so ADC #2.
+ bcs .match_plus256
+
+.match_plus1: adc #0 ; CS, so ADC #1, or CC if fall
+ bcc .do_match ; through from .match_plus2.
+
+.match_plus256: inx
+
+.do_match: eor #$FF ; Negate the lo-byte of length
+ tay ; and check for zero.
+ iny
+ beq .calc_addr
+ eor #$FF
+
+ inx ; Increment # of pages to copy.
+
+ clc ; Calc destination for partial
+ adc <apl_dstptr + 0 ; page.
+ sta <apl_dstptr + 0
+ bcs .calc_addr
+ dec <apl_dstptr + 1
+
+.calc_addr: sec ; Calc address of match.
+ lda <apl_dstptr + 0
+ sbc <apl_offset + 0
+ sta <apl_winptr + 0
+ lda <apl_dstptr + 1
+ sbc <apl_offset + 1
+ sta <apl_winptr + 1
+
+.copy_page: lda (apl_winptr),y
+ sta (apl_dstptr),y
+ iny
+ bne .copy_page
+ inc <apl_winptr + 1
+ inc <apl_dstptr + 1
+ dex ; Any full pages left to copy?
+ bne .copy_page
+
+ inx ; LWM=1.
+ jmp .next_tag
+
+ ;
+ ; Subroutines for byte & bit handling.
+ ;
+
+.load_bit: +APL_GET_SRC ; Reload an empty bit-buffer
+ rol ; from the compressed source.
+ sta <apl_bitbuf
+ rts
diff --git a/tools/apultra_src/asm/ARM7TDMI/aplib_arm.s b/tools/apultra_src/asm/ARM7TDMI/aplib_arm.s
new file mode 100644
index 0000000..b6d0cef
--- /dev/null
+++ b/tools/apultra_src/asm/ARM7TDMI/aplib_arm.s
@@ -0,0 +1,150 @@
+@APlib ARM7 decompressor by Dan Weiss, based on the original C version
+@Takes in raw apacked data, NOT data created by the 'safe' compressor.
+@Code is from the PocketNES NES Emulator for GBA
+
+@Code is formatted for GNU Assembler
+
+ src .req r0
+ dest .req r1
+ byte .req r2
+ mask .req r3
+ gamma .req r4
+ lwm .req r6
+ recentoff .req r7
+ temp .req r8
+
+.global depack
+.type depack STT_FUNC
+
+@r0 = src
+@r1 = dest
+@r2 = byte
+@r3 = rotating bit mask
+@r4 = increasing gamma
+@r6 = lwm
+@r7 = recentoff
+@r8 = lr copy/scratch
+
+ .macro GETBIT @3 instructions
+ movs mask,mask,ror #1
+ ldrcsb byte,[src],#1
+ tst byte,mask
+ .endm
+
+ .macro GETBITGAMMA @5 instructions
+ mov gamma,gamma,lsl #1
+ GETBIT
+ addne gamma,gamma,#1
+ .endm
+
+@This initilaiztion code can go into slow memory
+
+depack:
+ stmfd sp!,{r4-r10,lr}
+ ldrb temp,[src],#1
+ strb temp,[dest],#1
+ ldr mask,=0x01010101
+ b aploop_nolwm
+
+@This inner-loop code should be placed into fast memory
+
+ @depack enters here
+aploop_nolwm:
+ mov lwm,#0
+aploop:
+ GETBIT
+ bne apbranch1
+ ldrb temp,[src],#1
+ strb temp,[dest],#1
+ b aploop_nolwm
+apbranch1:
+ GETBIT
+ beq apbranch2
+ GETBIT
+ beq apbranch3
+ @get an offset
+ mov gamma,#0
+ GETBIT
+ addne gamma,gamma,#1
+ GETBITGAMMA
+ GETBITGAMMA
+ GETBITGAMMA
+ cmp gamma,#0
+ ldrneb gamma,[dest,-gamma]
+ strb gamma,[dest],#1
+ b aploop_nolwm
+apbranch3:
+ @use 7 bit offset, length = 2 or 3
+ @if a zero is encountered here, it's EOF
+ ldrb gamma,[src],#1
+ movs recentoff,gamma,lsr #1
+ beq done
+ ldrcsb temp,[dest,-recentoff]
+ strcsb temp,[dest],#1
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ mov lwm,#1
+ b aploop
+apbranch2:
+ @use a gamma code * 256 for offset, another gamma code for length
+
+ bl ap_getgamma
+ sub gamma,gamma,#2
+ cmp lwm,#0
+ bne ap_is_lwm
+ mov lwm,#1
+ cmp gamma,#0
+ bne ap_not_zero_gamma
+
+ @if gamma code is 2, use old recent offset, and a new gamma code for length
+ bl ap_getgamma
+copyloop1:
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ subs gamma,gamma,#1
+ bne copyloop1
+ b aploop
+
+ap_not_zero_gamma:
+ sub gamma,gamma,#1
+ap_is_lwm:
+ ldrb temp,[src],#1
+ add recentoff,temp,gamma,lsl #8
+ bl ap_getgamma
+ @gamma=length
+ cmp recentoff,#32000
+ addge gamma,gamma,#1
+ cmp recentoff,#1280
+ addge gamma,gamma,#1
+ cmp recentoff,#128
+ addlt gamma,gamma,#2
+copyloop2:
+ ldrb temp,[dest,-recentoff]
+ strb temp,[dest],#1
+ subs gamma,gamma,#1
+ bne copyloop2
+ b aploop
+
+ap_getgamma:
+ mov gamma,#1
+ap_getgammaloop:
+ GETBITGAMMA
+ GETBIT
+ bne ap_getgammaloop
+ bx lr
+
+done:
+ ldmfd sp!,{r4-r10,lr}
+ bx lr
+
+.unreq src
+.unreq dest
+.unreq byte
+.unreq mask
+.unreq gamma
+.unreq lwm
+.unreq recentoff
+.unreq temp
+
diff --git a/tools/apultra_src/asm/Z80/aplib_z80.asm b/tools/apultra_src/asm/Z80/aplib_z80.asm
new file mode 100644
index 0000000..6843a14
--- /dev/null
+++ b/tools/apultra_src/asm/Z80/aplib_z80.asm
@@ -0,0 +1,190 @@
+;Z80 Version by Dan Weiss
+;Call depack.
+;hl = source
+;de = dest
+
+ap_bits: .db 0
+ap_byte: .db 0
+lwm: .db 0
+r0: .dw 0
+
+ap_getbit:
+ push bc
+ ld bc,(ap_bits)
+ rrc c
+ jr nc,ap_getbit_continue
+ ld b,(hl)
+ inc hl
+ap_getbit_continue:
+ ld a,c
+ and b
+ ld (ap_bits),bc
+ pop bc
+ ret
+
+ap_getbitbc: ;doubles BC and adds the read bit
+ sla c
+ rl b
+ call ap_getbit
+ ret z
+ inc bc
+ ret
+
+ap_getgamma:
+ ld bc,1
+ap_getgammaloop:
+ call ap_getbitbc
+ call ap_getbit
+ jr nz,ap_getgammaloop
+ ret
+
+
+depack:
+ ;hl = source
+ ;de = dest
+ ldi
+ xor a
+ ld (lwm),a
+ inc a
+ ld (ap_bits),a
+
+aploop:
+ call ap_getbit
+ jp z, apbranch1
+ call ap_getbit
+ jr z, apbranch2
+ call ap_getbit
+ jr z, apbranch3
+ ;LWM = 0
+ xor a
+ ld (lwm),a
+ ;get an offset
+ ld bc,0
+ call ap_getbitbc
+ call ap_getbitbc
+ call ap_getbitbc
+ call ap_getbitbc
+ ld a,b
+ or c
+ jr nz,apbranch4
+ xor a ;write a 0
+ ld (de),a
+ inc de
+ jr aploop
+apbranch4:
+ ex de,hl ;write a previous bit (1-15 away from dest)
+ push hl
+ sbc hl,bc
+ ld a,(hl)
+ pop hl
+ ld (hl),a
+ inc hl
+ ex de,hl
+ jr aploop
+apbranch3:
+ ;use 7 bit offset, length = 2 or 3
+ ;if a zero is encountered here, it's EOF
+ ld c,(hl)
+ inc hl
+ rr c
+ ret z
+ ld b,2
+ jr nc,ap_dont_inc_b
+ inc b
+ap_dont_inc_b:
+ ;LWM = 1
+ ld a,1
+ ld (lwm),a
+
+ push hl
+ ld a,b
+ ld b,0
+ ;R0 = c
+ ld (r0),bc
+ ld h,d
+ ld l,e
+ or a
+ sbc hl,bc
+ ld c,a
+ ldir
+ pop hl
+ jr aploop
+apbranch2:
+ ;use a gamma code * 256 for offset, another gamma code for length
+ call ap_getgamma
+ dec bc
+ dec bc
+ ld a,(lwm)
+ or a
+ jr nz,ap_not_lwm
+ ;bc = 2?
+ ld a,b
+ or c
+ jr nz,ap_not_zero_gamma
+ ;if gamma code is 2, use old r0 offset, and a new gamma code for length
+ call ap_getgamma
+ push hl
+ ld h,d
+ ld l,e
+ push bc
+ ld bc,(r0)
+ sbc hl,bc
+ pop bc
+ ldir
+ pop hl
+ jr ap_finishup
+
+ap_not_zero_gamma:
+ dec bc
+ap_not_lwm:
+ ;do I even need this code?
+ ;bc=bc*256+(hl), lazy 16bit way
+ ld b,c
+ ld c,(hl)
+ inc hl
+ ld (r0),bc
+ push bc
+ call ap_getgamma
+ ex (sp),hl
+ ;bc = len, hl=offs
+ push de
+ ex de,hl
+ ;some comparison junk for some reason
+ ld hl,31999
+ or a
+ sbc hl,de
+ jr nc,skip1
+ inc bc
+skip1:
+ ld hl,1279
+ or a
+ sbc hl,de
+ jr nc,skip2
+ inc bc
+skip2:
+ ld hl,127
+ or a
+ sbc hl,de
+ jr c,skip3
+ inc bc
+ inc bc
+skip3:
+ ;bc = len, de = offs, hl=junk
+ pop hl
+ push hl
+ or a
+ sbc hl,de
+ pop de
+ ;hl=dest-offs, bc=len, de = dest
+ ldir
+ pop hl
+ap_finishup:
+ ld a,1
+ ld (lwm),a
+ jp aploop
+
+apbranch1:
+ ldi
+ xor a
+ ld (lwm),a
+ jp aploop