Extra libs: ap.lib

aPLib support with apultra.
author: Juan J. Martinez <jjm@usebox.net> 2021-01-09 09:01:05 +0000
committer: Juan J. Martinez <jjm@usebox.net> 2021-01-09 09:01:05 +0000
commit: 9bcf1e97960c0da7322a868efdbc07e2650716fe (patch)
tree: de6d32ad5b0e567991bd3eb262902c15a77074d9 /tools
parent: 3b31adf01305e522f7e28c1435fb47418ce43267 (diff)
download: ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.tar.gz
ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.zip
58 files changed, 9672 insertions, 1 deletions
diff --git a/tools/Makefile b/tools/Makefile
index 6223252..1c72392 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,4 +1,4 @@
-BIN=../bin/hex2bin ../bin/rasm
+BIN=../bin/hex2bin ../bin/rasm ../bin/apultra
 
 all: $(BIN)
 
@@ -11,6 +11,9 @@ all: $(BIN)
 ../bin/rasm:
 	make -C rasm
 
+../bin/apultra:
+	make -C apultra
+
 .PHONY: all clean
 clean:
 	make -C hex2bin-2.0 cleanall
diff --git a/tools/apultra/LICENSE b/tools/apultra/LICENSE
new file mode 100644
index 0000000..213be1a
--- /dev/null
+++ b/tools/apultra/LICENSE
@@ -0,0 +1,3 @@
+The apultra code is available under the Zlib license, except for src/matchfinder.c which is placed under the Creative Commons CC0 license.
+
+Please consult LICENSE.zlib.md and LICENSE.CC0.md for more information.
diff --git a/tools/apultra/LICENSE.cc0.md b/tools/apultra/LICENSE.cc0.md
new file mode 100644
index 0000000..139c68e
--- /dev/null
+++ b/tools/apultra/LICENSE.cc0.md
@@ -0,0 +1,43 @@
+## creative commons
+
+# CC0 1.0 Universal
+
+CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER.
+
+### Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. __Copyright and Related Rights.__ A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following:
+
+    i. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work;
+
+    ii. moral rights retained by the original author(s) and/or performer(s);
+
+    iii. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work;
+
+    iv. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below;
+
+    v. rights protecting the extraction, dissemination, use and reuse of data in a Work;
+
+    vi. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and
+
+    vii. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof.
+
+2. __Waiver.__ To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose.
+
+3. __Public License Fallback.__ Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose.
+
+4. __Limitations and Disclaimers.__
+
+    a. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document.
+
+    b. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law.
+
+    c. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work.
+
+    d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work.
diff --git a/tools/apultra/LICENSE.zlib.md b/tools/apultra/LICENSE.zlib.md
new file mode 100644
index 0000000..e1296a1
--- /dev/null
+++ b/tools/apultra/LICENSE.zlib.md
@@ -0,0 +1,19 @@
+Copyright (c) 2019 Emmanuel Marty
+
+This software is provided 'as-is', without any express or implied warranty. In
+no event will the authors be held liable for any damages arising from the use of
+this software.
+
+Permission is granted to anyone to use this software for any purpose, including
+commercial applications, and to alter it and redistribute it freely, subject to
+the following restrictions:
+
+1.  The origin of this software must not be misrepresented; you must not claim
+    that you wrote the original software. If you use this software in a product,
+    an acknowledgment in the product documentation would be appreciated but is
+    not required.
+
+2.  Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+3.  This notice may not be removed or altered from any source distribution.
diff --git a/tools/apultra/Makefile b/tools/apultra/Makefile
new file mode 100644
index 0000000..308c9aa
--- /dev/null
+++ b/tools/apultra/Makefile
@@ -0,0 +1,31 @@
+#CC=clang
+CC=gcc
+CFLAGS=-O3 -g -fomit-frame-pointer -Isrc/libdivsufsort/include -Isrc
+OBJDIR=obj
+LDFLAGS=
+
+$(OBJDIR)/%.o: src/../%.c
+	@mkdir -p '$(@D)'
+	$(CC) $(CFLAGS) -c $< -o $@
+
+APP := apultra
+
+OBJS += $(OBJDIR)/src/apultra.o
+OBJS += $(OBJDIR)/src/expand.o
+OBJS += $(OBJDIR)/src/matchfinder.o
+OBJS += $(OBJDIR)/src/shrink.o
+OBJS += $(OBJDIR)/src/libdivsufsort/lib/divsufsort.o
+OBJS += $(OBJDIR)/src/libdivsufsort/lib/divsufsort_utils.o
+OBJS += $(OBJDIR)/src/libdivsufsort/lib/sssort.o
+OBJS += $(OBJDIR)/src/libdivsufsort/lib/trsort.o
+
+all: $(APP)
+
+$(APP): $(OBJS)
+	$(CC) $^ $(LDFLAGS) -o $(APP)
+	cp $(APP) ../../bin
+	@rm -rf $(APP) $(OBJDIR)
+
+clean:
+	@rm -rf $(APP) $(OBJDIR)
+
diff --git a/tools/apultra/README.md b/tools/apultra/README.md
new file mode 100644
index 0000000..9dc1e74
--- /dev/null
+++ b/tools/apultra/README.md
@@ -0,0 +1,44 @@
+apultra -- a new, opensource optimal compressor for the apLib format
+====================================================================
+
+apultra is a command-line tool and a library that compresses bitstreams in the apLib format. 
+
+The tool produces files that are 5 to 7% smaller on average than appack, the apLib compressor. Unlike the similar [cap](https://github.com/svendahl/cap) compressor, apultra can compress files larger than 64K.
+
+apultra is written in portable C. It is fully open-source under a liberal license. You can continue to use the regular apLib decompression libraries for your target environment. You can do whatever you like with it.
+
+    Example compression with vmlinux-5.3.0-1-amd64
+
+    original       27923676 (100,00%)
+    appack         7370129 (26,39%)
+    gzip 1.8       7166179 (25,66%)
+    apultra 1.3.5  6910729 (24,75%)
+
+
+The output is fully compatible with the original [aPLib](http://ibsensoftware.com/products_aPLib.html) by Jørgen Ibsen.
+
+Inspirations:
+
+ * [cap](https://github.com/svendahl/cap) by Sven-Åke Dahl. 
+ * [Charles Bloom](http://cbloomrants.blogspot.com/)'s compression blog. 
+ * [LZ4](https://github.com/lz4/lz4) by Yann Collet. 
+ * spke for help and support
+
+Some projects that use apultra for compression:
+ * [Brick Rick](https://www.usebox.net/jjm/brick-rick/), a new game for the Amstrad CPC 464/6128 by usebox.net. A physical copy can be ordered from [Polyplay](https://www.polyplay.xyz/navi.php?suche=Brick+Rick&lang=eng)
+ * [Kitsune's Curse](https://www.usebox.net/jjm/kitsunes-curse/), another new title for the CPC line by usebox.net.
+ * [Sgt. Helmet's Training Day](https://www.mojontwins.com/juegos_mojonos/sgt-helmet-training-day-2020-cpc/), a new game for the Amstrad CPC by the Mojon Twins (using their MK1 engine).
+ * [Prince Dastan - Sokoban Within](https://www.pouet.net/prod.php?which=87382), a CPCRetroDev 2020 game for the Amstrad CPC by Euphoria Design 
+ * [Petris](https://github.com/bbbbbr/Petris), a homebrew game for the Gameboy.
+ * [Mr Palot](https://github.com/graelx/mrpalot), a ZX Spectrum game made with the Mojon Twins MK1 engine.
+ * [rasm](https://github.com/EdouardBERGE/rasm), a popular Z80 assembler, features built-in support for apultra-compressed data sections.
+
+Also of interest:
+ * [oapack](https://gitlab.com/eugene77/oapack) by Eugene Larchenko, a brute-force (exhaustive) optimal packer for the aPLib format. 
+ * [i8080 decompressors](https://gitlab.com/ivagor/unapack) for aPLib by Ivan Gorodetsky
+ * [Gameboy decompressor](https://github.com/untoxa/UnaPACK.GBZ80) by untoxa
+
+License:
+
+* The apultra code is available under the Zlib license.
+* The match finder (matchfinder.c) is available under the CC0 license due to using portions of code from Eric Bigger's Wimlib in the suffix array-based matchfinder.
diff --git a/tools/apultra/VS2017/apultra.sln b/tools/apultra/VS2017/apultra.sln
new file mode 100644
index 0000000..4a644c7
--- /dev/null
+++ b/tools/apultra/VS2017/apultra.sln
@@ -0,0 +1,31 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.489
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "apultra", "apultra.vcxproj", "{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Debug|x64.ActiveCfg = Debug|x64
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Debug|x64.Build.0 = Debug|x64
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Debug|x86.ActiveCfg = Debug|Win32
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Debug|x86.Build.0 = Debug|Win32
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Release|x64.ActiveCfg = Release|x64
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Release|x64.Build.0 = Release|x64
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Release|x86.ActiveCfg = Release|Win32
+		{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {A1E1655C-AA9F-41F0-80C9-18DD0B859D7C}
+	EndGlobalSection
+EndGlobal
diff --git a/tools/apultra/VS2017/apultra.vcxproj b/tools/apultra/VS2017/apultra.vcxproj
new file mode 100644
index 0000000..5ae0b83
--- /dev/null
+++ b/tools/apultra/VS2017/apultra.vcxproj
@@ -0,0 +1,203 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{3F30FEE8-63C5-4D39-A175-EDD7EA93E9B8}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>apultra</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <OutDir>$(ProjectDir)bin\</OutDir>
+    <TargetName>$(ProjectName)_debug</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <OutDir>$(ProjectDir)bin\</OutDir>
+    <TargetName>$(ProjectName)_debug</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>$(ProjectDir)bin\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>$(ProjectDir)bin\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <AdditionalIncludeDirectories>..\src\libdivsufsort\include;..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <OutputFile>$(ProjectDir)bin\$(TargetName)$(TargetExt)</OutputFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <AdditionalIncludeDirectories>..\src\libdivsufsort\include;..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <OutputFile>$(ProjectDir)bin\$(TargetName)$(TargetExt)</OutputFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalIncludeDirectories>..\src\libdivsufsort\include;..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <OmitFramePointers>true</OmitFramePointers>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <OutputFile>$(ProjectDir)bin\$(TargetName)$(TargetExt)</OutputFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalIncludeDirectories>..\src\libdivsufsort\include;..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <OmitFramePointers>true</OmitFramePointers>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <OutputFile>$(ProjectDir)bin\$(TargetName)$(TargetExt)</OutputFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\src\format.h" />
+    <ClInclude Include="..\src\expand.h" />
+    <ClInclude Include="..\src\libapultra.h" />
+    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_config.h" />
+    <ClInclude Include="..\src\libdivsufsort\include\divsufsort.h" />
+    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_private.h" />
+    <ClInclude Include="..\src\matchfinder.h" />
+    <ClInclude Include="..\src\shrink.h" />
+    <ClInclude Include="pch.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\expand.c" />
+    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort.c" />
+    <ClCompile Include="..\src\libdivsufsort\lib\sssort.c" />
+    <ClCompile Include="..\src\libdivsufsort\lib\trsort.c" />
+    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort_utils.c" />
+    <ClCompile Include="..\src\apultra.c" />
+    <ClCompile Include="..\src\matchfinder.c" />
+    <ClCompile Include="..\src\shrink.c" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/tools/apultra/VS2017/apultra.vcxproj.filters b/tools/apultra/VS2017/apultra.vcxproj.filters
new file mode 100644
index 0000000..3e88982
--- /dev/null
+++ b/tools/apultra/VS2017/apultra.vcxproj.filters
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Fichiers sources">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Fichiers d%27en-tête">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
+    </Filter>
+    <Filter Include="Fichiers de ressources">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+    <Filter Include="Fichiers sources\libdivsufsort">
+      <UniqueIdentifier>{5ec09c0d-19f7-4a6f-b524-f405fb99e48c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Fichiers sources\libdivsufsort\lib">
+      <UniqueIdentifier>{a922f475-1322-496d-8a6d-7f1c6b92423d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Fichiers sources\libdivsufsort\include">
+      <UniqueIdentifier>{bd05c6e8-af92-4ab8-8916-0424cd8d186b}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="pch.h">
+      <Filter>Fichiers d%27en-tête</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\format.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\libdivsufsort\include\divsufsort.h">
+      <Filter>Fichiers sources\libdivsufsort\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_private.h">
+      <Filter>Fichiers sources\libdivsufsort\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\matchfinder.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_config.h">
+      <Filter>Fichiers sources\libdivsufsort\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\shrink.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\expand.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\libapultra.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort.c">
+      <Filter>Fichiers sources\libdivsufsort\lib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\libdivsufsort\lib\sssort.c">
+      <Filter>Fichiers sources\libdivsufsort\lib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\libdivsufsort\lib\trsort.c">
+      <Filter>Fichiers sources\libdivsufsort\lib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\matchfinder.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\apultra.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort_utils.c">
+      <Filter>Fichiers sources\libdivsufsort\lib</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\shrink.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\expand.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/tools/apultra/VS2017/apultra.vcxproj.user b/tools/apultra/VS2017/apultra.vcxproj.user
new file mode 100644
index 0000000..969fd88
--- /dev/null
+++ b/tools/apultra/VS2017/apultra.vcxproj.user
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LocalDebuggerCommand>$(TargetPath)</LocalDebuggerCommand>
+    <LocalDebuggerCommandArguments>bad.bin bad.lz3 -stats</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerWorkingDirectory>$(ProjectDir)..\</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LocalDebuggerCommand>$(TargetPath)</LocalDebuggerCommand>
+    <LocalDebuggerCommandArguments>bad.bin bad.lz3 -stats</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerWorkingDirectory>$(ProjectDir)..\</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerCommand>$(TargetPath)</LocalDebuggerCommand>
+    <LocalDebuggerCommandArguments>bad.bin bad.lz3 -stats</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerWorkingDirectory>$(ProjectDir)..\</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerCommand>$(TargetPath)</LocalDebuggerCommand>
+    <LocalDebuggerCommandArguments>bad.bin bad.lz3 -stats</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerWorkingDirectory>$(ProjectDir)..\</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
+</Project>
+\ No newline at end of file
diff --git a/tools/apultra/asm/6502/aplib_6502.asm b/tools/apultra/asm/6502/aplib_6502.asm
new file mode 100644
index 0000000..1bc11b4
--- /dev/null
+++ b/tools/apultra/asm/6502/aplib_6502.asm
@@ -0,0 +1,257 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; aplib_6502.s
+;
+; NMOS 6502 decompressor for data stored in Jorgen Ibsen's aPLib format.
+;
+; Includes support for Emmanuel Marty's enhancements to the aPLib format.
+;
+; The code is 252 bytes long for standard format, 270 for enhanced format.
+;
+; This code is written for the ACME assembler.
+;
+; Copyright John Brandwood 2019.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+;  http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Macros
+;
+
+                ;
+                ; Macro to increment the source pointer to the next page.
+                ;
+
+                !macro  APL_INC_PAGE {
+                        inc     <apl_srcptr + 1
+                }
+
+                ;
+                ; Macro to read a byte from the compressed source data.
+                ;
+
+                !macro   APL_GET_SRC {
+                lda     (apl_srcptr),y
+                inc     <apl_srcptr + 0
+                bne     .skip
+                +APL_INC_PAGE
+.skip:
+                }
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 12 bytes of zero-page.
+;
+
+apl_bitbuf      =       $F7                     ; 1 byte.
+apl_offset      =       $F8                     ; 1 word.
+apl_winptr      =       $FA                     ; 1 word.
+apl_srcptr      =       $FC                     ; 1 word.
+apl_dstptr      =       $FE                     ; 1 word.
+apl_length      =       apl_winptr
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; apl_decompress - Decompress data stored in Jorgen Ibsen's aPLib format.
+;
+; Args: apl_srcptr = ptr to compessed data
+; Args: apl_dstptr = ptr to output buffer
+; Uses: lots!
+;
+; As an optimization, the code to handle window offsets > 64768 bytes has
+; been removed, since these don't occur with a 16-bit address range.
+;
+; As an optimization, the code to handle window offsets > 32000 bytes can
+; be commented-out, since these don't occur in typical 8-bit computer usage.
+;
+
+apl_decompress: ldy     #0                      ; Initialize source index.
+
+                lda     #$80                    ; Initialize an empty
+                sta     <apl_bitbuf             ; bit-buffer.
+
+                ;
+                ; 0 bbbbbbbb - One byte from compressed data, i.e. a "literal".
+                ;
+
+.literal:       +APL_GET_SRC
+
+.write_byte:    ldx     #0                      ; LWM=0.
+
+                sta     (apl_dstptr),y          ; Write the byte directly to
+                inc     <apl_dstptr + 0         ; the output.
+                bne     .next_tag
+                inc     <apl_dstptr + 1
+
+.next_tag:      asl     <apl_bitbuf             ; 0 bbbbbbbb
+                bne     .skip0
+                jsr     .load_bit
+.skip0:         bcc     .literal
+
+.skip1:         asl     <apl_bitbuf             ; 1 0 <offset> <length>
+                bne     .skip2
+                jsr     .load_bit
+.skip2:         bcc     .copy_large
+
+                asl     <apl_bitbuf             ; 1 1 0 dddddddn
+                bne     .skip3
+                jsr     .load_bit
+.skip3:         bcc     .copy_normal
+
+                ; 1 1 1 dddd - Copy 1 byte within 15 bytes (or zero).
+
+.copy_short:    lda     #$10
+.nibble_loop:   asl     <apl_bitbuf
+                bne     .skip4
+                pha
+                jsr     .load_bit
+                pla
+.skip4:         rol
+                bcc     .nibble_loop
+                beq     .write_byte             ; Offset=0 means write zero.
+
+                eor     #$FF                    ; Read the byte directly from
+                tay                             ; the destination window.
+                iny
+                dec     <apl_dstptr + 1
+                lda     (apl_dstptr),y
+                inc     <apl_dstptr + 1
+                ldy     #0
+                beq     .write_byte
+
+                ;
+                ; 1 1 0 dddddddn - Copy 2 or 3 within 128 bytes.
+                ;
+
+.copy_normal:   +APL_GET_SRC                    ; 1 1 0 dddddddn
+                lsr
+                beq     .finished               ; Offset 0 == EOF.
+
+                sta     <apl_offset + 0         ; Preserve offset.
+                sty     <apl_offset + 1
+                tya                             ; Y == 0.
+                tax                             ; Bits 8..15 of length.
+                adc     #2                      ; Bits 0...7 of length.
+                bne     .do_match               ; NZ from previous ADC.
+
+                ;
+                ; Subroutines for byte & bit handling.
+                ;
+
+.get_gamma:     lda     #1                      ; Get a gamma-coded value.
+.gamma_loop:    asl     <apl_bitbuf
+                bne     .skip5
+                pha
+                jsr     .load_bit
+                pla
+.skip5:         rol
+                rol     <apl_length + 1
+                asl     <apl_bitbuf
+                bne     .skip6
+                pha
+                jsr     .load_bit
+                pla
+.skip6:         bcs     .gamma_loop
+
+.finished:      rts                             ; All decompressed!
+
+                ;
+                ; 1 0 <offset> <length> - gamma-coded LZSS pair.
+                ;
+
+.copy_large:    jsr     .get_gamma              ; Bits 8..15 of offset (min 2).
+                sty     <apl_length + 1         ; Clear hi-byte of length.
+
+                cpx     #1                      ; CC if LWM==0, CS if LWM==1.
+                sbc     #2                      ; -3 if LWM==0, -2 if LWM==1.
+                bcs     .normal_pair            ; CC if LWM==0 && offset==2.
+
+                jsr     .get_gamma              ; Get length (A=lo-byte & CC).
+                ldx     <apl_length + 1
+                bcc     .do_match               ; Use previous Offset.
+
+.normal_pair:   sta     <apl_offset + 1         ; Save bits 8..15 of offset.
+
+                +APL_GET_SRC
+                sta     <apl_offset + 0         ; Save bits 0...7 of offset.
+
+                jsr     .get_gamma              ; Get length (A=lo-byte & CC).
+                ldx     <apl_length + 1
+
+                ldy     <apl_offset + 1         ; If offset <    256.
+                beq     .lt256
+                cpy     #$7D                    ; If offset >= 32000, length += 2.
+                bcs     .match_plus2
+                cpy     #$05                    ; If offset >=  1280, length += 1.
+                bcs     .match_plus1
+                bcc     .do_match
+.lt256:         ldy     <apl_offset + 0         ; If offset <    128, length += 2.
+                bmi     .do_match
+
+                sec                             ; aPLib gamma returns with CC.
+
+.match_plus2:   adc     #1                      ; CS, so ADC #2.
+                bcs     .match_plus256
+
+.match_plus1:   adc     #0                      ; CS, so ADC #1, or CC if fall
+                bcc     .do_match               ; through from .match_plus2.
+
+.match_plus256: inx
+
+.do_match:      eor     #$FF                    ; Negate the lo-byte of length
+                tay                             ; and check for zero.
+                iny
+                beq     .calc_addr
+                eor     #$FF
+
+                inx                             ; Increment # of pages to copy.
+
+                clc                             ; Calc destination for partial
+                adc     <apl_dstptr + 0         ; page.
+                sta     <apl_dstptr + 0
+                bcs     .calc_addr
+                dec     <apl_dstptr + 1
+
+.calc_addr:     sec                             ; Calc address of match.
+                lda     <apl_dstptr + 0
+                sbc     <apl_offset + 0
+                sta     <apl_winptr + 0
+                lda     <apl_dstptr + 1
+                sbc     <apl_offset + 1
+                sta     <apl_winptr + 1
+
+.copy_page:     lda     (apl_winptr),y
+                sta     (apl_dstptr),y
+                iny
+                bne     .copy_page
+                inc     <apl_winptr + 1
+                inc     <apl_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .copy_page
+
+                inx                             ; LWM=1.
+                jmp     .next_tag
+
+                ;
+                ; Subroutines for byte & bit handling.
+                ;
+
+.load_bit:      +APL_GET_SRC                    ; Reload an empty bit-buffer
+                rol                             ; from the compressed source.
+                sta     <apl_bitbuf
+                rts
diff --git a/tools/apultra/asm/6502/aplib_6502_b.asm b/tools/apultra/asm/6502/aplib_6502_b.asm
new file mode 100644
index 0000000..7963e02
--- /dev/null
+++ b/tools/apultra/asm/6502/aplib_6502_b.asm
@@ -0,0 +1,218 @@
+; -----------------------------------------------------------------------------
+; aplib_6502_b.s - fast aPLib backward decompressor for 6502 - 253 bytes
+; written for the ACME assembler
+;
+; jsr apl_decompress to unpack data backwards.
+; create backwards compressed data with apultra -b or oapack -b
+;
+; in:
+; * apl_srcptr (low and high byte) = last byte of compressed data
+; * apl_dstptr (low and high byte) = last byte of decompression buffer
+;
+; out:
+; * apl_dstptr (low and high byte) = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;  With parts of the code inspired by John Brandwood, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+                  ; Zero page locations
+
+apl_gamma2_hi     = $F6
+apl_bitbuf        = $F7
+apl_offset        = $F8
+apl_winptr        = $FA
+apl_srcptr        = $FC
+apl_dstptr        = $FE
+
+                  ; Read a byte from the source into A. Trashes X
+
+                  !macro   APL_GET_SRC {
+                  lda (apl_srcptr),y
+                  ldx <apl_srcptr+0
+                  bne .src_page_done
+                  dec <apl_srcptr+1
+.src_page_done:   dec <apl_srcptr+0
+                  }
+
+                  ; Write a byte to the destinatipn
+
+                  !macro   APL_PUT_DST {
+                  sta (apl_dstptr),y
+                  lda <apl_dstptr+0
+                  bne .dst_page_done
+                  dec <apl_dstptr+1
+.dst_page_done:   dec <apl_dstptr+0
+                  }
+
+                  ; Read one bit from the source into the carry, trash A
+
+                  !macro   APL_GET_BIT {
+                  asl <apl_bitbuf
+                  bne .has_bits
+                  jsr apl_load_bits
+.has_bits:
+                  }
+
+                  ; Read one bit from the source into the carry, preserve A
+
+                  !macro   APL_GET_BIT_SAVEA {
+                  asl <apl_bitbuf
+                  bne .has_bits
+                  pha
+                  jsr apl_load_bits
+                  pla
+.has_bits:
+                  }
+
+                  ; Decompress aPLib data backwards
+
+apl_decompress:   lda #$80                      ; initialize empty bit queue
+                  sta <apl_bitbuf               ; plus bit to roll into carry
+                  ldy #$00                      ; clear Y for indirect addr
+
+.copy_literal:    +APL_GET_SRC                  ; read literal from source
+.write_literal:   +APL_PUT_DST                  ; write literal to destination
+
+                  ldx #$00                      ; clear 'follows match' flag
+
+.next_token:      +APL_GET_BIT                  ; read 'literal or match' bit
+                  bcc .copy_literal             ; if 0: literal
+
+                  +APL_GET_BIT                  ; read '8+n bits or other' bit
+                  bcc .long_match               ; if 10x: long 8+n bits match
+                                                
+                                                ; 11x: other type of match
+
+                  +APL_GET_BIT                  ; read '7+1 match or short literal' bit
+                  bcs .short_match              ; if 111: 4 bit offset for 1-byte copy
+
+                  +APL_GET_SRC                  ; read low byte of offset + length bit
+                  lsr                           ; shift offset into place, len bit into carry
+                  beq .done                     ; check for EOD
+                  sta <apl_offset+0             ; store low byte of offset
+                  sty <apl_offset+1             ; set high byte of offset to 0
+
+                  tya                           ; set A to 0
+                  sty <apl_gamma2_hi            ; set high byte of len to 0
+                  adc #$02                      ; add 2 or 3 depending on len bit in carry
+                                                ; now, low part of len is in A
+                                                ; high part of len in apl_gamma2_hi is 0
+                                                ; offset is written to apl_offset
+                  bne .got_len                  ; go copy matched bytes
+
+.long_match:      jsr .get_gamma2               ; 10: read gamma2 high offset bits in A
+                  sty <apl_gamma2_hi            ; zero out high byte of gamma2
+
+                  cpx #$01                      ; set carry if following literal
+                  sbc #$02                      ; substract 3 if following literal, 2 otherwise
+                  bcs .no_repmatch
+
+                  jsr .get_gamma2               ; read repmatch length: low part in A
+                  bcc .got_len                  ; go copy large match
+                                                ; (carry is always clear after .get_gamma2)
+
+.short_match:     lda #$10                      ; clear offset, load end bit into place
+.read_short_offs: +APL_GET_BIT_SAVEA            ; read one bit of offset into carry
+                  rol                           ; shift into A, shift end bit as well
+                  bcc .read_short_offs          ; loop until end bit is shifted out into carry
+
+                  beq .write_literal            ; zero offset means write a 0
+                  tay
+                  lda (apl_dstptr),y            ; load backreferenced byte
+                  ldy #$00                      ; clear Y again
+                  beq .write_literal            ; go write byte to destination
+
+.get_gamma2:      lda #$01                      ; 1 so it gets shifted to 2
+.gamma2_loop:     +APL_GET_BIT_SAVEA            ; read data bit
+                  rol                           ; shift into low byte
+                  rol <apl_gamma2_hi            ; shift into high byte
+                  +APL_GET_BIT_SAVEA            ; read continuation bit
+                  bcs .gamma2_loop              ; loop until a zero continuation bit is read
+.done:            rts
+
+.no_repmatch:     sta <apl_offset+1             ; write high byte of offset
+                  +APL_GET_SRC                  ; read low byte of offset from source
+                  sta <apl_offset+0             ; store low byte of offset
+
+                  jsr .get_gamma2               ; read match length: low part in A
+
+                  ldx <apl_offset+1             ; high offset byte is zero?
+                  beq .offset_1byte             ; if so, offset < 256
+
+                                                ; offset is >= 256.
+
+                  cpx #$7d                      ; offset >= 32000 (7d00) ?
+                  bcs .offset_incby2            ; if so, increase match len by 2
+                  cpx #$05                      ; offset >= 1280 (0500) ?
+                  bcs .offset_incby1            ; if so, increase match len by 1
+                  bcc .got_len                  ; length is fine, go copy
+
+.offset_1byte:    ldx <apl_offset+0             ; offset < 128 ?
+                  bmi .got_len                  ; if so, increase match len by 2
+                  sec                           ; carry must be set below
+
+.offset_incby2:   adc #$01                      ; add 1 + set carry (from bcs or sec)
+                  bcs .len_inchi                ; go add 256 to len if overflow
+
+                                                ; carry clear: fall through for no-op
+
+.offset_incby1:   adc #$00                      ; add 1 + carry
+                  bcc .got_len
+.len_inchi:       inc <apl_gamma2_hi            ; add 256 to len if low byte overflows
+
+.got_len:         tax                           ; transfer low byte of len into X
+                  beq .add_offset
+                  inc <apl_gamma2_hi
+
+.add_offset:      clc                           ; add dest + match offset
+                  lda <apl_dstptr+0             ; low 8 bits
+                  adc <apl_offset+0
+                  sta <apl_winptr+0             ; store back reference address
+                  lda <apl_dstptr+1             ; high 8 bits
+                  adc <apl_offset+1
+                  sta <apl_winptr+1             ; store high 8 bits of address
+
+.copy_match_loop: lda (apl_winptr),y            ; read one byte of backreference
+                  +APL_PUT_DST                  ; write byte to destination
+
+                  lda <apl_winptr+0             ; decrement backreference address
+                  bne .backref_page_done
+                  dec <apl_winptr+1
+.backref_page_done:
+                  dec <apl_winptr+0
+
+                  dex                           ; loop to copy all matched bytes
+                  bne .copy_match_loop
+                  dec <apl_gamma2_hi
+                  bne .copy_match_loop
+
+                                                ; X is 0 when exiting the loop above
+                  inx                           ; set 'follows match' flag
+                  jmp .next_token               ; go decode next token
+
+apl_load_bits:    lda (apl_srcptr),y            ; read 8 bits from source
+                  rol                           ; shift bit queue, and high bit into carry
+                  sta <apl_bitbuf               ; save bit queue
+                  
+                  lda <apl_srcptr+0
+                  bne .bits_page_done
+                  dec <apl_srcptr+1
+.bits_page_done:  dec <apl_srcptr+0
+                  rts
diff --git a/tools/apultra/asm/68000/unaplib_68000.S b/tools/apultra/asm/68000/unaplib_68000.S
new file mode 100644
index 0000000..a60ae32
--- /dev/null
+++ b/tools/apultra/asm/68000/unaplib_68000.S
@@ -0,0 +1,117 @@
+;  unaplib_68000.s - aPLib decompressor for 68000 - 154 bytes
+;
+;  in:  a0 = start of compressed data
+;       a1 = start of decompression buffer
+;  out: d0 = decompressed size
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;  With parts of the code inspired by Franck "hitchhikr" Charlet
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress:
+               movem.l a2-a6/d2-d3,-(sp)
+
+               moveq #-128,d1       ; initialize empty bit queue
+                                    ; plus bit to roll into carry
+               lea 32000.w,a2       ; load 32000 offset constant
+               lea 1280.w,a3        ; load 1280 offset constant
+               lea 128.w,a4         ; load 128 offset constant
+               move.l a1,a5         ; save destination pointer
+
+.literal:      move.b (a0)+,(a1)+   ; copy literal byte
+.after_lit:    moveq #3,d2          ; set LWM flag
+
+.next_token:   bsr.s .get_bit       ; read 'literal or match' bit
+               bcc.s .literal       ; if 0: literal
+
+               bsr.s .get_bit       ; read '8+n bits or other type' bit
+               bcs.s .other_match   ; if 11x: other type of match
+
+               bsr.s .get_gamma2    ; 10: read gamma2-coded high offset bits
+               sub.l d2,d0          ; high offset bits == 2 when LWM == 3 ?
+               bcc.s .no_repmatch   ; if not, not a rep-match
+
+               bsr.s .get_gamma2    ; read repmatch length
+               bra.s .got_len       ; go copy large match
+
+.no_repmatch:  lsl.l #8,d0          ; shift high offset bits into place
+               move.b (a0)+,d0      ; read low offset byte
+               move.l d0,d3         ; copy offset into d3
+
+               bsr.s .get_gamma2    ; read match length
+               cmp.l a2,d3          ; offset >= 32000 ?
+               bge.s .inc_by_2      ; if so, increase match len by 2
+               cmp.l a3,d3          ; offset >= 1280 ?
+               bge.s .inc_by_1      ; if so, increase match len by 1
+               cmp.l a4,d3          ; offset < 128 ?
+               bge.s .got_len       ; if so, increase match len by 2
+.inc_by_2:     addq.l #1,d0         ; increase match len by 1
+.inc_by_1:     addq.l #1,d0         ; increase match len by 1
+
+.got_len:      move.l a1,a6         ; calculate backreference address
+               sub.l d3,a6          ; (dest - match offset)
+               subq.l #1,d0         ; dbf will loop until d0 is -1, not 0
+.copy_match:   move.b (a6)+,(a1)+   ; copy matched byte
+               dbf d0,.copy_match   ; loop for all matched bytes
+               moveq #2,d2          ; clear LWM flag
+               bra.s .next_token    ; go decode next token
+
+.other_match:  bsr.s .get_bit       ; read '7+1 match or short literal' bit
+               bcs.s .short_match   ; if 111: 4 bit offset for 1-byte copy
+
+               moveq #1,d0          ; 110: prepare match length
+               moveq #0,d3          ; clear high bits of offset
+               move.b (a0)+,d3      ; read low bits of offset + length bit
+               lsr.b #1,d3          ; shift offset into place, len into carry
+               beq.s .done          ; check for EOD
+               addx.b d0,d0         ; len = (1 << 1) + carry bit, ie. 2 or 3
+               bra.s .got_len       ; go copy match
+
+.short_match:  moveq #0,d0          ; clear short offset before reading 4 bits
+               bsr.s .get_dibits    ; read a data bit into d0, one into carry
+               addx.b d0,d0         ; shift second bit into d0
+               bsr.s .get_dibits    ; read a data bit into d0, one into carry
+               addx.b d0,d0         ; shift second bit into d0
+               beq.s .write_zero    ; if offset is zero, write a 0
+
+               move.l a1,a6         ; calculate backreference address
+               sub.l d0,a6          ; (dest - short offset)
+               move.b (a6),d0       ; read matched byte
+.write_zero:   move.b d0,(a1)+      ; write matched byte or 0
+               bra.s .after_lit     ; set LWM flag and go decode next token
+
+.done:         move.l a1,d0         ; pointer to last decompressed byte + 1
+               sub.l a6,d0          ; minus start of decompression buffer = size
+               movem.l (sp)+,a2-a6/d2-d3
+               rts
+
+.get_gamma2:   moveq #1,d0          ; init to 1 so it gets shifted to 2 below
+.gamma2_loop:  bsr.s .get_dibits    ; read data bit, shift into d0
+                                    ; and read continuation bit
+               bcs.s .gamma2_loop   ; loop until a 0 continuation bit is read
+               rts
+
+.get_dibits:   bsr.s .get_bit       ; read bit
+               addx.l d0,d0         ; shift into d0
+                                    ; fall through
+.get_bit:      add.b d1,d1          ; shift bit queue, high bit into carry
+               bne.s .got_bit       ; queue not empty, bits remain
+               move.b (a0)+,d1      ; read 8 new bits
+               addx.b d1,d1         ; shift bit queue, high bit into carry
+                                    ; and shift 1 from carry into bit queue
+.got_bit:      rts
diff --git a/tools/apultra/asm/6809/unaplib.s b/tools/apultra/asm/6809/unaplib.s
new file mode 100644
index 0000000..641c3f4
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib.s
@@ -0,0 +1,125 @@
+;  unaplib.s - aPLib decompressor for 6809 - 157 bytes
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress
+         lda #$80          ; initialize empty bit queue
+         sta <apbitbuf,pcr ; plus bit to roll into carry
+         leau ,x
+
+apcplit  ldb ,u+           ; copy literal byte
+apwtlit  stb ,y+
+
+         lda #$03          ; set 'follows literal' flag
+
+aptoken  bsr apgetbit      ; read 'literal or match' bit
+         bcc apcplit       ; if 0: literal
+
+         bsr apgetbit      ; read '8+n bits or other type' bit
+         bcs apother       ; if 11x: other type of match
+
+         sta <aplwm+2,pcr  ; store 'follows literal' flag
+
+         bsr apgamma2      ; 10: read gamma2-coded high offset bits
+aplwm    subd #$0000       ; high offset bits == 2 when follows_literal == 3 ?
+         bcc apnorep       ; if not, not a rep-match
+
+         bsr apgamma2      ; read repmatch length
+         bra apgotlen      ; go copy large match
+
+apnorep  tfr b,a           ; transfer high offset bits to A
+         ldb ,u+           ; read low offset byte in B
+         std <aprepof+1,pcr ; store match offset
+         tfr d,x           ; transfer offset to X
+
+         bsr apgamma2      ; read match length
+
+         cmpx #$7D00       ; offset >= 32000 ?
+         bge apincby2      ; if so, increase match len by 2
+         cmpx #$0500       ; offset >= 1280 ?
+         bge apincby1      ; if so, increase match len by 1
+         cmpx #$80         ; offset < 128 ?
+         bge apgotlen      ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+aprepof  ldd #$aaaa        ; load match offset
+         nega              ; reverse sign of offset in D
+         negb
+         sbca #0
+         leau d,y          ; put backreference start address in U (dst+offset)
+
+apcpymt  lda ,u+           ; copy matched byte
+         sta ,y+
+         leax -1,x         ; decrement X
+         bne apcpymt       ; loop until all matched bytes are copied
+
+         puls u            ; restore source compressed data pointer
+
+         lda #$02          ; clear 'follows literal' flag
+         bra aptoken
+
+apdibits bsr apgetbit      ; read bit
+         rolb              ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+         bne apdone        ; queue not empty, bits remain
+         pshs a            ; save reg A
+         lda ,u+           ; read 8 new bits
+         rola              ; shift bit queue, and high bit into carry
+         sta <apbitbuf,pcr ; save bit queue
+         puls a,pc         ; pop reg A and return
+
+apbitbuf fcb $00           ; bit queue
+
+apshort  clrb
+         bsr apdibits      ; read 2 offset bits
+         rolb
+         bsr apdibits      ; read 4 offset bits
+         rolb
+         beq apwtlit       ; go write zero
+
+         negb              ; reverse offset in D
+         ldb b,y           ; load backreferenced byte from dst+offset
+         bra apwtlit       ; go write backreferenced byte
+
+apgamma2 ldd #$1           ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit      ; read data bit
+         rolb              ; shift into D
+         rola
+         bsr apgetbit      ; read continuation bit
+         bcs apg2loop      ; loop until a zero continuation bit is read
+apdone   rts
+
+apother  bsr apgetbit      ; read '7+1 match or short literal' bit
+         bcs apshort       ; if 111: 4 bit offset for 1-byte copy
+
+         ldb ,u+           ; read low bits of offset + length bit in B
+         beq apdone        ; check for EOD
+         clra              ; clear high bits in A
+         lsrb              ; shift offset in place, shift length bit into carry
+         std <aprepof+1,pcr ; store match offset
+         ldb #$01          ; len in B will be 2*1+carry:
+         rolb              ; shift length, and carry into B
+         bra apgotlen      ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309.s b/tools/apultra/asm/6809/unaplib_6309.s
new file mode 100644
index 0000000..9e8ed71
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_6309.s
@@ -0,0 +1,139 @@
+;  unaplib_6309.s - aPLib decompressor for H6309 - 131 bytes
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+
+; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements
+; added by Doug Masten.
+;
+; Main advantage of H6309 CPU is the "TFM" instruction which can copy one
+; byte of memory in 3 clock cycles vs a traditional copy loop that takes
+; 20 clock cycles.
+
+; Options:
+;   APLIB_VAR
+;     Define variable to point to a DP memory location for a memory space
+;     and speed optimization.
+;     ex. APLIB_VAR equ <memory location>
+;
+;   APLIB_LONG_OFFSET_DISABLE
+;     Defined variable to disable long offsets >= 32000 for a speed and space
+;     optimization. Only enable this if you know what you are doing.
+;     ex. APLIB_LONG_OFFSET_DISABLE equ 1
+
+
+; define options
+         ifdef APLIB_VAR
+apbitbuf equ APLIB_VAR     ; bit queue (use DP memory for mem & space optimization)
+         else
+apbitbuf fcb 0             ; bit queue (DEFAULT - use extended memory)
+         endc
+
+
+apl_decompress
+         lda #$80          ; initialize empty bit queue
+         sta apbitbuf      ; plus bit to roll into carry
+         tfr x,u
+
+apcplit  ldb ,u+           ; copy literal byte
+apwtlit  stb ,y+
+
+         ldb #3            ; set 'follows literal' flag
+
+aptoken  bsr apgetbit      ; read 'literal or match' bit
+         bcc apcplit       ; if 0: literal
+
+         bsr apgetbit      ; read '8+n bits or other type' bit
+         bcs apother       ; if 11x: other type of match
+
+         bsr apgamma2      ; 10: read gamma2-coded high offset bits
+         clra
+         subr d,w          ; high offset bits == 2 when follows_literal == 3 ?
+         bcc apnorep       ; if not, not a rep-match
+
+         bsr apgamma2      ; read repmatch length
+         bra apgotlen      ; go copy large match
+
+apnorep  tfr f,a           ; transfer high offset bits to A
+         ldb ,u+           ; read low offset byte in B
+         tfr d,x           ; save match offset
+
+         bsr apgamma2      ; read match length
+
+         ifndef APLIB_LONG_OFFSET_DISABLE
+         cmpx #$7D00       ; offset >= 32000 ?
+         bge apincby2      ; if so, increase match len by 2
+         endc
+         cmpx #$0500       ; offset >= 1280 ?
+         bge apincby1      ; if so, increase match len by 1
+         cmpx #$80         ; offset < 128 ?
+         bge apgotlen      ; if so, increase match len by 2
+apincby2 incw
+apincby1 incw
+
+apgotlen tfr y,d           ; transfer dst to D
+         subr x,d          ; put backreference start address in D (dst + offset)
+         tfm d+,y+         ; copy matched bytes
+
+         ldb #2            ; clear 'follows literal' flag
+         bra aptoken
+
+apgamma2 ldw #1            ; init to 1 so it gets shifted to 2 below
+loop@    bsr apgetbit      ; read data bit
+         rolw              ; shift into W
+         bsr apgetbit      ; read continuation bit
+         bcs loop@         ; loop until a zero continuation bit is read
+         rts
+
+apdibits bsr apgetbit      ; read bit
+         rolb              ; push into B
+apgetbit lsl apbitbuf      ; shift bit queue, and high bit into carry
+         bne aprts         ; queue not empty, bits remain
+         lda ,u+           ; read 8 new bits
+         rola              ; shift bit queue, and high bit into carry
+         sta apbitbuf      ; save bit queue
+aprts    rts
+
+apshort  clrb
+         bsr apdibits      ; read 2 offset bits
+         rolb
+         bsr apdibits      ; read 4 offset bits
+         rolb
+         beq apwtlit       ; if zero, go write it
+
+         negb              ; reverse offset in D
+         ldb b,y           ; load backreferenced byte from dst+offset
+         bra apwtlit       ; go write it
+
+apother  bsr apgetbit      ; read '7+1 match or short literal' bit
+         bcs apshort       ; if 111: 4 bit offset for 1-byte copy
+
+         ldb ,u+           ; read low bits of offset + length bit in B
+         beq aprts         ; check for EOD and exit if so
+         clra              ; clear high bits in A
+         lsrb              ; shift offset in place, shift length bit into carry
+         tfr d,x           ; save match offset
+         ldb #1            ; len in B will be 2*1+carry:
+         rolb              ; shift length, and carry into B
+         tfr d,w
+         bra apgotlen      ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_6309_b.s b/tools/apultra/asm/6809/unaplib_6309_b.s
new file mode 100644
index 0000000..8343edf
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_6309_b.s
@@ -0,0 +1,143 @@
+;  unaplib_6309_b.s - aPLib backward decompressor for H6309 - 139 bytes
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+
+; Original M6809 version written by Emmanuel Marty with Hitachi 6309 enhancements
+; added by Doug Masten.
+;
+; Main advantage of H6309 CPU is the "TFM" instruction which can copy one
+; byte of memory in 3 clock cycles vs a traditional copy loop that takes
+; 20 clock cycles.
+
+; Options:
+;   APLIB_VAR
+;     Define variable to point to a DP memory location for a memory space
+;     and speed optimization.
+;     ex. APLIB_VAR equ <memory location>
+;
+;   APLIB_LONG_OFFSET_DISABLE
+;     Defined variable to disable long offsets >= 32000 for a speed and space
+;     optimization. Only enable this if you know what you are doing.
+;     ex. APLIB_LONG_OFFSET_DISABLE equ 1
+
+
+; define options
+         ifdef APLIB_VAR
+apbitbuf equ APLIB_VAR     ; bit queue (use DP memory for mem & space optimization)
+         else
+apbitbuf fcb 0             ; bit queue (DEFAULT - use extended memory)
+         endc
+
+
+apl_decompress
+         lda #$80          ; initialize empty bit queue
+         sta apbitbuf      ; plus bit to roll into carry
+         leau 1,x
+         leay 1,y
+
+apcplit  ldb ,-u           ; copy literal byte
+apwtlit  stb ,-y
+
+         ldb #3            ; set 'follows literal' flag
+
+aptoken  bsr apgetbit      ; read 'literal or match' bit
+         bcc apcplit       ; if 0: literal
+
+         bsr apgetbit      ; read '8+n bits or other type' bit
+         bcs apother       ; if 11x: other type of match
+
+         bsr apgamma2      ; 10: read gamma2-coded high offset bits
+         clra
+         subr d,w          ; high offset bits == 2 when follows_literal == 3 ?
+         bcc apnorep       ; if not, not a rep-match
+
+         bsr apgamma2      ; read repmatch length
+         bra apgotlen      ; go copy large match
+
+apnorep  tfr f,a           ; transfer high offset bits to A
+         ldb ,-u           ; read low offset byte in B
+         tfr d,x           ; save match offset
+
+         bsr apgamma2      ; read match length
+
+         ifndef APLIB_LONG_OFFSET_DISABLE
+         cmpx #$7D00       ; offset >= 32000 ?
+         bge apincby2      ; if so, increase match len by 2
+         endc
+         cmpx #$0500       ; offset >= 1280 ?
+         bge apincby1      ; if so, increase match len by 1
+         cmpx #$80         ; offset < 128 ?
+         bge apgotlen      ; if so, increase match len by 2
+apincby2 incw
+apincby1 incw
+
+apgotlen tfr y,d           ; transfer dst to D
+         addr x,d          ; put backreference start address in D (dst + offset)
+         decd
+         leay -1,y
+         tfm d-,y-         ; copy matched bytes
+         leay 1,y
+
+         ldb #2            ; clear 'follows literal' flag
+         bra aptoken
+
+apgamma2 ldw #1            ; init to 1 so it gets shifted to 2 below
+loop@    bsr apgetbit      ; read data bit
+         rolw              ; shift into W
+         bsr apgetbit      ; read continuation bit
+         bcs loop@         ; loop until a zero continuation bit is read
+         rts
+
+apdibits bsr apgetbit      ; read bit
+         rolb              ; push into B
+apgetbit lsl apbitbuf      ; shift bit queue, and high bit into carry
+         bne aprts         ; queue not empty, bits remain
+         lda ,-u           ; read 8 new bits
+         rola              ; shift bit queue, and high bit into carry
+         sta apbitbuf      ; save bit queue
+aprts    rts
+
+apshort  clrb
+         bsr apdibits      ; read 2 offset bits
+         rolb
+         bsr apdibits      ; read 4 offset bits
+         rolb
+         beq apwtlit       ; if zero, go write it
+
+         decb              ; we load below without predecrement, adjust here
+         ldb b,y           ; load backreferenced byte from dst+offset
+         bra apwtlit       ; go write it
+
+apother  bsr apgetbit      ; read '7+1 match or short literal' bit
+         bcs apshort       ; if 111: 4 bit offset for 1-byte copy
+
+         ldb ,-u           ; read low bits of offset + length bit in B
+         beq aprts         ; check for EOD and exit if so
+         clra              ; clear high bits in A
+         lsrb              ; shift offset in place, shift length bit into carry
+         tfr d,x           ; save match offset
+         ldb #1            ; len in B will be 2*1+carry:
+         rolb              ; shift length, and carry into B
+         tfr d,w
+         bra apgotlen      ; go copy match
diff --git a/tools/apultra/asm/6809/unaplib_b.s b/tools/apultra/asm/6809/unaplib_b.s
new file mode 100644
index 0000000..02f943c
--- /dev/null
+++ b/tools/apultra/asm/6809/unaplib_b.s
@@ -0,0 +1,122 @@
+;  unaplib_b.s - aPLib backward decompressor for 6809 - 154 bytes
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+apl_decompress
+         lda #$80          ; initialize empty bit queue
+         sta <apbitbuf,pcr ; plus bit to roll into carry
+         leau 1,x
+         leay 1,y
+
+apcplit  ldb ,-u           ; copy literal byte
+apwtlit  stb ,-y
+
+         lda #$03          ; set 'follows literal' flag
+
+aptoken  bsr apgetbit      ; read 'literal or match' bit
+         bcc apcplit       ; if 0: literal
+
+         bsr apgetbit      ; read '8+n bits or other type' bit
+         bcs apother       ; if 11x: other type of match
+
+         sta <aplwm+2,pcr  ; store 'follows literal' flag
+
+         bsr apgamma2      ; 10: read gamma2-coded high offset bits
+aplwm    subd #$0000       ; high offset bits == 2 when follows_literal == 3 ?
+         bcc apnorep       ; if not, not a rep-match
+
+         bsr apgamma2      ; read repmatch length
+         bra apgotlen      ; go copy large match
+
+apnorep  tfr b,a           ; transfer high offset bits to A
+         ldb ,-u           ; read low offset byte in B
+         std <aprepof+2,pcr ; store match offset
+         tfr d,x           ; transfer offset to X
+
+         bsr apgamma2      ; read match length
+
+         cmpx #$7D00       ; offset >= 32000 ?
+         bge apincby2      ; if so, increase match len by 2
+         cmpx #$0500       ; offset >= 1280 ?
+         bge apincby1      ; if so, increase match len by 1
+         cmpx #$80         ; offset < 128 ?
+         bge apgotlen      ; if so, increase match len by 2
+apincby2 addd #1
+apincby1 addd #1
+apgotlen pshs u            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+aprepof  leau $aaaa,y      ; put backreference start address in U (dst+offset)
+
+apcpymt  lda ,-u           ; copy matched byte
+         sta ,-y
+         leax -1,x         ; decrement X
+         bne apcpymt       ; loop until all matched bytes are copied
+
+         puls u            ; restore source compressed data pointer
+
+         lda #$02          ; clear 'follows literal' flag
+         bra aptoken
+
+apdibits bsr apgetbit      ; read bit
+         rolb              ; push into B
+apgetbit lsl <apbitbuf,pcr ; shift bit queue, and high bit into carry
+         bne apdone        ; queue not empty, bits remain
+         pshs a            ; push reg A
+         lda ,-u           ; read 8 new bits
+         rola              ; shift bit queue, and high bit into carry
+         sta <apbitbuf,pcr ; save bit queue
+         puls a,pc         ; pop reg A and return
+
+apbitbuf fcb $00           ; bit queue
+
+apshort  clrb
+         bsr apdibits      ; read 2 offset bits
+         rolb
+         bsr apdibits      ; read 4 offset bits
+         rolb
+         beq apwtlit       ; go write a zero
+
+         decb              ; we load below without predecrement, adjust here
+         ldb b,y           ; load backreferenced byte from dst+offset
+         bra apwtlit       ; go write backreferenced byte
+
+apgamma2 ldd #$1           ; init to 1 so it gets shifted to 2 below
+apg2loop bsr apgetbit      ; read data bit
+         rolb              ; shift into D
+         rola
+         bsr apgetbit      ; read continuation bit
+         bcs apg2loop      ; loop until a zero continuation bit is read
+apdone   rts
+
+apother  bsr apgetbit      ; read '7+1 match or short literal' bit
+         bcs apshort       ; if 111: 4 bit offset for 1-byte copy
+
+         ldb ,-u           ; read low bits of offset + length bit in B
+         beq apdone        ; check for EOD
+         clra              ; clear high bits in A
+         lsrb              ; shift offset in place, shift length bit into carry
+         std <aprepof+2,pcr ; store match offset
+         ldb #$01          ; len in B will be 2*1+carry:
+         rolb              ; shift length, and carry into B
+         bra apgotlen      ; go copy match
diff --git a/tools/apultra/asm/8088/aplib_8088_fast.S b/tools/apultra/asm/8088/aplib_8088_fast.S
new file mode 100644
index 0000000..c535234
--- /dev/null
+++ b/tools/apultra/asm/8088/aplib_8088_fast.S
@@ -0,0 +1,178 @@
+;  aplib_8088_fast.S - speed-optimized aPLib decompressor for 8088 - 188 bytes
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+        segment .text
+        bits 16
+
+;  ---------------------------------------------------------------------------
+;  Decompress aPLib data
+;  inputs:
+;  * ds:si: compressed aPLib data
+;  * es:di: output buffer
+;  output:
+;  * ax:    decompressed size
+;  ---------------------------------------------------------------------------
+
+%macro apl_get_bit 0            ; read bit into carry
+        add     al,al           ; shift bit queue, and high bit into carry
+        jnz     %%gotbit        ; queue not empty, bits remain
+        lodsb                   ; read 8 new bits
+        adc     al,al           ; shift bit queue, and high bit into carry
+%%gotbit:
+%endmacro
+
+apl_decompress:
+        push    di              ; remember decompression offset
+        cld                     ; make string operations go forward
+
+        ; === register map ===
+        ; al: bit queue
+        ; ah: unused, but value is trashed
+        ; bx: follows_literal
+        ; cx: scratch register for reading gamma2 codes and storing copy length
+        ; dx: match offset (and rep-offset)
+        ; si: input (compressed data) pointer
+        ; di: output (decompressed data) pointer
+        ; bp: temporary value, trashed
+
+        mov     al,080H         ; clear bit queue(al) and set high bit to move into carry
+        xor     dx,dx           ; invalidate rep offset
+
+.literal:
+        movsb                   ; read and write literal byte
+.next_command_after_literal:
+        mov     bx,03H          ; set follows_literal(bx) to 3
+
+.next_command:
+        apl_get_bit             ; read 'literal or match' bit
+        jnc     .literal        ; if 0: literal
+                                
+                                ; 1x: match
+
+        apl_get_bit             ; read '8+n bits or other type' bit
+        jc      .other          ; 11x: other type of match
+
+                                ; 10: 8+n bits match
+        call    .get_gamma2     ; read gamma2-coded high offset bits
+        sub     cx,bx           ; high offset bits == 2 when follows_literal == 3 ?
+                                ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+                                ; is == 2 will never result in zero)
+        jae     .not_repmatch   ; if not, not a rep-match
+
+        call    .get_gamma2     ; read match length
+        jmp     short .got_len  ; go copy
+
+.not_repmatch:
+        mov     dh,cl           ; transfer high offset bits to dh
+        mov     dl,[si]         ; read low offset byte in dl
+        inc     si
+
+        call    .get_gamma2     ; read match length
+        cmp     dh,07DH         ; offset >= 32000 ?
+        jae     .increase_len_by2 ; if so, increase match len by 2
+        cmp     dh,05H          ; offset >= 1280 ?
+        jae     .increase_len_by1 ; if so, increase match len by 1
+        cmp     dx,0080H        ; offset < 128 ?
+        jae     .got_len        ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+        inc     cx              ; increase length
+.increase_len_by1:
+        inc     cx              ; increase length
+
+        ; copy cx bytes from match offset dx
+
+.got_len:
+        push    ds              ; save ds:si (current pointer to compressed data)
+        mov     bp,si
+
+        push    es
+        pop     ds
+        mov     si,di           ; point to destination in es:di - offset in dx
+        sub     si,dx
+        rep     movsb           ; copy matched bytes
+
+        mov     si,bp           ; restore ds:si
+        pop     ds
+
+        mov     bl,02H          ; set follows_literal to 2 (bx is unmodified by match commands)
+        jmp     short .next_command
+
+        ; read gamma2-coded value into cx
+
+.get_gamma2:
+        xor     cx,cx           ; initialize to 1 so that value will start at 2
+        inc     cx              ; when shifted left in the adc below
+
+.gamma2_loop:
+        apl_get_bit             ; read data bit
+        adc     cx,cx           ; shift into cx
+        apl_get_bit             ; read continuation bit
+        jc      .gamma2_loop    ; loop until a zero continuation bit is read
+
+        ret
+
+        ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+        xor     cx,cx
+        apl_get_bit             ; read '7+1 match or short literal' bit
+        jc      .short_literal  ; 111: 4 bit offset for 1-byte copy
+
+                                ; 110: 7 bits offset + 1 bit length
+                                
+        mov     dl,[si]         ; read offset + length in dl
+        inc     si
+
+        inc     cx              ; prepare cx for length below
+        shr     dl,1            ; shift len bit into carry, and offset in place
+        je      .done           ; if zero offset: EOD
+        adc     cx,cx           ; len in cx: 1*2 + carry bit = 2 or 3
+
+        xor     dh,dh           ; clear high bits of offset
+        jmp     short .got_len
+
+        ; 4 bits offset / 1 byte copy
+
+.short_literal:
+        apl_get_bit             ; read 4 offset bits
+        adc     cl,cl
+        apl_get_bit
+        adc     cl,cl
+        apl_get_bit
+        adc     cl,cl
+        apl_get_bit
+        adc     cl,cl
+        xchg    ax,cx           ; preserve bit queue in cx, put offset in ax
+        jz      .write_zero     ; if offset is 0, write a zero byte
+
+                                ; short offset 1-15
+        mov     bx,di           ; point to destination in es:di - offset in ax
+        sub     bx,ax           ; we trash bx, it will be reset to 3 when we loop
+        mov     al,[es:bx]      ; read byte from short offset
+.write_zero:
+        stosb                   ; copy matched byte
+        mov     ax,cx           ; restore bit queue in al
+        jmp     .next_command_after_literal
+
+.done:
+        pop     ax              ; retrieve the original decompression offset
+        xchg    di,ax           ; compute decompressed size
+        sub     ax,di
+        ret
diff --git a/tools/apultra/asm/8088/aplib_8088_small.S b/tools/apultra/asm/8088/aplib_8088_small.S
new file mode 100644
index 0000000..542991e
--- /dev/null
+++ b/tools/apultra/asm/8088/aplib_8088_small.S
@@ -0,0 +1,177 @@
+;  aplib_8088_small.S - size-optimized aPLib decompressor for 8088 - 145 bytes
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+        segment .text
+        bits 16
+
+;  ---------------------------------------------------------------------------
+;  Decompress aPLib data
+;  inputs:
+;  * ds:si: compressed aPLib data
+;  * es:di: output buffer
+;  output:
+;  * ax:    decompressed size
+;  ---------------------------------------------------------------------------
+
+apl_decompress:
+        push    di              ; remember decompression offset
+        cld                     ; make string operations go forward
+
+        ; === register map ===
+        ; al: bit queue
+        ; ah: unused, but value is trashed
+        ; bx: follows_literal
+        ; cx: scratch register for reading gamma2 codes and storing copy length
+        ; dx: match offset (and rep-offset)
+        ; si: input (compressed data) pointer
+        ; di: output (decompressed data) pointer
+        ; bp: offset of .get_bit 
+
+        mov     al,080H         ; clear bit queue(al) and set high bit to move into carry
+        xor     dx,dx           ; invalidate rep offset
+        mov     bp,.get_bit     ; load offset of .get_bit, to be used with call bp
+
+.literal:
+        movsb                   ; read and write literal byte
+.next_command_after_literal:
+        mov     bx,03H          ; set follows_literal(bx) to 3
+
+.next_command:
+        call    bp              ; read 'literal or match' bit
+        jnc     .literal        ; if 0: literal
+                                
+                                ; 1x: match
+
+        call    bp              ; read '8+n bits or other type' bit
+        jc      .other          ; 11x: other type of match
+
+                                ; 10: 8+n bits match
+        call    .get_gamma2     ; read gamma2-coded high offset bits
+        sub     cx,bx           ; high offset bits == 2 when follows_literal == 3 ?
+                                ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+                                ; is == 2 will never result in a negative value)
+        jae     .not_repmatch   ; if not, not a rep-match
+
+        call    .get_gamma2     ; read match length
+        jmp     short .got_len  ; go copy
+
+.not_repmatch:
+        mov     dh,cl           ; transfer high offset bits to dh
+        mov     dl,[si]         ; read low offset byte in dl
+        inc     si
+
+        call    .get_gamma2     ; read match length
+        cmp     dh,07DH         ; offset >= 32000 ?
+        jae     .increase_len_by2 ; if so, increase match len by 2
+        cmp     dh,05H          ; offset >= 1280 ?
+        jae     .increase_len_by1 ; if so, increase match len by 1
+        cmp     dx,0080H        ; offset < 128 ?
+        jae     .got_len        ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+        inc     cx              ; increase length
+.increase_len_by1:
+        inc     cx              ; increase length
+
+        ; copy cx bytes from match offset dx
+
+.got_len:
+        push    ds              ; save ds:si (current pointer to compressed data)
+        push    si
+
+        push    es
+        pop     ds
+        mov     si,di           ; point to destination in es:di - offset in dx
+        sub     si,dx
+        rep     movsb           ; copy matched bytes
+
+        pop     si              ; restore ds:si
+        pop     ds
+
+        mov     bl,02H          ; set follows_literal to 2 (bx is unmodified by match commands)
+        jmp     short .next_command
+
+        ; read gamma2-coded value into cx
+
+.get_gamma2:
+        xor     cx,cx           ; initialize to 1 so that value will start at 2
+        inc     cx              ; when shifted left in the adc below
+
+.gamma2_loop:
+        call    .get_dibits     ; read data bit, shift into cx, read continuation bit
+        jc      .gamma2_loop    ; loop until a zero continuation bit is read
+
+        ret
+
+        ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+        xor     cx,cx
+        call    bp              ; read '7+1 match or short literal' bit
+        jc      .short_literal  ; 111: 4 bit offset for 1-byte copy
+
+                                ; 110: 7 bits offset + 1 bit length
+                                
+        mov     dl,[si]         ; read offset + length in dl
+        inc     si
+
+        inc     cx              ; prepare cx for length below
+        shr     dl,1            ; shift len bit into carry, and offset in place
+        je      .done           ; if zero offset: EOD
+        adc     cx,cx           ; len in cx: 1*2 + carry bit = 2 or 3
+
+        xor     dh,dh           ; clear high bits of offset
+        jmp     short .got_len
+
+        ; 4 bits offset / 1 byte copy
+
+.short_literal:
+        call    .get_dibits     ; read 2 offset bits
+        adc     cx,cx
+        call    .get_dibits     ; read 2 offset bits
+        adc     cx,cx
+        xchg    ax,cx           ; preserve bit queue in cx, put offset in ax
+        jz      .write_zero     ; if offset is 0, write a zero byte
+
+                                ; short offset 1-15
+        mov     bx,di           ; point to destination in es:di - offset in ax
+        sub     bx,ax           ; we trash bx, it will be reset to 3 when we loop
+        mov     al,[es:bx]      ; read byte from short offset
+.write_zero:
+        stosb                   ; copy matched byte
+        xchg    ax,cx           ; restore bit queue in al
+        jmp     .next_command_after_literal
+
+.done:
+        pop     ax              ; retrieve the original decompression offset
+        xchg    di,ax           ; compute decompressed size
+        sub     ax,di
+        ret
+
+.get_dibits:
+        call    bp              ; read data bit
+        adc     cx,cx           ; shift into cx
+
+.get_bit:
+        add     al,al           ; shift bit queue, and high bit into carry
+        jnz     .got_bit        ; queue not empty, bits remain
+        lodsb                   ; read 8 new bits
+        adc     al,al           ; shift bit queue, and high bit into carry
+.got_bit:
+        ret
diff --git a/tools/apultra/asm/ARM7TDMI/aplib_arm.s b/tools/apultra/asm/ARM7TDMI/aplib_arm.s
new file mode 100644
index 0000000..b6d0cef
--- /dev/null
+++ b/tools/apultra/asm/ARM7TDMI/aplib_arm.s
@@ -0,0 +1,150 @@
+@APlib ARM7 decompressor by Dan Weiss, based on the original C version
+@Takes in raw apacked data, NOT data created by the 'safe' compressor.
+@Code is from the PocketNES NES Emulator for GBA
+
+@Code is formatted for GNU Assembler
+
+ src .req r0
+ dest .req r1
+ byte .req r2
+ mask .req r3
+ gamma .req r4
+ lwm .req r6
+ recentoff .req r7
+ temp .req r8
+
+.global depack
+.type   depack STT_FUNC
+
+@r0 = src
+@r1 = dest
+@r2 = byte
+@r3 = rotating bit mask
+@r4 = increasing gamma
+@r6 = lwm
+@r7 = recentoff
+@r8 = lr copy/scratch
+
+	.macro GETBIT @3 instructions
+	movs mask,mask,ror #1
+	ldrcsb byte,[src],#1
+	tst byte,mask
+	.endm
+
+	.macro GETBITGAMMA @5 instructions
+	mov gamma,gamma,lsl #1
+	GETBIT
+	addne gamma,gamma,#1
+	.endm
+
+@This initilaiztion code can go into slow memory
+
+depack:
+	stmfd sp!,{r4-r10,lr}
+	ldrb temp,[src],#1
+	strb temp,[dest],#1
+	ldr mask,=0x01010101
+	b aploop_nolwm
+
+@This inner-loop code should be placed into fast memory
+
+	@depack enters here
+aploop_nolwm:
+	mov lwm,#0
+aploop:
+	GETBIT
+	bne apbranch1
+	ldrb temp,[src],#1
+	strb temp,[dest],#1
+	b aploop_nolwm
+apbranch1:
+	GETBIT
+	beq apbranch2
+	GETBIT
+	beq apbranch3
+	@get an offset
+	mov gamma,#0
+	GETBIT
+	addne gamma,gamma,#1
+	GETBITGAMMA
+	GETBITGAMMA
+	GETBITGAMMA
+	cmp gamma,#0
+	ldrneb gamma,[dest,-gamma]
+	strb gamma,[dest],#1
+	b aploop_nolwm
+apbranch3:
+	@use 7 bit offset, length = 2 or 3
+	@if a zero is encountered here, it's EOF
+	ldrb gamma,[src],#1
+	movs recentoff,gamma,lsr #1
+	beq done
+	ldrcsb temp,[dest,-recentoff]
+	strcsb temp,[dest],#1
+	ldrb temp,[dest,-recentoff]
+	strb temp,[dest],#1
+	ldrb temp,[dest,-recentoff]
+	strb temp,[dest],#1
+	mov lwm,#1
+	b aploop
+apbranch2:
+	@use a gamma code * 256 for offset, another gamma code for length
+
+	bl ap_getgamma
+	sub gamma,gamma,#2
+	cmp lwm,#0
+	bne ap_is_lwm
+	mov lwm,#1
+	cmp gamma,#0
+	bne ap_not_zero_gamma
+
+	@if gamma code is 2, use old recent offset, and a new gamma code for length
+	bl ap_getgamma
+copyloop1:
+	ldrb temp,[dest,-recentoff]
+	strb temp,[dest],#1
+	subs gamma,gamma,#1
+	bne copyloop1
+	b aploop
+	
+ap_not_zero_gamma:
+	sub gamma,gamma,#1
+ap_is_lwm:
+	ldrb temp,[src],#1
+	add recentoff,temp,gamma,lsl #8
+	bl ap_getgamma
+	@gamma=length
+	cmp recentoff,#32000
+	addge gamma,gamma,#1
+	cmp recentoff,#1280
+	addge gamma,gamma,#1
+	cmp recentoff,#128
+	addlt gamma,gamma,#2
+copyloop2:
+	ldrb temp,[dest,-recentoff]
+	strb temp,[dest],#1
+	subs gamma,gamma,#1
+	bne copyloop2
+	b aploop
+
+ap_getgamma:
+	mov gamma,#1
+ap_getgammaloop:
+	GETBITGAMMA
+	GETBIT
+	bne ap_getgammaloop
+	bx lr
+
+done:
+	ldmfd sp!,{r4-r10,lr}
+	bx lr
+
+.unreq src
+.unreq dest
+.unreq byte
+.unreq mask
+.unreq gamma
+.unreq lwm
+.unreq recentoff
+.unreq temp
+
diff --git a/tools/apultra/asm/Z80/unaplib_fast.asm b/tools/apultra/asm/Z80/unaplib_fast.asm
new file mode 100644
index 0000000..c21eb5d
--- /dev/null
+++ b/tools/apultra/asm/Z80/unaplib_fast.asm
@@ -0,0 +1,339 @@
+;
+;  Speed-optimized ApLib decompressor by spke & uniabis (ver.06 01-05/06/2020, 235 bytes)
+;
+;  The original Z80 decompressors for ApLib were written by Dan Weiss (Dwedit),
+;  then tweaked by Francisco Javier Pena Pareja (utopian),
+;  and optimized by Jaime Tejedor Gomez (Metalbrain) and Antonio Villena.
+;
+;  This is a new "implicit state" decompressor heavily optimized for speed by spke.
+;  (It is 12 bytes shorter and 18% faster than the previously fastest
+;  247b decompressor by Metalbrain and Antonio Villena.)
+;
+;  ver.00 by spke (21/08/2018-01/09/2018, 244 bytes, an edit of the existing 247b decompressor);
+;  ver.01 by spke (12-13/11/2018, 234(-10) bytes, +3% speed using the state machine for LWM);
+;  ver.02 by spke (06/08/2019, +1% speed);
+;  ver.03 by spke (27/08/2019, 236(+2) bytes, +1% speed using partly expanded LDIR);
+;  ver.04 by spke (spring 2020, added full revision history and support for long offsets)
+;  ver.05 by spke (17-31/05/2020, 230(-6) bytes, +3% speed, added support for backward compression) <- BROKEN, DO NOT USE
+;  ver.06 by uniabis & spke (01-07/06/2020, 235(+5) bytes, +1% speed, added support for HD64180)
+;
+;  The data must be compressed using any compressor for ApLib capable of generating raw data.
+;  At present, two best available compressors are:
+;
+;  "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+;  "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+;  The compression can done as follows:
+;
+;  apc.exe e <sourcefile> <outfile>
+;  or
+;  apultra.exe <sourcefile> <outfile>
+;
+;  A decent compressor was written by r57shell (although it is worse than compressors above):
+;  http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+;  The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+;  The decompression is done in the standard way:
+;
+;  ld hl,FirstByteOfCompressedData
+;  ld de,FirstByteOfMemoryForDecompressedData
+;  call DecompressApLib
+;
+;  Backward decompression is also supported; you can compress files backward using:
+;
+;  apultra.exe -b <sourcefile> <outfile>
+;
+;  uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+;
+;  ld hl,LastByteOfCompressedData
+;  ld de,LastByteOfMemoryForDecompressedData
+;  call DecompressApLib
+;
+;  The decompressor modifies AF, AF', BC, DE, HL, IX.
+;
+;  Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+;  see http://www.ibsensoftware.com/ for more information
+;
+;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+;	DEFINE SupportLongOffsets				; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+;	DEFINE BackwardDecompression				; decompress data compressed backwards, -10 bytes, speeds decompression up by 3%
+;	DEFINE HD64180						; -2 bytes for HD64180/Z180 support, slows decompression down by 1%
+
+	IFNDEF BackwardDecompression
+
+		MACRO NEXT_HL
+		inc hl
+		ENDM
+
+		MACRO COPY_1
+		ldi
+		ENDM
+
+		MACRO COPY_BC
+		ldir
+		ENDM
+
+	ELSE
+
+		MACRO NEXT_HL
+		dec hl
+		ENDM
+
+		MACRO COPY_1
+		ldd
+		ENDM
+
+		MACRO COPY_BC
+		lddr
+		ENDM
+
+	ENDIF
+
+		MACRO RELOAD_A
+		ld a,(hl) : NEXT_HL : rla
+		ENDM
+
+@Decompress:		COPY_1 : scf
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+LWM0:			;LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+.ReloadByteC0		RELOAD_A : jr c,.Check2ndBit
+
+;
+;  case "0"+BYTE: copy a single literal
+
+.CASE0:			COPY_1						; first byte is always copied as literal
+
+;
+;  main decompressor loop
+
+.MainLoop:		add a : jr nc,.CASE0 : jr z,.ReloadByteC0	; "0"+BYTE = copy literal
+.Check2ndBit		add a : jr nc,.CASE10 : jr z,.ReloadByteC1	; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit		add a : call z,ReloadByte : jp c,LWM1.CASE111	; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+;  branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+.CASE110:		; "use 7 bit offset, length = 2 or 3"
+			; "if a zero is found here, it's EOF"
+			ld c,(hl) : rr c : ret z			; process EOF
+			NEXT_HL
+			ld b,0
+
+	IFNDEF HD64180
+			ld ixl,c : ld ixh,b				; save offset for future LWMs
+	ELSE
+			push bc : pop ix
+	ENDIF
+
+			push hl						; save src
+			ld h,d : ld l,e					; HL = dest
+			jr c,.LengthIs3
+
+.LengthIs2		
+	IFNDEF BackwardDecompression
+			sbc hl,bc
+	ELSE
+			add hl,bc
+	ENDIF
+			COPY_1 : COPY_1
+			jr .PreMainLoop
+
+.LengthIs3
+	IFNDEF BackwardDecompression
+			or a : sbc hl,bc
+	ELSE
+			add hl,bc
+	ENDIF
+			COPY_1 : COPY_1 : COPY_1
+			jr .PreMainLoop
+
+.ReloadByteC1		RELOAD_A : jr c,.Check3rdBit
+
+;
+;  branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+.CASE10:		; "use a gamma code * 256 for offset, another gamma code for length"
+			call GetGammaCoded
+
+			; the original decompressor contains
+			;
+			; if ((LWM == 0) && (offs == 2)) { ... }
+			; else {
+			;	if (LWM == 0) { offs -= 3; }
+			;	else { offs -= 2; }
+			; }
+			;
+			; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+			; and to split the first condition by noticing that C-1 can never be zero
+			dec c : dec c : jr z,LWM1.KickInLWM
+
+.AfterLWM		dec c : ld b,c : ld c,(hl) : NEXT_HL	; BC = offset
+
+	IFNDEF HD64180
+			ld ixl,c : ld ixh,b : push bc
+	ELSE
+			push bc : push bc : pop ix
+	ENDIF
+
+			call GetGammaCoded			; BC = len*
+
+			ex (sp),hl
+
+			; interpretation of length value is offset-dependent:
+			; if (offs >= 32000) len++; if (offs >= 1280) len++; if (offs < 128) len+=2;
+			; in other words,
+			; (1 <= offs < 128) +=2
+			; (128 <= offs < 1280) +=0
+			; (1280 <= offs < 31999) +=1
+			; NB offsets over 32000 need one more check, but other Z80 decompressors seem to ignore it. is it not needed?
+
+			; interpretation of length value is offset-dependent
+			exa : ld a,h
+	IFDEF	SupportLongOffsets
+			; NB offsets over 32000 require an additional check, which is skipped in most
+			; Z80 decompressors (seemingly as a performance optimization)
+			cp 32000/256 : jr nc,.Add2
+	ENDIF
+			cp 5 : jr nc,.Add1
+			or a : jr nz,.Add0
+			bit 7,l : jr nz,.Add0
+.Add2			inc bc
+.Add1			inc bc
+.Add0			; for offs<128 : 4+4+7+7 + 4+7 + 8+7 + 6+6 = 60t
+			; for offs>=1280 : 4+4+7+12 + 6 = 33t
+			; for 128<=offs<1280 : 4+4+7+7 + 4+12 = 38t OR 4+4+7+7 + 4+7+8+12 = 53t
+
+.CopyMatch:		; this assumes that BC = len, DE = dest, HL = offset
+			; and also that (SP) = src, while having NC
+	IFNDEF BackwardDecompression
+			ld a,e : sub l : ld l,a
+			ld a,d : sbc h
+			ld h,a : exa
+	ELSE
+			exa
+.CopyMatchLDH		add hl,de
+	ENDIF
+			COPY_1 : COPY_BC
+.PreMainLoop		pop hl					; recover src
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+LWM1:			; LWM = 1
+
+;
+;  main decompressor loop
+
+.MainLoop:		add a : jr nc,LWM0.CASE0 : jr z,.ReloadByteC0		; "0"+BYTE = copy literal
+.Check2ndBit		add a : jr nc,.CASE10 : jr z,.ReloadByteC1		; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+.Check3rdBit		add a : call z,ReloadByte : jr nc,LWM0.CASE110		; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+;  case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+.CASE111:		ld bc,%11100000
+			DUP 4
+			add a : call z,ReloadByte : rl c	; read short offset (4 bits)
+			EDUP
+			ex de,hl : jr z,.WriteZero		; zero offset means "write zero" (NB: B is zero here)
+
+			; "write a previous byte (1-15 away from dest)"
+			push hl					; BC = offset, DE = src, HL = dest
+	IFNDEF BackwardDecompression
+			sbc hl,bc				; HL = dest-offset (SBC works because branching above ensured NC)
+	ELSE
+			add hl,bc
+	ENDIF
+			ld c,(hl)
+			pop hl
+
+.WriteZero		ld (hl),c : NEXT_HL
+			ex de,hl : jp LWM0.MainLoop		; 10+4*(4+10+8)+4+7 + 11+15+7+10 + 7+4+6+10 = 179t
+
+.ReloadByteC0		RELOAD_A : jp nc,LWM0.CASE0
+			jr .Check2ndBit
+
+.ReloadByteC1		RELOAD_A : jr c,.Check3rdBit
+
+;
+;  branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+.CASE10:		; "use a gamma code * 256 for offset, another gamma code for length"
+			call GetGammaCoded
+
+			; the original decompressor contains
+			;
+			; if ((LWM == 0) && (offs == 2)) { ... }
+			; else {
+			;	if (LWM == 0) { offs -= 3; }
+			;	else { offs -= 2; }
+			; }
+			;
+			; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+			; and to split the first condition by noticing that C-1 can never be zero
+			dec c : jr LWM0.AfterLWM
+
+;
+;  the re-use of the previous offset (LWM magic)
+
+.KickInLWM:		; "and a new gamma code for length"
+			inc c : call GetGammaCoded.ReadGamma		; BC = len
+
+	IFNDEF BackwardDecompression
+			push ix : ex (sp),hl : exa
+			jr LWM0.CopyMatch
+	ELSE
+			push ix : ex (sp),hl
+			jr LWM0.CopyMatchLDH
+	ENDIF
+
+;==================================================================================================================
+;==================================================================================================================
+;==================================================================================================================
+
+;
+;  interlaced gamma code reader
+;  x0 -> 1x
+;  x1y0 -> 1xy
+;  x1y1z0 -> 1xyz etc
+;  (technically, this is a 2-based variation of Exp-Golomb-1)
+
+GetGammaCoded:		ld bc,1
+.ReadGamma		add a : jr z,.ReloadByteRG1
+			rl c : rl b
+			add a : ret nc				; NB: flag NC immediately says we do not need to reload our byte...
+			jr nz,.ReadGamma			; ...even better, flag NZ then automatically means flag C :)
+
+.ReloadByteRG2		RELOAD_A : ret nc : jr .ReadGamma
+
+.ReloadByteRG1		RELOAD_A : rl c : rl b
+			add a : ret nc : jr .ReadGamma
+
+;
+;  pretty usual getbit for mixed datastreams
+
+ReloadByte:		RELOAD_A : ret
+
diff --git a/tools/apultra/asm/Z80/unaplib_small.asm b/tools/apultra/asm/Z80/unaplib_small.asm
new file mode 100644
index 0000000..280de15
--- /dev/null
+++ b/tools/apultra/asm/Z80/unaplib_small.asm
@@ -0,0 +1,258 @@
+;
+;  Size-optimized ApLib decompressor by spke & uniabis (ver.04 01-07/06/2020, 139 bytes)
+;
+;  The original Z80 decompressor for ApLib was written by Dan Weiss (Dwedit),
+;  then tweaked by Francisco Javier Pena Pareja (utopian),
+;  and optimized by Jaime Tejedor Gomez (Metalbrain).
+;
+;  This version was heavily re-optimized for size by spke.
+;  (It is 17 bytes shorter and 22% faster than the 156b version by Metalbrain.)
+;
+;  ver.00 by spke (21/08/2018-01/09/2018, 141 bytes);
+;  ver.01 by spke (spring 2019, 140(-1) bytes, slightly faster);
+;  ver.02 by spke (05-07/01/2020, added full revision history, support for long offsets
+;                  and an option to use self-modifying code instead of IY)
+;  ver.03 by spke (18-29/05/2020, +0.5% speed, added support for backward compression)
+;  ver.04 by uniabis (01-07/06/2020, 139(-1) bytes, +1% speed, added support for HD64180)
+;
+;  The data must be compressed using any compressor for ApLib capable of generating raw data.
+;  At present, two best available compressors are:
+;
+;  "APC" by Sven-Ake Dahl: https://github.com/svendahl/cap or
+;  "apultra" by Emmanuel Marty: https://github.com/emmanuel-marty/apultra
+;
+;  The compression can be done as follows:
+;
+;  apc.exe e <sourcefile> <outfile>
+;  or
+;  apultra.exe <sourcefile> <outfile>
+;
+;  A decent compressor was written by r57shell (although it is worse than compressors above):
+;  http://gendev.spritesmind.net/forum/viewtopic.php?p=32548#p32548
+;  The use of the official ApLib compressor by Joergen Ibsen is not recommended.
+;
+;  The decompression is done in the standard way:
+;
+;  ld hl,FirstByteOfCompressedData
+;  ld de,FirstByteOfMemoryForDecompressedData
+;  call DecompressApLib
+;
+;  Backward decompression is also supported; you can compress files backward using:
+;
+;  apultra.exe -b <sourcefile> <outfile>
+;
+;  uncomment option "DEFINE BackwardDecompression" and decompress the resulting files using:
+;
+;  ld hl,LastByteOfCompressedData
+;  ld de,LastByteOfMemoryForDecompressedData
+;  call DecompressApLib
+;
+;  The decompressor modifies AF, AF', BC, DE, HL, IX.
+;
+;  Of course, ApLib compression algorithms are (c) 1998-2014 Joergen Ibsen,
+;  see http://www.ibsensoftware.com/ for more information
+;
+;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+;	DEFINE FasterGetBit					; 16% speed-up at the cost of extra 4 bytes
+;	DEFINE SupportLongOffsets				; +4 bytes for long offset support. slows decompression down by 1%, but may be needed to decompress files >=32K
+;	DEFINE BackwardDecompression				; decompress data compressed backwards, -5 bytes, speeds decompression up by 3%
+
+
+	IFDEF FasterGetBit
+		MACRO	GET_BIT
+			add a : call z,ReloadByte
+		ENDM
+	ELSE
+		MACRO	GET_BIT
+			call GetOneBit
+		ENDM
+	ENDIF
+
+	IFNDEF BackwardDecompression
+
+		MACRO NEXT_HL
+		inc hl
+		ENDM
+
+		MACRO COPY_1
+		ldi
+		ENDM
+
+		MACRO COPY_BC
+		ldir
+		ENDM
+
+	ELSE
+
+		MACRO NEXT_HL
+		dec hl
+		ENDM
+
+		MACRO COPY_1
+		ldd
+		ENDM
+
+		MACRO COPY_BC
+		lddr
+		ENDM
+
+	ENDIF
+
+@DecompressApLib:	ld a,128
+
+;
+;  case "0"+BYTE: copy a single literal
+
+CASE0:			COPY_1					; first byte is always copied as literal
+ResetLWM:		ld b,-1					; LWM = 0 (LWM stands for "Last Was Match"; a flag that we did not have a match)
+
+;
+;  main decompressor loop
+
+MainLoop:		GET_BIT : jr nc,CASE0			; "0"+BYTE = copy literal
+			GET_BIT : jr nc,CASE10			; "10"+gamma(offset/256)+BYTE+gamma(length) = the main matching mechanism
+
+			ld bc,%11100000
+			GET_BIT : jr nc,CASE110			; "110"+[oooooool] = matched 2-3 bytes with a small offset
+
+;
+;  case "111"+"oooo": copy a byte with offset -1..-15, or write zero to dest
+
+CASE111:
+ReadFourBits		GET_BIT					; read short offset (4 bits)
+			rl c : jr c,ReadFourBits
+			ex de,hl : jr z,WriteZero		; zero offset means "write zero" (NB: B is zero here)
+
+			; "write a previous byte (1-15 away from dest)"
+			push hl					; BC = offset, DE = src, HL = dest
+	IFNDEF BackwardDecompression
+			sbc hl,bc				; HL = dest-offset (SBC works because branching above ensured NC)
+	ELSE
+			add hl,bc				; HL = dest-offset (SBC works because branching above ensured NC)
+	ENDIF
+			ld c,(hl) : pop hl
+
+WriteZero		ld (hl),c : NEXT_HL
+			ex de,hl : jr ResetLWM			; write one byte, reset LWM
+
+;
+;  branch "110"+[oooooool]: copy two or three bytes (bit "l") with the offset -1..-127 (bits "ooooooo"), or stop
+
+CASE110:		; "use 7 bit offset, length = 2 or 3"
+			; "if a zero is found here, it's EOF"
+			ld c,(hl) : rr c : ret z		; process EOF
+			NEXT_HL
+
+			push hl					; save src
+			ld h,b : ld l,c				; HL = offset
+
+			; flag NC means len=2, flag C means len=3
+			ld c,1 : rl c : jr SaveLWMOffset
+			
+;
+;  branch "10"+gamma(offset/256)+BYTE+gamma(length): the main matching mechanism
+
+CASE10:			; save state of LWM into A'
+			exa : ld a,b : exa
+
+			; "use a gamma code * 256 for offset, another gamma code for length"
+			call GetGammaCoded
+
+			; the original decompressor contains
+			;
+			; if ((LWM == 0) && (offs == 2)) { ... }
+			; else {
+			;	if (LWM == 0) { offs -= 3; }
+			;	else { offs -= 2; }
+			; }
+			;
+			; so, the idea here is to use the fact that GetGammaCoded returns (offset/256)+2,
+			; and to split the first condition by noticing that C-1 can never be zero
+			exa : add c : ld c,a : exa
+
+			; "if gamma code is 2, use old r0 offset"
+			dec c : jr z,KickInLWM
+			dec c
+			ld b,c : ld c,(hl) : NEXT_HL		; BC = offset
+
+			push bc					; (SP) = offset
+			call GetGammaCoded			; BC = len*
+			ex (sp),hl				; HL = offset, (SP) = src
+
+			; interpretation of length value is offset-dependent
+			exa : ld a,h
+	IFDEF	SupportLongOffsets
+			; NB offsets over 32000 require an additional check, which is skipped in most
+			; Z80 decompressors (seemingly as a performance optimization)
+			cp 32000/256 : jr nc,.Add2
+	ENDIF
+			cp 5 : jr nc,.Add1
+			or a : jr nz,.Add0
+			bit 7,l : jr nz,.Add0
+.Add2			inc bc
+.Add1			inc bc
+.Add0			exa
+
+SaveLWMOffset:
+			push hl : pop ix			; save offset for future LWMs
+
+CopyMatch:		; this assumes that BC = len, DE = dest, HL = offset
+			; and also that (SP) = src, while having NC
+	IFNDEF BackwardDecompression
+			push de
+			ex de,hl : sbc hl,de			; HL = dest-offset
+			pop de					; DE = dest
+	ELSE
+			add hl,de				; HL = dest+offset
+	ENDIF
+
+			COPY_BC
+			pop hl					; recover src
+			jr MainLoop
+
+;
+;  the re-use of the previous offset (LWM magic)
+
+KickInLWM:		; "and a new gamma code for length"
+			call GetGammaCoded			; BC = len
+			push ix : ex (sp),hl			; DE = dest, HL = prev offset
+			jr CopyMatch
+
+;
+;  interlaced gamma code reader
+;  x0 -> 1x
+;  x1y0 -> 1xy
+;  x1y1z0 -> 1xyz etc
+;  (technically, this is a 2-based variation of Exp-Golomb-1)
+
+GetGammaCoded:		ld bc,1
+ReadGamma		GET_BIT : rl c : rl b
+			GET_BIT : ret nc
+			jr ReadGamma
+
+;
+;  pretty usual getbit for mixed datastreams
+
+	IFNDEF FasterGetBit
+GetOneBit:		add a : ret nz
+	ENDIF
+ReloadByte:		ld a,(hl) : NEXT_HL
+			rla : ret
+
diff --git a/tools/apultra/asm/x86/aplib_x86_fast.asm b/tools/apultra/asm/x86/aplib_x86_fast.asm
new file mode 100644
index 0000000..9e41d31
--- /dev/null
+++ b/tools/apultra/asm/x86/aplib_x86_fast.asm
@@ -0,0 +1,180 @@
+;  aplib_x86_fast.asm - speed-optimized aPLib decompressor for x86 - 188 bytes
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+        segment .text
+        bits 32
+
+;  ---------------------------------------------------------------------------
+;  Decompress aPLib data
+;  inputs:
+;  * esi: compressed aPLib data
+;  * edi: output buffer
+;  output:
+;  * eax:    decompressed size
+;  ---------------------------------------------------------------------------
+        %ifndef BIN
+          global apl_decompress
+          global _apl_decompress
+        %endif
+        
+        ; uint32_t apl_decompress(const void *Source, void *Destination);
+        
+%macro apl_get_bit 0            ; read bit into carry
+        add     al,al           ; shift bit queue, and high bit into carry
+        jnz     %%gotbit        ; queue not empty, bits remain
+        lodsb                   ; read 8 new bits
+        adc     al,al           ; shift bit queue, and high bit into carry
+%%gotbit:
+%endmacro
+
+apl_decompress:
+_apl_decompress:
+        pushad
+        
+        %ifdef CDECL
+          mov    esi, [esp+32+4]  ; esi = aPLib compressed data
+          mov    edi, [esp+32+8]  ; edi = output
+        %endif
+        
+        ; === register map ===
+        ; al: bit queue
+        ; ah: unused, but value is trashed
+        ; bx: follows_literal
+        ; cx: scratch register for reading gamma2 codes and storing copy length
+        ; dx: match offset (and rep-offset)
+        ; si: input (compressed data) pointer
+        ; di: output (decompressed data) pointer
+        ; bp: temporary value, trashed
+        
+        mov     al,080H         ; clear bit queue(al) and set high bit to move into carry
+        xor     edx, edx        ; invalidate rep offset
+.literal:
+        movsb                   ; read and write literal byte
+.next_command_after_literal:
+        mov     ebx,03H         ; set follows_literal(bx) to 3
+
+.next_command:
+        apl_get_bit             ; read 'literal or match' bit
+        jnc     .literal        ; if 0: literal
+                                
+                                ; 1x: match
+
+        apl_get_bit             ; read '8+n bits or other type' bit
+        jc      .other          ; 11x: other type of match
+
+                                ; 10: 8+n bits match
+        call    .get_gamma2     ; read gamma2-coded high offset bits
+        sub     ecx,ebx         ; high offset bits == 2 when follows_literal == 3 ?
+                                ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+                                ; is == 2 will never result in zero)
+        jae     .not_repmatch   ; if not, not a rep-match
+
+        call    .get_gamma2     ; read match length
+        jmp     .got_len        ; go copy
+
+.not_repmatch:
+        mov     edx,ecx         ; transfer high offset bits to dh
+        shl     edx, 8
+        mov     dl,[esi]        ; read low offset byte in dl
+        inc     esi
+
+        call    .get_gamma2     ; read match length
+        cmp     edx,07D00H        ; offset >= 32000 ?
+        jae     .increase_len_by2 ; if so, increase match len by 2
+        cmp     edx,0500H         ; offset >= 1280 ?
+        jae     .increase_len_by1 ; if so, increase match len by 1
+        cmp     edx,0080H       ; offset < 128 ?
+        jae     .got_len        ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+        inc     ecx             ; increase length
+.increase_len_by1:
+        inc     ecx             ; increase length
+
+        ; copy cx bytes from match offset dx
+
+.got_len:
+        push    esi
+        mov     esi,edi         ; point to destination in es:di - offset in dx
+        sub     esi,edx
+        rep     movsb           ; copy matched bytes
+        pop     esi
+        mov     bl,02H          ; set follows_literal to 2 (bx is unmodified by match commands)
+        jmp     .next_command
+
+        ; read gamma2-coded value into cx
+
+.get_gamma2:
+        xor     ecx,ecx         ; initialize to 1 so that value will start at 2
+        inc     ecx             ; when shifted left in the adc below
+
+.gamma2_loop:
+        apl_get_bit             ; read data bit
+        adc     ecx,ecx         ; shift into cx
+        apl_get_bit             ; read continuation bit
+        jc      .gamma2_loop    ; loop until a zero continuation bit is read
+
+        ret
+
+        ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+
+.other:
+        xor     ecx,ecx
+        apl_get_bit             ; read '7+1 match or short literal' bit
+        jc      .short_literal  ; 111: 4 bit offset for 1-byte copy
+
+                                ; 110: 7 bits offset + 1 bit length
+                                
+        movzx   edx,byte[esi]   ; read offset + length in dl
+        inc     esi
+
+        inc     ecx             ; prepare cx for length below
+        shr     dl,1            ; shift len bit into carry, and offset in place
+        je      .done           ; if zero offset: EOD
+        adc     ecx,ecx         ; len in cx: 1*2 + carry bit = 2 or 3
+        jmp     .got_len
+
+        ; 4 bits offset / 1 byte copy
+
+.short_literal:
+        apl_get_bit             ; read 4 offset bits
+        adc     ecx,ecx
+        apl_get_bit
+        adc     ecx,ecx
+        apl_get_bit
+        adc     ecx,ecx
+        apl_get_bit
+        adc     ecx,ecx
+        xchg    eax,ecx         ; preserve bit queue in cx, put offset in ax
+        jz      .write_zero     ; if offset is 0, write a zero byte
+
+                                ; short offset 1-15
+        mov     ebx,edi         ; point to destination in es:di - offset in ax
+        sub     ebx,eax         ; we trash bx, it will be reset to 3 when we loop
+        mov     al,[ebx]        ; read byte from short offset
+.write_zero:
+        stosb                   ; copy matched byte
+        mov     eax,ecx         ; restore bit queue in al
+        jmp     .next_command_after_literal
+
+.done:
+        sub     edi, [esp+32+8] ; compute decompressed size
+        mov     [esp+28], edi
+        popad
+        ret
diff --git a/tools/apultra/asm/x86/aplib_x86_small.asm b/tools/apultra/asm/x86/aplib_x86_small.asm
new file mode 100644
index 0000000..ada00f6
--- /dev/null
+++ b/tools/apultra/asm/x86/aplib_x86_small.asm
@@ -0,0 +1,159 @@
+;  aplib_x86_small.asm - size-optimized aPLib decompressor for x86 - 185 bytes
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+        segment .text
+        bits 32
+;  ---------------------------------------------------------------------------
+;  Decompress aPLib data
+;  inputs:
+;  * esi: compressed aPLib data
+;  * edi: output buffer
+;  output:
+;  * eax:    decompressed size
+;  ---------------------------------------------------------------------------
+        %ifndef BIN
+          global apl_decompress
+          global _apl_decompress
+        %endif
+        
+apl_decompress:
+_apl_decompress:
+        pushad
+
+        %ifdef CDECL
+          mov    esi, [esp+32+4]  ; esi = aPLib compressed data
+          mov    edi, [esp+32+8]  ; edi = output
+        %endif
+        
+        ; === register map ===
+        ;  al: bit queue
+        ;  ah: unused, but value is trashed
+        ; ebx: follows_literal
+        ; ecx: scratch register for reading gamma2 codes and storing copy length
+        ; edx: match offset (and rep-offset)
+        ; esi: input (compressed data) pointer
+        ; edi: output (decompressed data) pointer
+        ; ebp: offset of .get_bit 
+               
+        mov     al,080H         ; clear bit queue(al) and set high bit to move into carry
+        xor     edx, edx        ; invalidate rep offset in edx
+
+        call    .init_get_bit
+.get_dibits:
+        call    ebp             ; read data bit
+        adc     ecx,ecx         ; shift into cx
+.get_bit:
+        add     al,al           ; shift bit queue, and high bit into carry
+        jnz     .got_bit        ; queue not empty, bits remain
+        lodsb                   ; read 8 new bits
+        adc     al,al           ; shift bit queue, and high bit into carry
+.got_bit:
+        ret
+.init_get_bit:
+        pop     ebp             ; load offset of .get_bit, to be used with call ebp
+        add     ebp, .get_bit - .get_dibits
+.literal:
+        movsb                   ; read and write literal byte
+.next_command_after_literal:
+        push    03H
+        pop     ebx             ; set follows_literal(bx) to 3
+        
+.next_command:
+        call    ebp             ; read 'literal or match' bit
+        jnc     .literal        ; if 0: literal
+                                
+                                ; 1x: match
+        call    ebp             ; read '8+n bits or other type' bit
+        jc      .other          ; 11x: other type of match
+                                ; 10: 8+n bits match
+        call    .get_gamma2     ; read gamma2-coded high offset bits
+        sub     ecx,ebx         ; high offset bits == 2 when follows_literal == 3 ?
+                                ; (a gamma2 value is always >= 2, so substracting follows_literal when it
+                                ; is == 2 will never result in a negative value)
+        jae     .not_repmatch   ; if not, not a rep-match
+        call    .get_gamma2     ; read match length
+        jmp     .got_len        ; go copy
+.not_repmatch:
+        mov     edx,ecx         ; transfer high offset bits to dh
+        shl     edx,8
+        mov     dl,[esi]        ; read low offset byte in dl
+        inc     esi
+        call    .get_gamma2     ; read match length
+        cmp     edx,7D00H       ; offset >= 32000 ?
+        jae     .increase_len_by2 ; if so, increase match len by 2
+        cmp     edx,0500H       ; offset >= 1280 ?
+        jae     .increase_len_by1 ; if so, increase match len by 1
+        cmp     edx,0080H       ; offset < 128 ?
+        jae     .got_len        ; if so, increase match len by 2, otherwise it would be a 7+1 copy
+.increase_len_by2:
+        inc     ecx             ; increase length
+.increase_len_by1:
+        inc     ecx             ; increase length
+        ; copy ecx bytes from match offset edx
+.got_len:
+        push    esi             ; save esi (current pointer to compressed data)
+        mov     esi,edi         ; point to destination in edi - offset in edx
+        sub     esi,edx
+        rep     movsb           ; copy matched bytes
+        pop     esi             ; restore esi
+        mov     bl,02H          ; set follows_literal to 2 (ebx is unmodified by match commands)
+        jmp     .next_command
+        ; read gamma2-coded value into ecx
+.get_gamma2:
+        xor     ecx,ecx         ; initialize to 1 so that value will start at 2
+        inc     ecx             ; when shifted left in the adc below
+.gamma2_loop:
+        call    .get_dibits     ; read data bit, shift into cx, read continuation bit
+        jc      .gamma2_loop    ; loop until a zero continuation bit is read
+        ret
+        ; handle 7 bits offset + 1 bit len or 4 bits offset / 1 byte copy
+.other:
+        xor     ecx,ecx
+        call    ebp             ; read '7+1 match or short literal' bit
+        jc      .short_literal  ; 111: 4 bit offset for 1-byte copy
+                                ; 110: 7 bits offset + 1 bit length
+                                
+        movzx   edx,byte[esi]   ; read offset + length in dl
+        inc     esi
+        inc     ecx             ; prepare cx for length below
+        shr     dl,1            ; shift len bit into carry, and offset in place
+        je      .done           ; if zero offset: EOD
+        adc     ecx,ecx         ; len in cx: 1*2 + carry bit = 2 or 3
+        jmp     .got_len
+        ; 4 bits offset / 1 byte copy
+.short_literal:
+        call    .get_dibits     ; read 2 offset bits
+        adc     ecx,ecx
+        call    .get_dibits     ; read 2 offset bits
+        adc     ecx,ecx
+        xchg    eax,ecx         ; preserve bit queue in cx, put offset in ax
+        jz      .write_zero     ; if offset is 0, write a zero byte
+                                ; short offset 1-15
+        mov     ebx,edi         ; point to destination in es:di - offset in ax
+        sub     ebx,eax         ; we trash bx, it will be reset to 3 when we loop
+        mov     al,[ebx]        ; read byte from short offset
+.write_zero:
+        stosb                   ; copy matched byte
+        xchg    eax,ecx         ; restore bit queue in al
+        jmp     .next_command_after_literal
+.done:
+        sub     edi, [esp+32+8] ; compute decompressed size
+        mov     [esp+28], edi
+        popad
+        ret
diff --git a/tools/apultra/src/apultra.c b/tools/apultra/src/apultra.c
new file mode 100644
index 0000000..1b30fbe
--- /dev/null
+++ b/tools/apultra/src/apultra.c
@@ -0,0 +1,1223 @@
+/*
+ * apultra.c - command line compression utility for the apultra library
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <windows.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+#include "libapultra.h"
+
+#define OPT_VERBOSE        1
+#define OPT_STATS          2
+#define OPT_BACKWARD       4
+
+#define TOOL_VERSION "1.4.0"
+
+/*---------------------------------------------------------------------------*/
+
+#ifdef _WIN32
+LARGE_INTEGER hpc_frequency;
+BOOL hpc_available = FALSE;
+#endif
+
+static void do_init_time() {
+#ifdef _WIN32
+   hpc_frequency.QuadPart = 0;
+   hpc_available = QueryPerformanceFrequency(&hpc_frequency);
+#endif
+}
+
+static long long do_get_time() {
+   long long nTime;
+
+#ifdef _WIN32
+   if (hpc_available) {
+      LARGE_INTEGER nCurTime;
+
+      /* Use HPC hardware for best precision */
+      QueryPerformanceCounter(&nCurTime);
+      nTime = (long long)(nCurTime.QuadPart * 1000000LL / hpc_frequency.QuadPart);
+   }
+   else {
+      struct _timeb tb;
+      _ftime(&tb);
+
+      nTime = ((long long)tb.time * 1000LL + (long long)tb.millitm) * 1000LL;
+   }
+#else
+   struct timeval tm;
+   gettimeofday(&tm, NULL);
+
+   nTime = (long long)tm.tv_sec * 1000000LL + (long long)tm.tv_usec;
+#endif
+   return nTime;
+}
+
+static void do_reverse_buffer(unsigned char *pBuffer, size_t nBufferSize) {
+   size_t nMidPoint = nBufferSize / 2;
+   size_t i, j;
+
+   for (i = 0, j = nBufferSize - 1; i < nMidPoint; i++, j--) {
+      unsigned char c = pBuffer[i];
+      pBuffer[i] = pBuffer[j];
+      pBuffer[j] = c;
+   }
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void compression_progress(long long nOriginalSize, long long nCompressedSize) {
+   if (nOriginalSize >= 512 * 1024) {
+      fprintf(stdout, "\r%lld => %lld (%g %%)     \b\b\b\b\b", nOriginalSize, nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize));
+      fflush(stdout);
+   }
+}
+
+static int do_compress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const unsigned int nMaxWindowSize) {
+   long long nStartTime = 0LL, nEndTime = 0LL;
+   size_t nOriginalSize = 0L, nCompressedSize = 0L, nMaxCompressedSize;
+   int nFlags = 0;
+   apultra_stats stats;
+   unsigned char *pDecompressedData;
+   unsigned char *pCompressedData;
+
+   if (nOptions & OPT_VERBOSE) {
+      nStartTime = do_get_time();
+   }
+
+   FILE* f_dict = NULL;
+   size_t nDictionarySize = 0;
+   if (pszDictionaryFilename) {
+      /* Open the dictionary */
+      f_dict = fopen(pszDictionaryFilename, "rb");
+      if (!f_dict) {
+         fprintf(stderr, "error opening dictionary '%s' for reading\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      /* Get dictionary size */
+      fseek(f_dict, 0, SEEK_END);
+      nDictionarySize = (size_t)ftell(f_dict);
+      fseek(f_dict, 0, SEEK_SET);
+
+      if (nDictionarySize > BLOCK_SIZE) nDictionarySize = BLOCK_SIZE;
+   }
+
+   /* Read the whole original file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nOriginalSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pDecompressedData = (unsigned char*)malloc(nDictionarySize + nOriginalSize);
+   if (!pDecompressedData) {
+      fclose(f_in);
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nOriginalSize);
+      return 100;
+   }
+
+   if (f_dict) {
+      /* Read dictionary data */
+      if (fread(pDecompressedData + ((nOptions & OPT_BACKWARD) ? nOriginalSize : 0), 1, nDictionarySize, f_dict) != nDictionarySize) {
+         free(pDecompressedData);
+         fclose(f_in);
+         fclose(f_dict);
+         fprintf(stderr, "I/O error while reading dictionary '%s'\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      fclose(f_dict);
+      f_dict = NULL;
+   }
+
+   /* Read input file data */
+   if (fread(pDecompressedData + ((nOptions & OPT_BACKWARD) ? 0 : nDictionarySize), 1, nOriginalSize, f_in) != nOriginalSize) {
+      free(pDecompressedData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData, nDictionarySize + nOriginalSize);
+
+   /* Allocate max compressed size */
+
+   nMaxCompressedSize = apultra_get_max_compressed_size(nDictionarySize + nOriginalSize);
+
+   pCompressedData = (unsigned char*)malloc(nMaxCompressedSize);
+   if (!pCompressedData) {
+      free(pDecompressedData);
+      fprintf(stderr, "out of memory for compressing '%s', %zd bytes needed\n", pszInFilename, nMaxCompressedSize);
+      return 100;
+   }
+
+   memset(pCompressedData, 0, nMaxCompressedSize);
+
+   nCompressedSize = apultra_compress(pDecompressedData, pCompressedData, nDictionarySize + nOriginalSize, nMaxCompressedSize, nFlags, nMaxWindowSize, nDictionarySize, compression_progress, &stats);
+
+   if ((nOptions & OPT_VERBOSE)) {
+      nEndTime = do_get_time();
+   }
+
+   if (nCompressedSize == -1) {
+      free(pCompressedData);
+      free(pDecompressedData);
+      fprintf(stderr, "compression error for '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData, nCompressedSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole compressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pCompressedData, 1, nCompressedSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pCompressedData);
+   free(pDecompressedData);
+
+   if ((nOptions & OPT_VERBOSE)) {
+      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
+      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
+      fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%g bytes/token), %d into %d bytes ==> %g %%\n",
+         pszInFilename, fDelta, fSpeed, stats.commands_divisor, (double)nOriginalSize / (double)stats.commands_divisor,
+         (int)nOriginalSize, (int)nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize));
+   }
+
+   if (nOptions & OPT_STATS) {
+      fprintf(stdout, "Tokens: literals: %d short matches: %d normal matches: %d large matches: %d rep matches: %d EOD: %d\n",
+         stats.num_literals, stats.num_4bit_matches, stats.num_7bit_matches, stats.num_variable_matches, stats.num_rep_matches, stats.num_eod);
+      if (stats.match_divisor > 0) {
+         fprintf(stdout, "Offsets: min: %d avg: %d max: %d count: %d\n", stats.min_offset, (int)(stats.total_offsets / (long long)stats.match_divisor), stats.max_offset, stats.match_divisor);
+         fprintf(stdout, "Match lens: min: %d avg: %d max: %d count: %d\n", stats.min_match_len, stats.total_match_lens / stats.match_divisor, stats.max_match_len, stats.match_divisor);
+      }
+      else {
+         fprintf(stdout, "Offsets: none\n");
+         fprintf(stdout, "Match lens: none\n");
+      }
+      if (stats.rle1_divisor > 0) {
+         fprintf(stdout, "RLE1 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle1_len, stats.total_rle1_lens / stats.rle1_divisor, stats.max_rle1_len, stats.rle1_divisor);
+      }
+      else {
+         fprintf(stdout, "RLE1 lens: none\n");
+      }
+      if (stats.rle2_divisor > 0) {
+         fprintf(stdout, "RLE2 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle2_len, stats.total_rle2_lens / stats.rle2_divisor, stats.max_rle2_len, stats.rle2_divisor);
+      }
+      else {
+         fprintf(stdout, "RLE2 lens: none\n");
+      }
+      fprintf(stdout, "Safe distance: %d (0x%X)\n", stats.safe_dist, stats.safe_dist);
+   }
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_decompress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+   long long nStartTime = 0LL, nEndTime = 0LL;
+   size_t nCompressedSize, nMaxDecompressedSize, nOriginalSize;
+   unsigned char *pCompressedData;
+   unsigned char *pDecompressedData;
+   int nFlags = 0;
+
+   /* Read the whole compressed file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nCompressedSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pCompressedData = (unsigned char*)malloc(nCompressedSize);
+   if (!pCompressedData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nCompressedSize);
+      return 100;
+   }
+
+   if (fread(pCompressedData, 1, nCompressedSize, f_in) != nCompressedSize) {
+      free(pCompressedData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData, nCompressedSize);
+
+   /* Get max decompressed size */
+
+   nMaxDecompressedSize = apultra_get_max_decompressed_size(pCompressedData, nCompressedSize, nFlags);
+   if (nMaxDecompressedSize == -1) {
+      free(pCompressedData);
+      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   FILE* f_dict = NULL;
+   size_t nDictionarySize = 0;
+   if (pszDictionaryFilename) {
+      /* Open the dictionary */
+      f_dict = fopen(pszDictionaryFilename, "rb");
+      if (!f_dict) {
+         fprintf(stderr, "error opening dictionary '%s' for reading\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      /* Get dictionary size */
+      fseek(f_dict, 0, SEEK_END);
+      nDictionarySize = (size_t)ftell(f_dict);
+      fseek(f_dict, 0, SEEK_SET);
+
+      if (nDictionarySize > BLOCK_SIZE) nDictionarySize = BLOCK_SIZE;
+   }
+
+   /* Allocate max decompressed size */
+
+   pDecompressedData = (unsigned char*)malloc(nDictionarySize + nMaxDecompressedSize);
+   if (!pDecompressedData) {
+      free(pCompressedData);
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      return 100;
+   }
+
+   memset(pDecompressedData, 0, nDictionarySize + nMaxDecompressedSize);
+
+   if (f_dict) {
+      /* Read dictionary data */
+      if (fread(pDecompressedData, 1, nDictionarySize, f_dict) != nDictionarySize) {
+         free(pDecompressedData);
+         fclose(f_in);
+         fclose(f_dict);
+         fprintf(stderr, "I/O error while reading dictionary '%s'\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      fclose(f_dict);
+      f_dict = NULL;
+
+      if (nOptions & OPT_BACKWARD)
+         do_reverse_buffer(pDecompressedData, nDictionarySize);
+   }
+
+   if (nOptions & OPT_VERBOSE) {
+      nStartTime = do_get_time();
+   }
+
+   nOriginalSize = apultra_decompress(pCompressedData, pDecompressedData, nCompressedSize, nMaxDecompressedSize, nDictionarySize, nFlags);
+   if (nOriginalSize == -1) {
+      free(pDecompressedData);
+      free(pCompressedData);
+
+      fprintf(stderr, "decompression error for '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData + nDictionarySize, nOriginalSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole decompressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pDecompressedData + nDictionarySize, 1, nOriginalSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pDecompressedData);
+   free(pCompressedData);
+
+   if (nOptions & OPT_VERBOSE) {
+      nEndTime = do_get_time();
+      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
+      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
+      fprintf(stdout, "Decompressed '%s' in %g seconds, %g Mb/s\n",
+         pszInFilename, fDelta, fSpeed);
+   }
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_compare(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+   long long nStartTime = 0LL, nEndTime = 0LL;
+   size_t nCompressedSize, nMaxDecompressedSize, nOriginalSize, nDecompressedSize;
+   unsigned char *pCompressedData = NULL;
+   unsigned char *pOriginalData = NULL;
+   unsigned char *pDecompressedData = NULL;
+   int nFlags = 0;
+
+   /* Read the whole compressed file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nCompressedSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pCompressedData = (unsigned char*)malloc(nCompressedSize);
+   if (!pCompressedData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nCompressedSize);
+      return 100;
+   }
+
+   if (fread(pCompressedData, 1, nCompressedSize, f_in) != nCompressedSize) {
+      free(pCompressedData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData, nCompressedSize);
+
+   /* Read the whole original file in memory */
+
+   f_in = fopen(pszOutFilename, "rb");
+   if (!f_in) {
+      free(pCompressedData);
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nOriginalSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pOriginalData = (unsigned char*)malloc(nOriginalSize);
+   if (!pOriginalData) {
+      fclose(f_in);
+      free(pCompressedData);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nOriginalSize);
+      return 100;
+   }
+
+   if (fread(pOriginalData, 1, nOriginalSize, f_in) != nOriginalSize) {
+      free(pOriginalData);
+      fclose(f_in);
+      free(pCompressedData);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   /* Get max decompressed size */
+
+   nMaxDecompressedSize = apultra_get_max_decompressed_size(pCompressedData, nCompressedSize, nFlags);
+   if (nMaxDecompressedSize == -1) {
+      free(pOriginalData);
+      free(pCompressedData);
+      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   FILE* f_dict = NULL;
+   size_t nDictionarySize = 0;
+   if (pszDictionaryFilename) {
+      /* Open the dictionary */
+      f_dict = fopen(pszDictionaryFilename, "rb");
+      if (!f_dict) {
+         fprintf(stderr, "error opening dictionary '%s' for reading\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      /* Get dictionary size */
+      fseek(f_dict, 0, SEEK_END);
+      nDictionarySize = (size_t)ftell(f_dict);
+      fseek(f_dict, 0, SEEK_SET);
+
+      if (nDictionarySize > BLOCK_SIZE) nDictionarySize = BLOCK_SIZE;
+   }
+
+   /* Allocate max decompressed size */
+
+   pDecompressedData = (unsigned char*)malloc(nDictionarySize + nMaxDecompressedSize);
+   if (!pDecompressedData) {
+      free(pOriginalData);
+      free(pCompressedData);
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      return 100;
+   }
+
+   memset(pDecompressedData, 0, nDictionarySize + nMaxDecompressedSize);
+
+   if (f_dict) {
+      /* Read dictionary data */
+      if (fread(pDecompressedData, 1, nDictionarySize, f_dict) != nDictionarySize) {
+         free(pDecompressedData);
+         fclose(f_in);
+         fclose(f_dict);
+         fprintf(stderr, "I/O error while reading dictionary '%s'\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      fclose(f_dict);
+      f_dict = NULL;
+
+      if (nOptions & OPT_BACKWARD)
+         do_reverse_buffer(pDecompressedData, nDictionarySize);
+   }
+
+   if (nOptions & OPT_VERBOSE) {
+      nStartTime = do_get_time();
+   }
+
+   nDecompressedSize = apultra_decompress(pCompressedData, pDecompressedData, nCompressedSize, nMaxDecompressedSize, nDictionarySize, nFlags);
+   if (nDecompressedSize == -1) {
+      free(pDecompressedData);
+      free(pOriginalData);
+      free(pCompressedData);
+
+      fprintf(stderr, "decompression error for '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData + nDictionarySize, nDecompressedSize);
+
+   if (nDecompressedSize != nOriginalSize || memcmp(pDecompressedData + nDictionarySize, pOriginalData, nOriginalSize)) {
+      fprintf(stderr, "error comparing compressed file '%s' with original '%s'\n", pszInFilename, pszOutFilename);
+      return 100;
+   }
+
+   free(pDecompressedData);
+   free(pOriginalData);
+   free(pCompressedData);
+
+   if (nOptions & OPT_VERBOSE) {
+      nEndTime = do_get_time();
+      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
+      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
+      fprintf(stdout, "Compared '%s' in %g seconds, %g Mb/s\n",
+         pszInFilename, fDelta, fSpeed);
+   }
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void generate_compressible_data(unsigned char *pBuffer, size_t nBufferSize, unsigned int nSeed, int nNumLiteralValues, float fMatchProbability) {
+   size_t nIndex = 0;
+   int nMatchProbability = (int)(fMatchProbability * 1023.0f);
+
+   srand(nSeed);
+   
+   if (nIndex >= nBufferSize) return;
+   pBuffer[nIndex++] = rand() % nNumLiteralValues;
+
+   while (nIndex < nBufferSize) {
+      if ((rand() & 1023) >= nMatchProbability) {
+         size_t nLiteralCount = rand() & 127;
+         if (nLiteralCount > (nBufferSize - nIndex))
+            nLiteralCount = nBufferSize - nIndex;
+
+         while (nLiteralCount--)
+            pBuffer[nIndex++] = rand() % nNumLiteralValues;
+      }
+      else {
+         size_t nMatchLength = MIN_MATCH_SIZE + (rand() & 1023);
+         size_t nMatchOffset;
+
+         if (nMatchLength > (nBufferSize - nIndex))
+            nMatchLength = nBufferSize - nIndex;
+         if (nMatchLength > nIndex)
+            nMatchLength = nIndex;
+
+         if (nMatchLength < nIndex)
+            nMatchOffset = rand() % (nIndex - nMatchLength);
+         else
+            nMatchOffset = 0;
+
+         while (nMatchLength--) {
+            pBuffer[nIndex] = pBuffer[nIndex - nMatchOffset];
+            nIndex++;
+         }
+      }
+   }
+}
+
+static void xor_data(unsigned char *pBuffer, size_t nBufferSize, unsigned int nSeed, float fXorProbability) {
+   size_t nIndex = 0;
+   int nXorProbability = (int)(fXorProbability * 1023.0f);
+
+   srand(nSeed);
+
+   if (nIndex >= nBufferSize) return;
+
+   while (nIndex < nBufferSize) {
+      if ((rand() & 1023) < nXorProbability) {
+         pBuffer[nIndex] ^= 0xff;
+      }
+      nIndex++;
+   }
+}
+
+static int do_self_test(const unsigned int nOptions, const unsigned int nMaxWindowSize, const int nIsQuickTest) {
+   unsigned char *pGeneratedData;
+   unsigned char *pCompressedData;
+   unsigned char *pTmpCompressedData;
+   unsigned char *pTmpDecompressedData;
+   size_t nGeneratedDataSize;
+   size_t nMaxCompressedDataSize;
+   unsigned int nSeed = 123;
+   int nFlags = 0;
+   int i;
+
+   pGeneratedData = (unsigned char*)malloc(4 * BLOCK_SIZE);
+   if (!pGeneratedData) {
+      fprintf(stderr, "out of memory, %d bytes needed\n", 4 * BLOCK_SIZE);
+      return 100;
+   }
+
+   nMaxCompressedDataSize = apultra_get_max_compressed_size(4 * BLOCK_SIZE);
+   pCompressedData = (unsigned char*)malloc(nMaxCompressedDataSize);
+   if (!pCompressedData) {
+      free(pGeneratedData);
+      pGeneratedData = NULL;
+
+      fprintf(stderr, "out of memory, %zd bytes needed\n", nMaxCompressedDataSize);
+      return 100;
+   }
+
+   pTmpCompressedData = (unsigned char*)malloc(nMaxCompressedDataSize);
+   if (!pTmpCompressedData) {
+      free(pCompressedData);
+      pCompressedData = NULL;
+      free(pGeneratedData);
+      pGeneratedData = NULL;
+
+      fprintf(stderr, "out of memory, %zd bytes needed\n", nMaxCompressedDataSize);
+      return 100;
+   }
+
+   pTmpDecompressedData = (unsigned char*)malloc(4 * BLOCK_SIZE);
+   if (!pTmpDecompressedData) {
+      free(pTmpCompressedData);
+      pTmpCompressedData = NULL;
+      free(pCompressedData);
+      pCompressedData = NULL;
+      free(pGeneratedData);
+      pGeneratedData = NULL;
+
+      fprintf(stderr, "out of memory, %d bytes needed\n", 4 * BLOCK_SIZE);
+      return 100;
+   }
+
+   memset(pGeneratedData, 0, 4 * BLOCK_SIZE);
+   memset(pCompressedData, 0, nMaxCompressedDataSize);
+   memset(pTmpCompressedData, 0, nMaxCompressedDataSize);
+
+   /* Test compressing with a too small buffer to do anything, expect to fail cleanly */
+   for (i = 0; i < 12; i++) {
+      generate_compressible_data(pGeneratedData, i, nSeed, 256, 0.5f);
+      apultra_compress(pGeneratedData, pCompressedData, i, i, nFlags, nMaxWindowSize, 0 /* dictionary size */, NULL, NULL);
+   }
+
+   size_t nDataSizeStep = 128;
+   float fProbabilitySizeStep = nIsQuickTest ? 0.005f : 0.0005f;
+
+   for (nGeneratedDataSize = 1024; nGeneratedDataSize <= (nIsQuickTest ? 1024U : (4U * BLOCK_SIZE)); nGeneratedDataSize += nDataSizeStep) {
+      float fMatchProbability;
+
+      fprintf(stdout, "size %zd", nGeneratedDataSize);
+      for (fMatchProbability = 0; fMatchProbability <= 0.995f; fMatchProbability += fProbabilitySizeStep) {
+         int nNumLiteralValues[12] = { 1, 2, 3, 15, 30, 56, 96, 137, 178, 191, 255, 256 };
+         float fXorProbability;
+
+         fputc('.', stdout);
+         fflush(stdout);
+
+         for (i = 0; i < 12; i++) {
+            /* Generate data to compress */
+            generate_compressible_data(pGeneratedData, nGeneratedDataSize, nSeed, nNumLiteralValues[i], fMatchProbability);
+
+            /* Try to compress it, expected to succeed */
+            size_t nActualCompressedSize = apultra_compress(pGeneratedData, pCompressedData, nGeneratedDataSize, apultra_get_max_compressed_size(nGeneratedDataSize),
+               nFlags, nMaxWindowSize, 0 /* dictionary size */, NULL, NULL);
+            if (nActualCompressedSize == -1 || nActualCompressedSize < (1 + 1 + 1 /* footer */)) {
+               free(pTmpDecompressedData);
+               pTmpDecompressedData = NULL;
+               free(pTmpCompressedData);
+               pTmpCompressedData = NULL;
+               free(pCompressedData);
+               pCompressedData = NULL;
+               free(pGeneratedData);
+               pGeneratedData = NULL;
+
+               fprintf(stderr, "\nself-test: error compressing size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               return 100;
+            }
+
+            /* Try to decompress it, expected to succeed */
+            size_t nActualDecompressedSize;
+            nActualDecompressedSize = apultra_decompress(pCompressedData, pTmpDecompressedData, nActualCompressedSize, nGeneratedDataSize, 0 /* dictionary size */, nFlags);
+            if (nActualDecompressedSize == -1) {
+               free(pTmpDecompressedData);
+               pTmpDecompressedData = NULL;
+               free(pTmpCompressedData);
+               pTmpCompressedData = NULL;
+               free(pCompressedData);
+               pCompressedData = NULL;
+               free(pGeneratedData);
+               pGeneratedData = NULL;
+
+               fprintf(stderr, "\nself-test: error decompressing size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               return 100;
+            }
+
+            if (memcmp(pGeneratedData, pTmpDecompressedData, nGeneratedDataSize)) {
+               free(pTmpDecompressedData);
+               pTmpDecompressedData = NULL;
+               free(pTmpCompressedData);
+               pTmpCompressedData = NULL;
+               free(pCompressedData);
+               pCompressedData = NULL;
+               free(pGeneratedData);
+               pGeneratedData = NULL;
+
+               fprintf(stderr, "\nself-test: error comparing decompressed and original data, size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               return 100;
+            }
+
+            /* Try to decompress corrupted data, expected to fail cleanly, without crashing or corrupting memory outside the output buffer */
+            for (fXorProbability = 0.05f; fXorProbability <= 0.5f; fXorProbability += 0.05f) {
+               memcpy(pTmpCompressedData, pCompressedData, nActualCompressedSize);
+               xor_data(pTmpCompressedData, nActualCompressedSize, nSeed, fXorProbability);
+               apultra_decompress(pTmpCompressedData, pGeneratedData, nActualCompressedSize, nGeneratedDataSize, 0 /* dictionary size */, nFlags);
+            }
+         }
+
+         nSeed++;
+      }
+
+      fputc(10, stdout);
+      fflush(stdout);
+
+      nDataSizeStep <<= 1;
+      if (nDataSizeStep > (128 * 4096))
+         nDataSizeStep = 128 * 4096;
+      fProbabilitySizeStep *= 1.25;
+      if (fProbabilitySizeStep > (0.0005f * 4096))
+         fProbabilitySizeStep = 0.0005f * 4096;
+   }
+
+   free(pTmpDecompressedData);
+   pTmpDecompressedData = NULL;
+
+   free(pTmpCompressedData);
+   pTmpCompressedData = NULL;
+
+   free(pCompressedData);
+   pCompressedData = NULL;
+
+   free(pGeneratedData);
+   pGeneratedData = NULL;
+
+   fprintf(stdout, "All tests passed.\n");
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const unsigned int nMaxWindowSize) {
+   size_t nFileSize, nMaxCompressedSize;
+   unsigned char *pFileData;
+   unsigned char *pCompressedData;
+   int nFlags = 0;
+   int i;
+
+   if (pszDictionaryFilename) {
+      fprintf(stderr, "in-memory benchmarking does not support dictionaries\n");
+      return 100;
+   }
+
+   /* Read the whole original file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nFileSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pFileData = (unsigned char*)malloc(nFileSize);
+   if (!pFileData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nFileSize);
+      return 100;
+   }
+
+   if (fread(pFileData, 1, nFileSize, f_in) != nFileSize) {
+      free(pFileData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pFileData, nFileSize);
+
+   /* Allocate max compressed size */
+
+   nMaxCompressedSize = apultra_get_max_compressed_size(nFileSize);
+
+   pCompressedData = (unsigned char*)malloc(nMaxCompressedSize + 2048);
+   if (!pCompressedData) {
+      free(pFileData);
+      fprintf(stderr, "out of memory for compressing '%s', %zd bytes needed\n", pszInFilename, nMaxCompressedSize);
+      return 100;
+   }
+
+   memset(pCompressedData + 1024, 0, nMaxCompressedSize);
+
+   long long nBestCompTime = -1;
+
+   size_t nActualCompressedSize = 0;
+   size_t nRightGuardPos = nMaxCompressedSize;
+
+   for (i = 0; i < 5; i++) {
+      unsigned char nGuard = 0x33 + i;
+      int j;
+
+      /* Write guard bytes around the output buffer, to help check for writes outside of it by the compressor */
+      memset(pCompressedData, nGuard, 1024);
+      memset(pCompressedData + 1024 + nRightGuardPos, nGuard, 1024);
+
+      long long t0 = do_get_time();
+      nActualCompressedSize = apultra_compress(pFileData, pCompressedData + 1024, nFileSize, nRightGuardPos, nFlags, nMaxWindowSize, 0 /* dictionary size */, NULL, NULL);
+      long long t1 = do_get_time();
+      if (nActualCompressedSize == -1) {
+         free(pCompressedData);
+         free(pFileData);
+         fprintf(stderr, "compression error\n");
+         return 100;
+      }
+
+      long long nCurDecTime = t1 - t0;
+      if (nBestCompTime == -1 || nBestCompTime > nCurDecTime)
+         nBestCompTime = nCurDecTime;
+
+      /* Check guard bytes before the output buffer */
+      for (j = 0; j < 1024; j++) {
+         if (pCompressedData[j] != nGuard) {
+            free(pCompressedData);
+            free(pFileData);
+            fprintf(stderr, "error, wrote outside of output buffer at %d!\n", j - 1024);
+            return 100;
+         }
+      }
+
+      /* Check guard bytes after the output buffer */
+      for (j = 0; j < 1024; j++) {
+         if (pCompressedData[1024 + nRightGuardPos + j] != nGuard) {
+            free(pCompressedData);
+            free(pFileData);
+            fprintf(stderr, "error, wrote outside of output buffer at %d!\n", j);
+            return 100;
+         }
+      }
+
+      nRightGuardPos = nActualCompressedSize;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData + 1024, nActualCompressedSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole compressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pCompressedData + 1024, 1, nActualCompressedSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pCompressedData);
+   free(pFileData);
+
+   fprintf(stdout, "compressed size: %zd bytes\n", nActualCompressedSize);
+   fprintf(stdout, "compression time: %lld microseconds (%g Mb/s)\n", nBestCompTime, ((double)nActualCompressedSize / 1024.0) / ((double)nBestCompTime / 1000.0));
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+   size_t nFileSize, nMaxDecompressedSize;
+   unsigned char *pFileData;
+   unsigned char *pDecompressedData;
+   int nFlags = 0;
+   int i;
+
+   if (pszDictionaryFilename) {
+      fprintf(stderr, "in-memory benchmarking does not support dictionaries\n");
+      return 100;
+   }
+
+   /* Read the whole compressed file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nFileSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pFileData = (unsigned char*)malloc(nFileSize);
+   if (!pFileData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nFileSize);
+      return 100;
+   }
+
+   if (fread(pFileData, 1, nFileSize, f_in) != nFileSize) {
+      free(pFileData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pFileData, nFileSize);
+
+   /* Allocate max decompressed size */
+
+   nMaxDecompressedSize = apultra_get_max_decompressed_size(pFileData, nFileSize, nFlags);
+   if (nMaxDecompressedSize == -1) {
+      free(pFileData);
+      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   pDecompressedData = (unsigned char*)malloc(nMaxDecompressedSize);
+   if (!pDecompressedData) {
+      free(pFileData);
+      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      return 100;
+   }
+
+   memset(pDecompressedData, 0, nMaxDecompressedSize);
+
+   long long nBestDecTime = -1;
+
+   size_t nActualDecompressedSize = 0;
+   for (i = 0; i < 50; i++) {
+      long long t0 = do_get_time();
+      nActualDecompressedSize = apultra_decompress(pFileData, pDecompressedData, nFileSize, nMaxDecompressedSize, 0 /* dictionary size */, nFlags);
+      long long t1 = do_get_time();
+      if (nActualDecompressedSize == -1) {
+         free(pDecompressedData);
+         free(pFileData);
+         fprintf(stderr, "decompression error\n");
+         return 100;
+      }
+
+      long long nCurDecTime = t1 - t0;
+      if (nBestDecTime == -1 || nBestDecTime > nCurDecTime)
+         nBestDecTime = nCurDecTime;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData, nActualDecompressedSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole decompressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pDecompressedData, 1, nActualDecompressedSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pDecompressedData);
+   free(pFileData);
+
+   fprintf(stdout, "decompressed size: %zd bytes\n", nActualDecompressedSize);
+   fprintf(stdout, "decompression time: %lld microseconds (%g Mb/s)\n", nBestDecTime, ((double)nActualDecompressedSize / 1024.0) / ((double)nBestDecTime / 1000.0));
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+int main(int argc, char **argv) {
+   int i;
+   const char *pszInFilename = NULL;
+   const char *pszOutFilename = NULL;
+   const char *pszDictionaryFilename = NULL;
+   int nArgsError = 0;
+   int nCommandDefined = 0;
+   int nVerifyCompression = 0;
+   char cCommand = 'z';
+   unsigned int nOptions = 0;
+   unsigned int nMaxWindowSize = 0;
+
+   for (i = 1; i < argc; i++) {
+      if (!strcmp(argv[i], "-d")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'd';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-z")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'z';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-c")) {
+         if (!nVerifyCompression) {
+            nVerifyCompression = 1;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-cbench")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'B';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-dbench")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'b';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-test")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 't';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-quicktest")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'T';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-D")) {
+         if (!pszDictionaryFilename && (i + 1) < argc) {
+            pszDictionaryFilename = argv[i + 1];
+            i++;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strncmp(argv[i], "-D", 2)) {
+         if (!pszDictionaryFilename) {
+            pszDictionaryFilename = argv[i] + 2;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-v")) {
+         if ((nOptions & OPT_VERBOSE) == 0) {
+            nOptions |= OPT_VERBOSE;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-w")) {
+         if (!nMaxWindowSize && (i + 1) < argc) {
+            char *pEnd = NULL;
+            nMaxWindowSize = (int)strtol(argv[i + 1], &pEnd, 10);
+            if (pEnd && pEnd != argv[i + 1] && (nMaxWindowSize >= 16 && nMaxWindowSize <= 0x200000)) {
+               i++;
+            }
+            else {
+               nArgsError = 1;
+            }
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strncmp(argv[i], "-w", 2)) {
+         if (!nMaxWindowSize) {
+            char *pEnd = NULL;
+            nMaxWindowSize = (int)strtol(argv[i] + 2, &pEnd, 10);
+            if (!(pEnd && pEnd != (argv[i] + 2) && (nMaxWindowSize >= 16 && nMaxWindowSize <= 0x200000))) {
+               nArgsError = 1;
+            }
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-stats")) {
+         if ((nOptions & OPT_STATS) == 0) {
+            nOptions |= OPT_STATS;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-b")) {
+         if ((nOptions & OPT_BACKWARD) == 0) {
+            nOptions |= OPT_BACKWARD;
+         }
+         else
+            nArgsError = 1;
+      }
+      else {
+         if (!pszInFilename)
+            pszInFilename = argv[i];
+         else {
+            if (!pszOutFilename)
+               pszOutFilename = argv[i];
+            else
+               nArgsError = 1;
+         }
+      }
+   }
+
+   if (!nArgsError && cCommand == 't') {
+      return do_self_test(nOptions, nMaxWindowSize, 0);
+   }
+   else if (!nArgsError && cCommand == 'T') {
+      return do_self_test(nOptions, nMaxWindowSize, 1);
+   }
+
+   if (nArgsError || !pszInFilename || !pszOutFilename) {
+      fprintf(stderr, "apultra command-line tool v" TOOL_VERSION " by Emmanuel Marty and spke\n");
+      fprintf(stderr, "usage: %s [-c] [-d] [-v] [-b] <infile> <outfile>\n", argv[0]);
+      fprintf(stderr, "        -c: check resulting stream after compressing\n");
+      fprintf(stderr, "        -d: decompress (default: compress)\n");
+      fprintf(stderr, "        -b: backwards compression or decompression\n");
+      fprintf(stderr, " -w <size>: maximum window size, in bytes (16..2097152), defaults to maximum\n");
+      fprintf(stderr, " -D <file>: use dictionary file\n");
+      fprintf(stderr, "   -cbench: benchmark in-memory compression\n");
+      fprintf(stderr, "   -dbench: benchmark in-memory decompression\n");
+      fprintf(stderr, "     -test: run full automated self-tests\n");
+      fprintf(stderr, "-quicktest: run quick automated self-tests\n");
+      fprintf(stderr, "    -stats: show compressed data stats\n");
+      fprintf(stderr, "        -v: be verbose\n");
+      return 100;
+   }
+
+   do_init_time();
+
+   if (cCommand == 'z') {
+      int nResult = do_compress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMaxWindowSize);
+      if (nResult == 0 && nVerifyCompression) {
+         return do_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions);
+      } else {
+         return nResult;
+      }
+   }
+   else if (cCommand == 'd') {
+      return do_decompress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions);
+   }
+   else if (cCommand == 'B') {
+      return do_compr_benchmark(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMaxWindowSize);
+   }
+   else if (cCommand == 'b') {
+      return do_dec_benchmark(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions);
+   }
+   else {
+      return 100;
+   }
+}
diff --git a/tools/apultra/src/expand.c b/tools/apultra/src/expand.c
new file mode 100644
index 0000000..76b3bf1
--- /dev/null
+++ b/tools/apultra/src/expand.c
@@ -0,0 +1,396 @@
+/*
+ * expand.c - decompressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "format.h"
+#include "expand.h"
+#include "libapultra.h"
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else /* _MSC_VER */
+#define FORCE_INLINE __attribute__((always_inline))
+#endif /* _MSC_VER */
+
+static inline FORCE_INLINE int apultra_read_bit(const unsigned char **ppInBlock, const unsigned char *pDataEnd, int *nCurBitMask, unsigned char *bits) {
+   const unsigned char *pInBlock = *ppInBlock;
+   int nBit;
+
+   if ((*nCurBitMask) == 0) {
+      if (pInBlock >= pDataEnd) return -1;
+      (*bits) = *pInBlock++;
+      (*nCurBitMask) = 128;
+   }
+
+   nBit = ((*bits) & 128) ? 1 : 0;
+
+   (*bits) <<= 1;
+   (*nCurBitMask) >>= 1;
+
+   *ppInBlock = pInBlock;
+   return nBit;
+}
+
+static inline FORCE_INLINE int apultra_read_gamma2(const unsigned char **ppInBlock, const unsigned char *pDataEnd, int *nCurBitMask, unsigned char *bits) {
+   int bit;
+   unsigned int v = 1;
+
+   do {
+      v = (v << 1) + apultra_read_bit(ppInBlock, pDataEnd, nCurBitMask, bits);
+      bit = apultra_read_bit(ppInBlock, pDataEnd, nCurBitMask, bits);
+      if (bit < 0) return bit;
+   } while (bit);
+
+   return v;
+}
+
+/**
+ * Get maximum decompressed size of compressed data
+ *
+ * @param pInputData compressed data
+ * @param nInputSize compressed size in bytes
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return maximum decompressed size
+ */
+size_t apultra_get_max_decompressed_size(const unsigned char *pInputData, size_t nInputSize, const unsigned int nFlags) {
+   const unsigned char *pInputDataEnd = pInputData + nInputSize;
+   int nCurBitMask = 0;
+   unsigned char bits = 0;
+   int nMatchOffset = -1;
+   int nFollowsLiteral = 3;
+   size_t nDecompressedSize = 0;
+
+   if (pInputData >= pInputDataEnd)
+      return -1;
+   pInputData++;
+   nDecompressedSize++;
+
+   while (1) {
+      unsigned int nResult;
+
+      nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+      if (nResult < 0) return -1;
+
+      if (!nResult) {
+         /* '0': literal */
+         if (pInputData < pInputDataEnd) {
+            pInputData++;
+            nDecompressedSize++;
+            nFollowsLiteral = 3;
+         }
+         else {
+            return -1;
+         }
+      }
+      else {
+         nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+         if (nResult < 0) return -1;
+
+         if (nResult == 0) {
+            unsigned int nMatchLen;
+
+            /* '10': 8+n bits offset */
+            int nMatchOffsetHi = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            nMatchOffsetHi -= nFollowsLiteral;
+            if (nMatchOffsetHi >= 0) {
+               nMatchOffset = ((unsigned int) nMatchOffsetHi) << 8;
+               nMatchOffset |= (unsigned int)(*pInputData++);
+
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+
+               if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                  nMatchLen += 2;
+               else if (nMatchOffset >= MINMATCH3_OFFSET)
+                  nMatchLen++;
+            }
+            else {
+               /* else rep-match */
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            }
+
+            nFollowsLiteral = 2;
+
+            nDecompressedSize += nMatchLen;
+         }
+         else {
+            nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            if (nResult < 0) return -1;
+
+            if (nResult == 0) {
+               unsigned int nCommand;
+               unsigned int nMatchLen;
+
+               /* '110': 7 bits offset + 1 bit length */
+               nCommand = (unsigned int)(*pInputData++);
+               if (nCommand == 0x00) {
+                  /* EOD. No match len follows. */
+                  break;
+               }
+
+               /* Bits 7-1: offset; bit 0: length */
+               nMatchOffset = (nCommand >> 1);
+               nMatchLen = (nCommand & 1) + 2;
+
+               nFollowsLiteral = 2;
+               nDecompressedSize += nMatchLen;
+            }
+            else {
+               unsigned int nShortMatchOffset;
+
+               /* '111': 4 bit offset */
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset = nResult << 3;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 2;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 1;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 0;
+
+               nFollowsLiteral = 3;
+               nDecompressedSize++;
+            }
+         }
+      }
+   }
+
+   return nDecompressedSize;
+}
+
+/**
+ * Decompress data in memory
+ *
+ * @param pInputData compressed data
+ * @param pOutBuffer buffer for decompressed data
+ * @param nInputSize compressed size in bytes
+ * @param nMaxOutBufferSize maximum capacity of decompression buffer
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return actual decompressed size, or -1 for error
+ */
+size_t apultra_decompress(const unsigned char *pInputData, unsigned char *pOutData, size_t nInputSize, size_t nMaxOutBufferSize, size_t nDictionarySize, const unsigned int nFlags) {
+   const unsigned char *pInputDataEnd = pInputData + nInputSize;
+   unsigned char *pCurOutData = pOutData + nDictionarySize;
+   const unsigned char *pOutDataEnd = pCurOutData + nMaxOutBufferSize;
+   const unsigned char *pOutDataFastEnd = pOutDataEnd - 20;
+   int nCurBitMask = 0;
+   unsigned char bits = 0;
+   int nMatchOffset = -1;
+   int nFollowsLiteral = 3;
+
+   if (pInputData >= pInputDataEnd && pCurOutData < pOutDataEnd)
+      return -1;
+   *pCurOutData++ = *pInputData++;
+
+   while (1) {
+      unsigned int nResult;
+
+      nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+      if (nResult < 0) return -1;
+
+      if (!nResult) {
+         /* '0': literal */
+         if (pInputData < pInputDataEnd && pCurOutData < pOutDataEnd) {
+            *pCurOutData++ = *pInputData++;
+            nFollowsLiteral = 3;
+         }
+         else {
+            return -1;
+         }
+      }
+      else {
+         nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+         if (nResult < 0) return -1;
+
+         if (nResult == 0) {
+            unsigned int nMatchLen;
+
+            /* '10': 8+n bits offset */
+            int nMatchOffsetHi = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            nMatchOffsetHi -= nFollowsLiteral;
+            if (nMatchOffsetHi >= 0) {
+               nMatchOffset = ((unsigned int) nMatchOffsetHi) << 8;
+               nMatchOffset |= (unsigned int)(*pInputData++);
+
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+
+               if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                  nMatchLen += 2;
+               else if (nMatchOffset >= MINMATCH3_OFFSET)
+                  nMatchLen++;
+            }
+            else {
+               /* else rep-match */
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            }
+
+            nFollowsLiteral = 2;
+            const unsigned char *pSrc = pCurOutData - nMatchOffset;
+            if (pSrc >= pOutData && (pSrc + nMatchLen) <= pOutDataEnd) {
+               if (nMatchLen < 11 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+                  memcpy(pCurOutData, pSrc, 8);
+                  memcpy(pCurOutData + 8, pSrc + 8, 2);
+                  pCurOutData += nMatchLen;
+               }
+               else {
+                  if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+                     /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */
+
+                     if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
+                        const unsigned char *pCopySrc = pSrc;
+                        unsigned char *pCopyDst = pCurOutData;
+                        const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;
+
+                        do {
+                           memcpy(pCopyDst, pCopySrc, 16);
+                           pCopySrc += 16;
+                           pCopyDst += 16;
+                        } while (pCopyDst < pCopyEndDst);
+
+                        pCurOutData += nMatchLen;
+                     }
+                     else {
+                        while (nMatchLen) {
+                           *pCurOutData++ = *pSrc++;
+                           nMatchLen--;
+                        }
+                     }
+                  }
+                  else {
+                     return -1;
+                  }
+               }
+            }
+            else {
+               return -1;
+            }
+         }
+         else {
+            nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            if (nResult < 0) return -1;
+
+            if (nResult == 0) {
+               unsigned int nCommand;
+               unsigned int nMatchLen;
+
+               /* '110': 7 bits offset + 1 bit length */
+               nCommand = (unsigned int)(*pInputData++);
+               if (nCommand == 0x00) {
+                  /* EOD. No match len follows. */
+                  break;
+               }
+
+               /* Bits 7-1: offset; bit 0: length */
+               nMatchOffset = (nCommand >> 1);
+               nMatchLen = (nCommand & 1) + 2;
+
+               nFollowsLiteral = 2;
+               const unsigned char *pSrc = pCurOutData - nMatchOffset;
+               if (pSrc >= pOutData && (pSrc + nMatchLen) <= pOutDataEnd) {
+                  if (nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+                     memcpy(pCurOutData, pSrc, 8);
+                     memcpy(pCurOutData + 8, pSrc + 8, 2);
+                     pCurOutData += nMatchLen;
+                  }
+                  else {
+                     if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+                        while (nMatchLen) {
+                           *pCurOutData++ = *pSrc++;
+                           nMatchLen--;
+                        }
+                     }
+                     else {
+                        return -1;
+                     }
+                  }
+               }
+               else {
+                  return -1;
+               }
+            }
+            else {
+               unsigned int nShortMatchOffset;
+
+               /* '111': 4 bit offset */
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset = nResult << 3;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 2;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 1;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 0;
+
+               nFollowsLiteral = 3;
+               if (nShortMatchOffset) {
+                  /* Short offset, 1-15 */
+                  const unsigned char *pSrc = pCurOutData - nShortMatchOffset;
+                  if (pSrc >= pOutData && (pCurOutData + 1) <= pOutDataEnd && (pSrc + 1) <= pOutDataEnd) {
+                     *pCurOutData++ = *pSrc++;
+                  }
+                  else {
+                     return -1;
+                  }
+               }
+               else {
+                  /* Write zero */
+                  if ((pCurOutData + 1) <= pOutDataEnd) {
+                     *pCurOutData++ = 0;
+                  }
+                  else {
+                     return -1;
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   return (size_t)(pCurOutData - pOutData) - nDictionarySize;
+}
diff --git a/tools/apultra/src/expand.h b/tools/apultra/src/expand.h
new file mode 100644
index 0000000..474660c
--- /dev/null
+++ b/tools/apultra/src/expand.h
@@ -0,0 +1,71 @@
+/*
+ * expand.h - decompressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _EXPAND_H
+#define _EXPAND_H
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Get maximum decompressed size of compressed data
+ *
+ * @param pInputData compressed data
+ * @param nInputSize compressed size in bytes
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return maximum decompressed size
+ */
+size_t apultra_get_max_decompressed_size(const unsigned char *pInputData, size_t nInputSize, const unsigned int nFlags);
+
+/**
+ * Decompress data in memory
+ *
+ * @param pInputData compressed data
+ * @param pOutBuffer buffer for decompressed data
+ * @param nInputSize compressed size in bytes
+ * @param nMaxOutBufferSize maximum capacity of decompression buffer
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return actual decompressed size, or -1 for error
+ */
+size_t apultra_decompress(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize, size_t nDictionarySize, const unsigned int nFlags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _EXPAND_H */
diff --git a/tools/apultra/src/format.h b/tools/apultra/src/format.h
new file mode 100644
index 0000000..cf949b5
--- /dev/null
+++ b/tools/apultra/src/format.h
@@ -0,0 +1,47 @@
+/*
+ * format.h - byte stream format definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _FORMAT_H
+#define _FORMAT_H
+
+#define MIN_OFFSET 1
+#define MAX_OFFSET 0x1fffff
+
+#define MAX_VARLEN 0x1fffff
+
+#define BLOCK_SIZE 0x100000
+
+#define MIN_MATCH_SIZE 1
+#define MINMATCH3_OFFSET 1280
+#define MINMATCH4_OFFSET 32000
+
+#endif /* _FORMAT_H */
diff --git a/tools/apultra/src/libapultra.h b/tools/apultra/src/libapultra.h
new file mode 100644
index 0000000..f97e336
--- /dev/null
+++ b/tools/apultra/src/libapultra.h
@@ -0,0 +1,40 @@
+/*
+ * libapultra.h - library definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _LIB_APULTRA_H
+#define _LIB_APULTRA_H
+
+#include "format.h"
+#include "shrink.h"
+#include "expand.h"
+
+#endif /* _LIB_APULTRA_H */
diff --git a/tools/apultra/src/libdivsufsort/CHANGELOG.md b/tools/apultra/src/libdivsufsort/CHANGELOG.md
new file mode 100644
index 0000000..fe9d004
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CHANGELOG.md
@@ -0,0 +1,21 @@
+# libdivsufsort Change Log
+
+See full changelog at: https://github.com/y-256/libdivsufsort/commits
+
+## [2.0.1] - 2010-11-11
+### Fixed
+* Wrong variable used in `divbwt` function
+* Enclose some string variables with double quotation marks in include/CMakeLists.txt
+* Fix typo in include/CMakeLists.txt
+
+## 2.0.0 - 2008-08-23
+### Changed
+* Switch the build system to [CMake](http://www.cmake.org/)
+* Improve the performance of the suffix-sorting algorithm
+
+### Added
+* OpenMP support
+* 64-bit version of divsufsort
+
+[Unreleased]: https://github.com/y-256/libdivsufsort/compare/2.0.1...HEAD
+[2.0.1]: https://github.com/y-256/libdivsufsort/compare/2.0.0...2.0.1
diff --git a/tools/apultra/src/libdivsufsort/CMakeLists.txt b/tools/apultra/src/libdivsufsort/CMakeLists.txt
new file mode 100644
index 0000000..7859943
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CMakeLists.txt
@@ -0,0 +1,99 @@
+### cmake file for building libdivsufsort Package ###
+cmake_minimum_required(VERSION 2.4.4)
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
+include(AppendCompilerFlags)
+
+## Project information ##
+project(libdivsufsort C)
+set(PROJECT_VENDOR "Yuta Mori")
+set(PROJECT_CONTACT "yuta.256@gmail.com")
+set(PROJECT_URL "https://github.com/y-256/libdivsufsort")
+set(PROJECT_DESCRIPTION "A lightweight suffix sorting library")
+include(VERSION.cmake)
+
+## CPack configuration ##
+set(CPACK_GENERATOR "TGZ;TBZ2;ZIP")
+set(CPACK_SOURCE_GENERATOR "TGZ;TBZ2;ZIP")
+include(ProjectCPack)
+
+## Project options ##
+option(BUILD_SHARED_LIBS "Set to OFF to build static libraries" ON)
+option(BUILD_EXAMPLES "Build examples" ON)
+option(BUILD_DIVSUFSORT64 "Build libdivsufsort64" OFF)
+option(USE_OPENMP "Use OpenMP for parallelization" OFF)
+option(WITH_LFS "Enable Large File Support" ON)
+
+## Installation directories ##
+set(LIB_SUFFIX "" CACHE STRING "Define suffix of directory name (32 or 64)")
+
+set(CMAKE_INSTALL_RUNTIMEDIR "" CACHE PATH "Specify the output directory for dll runtimes (default is bin)")
+if(NOT CMAKE_INSTALL_RUNTIMEDIR)
+  set(CMAKE_INSTALL_RUNTIMEDIR "${CMAKE_INSTALL_PREFIX}/bin")
+endif(NOT CMAKE_INSTALL_RUNTIMEDIR)
+
+set(CMAKE_INSTALL_LIBDIR "" CACHE PATH "Specify the output directory for libraries (default is lib)")
+if(NOT CMAKE_INSTALL_LIBDIR)
+  set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}")
+endif(NOT CMAKE_INSTALL_LIBDIR)
+
+set(CMAKE_INSTALL_INCLUDEDIR "" CACHE PATH "Specify the output directory for header files (default is include)")
+if(NOT CMAKE_INSTALL_INCLUDEDIR)
+  set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_PREFIX}/include")
+endif(NOT CMAKE_INSTALL_INCLUDEDIR)
+
+set(CMAKE_INSTALL_PKGCONFIGDIR "" CACHE PATH "Specify the output directory for pkgconfig files (default is lib/pkgconfig)")
+if(NOT CMAKE_INSTALL_PKGCONFIGDIR)
+  set(CMAKE_INSTALL_PKGCONFIGDIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+endif(NOT CMAKE_INSTALL_PKGCONFIGDIR)
+
+## Build type ##
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(CMAKE_VERBOSE_MAKEFILE ON)
+endif(NOT CMAKE_BUILD_TYPE)
+
+## Compiler options ##
+if(MSVC)
+  append_c_compiler_flags("/W4" "VC" CMAKE_C_FLAGS)
+  append_c_compiler_flags("/Oi;/Ot;/Ox;/Oy" "VC" CMAKE_C_FLAGS_RELEASE)
+  if(USE_OPENMP)
+    append_c_compiler_flags("/openmp" "VC" CMAKE_C_FLAGS)
+  endif(USE_OPENMP)
+elseif(BORLAND)
+  append_c_compiler_flags("-w" "BCC" CMAKE_C_FLAGS)
+  append_c_compiler_flags("-Oi;-Og;-Os;-Ov;-Ox" "BCC" CMAKE_C_FLAGS_RELEASE)
+else(MSVC)
+  if(CMAKE_COMPILER_IS_GNUCC)
+    append_c_compiler_flags("-Wall" "GCC" CMAKE_C_FLAGS)
+    append_c_compiler_flags("-fomit-frame-pointer" "GCC" CMAKE_C_FLAGS_RELEASE)
+    if(USE_OPENMP)
+      append_c_compiler_flags("-fopenmp" "GCC" CMAKE_C_FLAGS)
+    endif(USE_OPENMP)
+  else(CMAKE_COMPILER_IS_GNUCC)
+    append_c_compiler_flags("-Wall" "UNKNOWN" CMAKE_C_FLAGS)
+    append_c_compiler_flags("-fomit-frame-pointer" "UNKNOWN" CMAKE_C_FLAGS_RELEASE)
+    if(USE_OPENMP)
+      append_c_compiler_flags("-fopenmp;-openmp;-omp" "UNKNOWN" CMAKE_C_FLAGS)
+    endif(USE_OPENMP)
+  endif(CMAKE_COMPILER_IS_GNUCC)
+endif(MSVC)
+
+## Add definitions ##
+add_definitions(-DHAVE_CONFIG_H=1 -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS)
+
+## Add subdirectories ##
+add_subdirectory(pkgconfig)
+add_subdirectory(include)
+add_subdirectory(lib)
+if(BUILD_EXAMPLES)
+  add_subdirectory(examples)
+endif(BUILD_EXAMPLES)
+
+## Add 'uninstall' target ##
+CONFIGURE_FILE(
+  "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/cmake_uninstall.cmake.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/CMakeModules/cmake_uninstall.cmake"
+  IMMEDIATE @ONLY)
+ADD_CUSTOM_TARGET(uninstall
+  "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/CMakeModules/cmake_uninstall.cmake")
diff --git a/tools/apultra/src/libdivsufsort/CMakeModules/AppendCompilerFlags.cmake b/tools/apultra/src/libdivsufsort/CMakeModules/AppendCompilerFlags.cmake
new file mode 100644
index 0000000..58d3f99
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CMakeModules/AppendCompilerFlags.cmake
@@ -0,0 +1,38 @@
+include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
+
+macro(append_c_compiler_flags _flags _name _result)
+  set(SAFE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  string(REGEX REPLACE "[-+/ ]" "_" cname "${_name}")
+  string(TOUPPER "${cname}" cname)
+  foreach(flag ${_flags})
+    string(REGEX REPLACE "^[-+/ ]+(.*)[-+/ ]*$" "\\1" flagname "${flag}")
+    string(REGEX REPLACE "[-+/ ]" "_" flagname "${flagname}")
+    string(TOUPPER "${flagname}" flagname)
+    set(have_flag "HAVE_${cname}_${flagname}")
+    set(CMAKE_REQUIRED_FLAGS "${flag}")
+    check_c_source_compiles("int main() { return 0; }" ${have_flag})
+    if(${have_flag})
+      set(${_result} "${${_result}} ${flag}")
+    endif(${have_flag})
+  endforeach(flag)
+  set(CMAKE_REQUIRED_FLAGS ${SAFE_CMAKE_REQUIRED_FLAGS})
+endmacro(append_c_compiler_flags)
+
+macro(append_cxx_compiler_flags _flags _name _result)
+  set(SAFE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  string(REGEX REPLACE "[-+/ ]" "_" cname "${_name}")
+  string(TOUPPER "${cname}" cname)
+  foreach(flag ${_flags})
+    string(REGEX REPLACE "^[-+/ ]+(.*)[-+/ ]*$" "\\1" flagname "${flag}")
+    string(REGEX REPLACE "[-+/ ]" "_" flagname "${flagname}")
+    string(TOUPPER "${flagname}" flagname)
+    set(have_flag "HAVE_${cname}_${flagname}")
+    set(CMAKE_REQUIRED_FLAGS "${flag}")
+    check_cxx_source_compiles("int main() { return 0; }" ${have_flag})
+    if(${have_flag})
+      set(${_result} "${${_result}} ${flag}")
+    endif(${have_flag})
+  endforeach(flag)
+  set(CMAKE_REQUIRED_FLAGS ${SAFE_CMAKE_REQUIRED_FLAGS})
+endmacro(append_cxx_compiler_flags)
diff --git a/tools/apultra/src/libdivsufsort/CMakeModules/CheckFunctionKeywords.cmake b/tools/apultra/src/libdivsufsort/CMakeModules/CheckFunctionKeywords.cmake
new file mode 100644
index 0000000..44601fd
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CMakeModules/CheckFunctionKeywords.cmake
@@ -0,0 +1,15 @@
+include(CheckCSourceCompiles)
+
+macro(check_function_keywords _wordlist)
+  set(${_result} "")
+  foreach(flag ${_wordlist})
+    string(REGEX REPLACE "[-+/ ()]" "_" flagname "${flag}")
+    string(TOUPPER "${flagname}" flagname)
+    set(have_flag "HAVE_${flagname}")
+    check_c_source_compiles("${flag} void func(); void func() { } int main() { func(); return 0; }" ${have_flag})
+    if(${have_flag} AND NOT ${_result})
+      set(${_result} "${flag}")
+#      break()
+    endif(${have_flag} AND NOT ${_result})
+  endforeach(flag)
+endmacro(check_function_keywords)
diff --git a/tools/apultra/src/libdivsufsort/CMakeModules/CheckLFS.cmake b/tools/apultra/src/libdivsufsort/CMakeModules/CheckLFS.cmake
new file mode 100644
index 0000000..e2b0099
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CMakeModules/CheckLFS.cmake
@@ -0,0 +1,109 @@
+## Checks for large file support ##
+include(CheckIncludeFile)
+include(CheckSymbolExists)
+include(CheckTypeSize)
+
+macro(check_lfs _isenable)
+  set(LFS_OFF_T "")
+  set(LFS_FOPEN "")
+  set(LFS_FSEEK "")
+  set(LFS_FTELL "")
+  set(LFS_PRID "")
+
+  if(${_isenable})
+    set(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
+    set(CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS}
+        -D_LARGEFILE_SOURCE -D_LARGE_FILES -D_FILE_OFFSET_BITS=64
+        -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS)
+
+    check_include_file("sys/types.h" HAVE_SYS_TYPES_H)
+    check_include_file("inttypes.h" HAVE_INTTYPES_H)
+    check_include_file("stddef.h" HAVE_STDDEF_H)
+    check_include_file("stdint.h" HAVE_STDINT_H)
+
+    # LFS type1: 8 <= sizeof(off_t), fseeko, ftello
+    check_type_size("off_t" SIZEOF_OFF_T)
+    if(SIZEOF_OFF_T GREATER 7)
+      check_symbol_exists("fseeko" "stdio.h" HAVE_FSEEKO)
+      check_symbol_exists("ftello" "stdio.h" HAVE_FTELLO)
+      if(HAVE_FSEEKO AND HAVE_FTELLO)
+        set(LFS_OFF_T "off_t")
+        set(LFS_FOPEN "fopen")
+        set(LFS_FSEEK "fseeko")
+        set(LFS_FTELL "ftello")
+        check_symbol_exists("PRIdMAX" "inttypes.h" HAVE_PRIDMAX)
+        if(HAVE_PRIDMAX)
+          set(LFS_PRID "PRIdMAX")
+        else(HAVE_PRIDMAX)
+          check_type_size("long" SIZEOF_LONG)
+          check_type_size("int" SIZEOF_INT)
+          if(SIZEOF_OFF_T GREATER SIZEOF_LONG)
+            set(LFS_PRID "\"lld\"")
+          elseif(SIZEOF_LONG GREATER SIZEOF_INT)
+            set(LFS_PRID "\"ld\"")
+          else(SIZEOF_OFF_T GREATER SIZEOF_LONG)
+            set(LFS_PRID "\"d\"")
+          endif(SIZEOF_OFF_T GREATER SIZEOF_LONG)
+        endif(HAVE_PRIDMAX)
+      endif(HAVE_FSEEKO AND HAVE_FTELLO)
+    endif(SIZEOF_OFF_T GREATER 7)
+
+    # LFS type2: 8 <= sizeof(off64_t), fopen64, fseeko64, ftello64
+    if(NOT LFS_OFF_T)
+      check_type_size("off64_t" SIZEOF_OFF64_T)
+      if(SIZEOF_OFF64_T GREATER 7)
+        check_symbol_exists("fopen64" "stdio.h" HAVE_FOPEN64)
+        check_symbol_exists("fseeko64" "stdio.h" HAVE_FSEEKO64)
+        check_symbol_exists("ftello64" "stdio.h" HAVE_FTELLO64)
+        if(HAVE_FOPEN64 AND HAVE_FSEEKO64 AND HAVE_FTELLO64)
+          set(LFS_OFF_T "off64_t")
+          set(LFS_FOPEN "fopen64")
+          set(LFS_FSEEK "fseeko64")
+          set(LFS_FTELL "ftello64")
+          check_symbol_exists("PRIdMAX" "inttypes.h" HAVE_PRIDMAX)
+          if(HAVE_PRIDMAX)
+            set(LFS_PRID "PRIdMAX")
+          else(HAVE_PRIDMAX)
+            check_type_size("long" SIZEOF_LONG)
+            check_type_size("int" SIZEOF_INT)
+            if(SIZEOF_OFF64_T GREATER SIZEOF_LONG)
+              set(LFS_PRID "\"lld\"")
+            elseif(SIZEOF_LONG GREATER SIZEOF_INT)
+              set(LFS_PRID "\"ld\"")
+            else(SIZEOF_OFF64_T GREATER SIZEOF_LONG)
+              set(LFS_PRID "\"d\"")
+            endif(SIZEOF_OFF64_T GREATER SIZEOF_LONG)
+          endif(HAVE_PRIDMAX)
+        endif(HAVE_FOPEN64 AND HAVE_FSEEKO64 AND HAVE_FTELLO64)
+      endif(SIZEOF_OFF64_T GREATER 7)
+    endif(NOT LFS_OFF_T)
+
+    # LFS type3: 8 <= sizeof(__int64), _fseeki64, _ftelli64
+    if(NOT LFS_OFF_T)
+      check_type_size("__int64" SIZEOF___INT64)
+      if(SIZEOF___INT64 GREATER 7)
+        check_symbol_exists("_fseeki64" "stdio.h" HAVE__FSEEKI64)
+        check_symbol_exists("_ftelli64" "stdio.h" HAVE__FTELLI64)
+        if(HAVE__FSEEKI64 AND HAVE__FTELLI64)
+          set(LFS_OFF_T "__int64")
+          set(LFS_FOPEN "fopen")
+          set(LFS_FSEEK "_fseeki64")
+          set(LFS_FTELL "_ftelli64")
+          set(LFS_PRID  "\"I64d\"")
+        endif(HAVE__FSEEKI64 AND HAVE__FTELLI64)
+      endif(SIZEOF___INT64 GREATER 7)
+    endif(NOT LFS_OFF_T)
+
+    set(CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}")
+  endif(${_isenable})
+
+  if(NOT LFS_OFF_T)
+    ## not found
+    set(LFS_OFF_T "long")
+    set(LFS_FOPEN "fopen")
+    set(LFS_FSEEK "fseek")
+    set(LFS_FTELL "ftell")
+    set(LFS_PRID  "\"ld\"")
+  endif(NOT LFS_OFF_T)
+
+endmacro(check_lfs)
diff --git a/tools/apultra/src/libdivsufsort/CMakeModules/ProjectCPack.cmake b/tools/apultra/src/libdivsufsort/CMakeModules/ProjectCPack.cmake
new file mode 100644
index 0000000..7c105f9
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CMakeModules/ProjectCPack.cmake
@@ -0,0 +1,38 @@
+# If the cmake version includes cpack, use it
+IF(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake")
+  SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "${PROJECT_DESCRIPTION}")
+  SET(CPACK_PACKAGE_VENDOR "${PROJECT_VENDOR}")
+  SET(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
+  SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+  SET(CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
+  SET(CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
+  SET(CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
+#  SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${PROJECT_NAME} ${PROJECT_VERSION}")
+  SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${PROJECT_NAME}-${PROJECT_VERSION_FULL}")
+
+  IF(NOT DEFINED CPACK_SYSTEM_NAME)
+    SET(CPACK_SYSTEM_NAME "${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+  ENDIF(NOT DEFINED CPACK_SYSTEM_NAME)
+
+  IF(${CPACK_SYSTEM_NAME} MATCHES Windows)
+    IF(CMAKE_CL_64)
+      SET(CPACK_SYSTEM_NAME win64-${CMAKE_SYSTEM_PROCESSOR})
+    ELSE(CMAKE_CL_64)
+      SET(CPACK_SYSTEM_NAME win32-${CMAKE_SYSTEM_PROCESSOR})
+    ENDIF(CMAKE_CL_64)
+  ENDIF(${CPACK_SYSTEM_NAME} MATCHES Windows)
+
+  IF(NOT DEFINED CPACK_PACKAGE_FILE_NAME)
+    SET(CPACK_PACKAGE_FILE_NAME "${CPACK_SOURCE_PACKAGE_FILE_NAME}-${CPACK_SYSTEM_NAME}")
+  ENDIF(NOT DEFINED CPACK_PACKAGE_FILE_NAME)
+
+  SET(CPACK_PACKAGE_CONTACT "${PROJECT_CONTACT}")
+  IF(UNIX)
+    SET(CPACK_STRIP_FILES "")
+    SET(CPACK_SOURCE_STRIP_FILES "")
+#    SET(CPACK_PACKAGE_EXECUTABLES "ccmake" "CMake")
+  ENDIF(UNIX)
+  SET(CPACK_SOURCE_IGNORE_FILES "/CVS/" "/build/" "/\\\\.build/" "/\\\\.svn/" "~$")
+  # include CPack model once all variables are set
+  INCLUDE(CPack)
+ENDIF(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake")
diff --git a/tools/apultra/src/libdivsufsort/CMakeModules/cmake_uninstall.cmake.in b/tools/apultra/src/libdivsufsort/CMakeModules/cmake_uninstall.cmake.in
new file mode 100644
index 0000000..8366a83
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/CMakeModules/cmake_uninstall.cmake.in
@@ -0,0 +1,36 @@
+IF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  MESSAGE(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"")
+ENDIF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+FILE(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+STRING(REGEX REPLACE "\n" ";" files "${files}")
+
+SET(NUM 0)
+FOREACH(file ${files})
+  IF(EXISTS "$ENV{DESTDIR}${file}")
+    MESSAGE(STATUS "Looking for \"$ENV{DESTDIR}${file}\" - found")
+    SET(UNINSTALL_CHECK_${NUM} 1)
+  ELSE(EXISTS "$ENV{DESTDIR}${file}")
+    MESSAGE(STATUS "Looking for \"$ENV{DESTDIR}${file}\" - not found")
+    SET(UNINSTALL_CHECK_${NUM} 0)
+  ENDIF(EXISTS "$ENV{DESTDIR}${file}")
+  MATH(EXPR NUM "1 + ${NUM}")
+ENDFOREACH(file)
+
+SET(NUM 0)
+FOREACH(file ${files})
+  IF(${UNINSTALL_CHECK_${NUM}})
+    MESSAGE(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"")
+    EXEC_PROGRAM(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    IF(NOT "${rm_retval}" STREQUAL 0)
+      MESSAGE(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
+    ENDIF(NOT "${rm_retval}" STREQUAL 0)
+  ENDIF(${UNINSTALL_CHECK_${NUM}})
+  MATH(EXPR NUM "1 + ${NUM}")
+ENDFOREACH(file)
+
+FILE(REMOVE "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
diff --git a/tools/apultra/src/libdivsufsort/LICENSE b/tools/apultra/src/libdivsufsort/LICENSE
new file mode 100644
index 0000000..249efa4
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2003 Yuta Mori All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/tools/apultra/src/libdivsufsort/README.md b/tools/apultra/src/libdivsufsort/README.md
new file mode 100644
index 0000000..381a188
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/README.md
@@ -0,0 +1,140 @@
+# libdivsufsort
+
+libdivsufsort is a software library that implements a lightweight suffix array construction algorithm.
+
+## News
+* 2015-03-21: The project has moved from [Google Code](http://code.google.com/p/libdivsufsort/) to [GitHub](https://github.com/y-256/libdivsufsort)
+
+## Introduction
+This library provides a simple and an efficient C API to construct a suffix array and a Burrows-Wheeler transformed string from a given string over a constant-size alphabet.
+The algorithm runs in O(n log n) worst-case time using only 5n+O(1) bytes of memory space, where n is the length of
+the string.
+
+## Build requirements
+* An ANSI C Compiler (e.g. GNU GCC)
+* [CMake](http://www.cmake.org/ "CMake") version 2.4.2 or newer
+* CMake-supported build tool
+
+## Building on GNU/Linux
+1. Get the source code from GitHub. You can either
+    * use git to clone the repository
+    ```
+    git clone https://github.com/y-256/libdivsufsort.git
+    ```
+    * or download a [zip file](../../archive/master.zip) directly
+2. Create a `build` directory in the package source directory.
+```shell
+$ cd libdivsufsort
+$ mkdir build
+$ cd build
+```
+3. Configure the package for your system.
+If you want to install to a different location,  change the -DCMAKE_INSTALL_PREFIX option.
+```shell
+$ cmake -DCMAKE_BUILD_TYPE="Release" \
+-DCMAKE_INSTALL_PREFIX="/usr/local" ..
+```
+4. Compile the package.
+```shell
+$ make
+```
+5. (Optional) Install the library and header files.
+```shell
+$ sudo make install
+```
+
+## API
+```c
+/* Data types */
+typedef int32_t saint_t;
+typedef int32_t saidx_t;
+typedef uint8_t sauchar_t;
+
+/*
+ * Constructs the suffix array of a given string.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The output array or suffixes.
+ * @param n The length of the given string.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+saint_t
+divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n);
+
+/*
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+saidx_t
+divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n);
+```
+
+## Example Usage
+```c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <divsufsort.h>
+
+int main() {
+    // intput data
+    char *Text = "abracadabra";
+    int n = strlen(Text);
+    int i, j;
+
+    // allocate
+    int *SA = (int *)malloc(n * sizeof(int));
+
+    // sort
+    divsufsort((unsigned char *)Text, SA, n);
+
+    // output
+    for(i = 0; i < n; ++i) {
+        printf("SA[%2d] = %2d: ", i, SA[i]);
+        for(j = SA[i]; j < n; ++j) {
+            printf("%c", Text[j]);
+        }
+        printf("$\n");
+    }
+
+    // deallocate
+    free(SA);
+
+    return 0;
+}
+```
+See the [examples](examples) directory for a few other examples.
+
+## Benchmarks
+See [Benchmarks](https://github.com/y-256/libdivsufsort/blob/wiki/SACA_Benchmarks.md) page for details.
+
+## License
+libdivsufsort is released under the [MIT license](LICENSE "MIT license").
+> The MIT License (MIT)
+>
+> Copyright (c) 2003 Yuta Mori All rights reserved.
+>
+> Permission is hereby granted, free of charge, to any person obtaining a copy
+> of this software and associated documentation files (the "Software"), to deal
+> in the Software without restriction, including without limitation the rights
+> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+> copies of the Software, and to permit persons to whom the Software is
+> furnished to do so, subject to the following conditions:
+>
+> The above copyright notice and this permission notice shall be included in all
+> copies or substantial portions of the Software.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+> SOFTWARE.
+
+## Author
+* Yuta Mori
diff --git a/tools/apultra/src/libdivsufsort/VERSION.cmake b/tools/apultra/src/libdivsufsort/VERSION.cmake
new file mode 100644
index 0000000..3f11ac1
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/VERSION.cmake
@@ -0,0 +1,23 @@
+set(PROJECT_VERSION_MAJOR "2")
+set(PROJECT_VERSION_MINOR "0")
+set(PROJECT_VERSION_PATCH "2")
+set(PROJECT_VERSION_EXTRA "-1")
+set(PROJECT_VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}")
+set(PROJECT_VERSION_FULL "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}${PROJECT_VERSION_EXTRA}")
+
+set(LIBRARY_VERSION "3.0.1")
+set(LIBRARY_SOVERSION "3")
+
+## Git revision number ##
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+  execute_process(COMMAND git describe --tags HEAD
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DESCRIBE_TAGS ERROR_QUIET)
+  if(GIT_DESCRIBE_TAGS)
+    string(REGEX REPLACE "^v(.*)" "\\1" GIT_REVISION "${GIT_DESCRIBE_TAGS}")
+    string(STRIP "${GIT_REVISION}" GIT_REVISION)
+    if(GIT_REVISION)
+      set(PROJECT_VERSION_FULL "${GIT_REVISION}")
+    endif(GIT_REVISION)
+  endif(GIT_DESCRIBE_TAGS)
+endif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
diff --git a/tools/apultra/src/libdivsufsort/examples/CMakeLists.txt b/tools/apultra/src/libdivsufsort/examples/CMakeLists.txt
new file mode 100644
index 0000000..e801c81
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/examples/CMakeLists.txt
@@ -0,0 +1,11 @@
+## Add definitions ##
+add_definitions(-D_LARGEFILE_SOURCE -D_LARGE_FILES -D_FILE_OFFSET_BITS=64)
+
+## Targets ##
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../include"
+                    "${CMAKE_CURRENT_BINARY_DIR}/../include")
+link_directories("${CMAKE_CURRENT_BINARY_DIR}/../lib")
+foreach(src suftest mksary sasearch bwt unbwt)
+  add_executable(${src} ${src}.c)
+  target_link_libraries(${src} divsufsort)
+endforeach(src)
diff --git a/tools/apultra/src/libdivsufsort/examples/bwt.c b/tools/apultra/src/libdivsufsort/examples/bwt.c
new file mode 100644
index 0000000..5a362d0
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/examples/bwt.c
@@ -0,0 +1,220 @@
+/*
+ * bwt.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#if HAVE_IO_H && HAVE_FCNTL_H
+# include <io.h>
+# include <fcntl.h>
+#endif
+#include <time.h>
+#include <divsufsort.h>
+#include "lfs.h"
+
+
+static
+size_t
+write_int(FILE *fp, saidx_t n) {
+  unsigned char c[4];
+  c[0] = (unsigned char)((n >>  0) & 0xff), c[1] = (unsigned char)((n >>  8) & 0xff),
+  c[2] = (unsigned char)((n >> 16) & 0xff), c[3] = (unsigned char)((n >> 24) & 0xff);
+  return fwrite(c, sizeof(unsigned char), 4, fp);
+}
+
+static
+void
+print_help(const char *progname, int status) {
+  fprintf(stderr,
+          "bwt, a burrows-wheeler transform program, version %s.\n",
+          divsufsort_version());
+  fprintf(stderr, "usage: %s [-b num] INFILE OUTFILE\n", progname);
+  fprintf(stderr, "  -b num    set block size to num MiB [1..512] (default: 32)\n\n");
+  exit(status);
+}
+
+int
+main(int argc, const char *argv[]) {
+  FILE *fp, *ofp;
+  const char *fname, *ofname;
+  sauchar_t *T;
+  saidx_t *SA;
+  LFS_OFF_T n;
+  size_t m;
+  saidx_t pidx;
+  clock_t start,finish;
+  saint_t i, blocksize = 32, needclose = 3;
+
+  /* Check arguments. */
+  if((argc == 1) ||
+     (strcmp(argv[1], "-h") == 0) ||
+     (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
+  if((argc != 3) && (argc != 5)) { print_help(argv[0], EXIT_FAILURE); }
+  i = 1;
+  if(argc == 5) {
+    if(strcmp(argv[i], "-b") != 0) { print_help(argv[0], EXIT_FAILURE); }
+    blocksize = atoi(argv[i + 1]);
+    if(blocksize < 0) { blocksize = 1; }
+    else if(512 < blocksize) { blocksize = 512; }
+    i += 2;
+  }
+  blocksize <<= 20;
+
+  /* Open a file for reading. */
+  if(strcmp(argv[i], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&fp, fname = argv[i], "rb") != 0) {
+#else
+    if((fp = LFS_FOPEN(fname = argv[i], "rb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdin), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    fp = stdin;
+    fname = "stdin";
+    needclose ^= 1;
+  }
+  i += 1;
+
+  /* Open a file for writing. */
+  if(strcmp(argv[i], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&ofp, ofname = argv[i], "wb") != 0) {
+#else
+    if((ofp = LFS_FOPEN(ofname = argv[i], "wb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], ofname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdout), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    ofp = stdout;
+    ofname = "stdout";
+    needclose ^= 2;
+  }
+
+  /* Get the file size. */
+  if(LFS_FSEEK(fp, 0, SEEK_END) == 0) {
+    n = LFS_FTELL(fp);
+    rewind(fp);
+    if(n < 0) {
+      fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+    if(0x20000000L < n) { n = 0x20000000L; }
+    if((blocksize == 0) || (n < blocksize)) { blocksize = (saidx_t)n; }
+  } else if(blocksize == 0) { blocksize = 32 << 20; }
+
+  /* Allocate 5blocksize bytes of memory. */
+  T = (sauchar_t *)malloc(blocksize * sizeof(sauchar_t));
+  SA = (saidx_t *)malloc(blocksize * sizeof(saidx_t));
+  if((T == NULL) || (SA == NULL)) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Write the blocksize. */
+  if(write_int(ofp, blocksize) != 4) {
+    fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], ofname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  fprintf(stderr, "  BWT (blocksize %" PRIdSAINT_T ") ... ", blocksize);
+  start = clock();
+  for(n = 0; 0 < (m = fread(T, sizeof(sauchar_t), blocksize, fp)); n += m) {
+    /* Burrows-Wheeler Transform. */
+    pidx = divbwt(T, T, SA, m);
+    if(pidx < 0) {
+      fprintf(stderr, "%s (bw_transform): %s.\n",
+        argv[0],
+        (pidx == -1) ? "Invalid arguments" : "Cannot allocate memory");
+      exit(EXIT_FAILURE);
+    }
+
+    /* Write the bwted data. */
+    if((write_int(ofp, pidx) != 4) ||
+       (fwrite(T, sizeof(sauchar_t), m, ofp) != m)) {
+      fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], ofname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  }
+  if(ferror(fp)) {
+    fprintf(stderr, "%s: Cannot read from `%s': ", argv[0], fname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  finish = clock();
+  fprintf(stderr, "%" PRIdOFF_T " bytes: %.4f sec\n",
+    n, (double)(finish - start) / (double)CLOCKS_PER_SEC);
+
+  /* Close files */
+  if(needclose & 1) { fclose(fp); }
+  if(needclose & 2) { fclose(ofp); }
+
+  /* Deallocate memory. */
+  free(SA);
+  free(T);
+
+  return 0;
+}
diff --git a/tools/apultra/src/libdivsufsort/examples/mksary.c b/tools/apultra/src/libdivsufsort/examples/mksary.c
new file mode 100644
index 0000000..b48177c
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/examples/mksary.c
@@ -0,0 +1,193 @@
+/*
+ * mksary.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#if HAVE_IO_H && HAVE_FCNTL_H
+# include <io.h>
+# include <fcntl.h>
+#endif
+#include <time.h>
+#include <divsufsort.h>
+#include "lfs.h"
+
+
+static
+void
+print_help(const char *progname, int status) {
+  fprintf(stderr,
+          "mksary, a simple suffix array builder, version %s.\n",
+          divsufsort_version());
+  fprintf(stderr, "usage: %s INFILE OUTFILE\n\n", progname);
+  exit(status);
+}
+
+int
+main(int argc, const char *argv[]) {
+  FILE *fp, *ofp;
+  const char *fname, *ofname;
+  sauchar_t *T;
+  saidx_t *SA;
+  LFS_OFF_T n;
+  clock_t start, finish;
+  saint_t needclose = 3;
+
+  /* Check arguments. */
+  if((argc == 1) ||
+     (strcmp(argv[1], "-h") == 0) ||
+     (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
+  if(argc != 3) { print_help(argv[0], EXIT_FAILURE); }
+
+  /* Open a file for reading. */
+  if(strcmp(argv[1], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&fp, fname = argv[1], "rb") != 0) {
+#else
+    if((fp = LFS_FOPEN(fname = argv[1], "rb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdin), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    fp = stdin;
+    fname = "stdin";
+    needclose ^= 1;
+  }
+
+  /* Open a file for writing. */
+  if(strcmp(argv[2], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&ofp, ofname = argv[2], "wb") != 0) {
+#else
+    if((ofp = LFS_FOPEN(ofname = argv[2], "wb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], ofname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdout), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    ofp = stdout;
+    ofname = "stdout";
+    needclose ^= 2;
+  }
+
+  /* Get the file size. */
+  if(LFS_FSEEK(fp, 0, SEEK_END) == 0) {
+    n = LFS_FTELL(fp);
+    rewind(fp);
+    if(n < 0) {
+      fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+    if(0x7fffffff <= n) {
+      fprintf(stderr, "%s: Input file `%s' is too big.\n", argv[0], fname);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    fprintf(stderr, "%s: Cannot fseek `%s': ", argv[0], fname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Allocate 5blocksize bytes of memory. */
+  T = (sauchar_t *)malloc((size_t)n * sizeof(sauchar_t));
+  SA = (saidx_t *)malloc((size_t)n * sizeof(saidx_t));
+  if((T == NULL) || (SA == NULL)) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Read n bytes of data. */
+  if(fread(T, sizeof(sauchar_t), (size_t)n, fp) != (size_t)n) {
+    fprintf(stderr, "%s: %s `%s': ",
+      argv[0],
+      (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
+      fname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  if(needclose & 1) { fclose(fp); }
+
+  /* Construct the suffix array. */
+  fprintf(stderr, "%s: %" PRIdOFF_T " bytes ... ", fname, n);
+  start = clock();
+  if(divsufsort(T, SA, (saidx_t)n) != 0) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+  finish = clock();
+  fprintf(stderr, "%.4f sec\n", (double)(finish - start) / (double)CLOCKS_PER_SEC);
+
+  /* Write the suffix array. */
+  if(fwrite(SA, sizeof(saidx_t), (size_t)n, ofp) != (size_t)n) {
+    fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], ofname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  if(needclose & 2) { fclose(ofp); }
+
+  /* Deallocate memory. */
+  free(SA);
+  free(T);
+
+  return 0;
+}
diff --git a/tools/apultra/src/libdivsufsort/examples/sasearch.c b/tools/apultra/src/libdivsufsort/examples/sasearch.c
new file mode 100644
index 0000000..7e5ca4f
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/examples/sasearch.c
@@ -0,0 +1,165 @@
+/*
+ * sasearch.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#if HAVE_IO_H && HAVE_FCNTL_H
+# include <io.h>
+# include <fcntl.h>
+#endif
+#include <divsufsort.h>
+#include "lfs.h"
+
+
+static
+void
+print_help(const char *progname, int status) {
+  fprintf(stderr,
+          "sasearch, a simple SA-based full-text search tool, version %s\n",
+          divsufsort_version());
+  fprintf(stderr, "usage: %s PATTERN FILE SAFILE\n\n", progname);
+  exit(status);
+}
+
+int
+main(int argc, const char *argv[]) {
+  FILE *fp;
+  const char *P;
+  sauchar_t *T;
+  saidx_t *SA;
+  LFS_OFF_T n;
+  size_t Psize;
+  saidx_t i, size, left;
+
+  if((argc == 1) ||
+     (strcmp(argv[1], "-h") == 0) ||
+     (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
+  if(argc != 4) { print_help(argv[0], EXIT_FAILURE); }
+
+  P = argv[1];
+  Psize = strlen(P);
+
+  /* Open a file for reading. */
+#if HAVE_FOPEN_S
+  if(fopen_s(&fp, argv[2], "rb") != 0) {
+#else
+  if((fp = LFS_FOPEN(argv[2], "rb")) == NULL) {
+#endif
+    fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[2]);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Get the file size. */
+  if(LFS_FSEEK(fp, 0, SEEK_END) == 0) {
+    n = LFS_FTELL(fp);
+    rewind(fp);
+    if(n < 0) {
+      fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], argv[2]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    fprintf(stderr, "%s: Cannot fseek `%s': ", argv[0], argv[2]);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Allocate 5n bytes of memory. */
+  T = (sauchar_t *)malloc((size_t)n * sizeof(sauchar_t));
+  SA = (saidx_t *)malloc((size_t)n * sizeof(saidx_t));
+  if((T == NULL) || (SA == NULL)) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Read n bytes of data. */
+  if(fread(T, sizeof(sauchar_t), (size_t)n, fp) != (size_t)n) {
+    fprintf(stderr, "%s: %s `%s': ",
+      argv[0],
+      (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
+      argv[2]);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  fclose(fp);
+
+  /* Open the SA file for reading. */
+#if HAVE_FOPEN_S
+  if(fopen_s(&fp, argv[3], "rb") != 0) {
+#else
+  if((fp = LFS_FOPEN(argv[3], "rb")) == NULL) {
+#endif
+    fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[3]);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Read n * sizeof(saidx_t) bytes of data. */
+  if(fread(SA, sizeof(saidx_t), (size_t)n, fp) != (size_t)n) {
+    fprintf(stderr, "%s: %s `%s': ",
+      argv[0],
+      (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
+      argv[3]);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  fclose(fp);
+
+  /* Search and print */
+  size = sa_search(T, (saidx_t)n,
+                   (const sauchar_t *)P, (saidx_t)Psize,
+                   SA, (saidx_t)n, &left);
+  for(i = 0; i < size; ++i) {
+    fprintf(stdout, "%" PRIdSAIDX_T "\n", SA[left + i]);
+  }
+
+  /* Deallocate memory. */
+  free(SA);
+  free(T);
+
+  return 0;
+}
diff --git a/tools/apultra/src/libdivsufsort/examples/suftest.c b/tools/apultra/src/libdivsufsort/examples/suftest.c
new file mode 100644
index 0000000..71892ac
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/examples/suftest.c
@@ -0,0 +1,164 @@
+/*
+ * suftest.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#if HAVE_IO_H && HAVE_FCNTL_H
+# include <io.h>
+# include <fcntl.h>
+#endif
+#include <time.h>
+#include <divsufsort.h>
+#include "lfs.h"
+
+
+static
+void
+print_help(const char *progname, int status) {
+  fprintf(stderr,
+          "suftest, a suffixsort tester, version %s.\n",
+          divsufsort_version());
+  fprintf(stderr, "usage: %s FILE\n\n", progname);
+  exit(status);
+}
+
+int
+main(int argc, const char *argv[]) {
+  FILE *fp;
+  const char *fname;
+  sauchar_t *T;
+  saidx_t *SA;
+  LFS_OFF_T n;
+  clock_t start, finish;
+  saint_t needclose = 1;
+
+  /* Check arguments. */
+  if((argc == 1) ||
+     (strcmp(argv[1], "-h") == 0) ||
+     (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
+  if(argc != 2) { print_help(argv[0], EXIT_FAILURE); }
+
+  /* Open a file for reading. */
+  if(strcmp(argv[1], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&fp, fname = argv[1], "rb") != 0) {
+#else
+    if((fp = LFS_FOPEN(fname = argv[1], "rb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdin), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    fp = stdin;
+    fname = "stdin";
+    needclose = 0;
+  }
+
+  /* Get the file size. */
+  if(LFS_FSEEK(fp, 0, SEEK_END) == 0) {
+    n = LFS_FTELL(fp);
+    rewind(fp);
+    if(n < 0) {
+      fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+    if(0x7fffffff <= n) {
+      fprintf(stderr, "%s: Input file `%s' is too big.\n", argv[0], fname);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    fprintf(stderr, "%s: Cannot fseek `%s': ", argv[0], fname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Allocate 5n bytes of memory. */
+  T = (sauchar_t *)malloc((size_t)n * sizeof(sauchar_t));
+  SA = (saidx_t *)malloc((size_t)n * sizeof(saidx_t));
+  if((T == NULL) || (SA == NULL)) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Read n bytes of data. */
+  if(fread(T, sizeof(sauchar_t), (size_t)n, fp) != (size_t)n) {
+    fprintf(stderr, "%s: %s `%s': ",
+      argv[0],
+      (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
+      argv[1]);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  if(needclose & 1) { fclose(fp); }
+
+  /* Construct the suffix array. */
+  fprintf(stderr, "%s: %" PRIdOFF_T " bytes ... ", fname, n);
+  start = clock();
+  if(divsufsort(T, SA, (saidx_t)n) != 0) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+  finish = clock();
+  fprintf(stderr, "%.4f sec\n", (double)(finish - start) / (double)CLOCKS_PER_SEC);
+
+  /* Check the suffix array. */
+  if(sufcheck(T, SA, (saidx_t)n, 1) != 0) { exit(EXIT_FAILURE); }
+
+  /* Deallocate memory. */
+  free(SA);
+  free(T);
+
+  return 0;
+}
diff --git a/tools/apultra/src/libdivsufsort/examples/unbwt.c b/tools/apultra/src/libdivsufsort/examples/unbwt.c
new file mode 100644
index 0000000..c0f19e9
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/examples/unbwt.c
@@ -0,0 +1,207 @@
+/*
+ * unbwt.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#if HAVE_IO_H && HAVE_FCNTL_H
+# include <io.h>
+# include <fcntl.h>
+#endif
+#include <time.h>
+#include <divsufsort.h>
+#include "lfs.h"
+
+
+static
+size_t
+read_int(FILE *fp, saidx_t *n) {
+  unsigned char c[4];
+  size_t m = fread(c, sizeof(unsigned char), 4, fp);
+  if(m == 4) {
+    *n = (c[0] <<  0) | (c[1] <<  8) |
+         (c[2] << 16) | (c[3] << 24);
+  }
+  return m;
+}
+
+static
+void
+print_help(const char *progname, int status) {
+  fprintf(stderr,
+          "unbwt, an inverse burrows-wheeler transform program, version %s.\n",
+          divsufsort_version());
+  fprintf(stderr, "usage: %s INFILE OUTFILE\n\n", progname);
+  exit(status);
+}
+
+int
+main(int argc, const char *argv[]) {
+  FILE *fp, *ofp;
+  const char *fname, *ofname;
+  sauchar_t *T;
+  saidx_t *A;
+  LFS_OFF_T n;
+  size_t m;
+  saidx_t pidx;
+  clock_t start, finish;
+  saint_t err, blocksize, needclose = 3;
+
+  /* Check arguments. */
+  if((argc == 1) ||
+     (strcmp(argv[1], "-h") == 0) ||
+     (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
+  if(argc != 3) { print_help(argv[0], EXIT_FAILURE); }
+
+  /* Open a file for reading. */
+  if(strcmp(argv[1], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&fp, fname = argv[1], "rb") != 0) {
+#else
+    if((fp = LFS_FOPEN(fname = argv[1], "rb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdin), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    fp = stdin;
+    fname = "stdin";
+    needclose ^= 1;
+  }
+
+  /* Open a file for writing. */
+  if(strcmp(argv[2], "-") != 0) {
+#if HAVE_FOPEN_S
+    if(fopen_s(&ofp, ofname = argv[2], "wb") != 0) {
+#else
+    if((ofp = LFS_FOPEN(ofname = argv[2], "wb")) == NULL) {
+#endif
+      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], ofname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+#if HAVE__SETMODE && HAVE__FILENO
+    if(_setmode(_fileno(stdout), _O_BINARY) == -1) {
+      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+#endif
+    ofp = stdout;
+    ofname = "stdout";
+    needclose ^= 2;
+  }
+
+  /* Read the blocksize. */
+  if(read_int(fp, &blocksize) != 4) {
+    fprintf(stderr, "%s: Cannot read from `%s': ", argv[0], fname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+
+  /* Allocate 5blocksize bytes of memory. */
+  T = (sauchar_t *)malloc(blocksize * sizeof(sauchar_t));
+  A = (saidx_t *)malloc(blocksize * sizeof(saidx_t));
+  if((T == NULL) || (A == NULL)) {
+    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  fprintf(stderr, "UnBWT (blocksize %" PRIdSAINT_T ") ... ", blocksize);
+  start = clock();
+  for(n = 0; (m = read_int(fp, &pidx)) != 0; n += m) {
+    /* Read blocksize bytes of data. */
+    if((m != 4) || ((m = fread(T, sizeof(sauchar_t), blocksize, fp)) == 0)) {
+      fprintf(stderr, "%s: %s `%s': ",
+        argv[0],
+        (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
+        fname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+
+    /* Inverse Burrows-Wheeler Transform. */
+    if((err = inverse_bw_transform(T, T, A, m, pidx)) != 0) {
+      fprintf(stderr, "%s (reverseBWT): %s.\n",
+        argv[0],
+        (err == -1) ? "Invalid data" : "Cannot allocate memory");
+      exit(EXIT_FAILURE);
+    }
+
+    /* Write m bytes of data. */
+    if(fwrite(T, sizeof(sauchar_t), m, ofp) != m) {
+      fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], ofname);
+      perror(NULL);
+      exit(EXIT_FAILURE);
+    }
+  }
+  if(ferror(fp)) {
+    fprintf(stderr, "%s: Cannot read from `%s': ", argv[0], fname);
+    perror(NULL);
+    exit(EXIT_FAILURE);
+  }
+  finish = clock();
+  fprintf(stderr, "%" PRIdOFF_T " bytes: %.4f sec\n",
+    n, (double)(finish - start) / (double)CLOCKS_PER_SEC);
+
+  /* Close files */
+  if(needclose & 1) { fclose(fp); }
+  if(needclose & 2) { fclose(ofp); }
+
+  /* Deallocate memory. */
+  free(A);
+  free(T);
+
+  return 0;
+}
diff --git a/tools/apultra/src/libdivsufsort/include/CMakeLists.txt b/tools/apultra/src/libdivsufsort/include/CMakeLists.txt
new file mode 100644
index 0000000..37781cc
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/CMakeLists.txt
@@ -0,0 +1,162 @@
+include(CheckIncludeFiles)
+include(CheckIncludeFile)
+include(CheckSymbolExists)
+include(CheckTypeSize)
+include(CheckFunctionKeywords)
+include(CheckLFS)
+
+## Checks for header files ##
+check_include_file("inttypes.h" HAVE_INTTYPES_H)
+check_include_file("memory.h" HAVE_MEMORY_H)
+check_include_file("stddef.h" HAVE_STDDEF_H)
+check_include_file("stdint.h" HAVE_STDINT_H)
+check_include_file("stdlib.h" HAVE_STDLIB_H)
+check_include_file("string.h" HAVE_STRING_H)
+check_include_file("strings.h" HAVE_STRINGS_H)
+check_include_file("sys/types.h" HAVE_SYS_TYPES_H)
+if(HAVE_INTTYPES_H)
+  set(INCFILE "#include <inttypes.h>")
+elseif(HAVE_STDINT_H)
+  set(INCFILE "#include <stdint.h>")
+else(HAVE_INTTYPES_H)
+  set(INCFILE "")
+endif(HAVE_INTTYPES_H)
+
+## create configuration files from .cmake file ##
+if(BUILD_EXAMPLES)
+  ## Checks for WinIO ##
+  if(WIN32)
+    check_include_file("io.h" HAVE_IO_H)
+    check_include_file("fcntl.h" HAVE_FCNTL_H)
+    check_symbol_exists("_setmode" "io.h;fcntl.h" HAVE__SETMODE)
+    if(NOT HAVE__SETMODE)
+      check_symbol_exists("setmode" "io.h;fcntl.h" HAVE_SETMODE)
+    endif(NOT HAVE__SETMODE)
+    check_symbol_exists("_fileno" "stdio.h" HAVE__FILENO)
+    check_symbol_exists("fopen_s" "stdio.h" HAVE_FOPEN_S)
+    check_symbol_exists("_O_BINARY" "fcntl.h" HAVE__O_BINARY)
+  endif(WIN32)
+
+  ## Checks for large file support ##
+  check_lfs(WITH_LFS)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/lfs.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/lfs.h" @ONLY)
+endif(BUILD_EXAMPLES)
+
+## generate config.h ##
+check_function_keywords("inline;__inline;__inline__;__declspec(dllexport);__declspec(dllimport)")
+if(HAVE_INLINE)
+  set(INLINE "inline")
+elseif(HAVE___INLINE)
+  set(INLINE "__inline")
+elseif(HAVE___INLINE__)
+  set(INLINE "__inline__")
+else(HAVE_INLINE)
+  set(INLINE "")
+endif(HAVE_INLINE)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/config.h")
+
+## Checks for types ##
+# sauchar_t (8bit)
+check_type_size("uint8_t" UINT8_T)
+if(HAVE_UINT8_T)
+  set(SAUCHAR_TYPE "uint8_t")
+else(HAVE_UINT8_T)
+  check_type_size("unsigned char" SIZEOF_UNSIGNED_CHAR)
+  if("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
+    set(SAUCHAR_TYPE "unsigned char")
+  else("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
+    message(FATAL_ERROR "Cannot find unsigned 8-bit integer type")
+  endif("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
+endif(HAVE_UINT8_T)
+# saint_t (32bit)
+check_type_size("int32_t" INT32_T)
+if(HAVE_INT32_T)
+  set(SAINT32_TYPE "int32_t")
+  check_symbol_exists("PRId32" "inttypes.h" HAVE_PRID32)
+  if(HAVE_PRID32)
+    set(SAINT32_PRId "PRId32")
+  else(HAVE_PRID32)
+    set(SAINT32_PRId "\"d\"")
+  endif(HAVE_PRID32)
+else(HAVE_INT32_T)
+  check_type_size("int" SIZEOF_INT)
+  check_type_size("long" SIZEOF_LONG)
+  check_type_size("short" SIZEOF_SHORT)
+  check_type_size("__int32" SIZEOF___INT32)
+  if("${SIZEOF_INT}" STREQUAL "4")
+    set(SAINT32_TYPE "int")
+    set(SAINT32_PRId "\"d\"")
+  elseif("${SIZEOF_LONG}" STREQUAL "4")
+    set(SAINT32_TYPE "long")
+    set(SAINT32_PRId "\"ld\"")
+  elseif("${SIZEOF_SHORT}" STREQUAL "4")
+    set(SAINT32_TYPE "short")
+    set(SAINT32_PRId "\"d\"")
+  elseif("${SIZEOF___INT32}" STREQUAL "4")
+    set(SAINT32_TYPE "__int32")
+    set(SAINT32_PRId "\"d\"")
+  else("${SIZEOF_INT}" STREQUAL "4")
+    message(FATAL_ERROR "Cannot find 32-bit integer type")
+  endif("${SIZEOF_INT}" STREQUAL "4")
+endif(HAVE_INT32_T)
+# saint64_t (64bit)
+if(BUILD_DIVSUFSORT64)
+  check_type_size("int64_t" INT64_T)
+  if(HAVE_INT64_T)
+    set(SAINT64_TYPE "int64_t")
+    check_symbol_exists("PRId64" "inttypes.h" HAVE_PRID64)
+    if(HAVE_PRID64)
+      set(SAINT64_PRId "PRId64")
+    else(HAVE_PRID64)
+      set(SAINT64_PRId "\"lld\"")
+    endif(HAVE_PRID64)
+  else(HAVE_INT64_T)
+    check_type_size("int" SIZEOF_INT)
+    check_type_size("long" SIZEOF_LONG)
+    check_type_size("long long" SIZEOF_LONG_LONG)
+    check_type_size("__int64" SIZEOF___INT64)
+    if("${SIZEOF_INT}" STREQUAL "8")
+      set(SAINT64_TYPE "int")
+      set(SAINT64_PRId "\"d\"")
+    elseif("${SIZEOF_LONG}" STREQUAL "8")
+      set(SAINT64_TYPE "long")
+      set(SAINT64_PRId "\"ld\"")
+    elseif("${SIZEOF_LONG_LONG}" STREQUAL "8")
+      set(SAINT64_TYPE "long long")
+      set(SAINT64_PRId "\"lld\"")
+    elseif("${SIZEOF___INT64}" STREQUAL "8")
+      set(SAINT64_TYPE "__int64")
+      set(SAINT64_PRId "\"I64d\"")
+    else("${SIZEOF_INT}" STREQUAL "8")
+      message(SEND_ERROR "Cannot find 64-bit integer type")
+      set(BUILD_DIVSUFSORT64 OFF)
+    endif("${SIZEOF_INT}" STREQUAL "8")
+  endif(HAVE_INT64_T)
+endif(BUILD_DIVSUFSORT64)
+
+## generate divsufsort.h ##
+set(DIVSUFSORT_IMPORT "")
+set(DIVSUFSORT_EXPORT "")
+if(BUILD_SHARED_LIBS)
+  if(HAVE___DECLSPEC_DLLIMPORT_)
+    set(DIVSUFSORT_IMPORT "__declspec(dllimport)")
+  endif(HAVE___DECLSPEC_DLLIMPORT_)
+  if(HAVE___DECLSPEC_DLLEXPORT_)
+    set(DIVSUFSORT_EXPORT "__declspec(dllexport)")
+  endif(HAVE___DECLSPEC_DLLEXPORT_)
+endif(BUILD_SHARED_LIBS)
+set(W64BIT "")
+set(SAINDEX_TYPE "${SAINT32_TYPE}")
+set(SAINDEX_PRId "${SAINT32_PRId}")
+set(SAINT_PRId "${SAINT32_PRId}")
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/divsufsort.h.cmake"
+               "${CMAKE_CURRENT_BINARY_DIR}/divsufsort.h" @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/divsufsort.h" DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+if(BUILD_DIVSUFSORT64)
+  set(W64BIT "64")
+  set(SAINDEX_TYPE "${SAINT64_TYPE}")
+  set(SAINDEX_PRId "${SAINT64_PRId}")
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/divsufsort.h.cmake"
+                 "${CMAKE_CURRENT_BINARY_DIR}/divsufsort64.h" @ONLY)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/divsufsort64.h" DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif(BUILD_DIVSUFSORT64)
diff --git a/tools/apultra/src/libdivsufsort/include/config.h.cmake b/tools/apultra/src/libdivsufsort/include/config.h.cmake
new file mode 100644
index 0000000..6a1cf47
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/config.h.cmake
@@ -0,0 +1,81 @@
+/*
+ * config.h for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** Define to the version of this package. **/
+#cmakedefine PROJECT_VERSION_FULL "${PROJECT_VERSION_FULL}"
+
+/** Define to 1 if you have the header files. **/
+#cmakedefine HAVE_INTTYPES_H 1
+#cmakedefine HAVE_STDDEF_H 1
+#cmakedefine HAVE_STDINT_H 1
+#cmakedefine HAVE_STDLIB_H 1
+#cmakedefine HAVE_STRING_H 1
+#cmakedefine HAVE_STRINGS_H 1
+#cmakedefine HAVE_MEMORY_H 1
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/** for WinIO **/
+#cmakedefine HAVE_IO_H 1
+#cmakedefine HAVE_FCNTL_H 1
+#cmakedefine HAVE__SETMODE 1
+#cmakedefine HAVE_SETMODE 1
+#cmakedefine HAVE__FILENO 1
+#cmakedefine HAVE_FOPEN_S 1
+#cmakedefine HAVE__O_BINARY 1
+#ifndef HAVE__SETMODE
+# if HAVE_SETMODE
+#  define _setmode setmode
+#  define HAVE__SETMODE 1
+# endif
+# if HAVE__SETMODE && !HAVE__O_BINARY
+#  define _O_BINARY 0
+#  define HAVE__O_BINARY 1
+# endif
+#endif
+
+/** for inline **/
+#ifndef INLINE
+# define INLINE @INLINE@
+#endif
+
+/** for VC++ warning **/
+#ifdef _MSC_VER
+#pragma warning(disable: 4127)
+#endif
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _CONFIG_H */
diff --git a/tools/apultra/src/libdivsufsort/include/divsufsort.h b/tools/apultra/src/libdivsufsort/include/divsufsort.h
new file mode 100644
index 0000000..7ebb412
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/divsufsort.h
@@ -0,0 +1,189 @@
+/*
+ * divsufsort.h for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define DIVSUFSORT_API
+
+/*- Datatypes -*/
+#ifndef SAUCHAR_T
+#define SAUCHAR_T
+typedef unsigned char sauchar_t;
+#endif /* SAUCHAR_T */
+#ifndef SAINT_T
+#define SAINT_T
+typedef int saint_t;
+#endif /* SAINT_T */
+#ifndef SAIDX_T
+#define SAIDX_T
+typedef int saidx_t;
+#endif /* SAIDX_T */
+#ifndef PRIdSAIDX_T
+#define PRIdSAIDX_T "d"
+#endif
+
+/*- divsufsort context */
+typedef struct _divsufsort_ctx_t {
+   saidx_t *bucket_A;
+   saidx_t *bucket_B;
+} divsufsort_ctx_t;
+
+/*- Prototypes -*/
+
+/**
+ * Initialize suffix array context
+ *
+ * @return 0 for success, or non-zero in case of an error
+ */
+int divsufsort_init(divsufsort_ctx_t *ctx);
+
+/**
+ * Destroy suffix array context
+ *
+ * @param ctx suffix array context to destroy
+ */
+void divsufsort_destroy(divsufsort_ctx_t *ctx);
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param ctx suffix array context
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t divsufsort_build_array(divsufsort_ctx_t *ctx, const sauchar_t *T, saidx_t *SA, saidx_t n);
+
+#if 0
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saidx_t
+divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n);
+
+/**
+ * Returns the version of the divsufsort library.
+ * @return The version number string.
+ */
+DIVSUFSORT_API
+const char *
+divsufsort_version(void);
+
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string and suffix array.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param SA[0..n-1] The suffix array. (can be NULL)
+ * @param n The length of the given string.
+ * @param idx The output primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+bw_transform(const sauchar_t *T, sauchar_t *U,
+             saidx_t *SA /* can NULL */,
+             saidx_t n, saidx_t *idx);
+
+/**
+ * Inverse BW-transforms a given BWTed string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param idx The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+inverse_bw_transform(const sauchar_t *T, sauchar_t *U,
+                     saidx_t *A /* can NULL */,
+                     saidx_t n, saidx_t idx);
+
+/**
+ * Checks the correctness of a given suffix array.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The input suffix array.
+ * @param n The length of the given string.
+ * @param verbose The verbose mode.
+ * @return 0 if no error occurred.
+ */
+DIVSUFSORT_API
+saint_t
+sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose);
+
+/**
+ * Search for the pattern P in the string T.
+ * @param T[0..Tsize-1] The input string.
+ * @param Tsize The length of the given string.
+ * @param P[0..Psize-1] The input pattern string.
+ * @param Psize The length of the given pattern string.
+ * @param SA[0..SAsize-1] The input suffix array.
+ * @param SAsize The length of the given suffix array.
+ * @param idx The output index.
+ * @return The count of matches if no error occurred, -1 otherwise.
+ */
+DIVSUFSORT_API
+saidx_t
+sa_search(const sauchar_t *T, saidx_t Tsize,
+          const sauchar_t *P, saidx_t Psize,
+          const saidx_t *SA, saidx_t SAsize,
+          saidx_t *left);
+
+/**
+ * Search for the character c in the string T.
+ * @param T[0..Tsize-1] The input string.
+ * @param Tsize The length of the given string.
+ * @param SA[0..SAsize-1] The input suffix array.
+ * @param SAsize The length of the given suffix array.
+ * @param c The input character.
+ * @param idx The output index.
+ * @return The count of matches if no error occurred, -1 otherwise.
+ */
+DIVSUFSORT_API
+saidx_t
+sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
+                const saidx_t *SA, saidx_t SAsize,
+                saint_t c, saidx_t *left);
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
diff --git a/tools/apultra/src/libdivsufsort/include/divsufsort.h.cmake b/tools/apultra/src/libdivsufsort/include/divsufsort.h.cmake
new file mode 100644
index 0000000..bcaba7c
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/divsufsort.h.cmake
@@ -0,0 +1,180 @@
+/*
+ * divsufsort@W64BIT@.h for libdivsufsort@W64BIT@
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT@W64BIT@_H
+#define _DIVSUFSORT@W64BIT@_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+@INCFILE@
+
+#ifndef DIVSUFSORT_API
+# ifdef DIVSUFSORT_BUILD_DLL
+#  define DIVSUFSORT_API @DIVSUFSORT_EXPORT@
+# else
+#  define DIVSUFSORT_API @DIVSUFSORT_IMPORT@
+# endif
+#endif
+
+/*- Datatypes -*/
+#ifndef SAUCHAR_T
+#define SAUCHAR_T
+typedef @SAUCHAR_TYPE@ sauchar_t;
+#endif /* SAUCHAR_T */
+#ifndef SAINT_T
+#define SAINT_T
+typedef @SAINT32_TYPE@ saint_t;
+#endif /* SAINT_T */
+#ifndef SAIDX@W64BIT@_T
+#define SAIDX@W64BIT@_T
+typedef @SAINDEX_TYPE@ saidx@W64BIT@_t;
+#endif /* SAIDX@W64BIT@_T */
+#ifndef PRIdSAINT_T
+#define PRIdSAINT_T @SAINT_PRId@
+#endif /* PRIdSAINT_T */
+#ifndef PRIdSAIDX@W64BIT@_T
+#define PRIdSAIDX@W64BIT@_T @SAINDEX_PRId@
+#endif /* PRIdSAIDX@W64BIT@_T */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+divsufsort@W64BIT@(const sauchar_t *T, saidx@W64BIT@_t *SA, saidx@W64BIT@_t n);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saidx@W64BIT@_t
+divbwt@W64BIT@(const sauchar_t *T, sauchar_t *U, saidx@W64BIT@_t *A, saidx@W64BIT@_t n);
+
+/**
+ * Returns the version of the divsufsort library.
+ * @return The version number string.
+ */
+DIVSUFSORT_API
+const char *
+divsufsort@W64BIT@_version(void);
+
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string and suffix array.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param SA[0..n-1] The suffix array. (can be NULL)
+ * @param n The length of the given string.
+ * @param idx The output primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+bw_transform@W64BIT@(const sauchar_t *T, sauchar_t *U,
+             saidx@W64BIT@_t *SA /* can NULL */,
+             saidx@W64BIT@_t n, saidx@W64BIT@_t *idx);
+
+/**
+ * Inverse BW-transforms a given BWTed string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param idx The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+inverse_bw_transform@W64BIT@(const sauchar_t *T, sauchar_t *U,
+                     saidx@W64BIT@_t *A /* can NULL */,
+                     saidx@W64BIT@_t n, saidx@W64BIT@_t idx);
+
+/**
+ * Checks the correctness of a given suffix array.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The input suffix array.
+ * @param n The length of the given string.
+ * @param verbose The verbose mode.
+ * @return 0 if no error occurred.
+ */
+DIVSUFSORT_API
+saint_t
+sufcheck@W64BIT@(const sauchar_t *T, const saidx@W64BIT@_t *SA, saidx@W64BIT@_t n, saint_t verbose);
+
+/**
+ * Search for the pattern P in the string T.
+ * @param T[0..Tsize-1] The input string.
+ * @param Tsize The length of the given string.
+ * @param P[0..Psize-1] The input pattern string.
+ * @param Psize The length of the given pattern string.
+ * @param SA[0..SAsize-1] The input suffix array.
+ * @param SAsize The length of the given suffix array.
+ * @param idx The output index.
+ * @return The count of matches if no error occurred, -1 otherwise.
+ */
+DIVSUFSORT_API
+saidx@W64BIT@_t
+sa_search@W64BIT@(const sauchar_t *T, saidx@W64BIT@_t Tsize,
+          const sauchar_t *P, saidx@W64BIT@_t Psize,
+          const saidx@W64BIT@_t *SA, saidx@W64BIT@_t SAsize,
+          saidx@W64BIT@_t *left);
+
+/**
+ * Search for the character c in the string T.
+ * @param T[0..Tsize-1] The input string.
+ * @param Tsize The length of the given string.
+ * @param SA[0..SAsize-1] The input suffix array.
+ * @param SAsize The length of the given suffix array.
+ * @param c The input character.
+ * @param idx The output index.
+ * @return The count of matches if no error occurred, -1 otherwise.
+ */
+DIVSUFSORT_API
+saidx@W64BIT@_t
+sa_simplesearch@W64BIT@(const sauchar_t *T, saidx@W64BIT@_t Tsize,
+                const saidx@W64BIT@_t *SA, saidx@W64BIT@_t SAsize,
+                saint_t c, saidx@W64BIT@_t *left);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT@W64BIT@_H */
diff --git a/tools/apultra/src/libdivsufsort/include/divsufsort_config.h b/tools/apultra/src/libdivsufsort/include/divsufsort_config.h
new file mode 100644
index 0000000..4054a8a
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/divsufsort_config.h
@@ -0,0 +1,9 @@
+#define HAVE_STRING_H 1
+#define HAVE_STDLIB_H 1
+#define HAVE_MEMORY_H 1
+#define HAVE_STDINT_H 1
+#define INLINE inline
+
+#ifdef _MSC_VER
+#pragma warning( disable : 4244 )
+#endif /* _MSC_VER */
diff --git a/tools/apultra/src/libdivsufsort/include/divsufsort_private.h b/tools/apultra/src/libdivsufsort/include/divsufsort_private.h
new file mode 100644
index 0000000..b4d97ad
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/divsufsort_private.h
@@ -0,0 +1,205 @@
+/*
+ * divsufsort_private.h for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_PRIVATE_H
+#define _DIVSUFSORT_PRIVATE_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "divsufsort_config.h"
+#include <assert.h>
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# if HAVE_STDINT_H
+#  include <stdint.h>
+# endif
+#endif
+#if defined(BUILD_DIVSUFSORT64)
+# include "divsufsort64.h"
+# ifndef SAIDX_T
+#  define SAIDX_T
+#  define saidx_t saidx64_t
+# endif /* SAIDX_T */
+# ifndef PRIdSAIDX_T
+#  define PRIdSAIDX_T PRIdSAIDX64_T
+# endif /* PRIdSAIDX_T */
+# define divsufsort divsufsort64
+# define divbwt divbwt64
+# define divsufsort_version divsufsort64_version
+# define bw_transform bw_transform64
+# define inverse_bw_transform inverse_bw_transform64
+# define sufcheck sufcheck64
+# define sa_search sa_search64
+# define sa_simplesearch sa_simplesearch64
+# define sssort sssort64
+# define trsort trsort64
+#else
+# include "divsufsort.h"
+#endif
+
+
+/*- Constants -*/
+#if !defined(UINT8_MAX)
+# define UINT8_MAX (255)
+#endif /* UINT8_MAX */
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (UINT8_MAX + 1)
+#endif
+/* for divsufsort.c */
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+/* for sssort.c */
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# if defined(BUILD_DIVSUFSORT64)
+#  define SS_MISORT_STACKSIZE (96)
+# else
+#  define SS_MISORT_STACKSIZE (64)
+# endif
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#if defined(BUILD_DIVSUFSORT64)
+# define SS_SMERGE_STACKSIZE (64)
+#else
+# define SS_SMERGE_STACKSIZE (32)
+#endif
+/* for trsort.c */
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#if defined(BUILD_DIVSUFSORT64)
+# define TR_STACKSIZE (96)
+#else
+# define TR_STACKSIZE (64)
+#endif
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+/* for divsufsort.c */
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Prototypes -*/
+/* sssort.c */
+void
+sssort(const sauchar_t *Td, const saidx_t *PA,
+       saidx_t *first, saidx_t *last,
+       saidx_t *buf, saidx_t bufsize,
+       saidx_t depth, saidx_t n, saint_t lastsuffix);
+/* trsort.c */
+void
+trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_PRIVATE_H */
diff --git a/tools/apultra/src/libdivsufsort/include/lfs.h.cmake b/tools/apultra/src/libdivsufsort/include/lfs.h.cmake
new file mode 100644
index 0000000..d5b84a8
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/include/lfs.h.cmake
@@ -0,0 +1,56 @@
+/*
+ * lfs.h for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _LFS_H
+#define _LFS_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef __STRICT_ANSI__
+# define LFS_OFF_T @LFS_OFF_T@
+# define LFS_FOPEN @LFS_FOPEN@
+# define LFS_FTELL @LFS_FTELL@
+# define LFS_FSEEK @LFS_FSEEK@
+# define LFS_PRId  @LFS_PRID@
+#else
+# define LFS_OFF_T long
+# define LFS_FOPEN fopen
+# define LFS_FTELL ftell
+# define LFS_FSEEK fseek
+# define LFS_PRId "ld"
+#endif
+#ifndef PRIdOFF_T
+# define PRIdOFF_T LFS_PRId
+#endif
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _LFS_H */
diff --git a/tools/apultra/src/libdivsufsort/pkgconfig/CMakeLists.txt b/tools/apultra/src/libdivsufsort/pkgconfig/CMakeLists.txt
new file mode 100644
index 0000000..ee7063c
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/pkgconfig/CMakeLists.txt
@@ -0,0 +1,9 @@
+## generate libdivsufsort.pc ##
+set(W64BIT "")
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/libdivsufsort.pc.cmake" "${CMAKE_CURRENT_BINARY_DIR}/libdivsufsort.pc" @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libdivsufsort.pc" DESTINATION ${CMAKE_INSTALL_PKGCONFIGDIR})
+if(BUILD_DIVSUFSORT64)
+  set(W64BIT "64")
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/libdivsufsort.pc.cmake" "${CMAKE_CURRENT_BINARY_DIR}/libdivsufsort64.pc" @ONLY)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libdivsufsort64.pc" DESTINATION ${CMAKE_INSTALL_PKGCONFIGDIR})
+endif(BUILD_DIVSUFSORT64)
diff --git a/tools/apultra/src/libdivsufsort/pkgconfig/libdivsufsort.pc.cmake b/tools/apultra/src/libdivsufsort/pkgconfig/libdivsufsort.pc.cmake
new file mode 100644
index 0000000..6419d1e
--- /dev/null
+++ b/tools/apultra/src/libdivsufsort/pkgconfig/libdivsufsort.pc.cmake
@@ -0,0 +1,11 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=@CMAKE_INSTALL_LIBDIR@
+includedir=@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: @PROJECT_NAME@@W64BIT@
+Description: @PROJECT_DESCRIPTION@
+Version: @PROJECT_VERSION_FULL@
+URL: @PROJECT_URL@
+Libs: -L${libdir} -ldivsufsort@W64BIT@
+Cflags: -I${includedir}
diff --git a/tools/apultra/src/matchfinder.c b/tools/apultra/src/matchfinder.c
new file mode 100644
index 0000000..a9987f9
--- /dev/null
+++ b/tools/apultra/src/matchfinder.c
@@ -0,0 +1,449 @@
+/*
+ * matchfinder.c - LZ match finder implementation
+ *
+ * The following copying information applies to this specific source code file:
+ *
+ * Written in 2019 by Emmanuel Marty <marty.emmanuel@gmail.com>
+ * Portions written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
+ *
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "matchfinder.h"
+#include "format.h"
+#include "libapultra.h"
+
+/**
+ * Hash index into TAG_BITS
+ *
+ * @param nIndex index value
+ *
+ * @return hash
+ */
+static inline int apultra_get_index_tag(unsigned int nIndex) {
+   return (int)(((unsigned long long)nIndex * 11400714819323198485ULL) >> (64ULL - TAG_BITS));
+}
+
+/**
+ * Parse input data, build suffix array and overlaid data structures to speed up match finding
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int apultra_build_suffix_array(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize) {
+   unsigned long long *intervals = pCompressor->intervals;
+
+   /* Build suffix array from input data */
+   saidx_t *suffixArray = (saidx_t*)intervals;
+   if (divsufsort_build_array(&pCompressor->divsufsort_context, pInWindow, suffixArray, nInWindowSize) != 0) {
+      return 100;
+   }
+
+   int i, r;
+
+   for (i = nInWindowSize - 1; i >= 0; i--) {
+      intervals[i] = suffixArray[i];
+   }
+
+   int *PLCP = (int*)pCompressor->pos_data;  /* Use temporarily */
+   int *Phi = PLCP;
+   int nCurLen = 0;
+
+   /* Compute the permuted LCP first (K�rkk�inen method) */
+   Phi[intervals[0]] = -1;
+   for (i = 1; i < nInWindowSize; i++)
+      Phi[intervals[i]] = (unsigned int)intervals[i - 1];
+   for (i = 0; i < nInWindowSize; i++) {
+      if (Phi[i] == -1) {
+         PLCP[i] = 0;
+         continue;
+      }
+      int nMaxLen = (i > Phi[i]) ? (nInWindowSize - i) : (nInWindowSize - Phi[i]);
+      while (nCurLen < nMaxLen && pInWindow[i + nCurLen] == pInWindow[Phi[i] + nCurLen]) nCurLen++;
+      PLCP[i] = nCurLen;
+      if (nCurLen > 0)
+         nCurLen--;
+   }
+
+   /* Rotate permuted LCP into the LCP. This has better cache locality than the direct Kasai LCP method. This also
+    * saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method,
+    * and the interval builder below doesn't need it either. */
+   intervals[0] &= POS_MASK;
+
+   for (i = 1; i < nInWindowSize; i++) {
+      int nIndex = (int)(intervals[i] & POS_MASK);
+      int nLen = PLCP[nIndex];
+      if (nLen < MIN_MATCH_SIZE)
+         nLen = 0;
+      if (nLen > LCP_MAX)
+         nLen = LCP_MAX;
+      int nTaggedLen = 0;
+      if (nLen)
+         nTaggedLen = (nLen << TAG_BITS) | (apultra_get_index_tag((unsigned int)nIndex) & ((1 << TAG_BITS) - 1));
+      intervals[i] = ((unsigned long long)nIndex) | (((unsigned long long)nTaggedLen) << LCP_SHIFT);
+   }
+
+   /**
+    * Build intervals for finding matches
+    *
+    * Methodology and code fragment taken from wimlib (CC0 license):
+    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
+    */
+   unsigned long long * const SA_and_LCP = intervals;
+   unsigned long long *pos_data = pCompressor->pos_data;
+   unsigned long long next_interval_idx;
+   unsigned long long *top = pCompressor->open_intervals;
+   unsigned long long prev_pos = SA_and_LCP[0] & POS_MASK;
+
+   *top = 0;
+   intervals[0] = 0;
+   next_interval_idx = 1;
+
+   for (r = 1; r < nInWindowSize; r++) {
+      const unsigned long long next_pos = SA_and_LCP[r] & POS_MASK;
+      const unsigned long long next_lcp = SA_and_LCP[r] & LCP_MASK;
+      const unsigned long long top_lcp = *top & LCP_MASK;
+
+      if (next_lcp == top_lcp) {
+         /* Continuing the deepest open interval  */
+         pos_data[prev_pos] = *top;
+      }
+      else if (next_lcp > top_lcp) {
+         /* Opening a new interval  */
+         *++top = next_lcp | next_interval_idx++;
+         pos_data[prev_pos] = *top;
+      }
+      else {
+         /* Closing the deepest open interval  */
+         pos_data[prev_pos] = *top;
+         for (;;) {
+            const unsigned long long closed_interval_idx = *top-- & POS_MASK;
+            const unsigned long long superinterval_lcp = *top & LCP_MASK;
+
+            if (next_lcp == superinterval_lcp) {
+               /* Continuing the superinterval */
+               intervals[closed_interval_idx] = *top;
+               break;
+            }
+            else if (next_lcp > superinterval_lcp) {
+               /* Creating a new interval that is a
+                * superinterval of the one being
+                * closed, but still a subinterval of
+                * its superinterval  */
+               *++top = next_lcp | next_interval_idx++;
+               intervals[closed_interval_idx] = *top;
+               break;
+            }
+            else {
+               /* Also closing the superinterval  */
+               intervals[closed_interval_idx] = *top;
+            }
+         }
+      }
+      prev_pos = next_pos;
+   }
+
+   /* Close any still-open intervals.  */
+   pos_data[prev_pos] = *top;
+   for (; top > pCompressor->open_intervals; top--)
+      intervals[*top & POS_MASK] = *(top - 1);
+
+   /* Success */
+   return 0;
+}
+
+/**
+ * Find matches at the specified offset in the input window
+ *
+ * @param pCompressor compression context
+ * @param nOffset offset to find matches at, in the input window
+ * @param pMatches pointer to returned matches
+ * @param pMatchDepth pointer to returned match depths
+ * @param pMatch1 pointer to 1-byte length, 4 bit offset match
+ * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return number of matches
+ */
+int apultra_find_matches_at(apultra_compressor *pCompressor, const int nOffset, apultra_match *pMatches, unsigned short *pMatchDepth, unsigned char *pMatch1, const int nMaxMatches, const int nBlockFlags) {
+   unsigned long long *intervals = pCompressor->intervals;
+   unsigned long long *pos_data = pCompressor->pos_data;
+   unsigned long long ref;
+   unsigned long long super_ref;
+   unsigned long long match_pos;
+   apultra_match *matchptr;
+   unsigned short *depthptr;
+   const int nMaxOffset = pCompressor->max_offset;
+
+   *pMatch1 = 0;
+
+   /**
+    * Find matches using intervals
+    *
+    * Taken from wimlib (CC0 license):
+    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
+    */
+
+    /* Get the deepest lcp-interval containing the current suffix. */
+   ref = pos_data[nOffset];
+
+   pos_data[nOffset] = 0;
+
+   /* Ascend until we reach a visited interval, the root, or a child of the
+    * root.  Link unvisited intervals to the current suffix as we go.  */
+   while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+      intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
+      ref = super_ref;
+   }
+
+   if (super_ref == 0) {
+      /* In this case, the current interval may be any of:
+       * (1) the root;
+       * (2) an unvisited child of the root */
+
+      if (ref != 0)  /* Not the root?  */
+         intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
+      return 0;
+   }
+
+   /* Ascend indirectly via pos_data[] links.  */
+   match_pos = super_ref & EXCL_VISITED_MASK;
+   matchptr = pMatches;
+   depthptr = pMatchDepth;
+   int nPrevOffset = 0;
+   int nPrevLen = 0;
+   int nCurDepth = 0;
+   unsigned short *cur_depth = NULL;
+   
+   if (nOffset >= match_pos && (nBlockFlags & 3) == 3) {
+      int nMatchOffset = (int)(nOffset - match_pos);
+      int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+      if ((matchptr - pMatches) < nMaxMatches) {
+         if (nMatchOffset <= nMaxOffset) {
+            if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+               nCurDepth++;
+               *cur_depth = nCurDepth;
+            }
+            else {
+               nCurDepth = 0;
+
+               cur_depth = depthptr;
+               matchptr->length = nMatchLen;
+               matchptr->offset = nMatchOffset;
+               *depthptr = 0;
+               matchptr++;
+               depthptr++;
+            }
+
+            nPrevLen = nMatchLen;
+            nPrevOffset = nMatchOffset;
+         }
+      }
+   }
+
+   for (;;) {
+      if ((super_ref = pos_data[match_pos]) > ref) {
+         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
+
+         if (nOffset >= match_pos && (nBlockFlags & 3) == 3) {
+            int nMatchOffset = (int)(nOffset - match_pos);
+            int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+            if ((matchptr - pMatches) < nMaxMatches) {
+               if (nMatchOffset <= nMaxOffset && abs(nMatchOffset - nPrevOffset) >= 128) {
+                  if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+                     nCurDepth++;
+                     *cur_depth = nCurDepth | 0x8000;
+                  }
+                  else {
+                     nCurDepth = 0;
+
+                     cur_depth = depthptr;
+                     matchptr->length = nMatchLen;
+                     matchptr->offset = nMatchOffset;
+                     *depthptr = 0x8000;
+                     matchptr++;
+                     depthptr++;
+                  }
+
+                  nPrevLen = nMatchLen;
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
+
+      while ((super_ref = pos_data[match_pos]) > ref) {
+         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
+
+         if (nOffset > match_pos && (nBlockFlags & 3) == 3) {
+            int nMatchOffset = (int)(nOffset - match_pos);
+            int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+            if ((matchptr - pMatches) < nMaxMatches) {
+               if (nMatchOffset <= nMaxOffset && (nMatchLen >= 3 || (nMatchLen >= 2 && (matchptr - pMatches) < (nMaxMatches - 1))) && nMatchLen < 1280 && abs(nMatchOffset - nPrevOffset) >= 128) {
+                  if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+                     nCurDepth++;
+                     *cur_depth = nCurDepth | 0x8000;
+                  }
+                  else {
+                     nCurDepth = 0;
+
+                     cur_depth = depthptr;
+                     matchptr->length = nMatchLen;
+                     matchptr->offset = nMatchOffset;
+                     *depthptr = 0x8000;
+                     matchptr++;
+                     depthptr++;
+                  }
+
+                  nPrevLen = nMatchLen;
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
+
+      intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
+      pos_data[match_pos] = (unsigned long long)ref;
+
+      int nMatchOffset = (int)(nOffset - match_pos);
+      int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+      if ((matchptr - pMatches) < nMaxMatches) {
+         if (nMatchOffset <= nMaxOffset && nMatchOffset != nPrevOffset) {
+            if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+               nCurDepth++;
+               *cur_depth = nCurDepth;
+            }
+            else {
+               nCurDepth = 0;
+
+               cur_depth = depthptr;
+               matchptr->length = nMatchLen;
+               matchptr->offset = nMatchOffset;
+               *depthptr = 0;
+               matchptr++;
+               depthptr++;
+            }
+
+            nPrevLen = nMatchLen;
+            nPrevOffset = nMatchOffset;
+         }
+      }
+
+      if (nMatchOffset && nMatchOffset < 16 && nMatchLen)
+         *pMatch1 = nMatchOffset;
+
+      if (super_ref == 0)
+         break;
+      ref = super_ref;
+      match_pos = intervals[ref & POS_MASK] & EXCL_VISITED_MASK;
+
+      if (nOffset > match_pos && (nBlockFlags & 3) == 3) {
+         int nMatchOffset = (int)(nOffset - match_pos);
+         int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+         if ((matchptr - pMatches) < nMaxMatches) {
+            if (nMatchOffset <= nMaxOffset && nMatchLen >= 2 && abs(nMatchOffset - nPrevOffset) >= 128) {
+               if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+                  nCurDepth++;
+                  *cur_depth = nCurDepth | 0x8000;
+               }
+               else {
+                  nCurDepth = 0;
+
+                  cur_depth = depthptr;
+                  matchptr->length = nMatchLen;
+                  matchptr->offset = nMatchOffset;
+                  *depthptr = 0x8000;
+                  matchptr++;
+                  depthptr++;
+               }
+
+               nPrevLen = nMatchLen;
+               nPrevOffset = nMatchOffset;
+            }
+         }
+      }
+   }
+
+   return (int)(matchptr - pMatches);
+}
+
+/**
+ * Skip previously compressed bytes
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically 0)
+ * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
+ */
+void apultra_skip_matches(apultra_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   apultra_match match;
+   unsigned short depth;
+   unsigned char match1;
+   int i;
+
+   /* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
+    * we don't store the matches. */
+   for (i = nStartOffset; i < nEndOffset; i++) {
+      apultra_find_matches_at(pCompressor, i, &match, &depth, &match1, 0, 0);
+   }
+}
+
+/**
+ * Find all matches for the data to be compressed
+ *
+ * @param pCompressor compression context
+ * @param nMatchesPerOffset maximum number of matches to store for each offset
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ */
+void apultra_find_all_matches(apultra_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset, const int nBlockFlags) {
+   apultra_match *pMatch = pCompressor->match;
+   unsigned short *pMatchDepth = pCompressor->match_depth;
+   unsigned char *pMatch1 = pCompressor->match1;
+   int i;
+
+   for (i = nStartOffset; i < nEndOffset; i++) {
+      int nMatches = apultra_find_matches_at(pCompressor, i, pMatch, pMatchDepth, pMatch1, nMatchesPerOffset, nBlockFlags);
+
+      while (nMatches < nMatchesPerOffset) {
+         pMatch[nMatches].length = 0;
+         pMatch[nMatches].offset = 0;
+         pMatchDepth[nMatches] = 0;
+         nMatches++;
+      }
+
+      pMatch += nMatchesPerOffset;
+      pMatchDepth += nMatchesPerOffset;
+      pMatch1++;
+   }
+}
diff --git a/tools/apultra/src/matchfinder.h b/tools/apultra/src/matchfinder.h
new file mode 100644
index 0000000..7d68eaf
--- /dev/null
+++ b/tools/apultra/src/matchfinder.h
@@ -0,0 +1,94 @@
+/*
+ * matchfinder.h - LZ match finder definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _MATCHFINDER_H
+#define _MATCHFINDER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declarations */
+typedef struct _apultra_match apultra_match;
+typedef struct _apultra_compressor apultra_compressor;
+
+/**
+ * Parse input data, build suffix array and overlaid data structures to speed up match finding
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int apultra_build_suffix_array(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize);
+
+/**
+ * Find matches at the specified offset in the input window
+ *
+ * @param pCompressor compression context
+ * @param nOffset offset to find matches at, in the input window
+ * @param pMatches pointer to returned matches
+ * @param pMatchDepth pointer to returned match depths
+ * @param pMatch1 pointer to 1-byte length, 4 bit offset match
+ * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return number of matches
+ */
+int apultra_find_matches_at(apultra_compressor *pCompressor, const int nOffset, apultra_match *pMatches, unsigned short *pMatchDepth, unsigned char *pMatch1, const int nMaxMatches, const int nBlockFlags);
+
+/**
+ * Skip previously compressed bytes
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically 0)
+ * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
+ */
+void apultra_skip_matches(apultra_compressor *pCompressor, const int nStartOffset, const int nEndOffset);
+
+/**
+ * Find all matches for the data to be compressed
+ *
+ * @param pCompressor compression context
+ * @param nMatchesPerOffset maximum number of matches to store for each offset
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ */
+void apultra_find_all_matches(apultra_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset, const int nBlockFlags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MATCHFINDER_H */
diff --git a/tools/apultra/src/shrink.c b/tools/apultra/src/shrink.c
new file mode 100644
index 0000000..c6c9826
--- /dev/null
+++ b/tools/apultra/src/shrink.c
@@ -0,0 +1,1728 @@
+/*
+ * shrink.c - compressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "libapultra.h"
+#include "matchfinder.h"
+#include "shrink.h"
+#include "format.h"
+
+#define TOKEN_CODE_LARGE_MATCH   2 /* 10 */
+#define TOKEN_SIZE_LARGE_MATCH   2
+
+#define TOKEN_CODE_7BIT_MATCH    6 /* 110 */
+#define TOKEN_SIZE_7BIT_MATCH    3
+
+#define TOKEN_CODE_4BIT_MATCH    7 /* 111 */
+#define TOKEN_SIZE_4BIT_MATCH    3
+
+#define CountShift(N,bits)  if ((N)>>(bits)) { (N)>>=(bits); (n) += (bits); }
+
+/** Gamma2 bit counts for common values, up to 255 */
+static char _gamma2_size[256] = {
+   0, 0, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+};
+
+/**
+ * Write bitpacked value to output (compressed) buffer
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nValue value to write
+ * @param nBits number of least significant bits to write in value
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ *
+ * @return updated write index into output buffer, or -1 in case of an error
+ */
+static int apultra_write_bits(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, const int nValue, const int nBits, int *nCurBitsOffset, int *nCurBitShift) {
+   int i;
+
+   if (nOutOffset < 0) return -1;
+
+   for (i = nBits - 1; i >= 0; i--) {
+      if ((*nCurBitsOffset) == INT_MIN) {
+         /* Allocate a new byte in the stream to pack bits in */
+         if (nOutOffset >= nMaxOutDataSize) return -1;
+         (*nCurBitsOffset) = nOutOffset;
+         (*nCurBitShift) = 7;
+         pOutData[nOutOffset++] = 0;
+      }
+
+      pOutData[(*nCurBitsOffset)] |= ((nValue >> i) & 1) << (*nCurBitShift);
+
+      (*nCurBitShift) --;
+      if ((*nCurBitShift) == -1) {
+         /* Current byte is full */
+         (*nCurBitsOffset) = INT_MIN;
+      }
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Get size of gamma2 encoded value
+ *
+ * @param nValue value of evaluate (2..n)
+ *
+ * @return number of bits required
+ */
+static int apultra_get_gamma2_size(int nValue) {
+   if (nValue >= 0 && nValue < 256)
+      return _gamma2_size[nValue];
+   else {
+      unsigned int n = 0;
+      CountShift(nValue, 16);
+      CountShift(nValue, 8);
+      CountShift(nValue, 4);
+      CountShift(nValue, 2);
+      CountShift(nValue, 1);
+
+      return n << 1;
+   }
+}
+
+/**
+ * Write gamma2 encoded value to output (compressed) buffer
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nValue value of write (2..n)
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ *
+ * @return updated write index into output buffer, or -1 in case of an error
+ */
+static int apultra_write_gamma2_value(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int nValue, int *nCurBitsOffset, int *nCurBitShift) {
+   int msb = 30;
+   while ((nValue >> msb--) == 0);
+
+   while (msb > 0) {
+      int bit = (nValue >> msb) & 1;
+   
+      nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, bit, 1, nCurBitsOffset, nCurBitShift);
+      msb--;
+      nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 1, 1, nCurBitsOffset, nCurBitShift);
+   }
+
+   nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, nValue & 1, 1, nCurBitsOffset, nCurBitShift);
+   nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 0, 1, nCurBitsOffset, nCurBitShift);
+   return nOutOffset;
+}
+
+/**
+ * Get the number of extra bits required to represent a match offset
+ *
+ * @param nLength match length
+ * @param nMatchOffset match offset
+ * @param nFollowsLiteral non-zero if the match follows a literal, zero if it immediately follows another match
+ *
+ * @return number of extra bits required
+ */
+static inline int apultra_get_offset_varlen_size(const int nLength, const int nMatchOffset, const int nFollowsLiteral) {
+   if (nLength <= 3 && nMatchOffset < 128)
+      return 8 + TOKEN_SIZE_7BIT_MATCH;
+   else {
+      if (nFollowsLiteral)
+         return 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 3);
+      else
+         return 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 2);
+   }
+}
+
+/**
+ * Get the number of extra bits required to represent a match length
+ *
+ * @param nLength match length
+ * @param nMatchOffset match offset
+ *
+ * @return number of extra bits required
+ */
+static inline int apultra_get_match_varlen_size(int nLength, const int nMatchOffset) {
+   if (nLength <= 3 && nMatchOffset < 128)
+      return 0;
+   else {
+      if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+         return apultra_get_gamma2_size(nLength - 2);
+      else if (nMatchOffset < MINMATCH3_OFFSET)
+         return apultra_get_gamma2_size(nLength);
+      else
+         return apultra_get_gamma2_size(nLength - 1);
+   }
+}
+
+/**
+ * Insert forward rep candidate
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param i input data window position whose matches are being considered
+ * @param nMatchOffset match offset to use as rep candidate
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nArrivalsPerPosition maximum number of arrivals per input buffer position
+ * @param nDepth current insertion depth
+ */
+static void apultra_insert_forward_match(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int i, const int nMatchOffset, const int nStartOffset, const int nEndOffset, const int nArrivalsPerPosition, int nDepth) {
+   const apultra_arrival *arrival = pCompressor->arrival + ((i - nStartOffset) * nArrivalsPerPosition);
+   const int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   int* visited = ((int*)pCompressor->pos_data) - nStartOffset /* reuse */;
+   int j;
+
+   for (j = 0; j < nArrivalsPerPosition && arrival[j].from_slot; j++) {
+      if (arrival[j].follows_literal) {
+         int nRepOffset = arrival[j].rep_offset;
+
+         if (nMatchOffset != nRepOffset && nRepOffset) {
+            int nRepPos = arrival[j].rep_pos;
+
+            if (nRepPos >= nStartOffset &&
+               nRepPos < nEndOffset &&
+               visited[nRepPos] != nMatchOffset) {
+
+               visited[nRepPos] = nMatchOffset;
+
+               if (nRepPos >= nMatchOffset && pCompressor->match[((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT) + NMATCHES_PER_INDEX - 1].length == 0) {
+                  const unsigned char* pInWindowAtRepOffset = pInWindow + nRepPos;
+
+                  if (pInWindowAtRepOffset[0] == pInWindowAtRepOffset[-nMatchOffset]) {
+                     int nLen0 = rle_len[nRepPos - nMatchOffset];
+                     int nLen1 = rle_len[nRepPos];
+                     int nMinLen = (nLen0 < nLen1) ? nLen0 : nLen1;
+
+                     int nMaxRepLen = nEndOffset - nRepPos;
+                     if (nMaxRepLen > LCP_MAX)
+                        nMaxRepLen = LCP_MAX;
+
+                     if (nMinLen > nMaxRepLen)
+                        nMinLen = nMaxRepLen;
+
+                     const unsigned char* pInWindowMax = pInWindowAtRepOffset + nMaxRepLen;
+                     pInWindowAtRepOffset += nMinLen;
+
+                     while ((pInWindowAtRepOffset + 8) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nMatchOffset, 8))
+                        pInWindowAtRepOffset += 8;
+                     while ((pInWindowAtRepOffset + 4) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nMatchOffset, 4))
+                        pInWindowAtRepOffset += 4;
+                     while (pInWindowAtRepOffset < pInWindowMax && pInWindowAtRepOffset[0] == pInWindowAtRepOffset[-nMatchOffset])
+                        pInWindowAtRepOffset++;
+
+                     int nCurRepLen = (int)(pInWindowAtRepOffset - (pInWindow + nRepPos));
+
+                     if (nCurRepLen >= 2) {
+                        apultra_match* fwd_match = pCompressor->match + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+                        unsigned short* fwd_depth = pCompressor->match_depth + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+                        int r;
+
+                        for (r = 0; fwd_match[r].length >= MIN_MATCH_SIZE; r++) {
+                           if (fwd_match[r].offset == nMatchOffset && (fwd_depth[r] & 0x3fff) == 0) {
+                              if ((int)fwd_match[r].length < nCurRepLen) {
+                                 fwd_match[r].length = nCurRepLen;
+                                 fwd_depth[r] = 0;
+                              }
+                              r = NMATCHES_PER_INDEX;
+                              break;
+                           }
+                        }
+
+                        if (r < NMATCHES_PER_INDEX) {
+                           fwd_match[r].offset = nMatchOffset;
+                           fwd_match[r].length = nCurRepLen;
+                           fwd_depth[r] = 0;
+
+                           if (nDepth < 9)
+                              apultra_insert_forward_match(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nArrivalsPerPosition, nDepth + 1);
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+/**
+ * Attempt to pick optimal matches, so as to produce the smallest possible output that decompresses to the same input
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nInsertForwardReps non-zero to insert forward repmatch candidates, zero to use the previously inserted candidates
+ * @param nCurRepMatchOffset starting rep offset for this block
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ * @param nArrivalsPerPosition maximum number of arrivals per input buffer position
+ */
+static void apultra_optimize_forward(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, const int nInsertForwardReps, const int *nCurRepMatchOffset, const int nBlockFlags, const int nArrivalsPerPosition) {
+   apultra_arrival *arrival = pCompressor->arrival - (nStartOffset * nArrivalsPerPosition);
+   const int* rle_len = (int*)pCompressor->intervals /* reuse */;
+   int* visited = ((int*)pCompressor->pos_data) - nStartOffset /* reuse */;
+   int i, j, n;
+
+   if ((nEndOffset - nStartOffset) > pCompressor->block_size) return;
+
+   memset(arrival + (nStartOffset * nArrivalsPerPosition), 0, sizeof(apultra_arrival) * ((nEndOffset - nStartOffset + 1) * nArrivalsPerPosition));
+
+   arrival[nStartOffset * nArrivalsPerPosition].from_slot = -1;
+   arrival[nStartOffset * nArrivalsPerPosition].rep_offset = *nCurRepMatchOffset;
+
+   for (i = (nStartOffset * nArrivalsPerPosition); i != ((nEndOffset+1) * nArrivalsPerPosition); i++) {
+      arrival[i].cost = 0x40000000;
+   }
+
+   if (nInsertForwardReps) {
+      memset(visited + nStartOffset, 0, (nEndOffset - nStartOffset) * sizeof(int));
+   }
+
+   for (i = nStartOffset; i != nEndOffset; i++) {
+      apultra_arrival *cur_arrival = &arrival[i * nArrivalsPerPosition];
+      int m;
+      
+      const unsigned char nMatch1Offs = pCompressor->match1[i - nStartOffset];
+      int nShortOffset;
+      int nShortLen;
+      int nLiteralScore;
+      int nLiteralCost;
+
+      if ((pInWindow[i] != 0 && nMatch1Offs == 0) || (i == nStartOffset && (nBlockFlags & 1))) {
+         nShortOffset = 0;
+         nShortLen = 0;
+         nLiteralCost = 9 /* literal bit + literal byte */;
+      }
+      else {
+         nShortOffset = (pInWindow[i] == 0) ? 0 : nMatch1Offs;
+         nShortLen = 1;
+         nLiteralCost = 4 + TOKEN_SIZE_4BIT_MATCH /* command and offset cost; no length cost */;
+      }
+
+      nLiteralScore = nShortOffset ? 3 : 1;
+
+      if (cur_arrival[nArrivalsPerPosition].from_slot) {
+         for (j = 0; j < nArrivalsPerPosition && cur_arrival[j].from_slot; j++) {
+            int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+            int nCodingChoiceCost = nPrevCost + nLiteralCost;
+            int nScore = cur_arrival[j].score + nLiteralScore;
+
+            apultra_arrival* pDestSlots = &cur_arrival[nArrivalsPerPosition];
+            if (nCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 1].cost ||
+               (nCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 1].cost && nScore < pDestSlots[nArrivalsPerPosition - 1].score)) {
+               int nRepOffset = cur_arrival[j].rep_offset;
+               int exists = 0;
+
+               for (n = 0;
+                  n < nArrivalsPerPosition && pDestSlots[n].cost < nCodingChoiceCost;
+                  n++) {
+                  if (pDestSlots[n].rep_offset == nRepOffset) {
+                     exists = 1;
+                     break;
+                  }
+               }
+
+               if (!exists) {
+                  for (;
+                     n < nArrivalsPerPosition && pDestSlots[n].cost == nCodingChoiceCost && nScore >= pDestSlots[n].score;
+                     n++) {
+                     if (pDestSlots[n].rep_offset == nRepOffset) {
+                        exists = 1;
+                        break;
+                     }
+                  }
+
+                  if (!exists) {
+                     if (n < nArrivalsPerPosition) {
+                        int nn;
+
+                        for (nn = n;
+                           nn < nArrivalsPerPosition && pDestSlots[nn].cost == nCodingChoiceCost;
+                           nn++) {
+                           if (pDestSlots[nn].rep_offset == nRepOffset) {
+                              exists = 1;
+                              break;
+                           }
+                        }
+
+                        if (!exists) {
+                           int z;
+
+                           for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                              if (pDestSlots[z].rep_offset == nRepOffset)
+                                 break;
+                           }
+
+                           apultra_arrival* pDestArrival = &pDestSlots[n];
+                           memmove(&pDestSlots[n + 1],
+                              &pDestSlots[n],
+                              sizeof(apultra_arrival) * (z - n));
+
+                           pDestArrival->cost = nCodingChoiceCost;
+                           pDestArrival->from_pos = i;
+                           pDestArrival->from_slot = j + 1;
+                           pDestArrival->follows_literal = 1;
+                           pDestArrival->rep_offset = nRepOffset;
+                           pDestArrival->short_offset = nShortOffset;
+                           pDestArrival->rep_pos = cur_arrival[j].rep_pos;
+                           pDestArrival->match_len = nShortLen;
+                           pDestArrival->score = nScore;
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+      else {
+         for (j = 0; j < nArrivalsPerPosition && cur_arrival[j].from_slot; j++) {
+            int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+            int nCodingChoiceCost = nPrevCost + nLiteralCost;
+            int nScore = cur_arrival[j].score + nLiteralScore;
+
+            apultra_arrival* pDestArrival = &cur_arrival[nArrivalsPerPosition + j];
+
+            pDestArrival->cost = nCodingChoiceCost;
+            pDestArrival->from_pos = i;
+            pDestArrival->from_slot = j + 1;
+            pDestArrival->follows_literal = 1;
+            pDestArrival->rep_offset = cur_arrival[j].rep_offset;
+            pDestArrival->short_offset = nShortOffset;
+            pDestArrival->rep_pos = cur_arrival[j].rep_pos;
+            pDestArrival->match_len = nShortLen;
+            pDestArrival->score = nScore;
+         }
+      }
+
+      if (i == nStartOffset && (nBlockFlags & 1)) continue;
+
+      const apultra_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+      const unsigned short *match_depth = pCompressor->match_depth + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+      int nNumArrivalsForThisPos = j, nOverallMinRepLen = 0, nOverallMaxRepLen = 0;
+
+      int nRepLenForArrival[NARRIVALS_PER_POSITION_MAX];
+      memset(nRepLenForArrival, 0, nArrivalsPerPosition * sizeof(int));
+
+      int nRepMatchArrivalIdx[NARRIVALS_PER_POSITION_MAX + 1];
+      int nNumRepMatchArrivals = 0;
+
+      int nMaxRepLenForPos = nEndOffset - i;
+      if (nMaxRepLenForPos > LCP_MAX)
+         nMaxRepLenForPos = LCP_MAX;
+      const unsigned char* pInWindowStart = pInWindow + i;
+      const unsigned char* pInWindowMax = pInWindowStart + nMaxRepLenForPos;
+      const int nLen1 = rle_len[i];
+
+      for (j = 0; j < nNumArrivalsForThisPos && (i + 2) <= nEndOffset; j++) {
+         if (cur_arrival[j].follows_literal) {
+            int nRepOffset = cur_arrival[j].rep_offset;
+
+            if (nRepOffset && i >= nRepOffset) {
+               if (pInWindowStart[0] == pInWindowStart[-nRepOffset]) {
+                  int nLen0 = rle_len[i - nRepOffset];
+                  int nMinLen = (nLen0 < nLen1) ? nLen0 : nLen1;
+
+                  if (nMinLen > nMaxRepLenForPos)
+                     nMinLen = nMaxRepLenForPos;
+
+                  const unsigned char* pInWindowAtRepOffset = pInWindowStart + nMinLen;
+                  while ((pInWindowAtRepOffset + 8) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nRepOffset, 8))
+                     pInWindowAtRepOffset += 8;
+                  while ((pInWindowAtRepOffset + 4) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nRepOffset, 4))
+                     pInWindowAtRepOffset += 4;
+                  while (pInWindowAtRepOffset < pInWindowMax && pInWindowAtRepOffset[0] == pInWindowAtRepOffset[-nRepOffset])
+                     pInWindowAtRepOffset++;
+
+                  int nCurMaxLen = (int)(pInWindowAtRepOffset - pInWindowStart);
+
+                  if (nCurMaxLen >= 2) {
+                     nRepLenForArrival[j] = nCurMaxLen;
+                     nRepMatchArrivalIdx[nNumRepMatchArrivals++] = j;
+
+                     if (nOverallMaxRepLen < nCurMaxLen)
+                        nOverallMaxRepLen = nCurMaxLen;
+                  }
+               }
+            }
+         }
+      }
+      nRepMatchArrivalIdx[nNumRepMatchArrivals] = -1;
+
+      for (m = 0; m < NMATCHES_PER_INDEX && match[m].length; m++) {
+         const int nOrigMatchLen = match[m].length;
+         const int nOrigMatchOffset = match[m].offset;
+         const unsigned int nOrigMatchDepth = match_depth[m] & 0x3fff;
+         const int nScorePenalty = 3 + ((match_depth[m] & 0x8000) >> 15);
+         unsigned int d;
+
+         for (d = 0; d <= nOrigMatchDepth; d += (nOrigMatchDepth ? nOrigMatchDepth : 1)) {
+            const int nMatchOffset = nOrigMatchOffset - d;
+            int nMatchLen = nOrigMatchLen - d;
+
+            if ((i + nMatchLen) > nEndOffset)
+               nMatchLen = nEndOffset - i;
+
+            if (nInsertForwardReps) {
+               apultra_insert_forward_match(pCompressor, pInWindow, i, nMatchOffset, nStartOffset, nEndOffset, nArrivalsPerPosition, 0);
+            }
+
+            if (nMatchLen >= 2) {
+               int nStartingMatchLen, nJumpMatchLen, k;
+               int nNoRepMatchOffsetCostForLit[2], nNoRepMatchOffsetCostDelta;
+               int nMinMatchLenForOffset;
+               int nNoRepCostAdjusment = (nMatchLen >= LCP_MAX) ? 1 : 0;
+
+               if (nMatchOffset < MINMATCH3_OFFSET)
+                  nMinMatchLenForOffset = 2;
+               else {
+                  if (nMatchOffset < MINMATCH4_OFFSET)
+                     nMinMatchLenForOffset = 3;
+                  else
+                     nMinMatchLenForOffset = 4;
+               }
+
+               if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE && i >= nMatchLen)
+                  nStartingMatchLen = nMatchLen;
+               else
+                  nStartingMatchLen = 2;
+
+               if ((nBlockFlags & 3) == 3 && nMatchLen > 90 && i >= 90)
+                  nJumpMatchLen = 90;
+               else
+                  nJumpMatchLen = nMatchLen + 1;
+
+               if (nStartingMatchLen <= 3 && nMatchOffset < 128) {
+                  nNoRepMatchOffsetCostForLit[0] = 8 + TOKEN_SIZE_7BIT_MATCH;
+                  nNoRepMatchOffsetCostForLit[1] = 8 + TOKEN_SIZE_7BIT_MATCH;
+               }
+               else {
+                  nNoRepMatchOffsetCostForLit[0] = 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 2);
+                  nNoRepMatchOffsetCostForLit[1] = 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 3);
+               }
+               nNoRepMatchOffsetCostDelta = nNoRepMatchOffsetCostForLit[1] - nNoRepMatchOffsetCostForLit[0];
+
+               for (k = nStartingMatchLen; k <= nMatchLen; k++) {
+                  int nRepMatchMatchLenCost = apultra_get_gamma2_size(k);
+                  apultra_arrival *pDestSlots = &cur_arrival[k * nArrivalsPerPosition];
+ 
+                  /* Insert non-repmatch candidate */
+
+                  if (k >= nMinMatchLenForOffset) {
+                     int nNoRepMatchMatchLenCost;
+
+                     if (k <= 3 && nMatchOffset < 128)
+                        nNoRepMatchMatchLenCost = 0;
+                     else {
+                        if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                           nNoRepMatchMatchLenCost = apultra_get_gamma2_size(k - 2);
+                        else if (nMatchOffset < MINMATCH3_OFFSET)
+                           nNoRepMatchMatchLenCost = nRepMatchMatchLenCost;
+                        else
+                           nNoRepMatchMatchLenCost = apultra_get_gamma2_size(k - 1);
+                     }
+
+                     for (j = 0; j < nNumArrivalsForThisPos; j++) {
+                        if (nMatchOffset != cur_arrival[j].rep_offset || cur_arrival[j].follows_literal == 0) {
+                           int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+                           int nMatchCmdCost = nNoRepMatchMatchLenCost + nNoRepMatchOffsetCostForLit[cur_arrival[j].follows_literal];
+                           int nCodingChoiceCost = nPrevCost + nMatchCmdCost;
+
+                           if (nCodingChoiceCost <= (pDestSlots[nArrivalsPerPosition - 1].cost + 1)) {
+                              int nScore = cur_arrival[j].score + nScorePenalty;
+
+                              if (nCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 2].cost ||
+                                 (nCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 2].cost && nScore < pDestSlots[nArrivalsPerPosition - 2].score)) {
+                                 int exists = 0;
+
+                                 for (n = 0;
+                                    n < nArrivalsPerPosition && pDestSlots[n].cost < nCodingChoiceCost;
+                                    n++) {
+                                    if (pDestSlots[n].rep_offset == nMatchOffset) {
+                                       exists = 1;
+                                       break;
+                                    }
+                                 }
+
+                                 if (!exists) {
+                                    int nRevisedCodingChoiceCost = nCodingChoiceCost - nNoRepCostAdjusment;
+
+                                    for (;
+                                       n < nArrivalsPerPosition - 1 && pDestSlots[n].cost == nRevisedCodingChoiceCost && nScore >= pDestSlots[n].score;
+                                       n++) {
+                                       if (pDestSlots[n].rep_offset == nMatchOffset) {
+                                          exists = 1;
+                                          break;
+                                       }
+                                    }
+
+                                    if (!exists) {
+                                       if (n < nArrivalsPerPosition - 1) {
+                                          int nn;
+
+                                          for (nn = n;
+                                             nn < nArrivalsPerPosition && pDestSlots[nn].cost == nCodingChoiceCost;
+                                             nn++) {
+                                             if (pDestSlots[nn].rep_offset == nMatchOffset) {
+                                                exists = 1;
+                                                break;
+                                             }
+                                          }
+
+                                          if (!exists) {
+                                             int z;
+
+                                             for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                                                if (pDestSlots[z].rep_offset == nMatchOffset)
+                                                   break;
+                                             }
+
+                                             apultra_arrival* pDestArrival = &pDestSlots[n];
+                                             memmove(&pDestSlots[n + 1],
+                                                &pDestSlots[n],
+                                                sizeof(apultra_arrival) * (z - n));
+
+                                             pDestArrival->cost = nRevisedCodingChoiceCost;
+                                             pDestArrival->from_pos = i;
+                                             pDestArrival->from_slot = j + 1;
+                                             pDestArrival->follows_literal = 0;
+                                             pDestArrival->rep_offset = nMatchOffset;
+                                             pDestArrival->short_offset = 0;
+                                             pDestArrival->rep_pos = i;
+                                             pDestArrival->match_len = k;
+                                             pDestArrival->score = nScore;
+                                          }
+                                       }
+                                    }
+                                 }
+                                 else {
+                                    if ((nCodingChoiceCost - pDestSlots[n].cost) >= nNoRepMatchOffsetCostDelta)
+                                       break;
+                                 }
+                              }
+                              if (cur_arrival[j].follows_literal == 0 || nNoRepMatchOffsetCostDelta == 0)
+                                 break;
+                           }
+                           else {
+                              break;
+                           }
+                        }
+                     }
+                  }
+
+                  /* Insert repmatch candidate */
+
+                  if (k > nOverallMinRepLen && k <= nOverallMaxRepLen) {
+                     int nRepMatchCmdCost = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + nRepMatchMatchLenCost;
+                     int nCurRepMatchArrival;
+
+                     if (k <= 90)
+                        nOverallMinRepLen = k;
+                     else if (nOverallMaxRepLen == k)
+                        nOverallMaxRepLen--;
+                     
+                     for (nCurRepMatchArrival = 0; (j = nRepMatchArrivalIdx[nCurRepMatchArrival]) >= 0; nCurRepMatchArrival++) {
+                        if (nRepLenForArrival[j] >= k) {
+                           int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+                           int nRepCodingChoiceCost = nPrevCost + nRepMatchCmdCost;
+                           int nScore = cur_arrival[j].score + 2;
+
+                           if (nRepCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 1].cost ||
+                              (nRepCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 1].cost && nScore < pDestSlots[nArrivalsPerPosition - 1].score)) {
+                              int nRepOffset = cur_arrival[j].rep_offset;
+                              int exists = 0;
+
+                              for (n = 0;
+                                 n < nArrivalsPerPosition && pDestSlots[n].cost < nRepCodingChoiceCost;
+                                 n++) {
+                                 if (pDestSlots[n].rep_offset == nRepOffset) {
+                                    exists = 1;
+                                    break;
+                                 }
+                              }
+
+                              if (!exists) {
+                                 for (;
+                                    n < nArrivalsPerPosition && pDestSlots[n].cost == nRepCodingChoiceCost && nScore >= pDestSlots[n].score;
+                                    n++) {
+                                    if (pDestSlots[n].rep_offset == nRepOffset) {
+                                       exists = 1;
+                                       break;
+                                    }
+                                 }
+
+                                 if (!exists) {
+                                    if (n < nArrivalsPerPosition) {
+                                       int nn;
+
+                                       for (nn = n;
+                                          nn < nArrivalsPerPosition && pDestSlots[nn].cost == nRepCodingChoiceCost;
+                                          nn++) {
+                                          if (pDestSlots[nn].rep_offset == nRepOffset) {
+                                             exists = 1;
+                                             break;
+                                          }
+                                       }
+
+                                       if (!exists) {
+                                          int z;
+
+                                          for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                                             if (pDestSlots[z].rep_offset == nRepOffset)
+                                                break;
+                                          }
+
+                                          apultra_arrival* pDestArrival = &pDestSlots[n];
+                                          memmove(&pDestSlots[n + 1],
+                                             &pDestSlots[n],
+                                             sizeof(apultra_arrival) * (z - n));
+
+                                          pDestArrival->cost = nRepCodingChoiceCost;
+                                          pDestArrival->from_pos = i;
+                                          pDestArrival->from_slot = j + 1;
+                                          pDestArrival->follows_literal = 0;
+                                          pDestArrival->rep_offset = nRepOffset;
+                                          pDestArrival->short_offset = 0;
+                                          pDestArrival->rep_pos = i;
+                                          pDestArrival->match_len = k;
+                                          pDestArrival->score = nScore;
+                                       }
+                                    }
+                                 }
+                              }
+                           }
+                           else {
+                              break;
+                           }
+                        }
+                     }
+                  }
+
+                  if (k == 3 && nMatchOffset < 128) {
+                     nNoRepMatchOffsetCostForLit[0] = 8 + TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size((nMatchOffset >> 8) + 2) */;
+                     nNoRepMatchOffsetCostForLit[1] = 8 + TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size((nMatchOffset >> 8) + 3) */;
+                  }
+
+                  if (k == nJumpMatchLen)
+                     k = nMatchLen - 1;
+               }
+            }
+
+            if (nOrigMatchLen >= 512)
+               break;
+         }
+      }
+   }
+   
+   if (!nInsertForwardReps) {
+      const apultra_arrival* end_arrival = &arrival[(i * nArrivalsPerPosition) + 0];
+      apultra_final_match* pBestMatch = pCompressor->best_match - nStartOffset;
+
+      while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0 && (int)end_arrival->from_pos < nEndOffset) {
+         pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
+         if (end_arrival->match_len >= 2)
+            pBestMatch[end_arrival->from_pos].offset = end_arrival->rep_offset;
+         else
+            pBestMatch[end_arrival->from_pos].offset = end_arrival->short_offset;
+
+         end_arrival = &arrival[(end_arrival->from_pos * nArrivalsPerPosition) + (end_arrival->from_slot - 1)];
+      }
+   }
+}
+
+/**
+ * Attempt to replace matches by literals when it makes the final bitstream smaller, and merge large matches
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param pBestMatch optimal matches to evaluate and update
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nCurRepMatchOffset starting rep offset for this block
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return non-zero if the number of tokens was reduced, 0 if it wasn't
+ */
+static int apultra_reduce_commands(apultra_compressor *pCompressor, const unsigned char *pInWindow, apultra_final_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int *nCurRepMatchOffset, const int nBlockFlags) {
+   int i;
+   int nRepMatchOffset = *nCurRepMatchOffset;
+   int nFollowsLiteral = 0;
+   int nDidReduce = 0;
+   int nLastMatchLen = 0;
+   const unsigned char *match1 = pCompressor->match1 - nStartOffset;
+
+   for (i = nStartOffset + ((nBlockFlags & 1) ? 1 : 0); i < nEndOffset; ) {
+      apultra_final_match *pMatch = pBestMatch + i;
+
+      if (pMatch->length <= 1 &&
+         (i + 1) < nEndOffset &&
+         pBestMatch[i + 1].length >= 2 &&
+         pBestMatch[i + 1].length < MAX_VARLEN &&
+         pBestMatch[i + 1].offset &&
+         i >= pBestMatch[i + 1].offset &&
+         (i + pBestMatch[i + 1].length + 1) <= nEndOffset &&
+         !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
+         if ((pBestMatch[i + 1].offset < MINMATCH3_OFFSET || (pBestMatch[i + 1].length + 1) >= 3 || (pBestMatch[i + 1].offset == nRepMatchOffset && nFollowsLiteral)) &&
+            (pBestMatch[i + 1].offset < MINMATCH4_OFFSET || (pBestMatch[i + 1].length + 1) >= 4 || (pBestMatch[i + 1].offset == nRepMatchOffset && nFollowsLiteral))) {
+
+            int nCurPartialCommandSize = (pMatch->length == 1) ? (TOKEN_SIZE_4BIT_MATCH + 4) : (1 /* literal bit */ + 8 /* literal size */);
+            if (pBestMatch[i + 1].offset == nRepMatchOffset /* always follows a literal, the one at the current position */) {
+               nCurPartialCommandSize += TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[i + 1].length);
+            }
+            else {
+               nCurPartialCommandSize += apultra_get_offset_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset, 1) + apultra_get_match_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset);
+            }
+
+            int nReducedPartialCommandSize;
+            if (pBestMatch[i + 1].offset == nRepMatchOffset && nFollowsLiteral) {
+               nReducedPartialCommandSize = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[i + 1].length);
+            }
+            else {
+               nReducedPartialCommandSize = apultra_get_offset_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset, nFollowsLiteral) + apultra_get_match_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset);
+            }
+
+            if (nReducedPartialCommandSize < nCurPartialCommandSize || (nFollowsLiteral == 0 && nLastMatchLen >= LCP_MAX)) {
+               /* Merge */
+               pBestMatch[i].length = pBestMatch[i + 1].length + 1;
+               pBestMatch[i].offset = pBestMatch[i + 1].offset;
+               pBestMatch[i + 1].length = 0;
+               pBestMatch[i + 1].offset = 0;
+               nDidReduce = 1;
+               continue;
+            }
+         }
+      }
+
+      if (pMatch->length >= 2) {
+         if (pMatch->length < 32 && /* Don't waste time considering large matches, they will always win over literals */
+             (i + pMatch->length) < nEndOffset /* Don't consider the last match in the block, we can only reduce a match inbetween other tokens */) {
+            int nNextIndex = i + pMatch->length;
+            int nNextFollowsLiteral = 0;
+            int nCannotEncode = 0;
+
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < 2) {
+               nNextIndex++;
+               nNextFollowsLiteral = 1;
+            }
+
+            if (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length >= 2) {
+               if (nRepMatchOffset && nRepMatchOffset != pMatch->offset && pBestMatch[nNextIndex].offset && pMatch->offset != pBestMatch[nNextIndex].offset &&
+                  nNextFollowsLiteral) {
+                  /* Try to gain a match forward */
+                  if (i >= pBestMatch[nNextIndex].offset && (i - pBestMatch[nNextIndex].offset + pMatch->length) <= nEndOffset) {
+                     if ((pBestMatch[nNextIndex].offset < MINMATCH3_OFFSET || pMatch->length >= 3) &&
+                        (pBestMatch[nNextIndex].offset < MINMATCH4_OFFSET || pMatch->length >= 4)) {
+                        int nMaxLen = 0;
+                        const unsigned char* pInWindowAtPos = pInWindow + i;
+                        while (nMaxLen < pMatch->length && pInWindowAtPos[nMaxLen - pBestMatch[nNextIndex].offset] == pInWindowAtPos[nMaxLen])
+                           nMaxLen++;
+
+                        if (nMaxLen >= pMatch->length) {
+                           /* Replace */
+                           pMatch->offset = pBestMatch[nNextIndex].offset;
+                           nDidReduce = 1;
+                        }
+                        else if (nMaxLen >= 2) {
+                           if ((nFollowsLiteral && nRepMatchOffset == pBestMatch[nNextIndex].offset) ||
+                              ((pBestMatch[nNextIndex].offset < MINMATCH3_OFFSET || nMaxLen >= 3) &&
+                               (pBestMatch[nNextIndex].offset < MINMATCH4_OFFSET || nMaxLen >= 4))) {
+
+                              int nPartialSizeBefore, nPartialSizeAfter, j;
+
+                              nPartialSizeBefore = apultra_get_offset_varlen_size(pMatch->length, pMatch->offset, nFollowsLiteral);
+                              nPartialSizeBefore += apultra_get_match_varlen_size(pMatch->length, pMatch->offset);
+
+                              nPartialSizeBefore += apultra_get_offset_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset, 1);
+                              nPartialSizeBefore += apultra_get_match_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset);
+
+                              nPartialSizeAfter = apultra_get_offset_varlen_size(nMaxLen, pBestMatch[nNextIndex].offset, nFollowsLiteral);
+                              if (nFollowsLiteral && nRepMatchOffset == pBestMatch[nNextIndex].offset)
+                                 nPartialSizeAfter += apultra_get_gamma2_size(nMaxLen);
+                              else
+                                 nPartialSizeAfter += apultra_get_match_varlen_size(nMaxLen, pBestMatch[nNextIndex].offset);
+
+                              nPartialSizeAfter += TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */;
+                              nPartialSizeAfter += apultra_get_gamma2_size(pBestMatch[nNextIndex].length);
+
+                              for (j = nMaxLen; j < pMatch->length; j++) {
+                                 if (pInWindow[i + j] == 0 || match1[i + j])
+                                    nPartialSizeAfter += TOKEN_SIZE_4BIT_MATCH + 4;
+                                 else
+                                    nPartialSizeAfter += 1 /* literal bit */ + 8 /* literal byte */;
+                              }
+
+                              if (nPartialSizeAfter < nPartialSizeBefore) {
+                                 /* We gain a repmatch that is shorter than the original match as this is the best we can do, so it is followed by extra literals, but
+                                  * we have calculated that this is shorter */
+
+                                 int nOrigLen = pMatch->length;
+                                 int j;
+
+                                 pMatch->offset = pBestMatch[nNextIndex].offset;
+                                 pMatch->length = nMaxLen;
+
+                                 for (j = nMaxLen; j < nOrigLen; j++) {
+                                    pBestMatch[i + j].offset = match1[i + j];
+                                    pBestMatch[i + j].length = (pInWindow[i + j] && match1[i+j] == 0) ? 0 : 1;
+                                 }
+
+                                 nDidReduce = 1;
+                                 continue;
+                              }
+                           }
+                        }
+                     }
+                  }
+               }
+
+               /* Calculate this command's current cost */
+
+               int nCurCommandSize;
+               if (pMatch->offset == nRepMatchOffset && nFollowsLiteral) {
+                  nCurCommandSize = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pMatch->length);
+               }
+               else {
+                  nCurCommandSize = apultra_get_offset_varlen_size(pMatch->length, pMatch->offset, nFollowsLiteral) + apultra_get_match_varlen_size(pMatch->length, pMatch->offset);
+               }
+
+               /* Calculate the next command's current cost */
+               int nNextCommandSize;
+               if (pBestMatch[nNextIndex].offset == pMatch->offset && nNextFollowsLiteral && pBestMatch[nNextIndex].length >= 2) {
+                  nNextCommandSize = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[nNextIndex].length);
+               }
+               else {
+                  nNextCommandSize = apultra_get_offset_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset, nNextFollowsLiteral) + apultra_get_match_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset);
+               }
+
+               int nOriginalCombinedCommandSize = nCurCommandSize + nNextCommandSize;
+
+               /* Calculate the cost of replacing this match command by literals + the effect on the cost of the next command */
+               int nReducedCommandSize = 0;
+               int j;
+
+               for (j = 0; j < pMatch->length; j++) {
+                  if (pInWindow[i + j] == 0 || match1[i + j])
+                     nReducedCommandSize += TOKEN_SIZE_4BIT_MATCH + 4;
+                  else
+                     nReducedCommandSize += 1 /* literal bit */ + 8;
+               }
+
+               if (pBestMatch[nNextIndex].offset == nRepMatchOffset /* the new command would always follow literals, the ones we create */ && pBestMatch[nNextIndex].length >= 2) {
+                  nReducedCommandSize += TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[nNextIndex].length);
+               }
+               else {
+                  if ((pBestMatch[nNextIndex].length < 3 && pBestMatch[nNextIndex].offset >= MINMATCH3_OFFSET) ||
+                     (pBestMatch[nNextIndex].length < 4 && pBestMatch[nNextIndex].offset >= MINMATCH4_OFFSET)) {
+                     /* This match length can only be encoded with a rep-match */
+                     nCannotEncode = 1;
+                  }
+                  else {
+                     nReducedCommandSize += apultra_get_offset_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset, 1 /* follows literals */) + apultra_get_match_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset);
+                  }
+               }
+
+               if (!nCannotEncode && nOriginalCombinedCommandSize > nReducedCommandSize) {
+                  /* Reduce */
+                  int nMatchLen = pMatch->length;
+                  int j;
+
+                  for (j = 0; j < nMatchLen; j++) {
+                     pBestMatch[i + j].offset = match1[i + j];
+                     pBestMatch[i + j].length = (pInWindow[i + j] && match1[i + j] == 0) ? 0 : 1;
+                  }
+
+                  nDidReduce = 1;
+                  continue;
+               }
+            }
+         }
+
+         if ((i + pMatch->length) < nEndOffset && pMatch->offset > 0 &&
+            pBestMatch[i + pMatch->length].offset > 0 &&
+            pBestMatch[i + pMatch->length].length >= 2 &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) >= LEAVE_ALONE_MATCH_SIZE &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
+            (i + pMatch->length) >= pMatch->offset &&
+            (i + pMatch->length) >= pBestMatch[i + pMatch->length].offset &&
+            (i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
+            !memcmp(pInWindow + i + pMatch->length - pMatch->offset,
+               pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
+               pBestMatch[i + pMatch->length].length)) {
+            int nMatchLen = pMatch->length;
+
+            /* Join large matches */
+
+            int nNextIndex = i + pMatch->length + pBestMatch[i + pMatch->length].length;
+            int nNextFollowsLiteral = 0;
+            int nCannotEncode = 0;
+
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < 2) {
+               nNextIndex++;
+               nNextFollowsLiteral = 1;
+            }
+
+            if (nNextIndex < nEndOffset && nNextFollowsLiteral && pBestMatch[nNextIndex].length >= 2 &&
+               pBestMatch[nNextIndex].offset == pBestMatch[i + pMatch->length].offset) {
+               if ((pBestMatch[nNextIndex].offset >= MINMATCH3_OFFSET && pBestMatch[nNextIndex].length < 3) ||
+                  (pBestMatch[nNextIndex].offset >= MINMATCH4_OFFSET && pBestMatch[nNextIndex].length < 4)) {
+                  nCannotEncode = 1;
+               }
+            }
+
+            if (!nCannotEncode) {
+               pMatch->length += pBestMatch[i + nMatchLen].length;
+               pBestMatch[i + nMatchLen].offset = 0;
+               pBestMatch[i + nMatchLen].length = -1;
+               nDidReduce = 1;
+               continue;
+            }
+         }
+
+         nRepMatchOffset = pMatch->offset;
+         nFollowsLiteral = 0;
+         nLastMatchLen = pMatch->length;
+
+         i += pMatch->length;
+      }
+      else {
+         /* 4 bits offset (1 byte match) or literal */
+         i++;
+         nFollowsLiteral = 1;
+         nLastMatchLen = 0;
+      }
+   }
+
+   return nDidReduce;
+}
+
+/**
+ * Emit a block of compressed data
+ *
+ * @param pCompressor compression context
+ * @param pBestMatch optimal matches to emit
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ * @param nFollowsLiteral non-zero if the next command to be issued follows a literal, 0 if not
+ * @param nCurRepMatchOffset starting rep offset for this block, updated after the block is compressed successfully
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int apultra_write_block(apultra_compressor *pCompressor, apultra_final_match *pBestMatch, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurBitsOffset, int *nCurBitShift, int *nFollowsLiteral, int *nCurRepMatchOffset, const int nBlockFlags) {
+   int i;
+   int nRepMatchOffset = *nCurRepMatchOffset;
+   const int nMaxOffset = pCompressor->max_offset;
+
+   if (nBlockFlags & 1) {
+      if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+         return -1;
+      pOutData[nOutOffset++] = pInWindow[nStartOffset];
+      *nFollowsLiteral = 1;
+   }
+
+   for (i = nStartOffset + ((nBlockFlags & 1) ? 1 : 0); i < nEndOffset; ) {
+      const apultra_final_match *pMatch = pBestMatch + i;
+
+      if (pMatch->length >= 2) {
+         int nMatchOffset = pMatch->offset;
+         int nMatchLen = pMatch->length;
+
+         if (nMatchOffset < MIN_OFFSET || nMatchOffset > nMaxOffset)
+            return -1;
+
+         if (nMatchOffset == nRepMatchOffset && *nFollowsLiteral) {
+            /* Rep-match */
+            nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_LARGE_MATCH, TOKEN_SIZE_LARGE_MATCH, nCurBitsOffset, nCurBitShift);
+            nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 0 /* length of 2 encoded as gamma 2 */, 2, nCurBitsOffset, nCurBitShift);
+
+            /* The match length isn't encoded in the command, emit elias gamma value */
+            nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen, nCurBitsOffset, nCurBitShift);
+            if (nOutOffset < 0) return -1;
+
+            *nFollowsLiteral = 0;
+
+            pCompressor->stats.num_rep_matches++;
+         }
+         else {
+            if (nMatchLen <= 3 && nMatchOffset < 128) {
+               /* 7 bits offset + 1 bit length */
+               nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_7BIT_MATCH, TOKEN_SIZE_7BIT_MATCH, nCurBitsOffset, nCurBitShift);
+
+               if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+                  return -1;
+               pOutData[nOutOffset++] = ((nMatchOffset) & 0x7f) << 1 | (nMatchLen - 2);
+
+               *nFollowsLiteral = 0;
+               nRepMatchOffset = nMatchOffset;
+
+               pCompressor->stats.num_7bit_matches++;
+            }
+            else {
+               /* 8+n bits offset */
+               nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_LARGE_MATCH, TOKEN_SIZE_LARGE_MATCH, nCurBitsOffset, nCurBitShift);
+
+               if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+                  return -1;
+               if (*nFollowsLiteral)
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, (nMatchOffset >> 8) + 3, nCurBitsOffset, nCurBitShift);
+               else
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, (nMatchOffset >> 8) + 2, nCurBitsOffset, nCurBitShift);
+               pOutData[nOutOffset++] = nMatchOffset & 0xff;
+
+               /* The match length isn't encoded in the command, emit elias gamma value */
+
+               if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen - 2, nCurBitsOffset, nCurBitShift);
+               else if (nMatchOffset < MINMATCH3_OFFSET)
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen, nCurBitsOffset, nCurBitShift);
+               else
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen - 1, nCurBitsOffset, nCurBitShift);
+               if (nOutOffset < 0) return -1;
+
+               *nFollowsLiteral = 0;
+               nRepMatchOffset = nMatchOffset;
+
+               pCompressor->stats.num_variable_matches++;
+            }
+         }
+
+         if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1)
+            pCompressor->stats.min_offset = nMatchOffset;
+         if (nMatchOffset > pCompressor->stats.max_offset)
+            pCompressor->stats.max_offset = nMatchOffset;
+         pCompressor->stats.total_offsets += (long long)nMatchOffset;
+
+         if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1)
+            pCompressor->stats.min_match_len = nMatchLen;
+         if (nMatchLen > pCompressor->stats.max_match_len)
+            pCompressor->stats.max_match_len = nMatchLen;
+         pCompressor->stats.total_match_lens += nMatchLen;
+         pCompressor->stats.match_divisor++;
+
+         if (nMatchOffset == 1) {
+            if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1)
+               pCompressor->stats.min_rle1_len = nMatchLen;
+            if (nMatchLen > pCompressor->stats.max_rle1_len)
+               pCompressor->stats.max_rle1_len = nMatchLen;
+            pCompressor->stats.total_rle1_lens += nMatchLen;
+            pCompressor->stats.rle1_divisor++;
+         }
+         else if (nMatchOffset == 2) {
+            if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1)
+               pCompressor->stats.min_rle2_len = nMatchLen;
+            if (nMatchLen > pCompressor->stats.max_rle2_len)
+               pCompressor->stats.max_rle2_len = nMatchLen;
+            pCompressor->stats.total_rle2_lens += nMatchLen;
+            pCompressor->stats.rle2_divisor++;
+         }
+
+         i += nMatchLen;
+
+         pCompressor->stats.commands_divisor++;
+      }
+      else if (pMatch->length == 1) {
+         int nMatchOffset = pMatch->offset;
+
+         /* 4 bits offset */
+
+         if (nMatchOffset < 0 || nMatchOffset > 15)
+            return -1;
+
+         nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_4BIT_MATCH, TOKEN_SIZE_4BIT_MATCH, nCurBitsOffset, nCurBitShift);
+         nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, nMatchOffset, 4, nCurBitsOffset, nCurBitShift);
+         if (nOutOffset < 0) return -1;
+
+         pCompressor->stats.num_4bit_matches++;
+         pCompressor->stats.commands_divisor++;
+
+         i++;
+         *nFollowsLiteral = 1;
+      }
+      else {
+         /* Literal */
+
+         nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 0 /* literal */, 1, nCurBitsOffset, nCurBitShift);
+
+         if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+            return -1;
+         pOutData[nOutOffset++] = pInWindow[i];
+
+         pCompressor->stats.num_literals++;
+         pCompressor->stats.commands_divisor++;
+         i++;
+         *nFollowsLiteral = 1;
+      }
+
+      int nCurSafeDist = (i - nStartOffset) - nOutOffset;
+      if (nCurSafeDist >= 0 && pCompressor->stats.safe_dist < nCurSafeDist)
+         pCompressor->stats.safe_dist = nCurSafeDist;
+   }
+
+   if (nBlockFlags & 2) {
+      /* 8 bits offset */
+
+      nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_7BIT_MATCH, TOKEN_SIZE_7BIT_MATCH, nCurBitsOffset, nCurBitShift);
+
+      if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+         return -1;
+      pOutData[nOutOffset++] = 0x00;   /* Offset: EOD */
+      pCompressor->stats.num_eod++;
+      pCompressor->stats.commands_divisor++;
+
+      int nCurSafeDist = (i - nStartOffset) - nOutOffset;
+      if (nCurSafeDist >= 0 && pCompressor->stats.safe_dist < nCurSafeDist)
+         pCompressor->stats.safe_dist = nCurSafeDist;
+   }
+
+   *nCurRepMatchOffset = nRepMatchOffset;
+   return nOutOffset;
+}
+
+/**
+ * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ * @param nCurFollowsLiteral non-zero if the next command to be issued follows a literal, 0 if not
+ * @param nCurRepMatchOffset starting rep offset for this block, updated after the block is compressed successfully
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int apultra_optimize_and_write_block(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize, int *nCurBitsOffset, int *nCurBitShift, int *nCurFollowsLiteral, int *nCurRepMatchOffset, const int nBlockFlags) {
+   int nOutOffset = 0;
+   const int nEndOffset = nPreviousBlockSize + nInDataSize;
+   const int nArrivalsPerPosition = pCompressor->max_arrivals;
+   int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   int i, nPosition;
+
+   memset(pCompressor->best_match, 0, pCompressor->block_size * sizeof(apultra_final_match));
+
+   if ((nBlockFlags & 3) == 3) {
+      int *first_offset_for_byte = pCompressor->first_offset_for_byte;
+      int *next_offset_for_pos = pCompressor->next_offset_for_pos;
+
+      /* Supplement 2 and 3-byte matches */
+
+      memset(first_offset_for_byte, 0xff, sizeof(int) * 65536);
+      memset(next_offset_for_pos, 0xff, sizeof(int) * nInDataSize);
+
+      for (nPosition = nPreviousBlockSize; nPosition < (nEndOffset - 1); nPosition++) {
+         next_offset_for_pos[nPosition - nPreviousBlockSize] = first_offset_for_byte[((unsigned int)pInWindow[nPosition]) | (((unsigned int)pInWindow[nPosition + 1]) << 8)];
+         first_offset_for_byte[((unsigned int)pInWindow[nPosition]) | (((unsigned int)pInWindow[nPosition + 1]) << 8)] = nPosition;
+      }
+
+      for (nPosition = nPreviousBlockSize + 1; nPosition < (nEndOffset - 1); nPosition++) {
+         apultra_match *match = pCompressor->match + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+         unsigned short *match_depth = pCompressor->match_depth + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+         int m = 0, nInserted = 0;
+         int nMatchPos;
+
+         while (m < 15 && match[m].length)
+            m++;
+
+         for (nMatchPos = next_offset_for_pos[nPosition - nPreviousBlockSize]; m < 15 && nMatchPos >= 0; nMatchPos = next_offset_for_pos[nMatchPos - nPreviousBlockSize]) {
+            int nMatchOffset = nPosition - nMatchPos;
+
+            if (nMatchOffset <= pCompressor->max_offset) {
+               int nExistingMatchIdx;
+               int nAlreadyExists = 0;
+
+               for (nExistingMatchIdx = 0; nExistingMatchIdx < m; nExistingMatchIdx++) {
+                  if (match[nExistingMatchIdx].offset == nMatchOffset ||
+                     (match[nExistingMatchIdx].offset - (match_depth[nExistingMatchIdx] & 0x3fff)) == nMatchOffset) {
+                     nAlreadyExists = 1;
+                     break;
+                  }
+               }
+
+               if (!nAlreadyExists) {
+                  match[m].length = (nPosition < (nEndOffset - 2) && pInWindow[nMatchPos + 2] == pInWindow[nPosition + 2]) ? 3 : 2;
+                  match[m].offset = nMatchOffset;
+                  match_depth[m] = 0x4000;
+                  m++;
+                  nInserted++;
+                  if (nInserted >= 6)
+                     break;
+               }
+            }
+            else {
+               break;
+            }
+         }
+      }
+   }
+
+   i = 0;
+   while (i < nEndOffset) {
+      int nRangeStartIdx = i;
+      unsigned char c = pInWindow[nRangeStartIdx];
+      do {
+         i++;
+      }
+      while (i < nEndOffset && pInWindow[i] == c);
+      while (nRangeStartIdx < i) {
+         rle_len[nRangeStartIdx] = i - nRangeStartIdx;
+         nRangeStartIdx++;
+      }
+   }
+
+   apultra_optimize_forward(pCompressor, pInWindow, nPreviousBlockSize, nEndOffset, 1 /* nInsertForwardReps */, nCurRepMatchOffset, nBlockFlags, nArrivalsPerPosition);
+
+   if ((nBlockFlags & 3) == 3 && nArrivalsPerPosition == NARRIVALS_PER_POSITION_MAX) {
+      const int* next_offset_for_pos = pCompressor->next_offset_for_pos;
+      int* offset_cache = pCompressor->offset_cache;
+
+      /* Supplement matches further */
+
+      memset(offset_cache, 0xff, sizeof(int) * 2048);
+
+      for (nPosition = nPreviousBlockSize + 1; nPosition < (nEndOffset - 1); nPosition++) {
+         apultra_match* match = pCompressor->match + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+
+         if (match[0].length < 8) {
+            unsigned short* match_depth = pCompressor->match_depth + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+            int m = 0, nInserted = 0;
+            int nMatchPos;
+
+            while (m < 42 && match[m].length) {
+               offset_cache[match[m].offset & 2047] = nPosition;
+               offset_cache[(match[m].offset - (match_depth[m] & 0x3fff)) & 2047] = nPosition;
+               m++;
+            }
+
+            for (nMatchPos = next_offset_for_pos[nPosition - nPreviousBlockSize]; m < 42 && nMatchPos >= 0; nMatchPos = next_offset_for_pos[nMatchPos - nPreviousBlockSize]) {
+               int nMatchOffset = nPosition - nMatchPos;
+
+               if (nMatchOffset <= pCompressor->max_offset) {
+                  int nAlreadyExists = 0;
+
+                  if (offset_cache[nMatchOffset & 2047] == nPosition) {
+                     int nExistingMatchIdx;
+
+                     for (nExistingMatchIdx = 0; nExistingMatchIdx < m; nExistingMatchIdx++) {
+                        if (match[nExistingMatchIdx].offset == nMatchOffset ||
+                           (match[nExistingMatchIdx].offset - (match_depth[nExistingMatchIdx] & 0x3fff)) == nMatchOffset) {
+                           nAlreadyExists = 1;
+
+                           if (match_depth[nExistingMatchIdx] == 0x4000) {
+                              int nMatchLen = 2;
+                              while (nMatchLen < 16 && nPosition < (nEndOffset - nMatchLen) && pInWindow[nMatchPos + nMatchLen] == pInWindow[nPosition + nMatchLen])
+                                 nMatchLen++;
+                              if (nMatchLen > (int)match[nExistingMatchIdx].length)
+                                 match[nExistingMatchIdx].length = nMatchLen;
+                           }
+
+                           break;
+                        }
+                     }
+                  }
+
+                  if (!nAlreadyExists) {
+                     int nForwardPos = nPosition + 2 + 1;
+                     int nGotMatch = 0;
+
+                     while (nForwardPos >= nMatchOffset && (nForwardPos + 2) < nEndOffset && nForwardPos < (nPosition + 2 + 1 + 5)) {
+                        if (!memcmp(pInWindow + nForwardPos, pInWindow + nForwardPos - nMatchOffset, 2)) {
+                           nGotMatch = 1;
+                           break;
+                        }
+                        nForwardPos++;
+                     }
+
+                     if (nGotMatch) {
+                        int nMatchLen = 2;
+                        while (nMatchLen < 16 && nPosition < (nEndOffset - nMatchLen) && pInWindow[nMatchPos + nMatchLen] == pInWindow[nPosition + nMatchLen])
+                           nMatchLen++;
+                        match[m].length = nMatchLen;
+                        match[m].offset = nMatchOffset;
+                        match_depth[m] = 0;
+                        m++;
+
+                        apultra_insert_forward_match(pCompressor, pInWindow, nPosition, nMatchOffset, nPreviousBlockSize, nEndOffset, nArrivalsPerPosition, 8);
+
+                        nInserted++;
+                        if (nInserted >= 6)
+                           break;
+                     }
+                  }
+               }
+               else {
+                  break;
+               }
+            }
+         }
+      }
+   }
+
+   /* Pick optimal matches */
+   apultra_optimize_forward(pCompressor, pInWindow, nPreviousBlockSize, nEndOffset, 0 /* nInsertForwardReps */, nCurRepMatchOffset, nBlockFlags, nArrivalsPerPosition);
+
+   /* Apply reduction and merge pass */
+   int nDidReduce;
+   int nPasses = 0;
+   do {
+      nDidReduce = apultra_reduce_commands(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nEndOffset, nCurRepMatchOffset, nBlockFlags);
+      nPasses++;
+   } while (nDidReduce && nPasses < 20);
+
+   /* Write compressed block */
+
+   return apultra_write_block(pCompressor, pCompressor->best_match - nPreviousBlockSize, pInWindow, nPreviousBlockSize, nEndOffset, pOutData, nOutOffset, nMaxOutDataSize, nCurBitsOffset, nCurBitShift, nCurFollowsLiteral, nCurRepMatchOffset, nBlockFlags);
+}
+
+/* Forward declaration */
+static void apultra_compressor_destroy(apultra_compressor *pCompressor);
+
+/**
+ * Initialize compression context
+ *
+ * @param pCompressor compression context to initialize
+ * @param nBlockSize maximum size of input data (bytes to compress only)
+ * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
+ * @param nMaxArrivals maximum number of arrivals per position
+ * @param nFlags compression flags
+ *
+ * @return 0 for success, non-zero for failure
+ */
+static int apultra_compressor_init(apultra_compressor *pCompressor, const int nBlockSize, const int nMaxWindowSize, const int nMaxArrivals, const int nFlags) {
+   int nResult;
+
+   nResult = divsufsort_init(&pCompressor->divsufsort_context);
+   pCompressor->intervals = NULL;
+   pCompressor->pos_data = NULL;
+   pCompressor->open_intervals = NULL;
+   pCompressor->match = NULL;
+   pCompressor->match_depth = NULL;
+   pCompressor->match1 = NULL;
+   pCompressor->best_match = NULL;
+   pCompressor->arrival = NULL;
+   pCompressor->first_offset_for_byte = NULL;
+   pCompressor->next_offset_for_pos = NULL;
+   pCompressor->offset_cache = NULL;
+   pCompressor->flags = nFlags;
+   pCompressor->block_size = nBlockSize;
+   pCompressor->max_arrivals = nMaxArrivals;
+
+   memset(&pCompressor->stats, 0, sizeof(pCompressor->stats));
+   pCompressor->stats.min_match_len = -1;
+   pCompressor->stats.min_offset = -1;
+   pCompressor->stats.min_rle1_len = -1;
+   pCompressor->stats.min_rle2_len = -1;
+
+   if (!nResult) {
+      pCompressor->intervals = (unsigned long long *)malloc(nMaxWindowSize * sizeof(unsigned long long));
+
+      if (pCompressor->intervals) {
+         pCompressor->pos_data = (unsigned long long *)malloc(nMaxWindowSize * sizeof(unsigned long long));
+
+         if (pCompressor->pos_data) {
+            pCompressor->open_intervals = (unsigned long long *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned long long));
+
+            if (pCompressor->open_intervals) {
+               pCompressor->arrival = (apultra_arrival *)malloc((nBlockSize + 1) * nMaxArrivals * sizeof(apultra_arrival));
+
+               if (pCompressor->arrival) {
+                  pCompressor->best_match = (apultra_final_match *)malloc(nBlockSize * sizeof(apultra_final_match));
+
+                  if (pCompressor->best_match) {
+                     pCompressor->match = (apultra_match *)malloc(nBlockSize * NMATCHES_PER_INDEX * sizeof(apultra_match));
+                     if (pCompressor->match) {
+                        pCompressor->match_depth = (unsigned short *)malloc(nBlockSize * NMATCHES_PER_INDEX * sizeof(unsigned short));
+                        if (pCompressor->match_depth) {
+                           pCompressor->match1 = (unsigned char *)malloc(nBlockSize * sizeof(unsigned char));
+                           if (pCompressor->match1) {
+                              pCompressor->first_offset_for_byte = (int*)malloc(65536 * sizeof(int));
+                              if (pCompressor->first_offset_for_byte) {
+                                 pCompressor->next_offset_for_pos = (int*)malloc(nBlockSize * sizeof(int));
+                                 if (pCompressor->next_offset_for_pos) {
+                                    if (nMaxArrivals == NARRIVALS_PER_POSITION_MAX) {
+                                       pCompressor->offset_cache = (int*)malloc(2048 * sizeof(int));
+                                       if (pCompressor->offset_cache) {
+                                          return 0;
+                                       }
+                                    }
+                                    else {
+                                       return 0;
+                                    }
+                                 }
+                              }
+                           }
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   apultra_compressor_destroy(pCompressor);
+   return 100;
+}
+
+/**
+ * Clean up compression context and free up any associated resources
+ *
+ * @param pCompressor compression context to clean up
+ */
+static void apultra_compressor_destroy(apultra_compressor *pCompressor) {
+   divsufsort_destroy(&pCompressor->divsufsort_context);
+
+   if (pCompressor->offset_cache) {
+      free(pCompressor->offset_cache);
+      pCompressor->offset_cache = NULL;
+   }
+
+   if (pCompressor->next_offset_for_pos) {
+      free(pCompressor->next_offset_for_pos);
+      pCompressor->next_offset_for_pos = NULL;
+   }
+
+   if (pCompressor->first_offset_for_byte) {
+      free(pCompressor->first_offset_for_byte);
+      pCompressor->first_offset_for_byte = NULL;
+   }
+
+   if (pCompressor->match1) {
+      free(pCompressor->match1);
+      pCompressor->match1 = NULL;
+   }
+
+   if (pCompressor->match_depth) {
+      free(pCompressor->match_depth);
+      pCompressor->match_depth = NULL;
+   }
+
+   if (pCompressor->match) {
+      free(pCompressor->match);
+      pCompressor->match = NULL;
+   }
+
+   if (pCompressor->arrival) {
+      free(pCompressor->arrival);
+      pCompressor->arrival = NULL;
+   }
+
+   if (pCompressor->best_match) {
+      free(pCompressor->best_match);
+      pCompressor->best_match = NULL;
+   }
+
+   if (pCompressor->open_intervals) {
+      free(pCompressor->open_intervals);
+      pCompressor->open_intervals = NULL;
+   }
+
+   if (pCompressor->pos_data) {
+      free(pCompressor->pos_data);
+      pCompressor->pos_data = NULL;
+   }
+
+   if (pCompressor->intervals) {
+      free(pCompressor->intervals);
+      pCompressor->intervals = NULL;
+   }
+}
+
+/**
+ * Compress one block of data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ * @param nCurFollowsLiteral non-zero if the next command to be issued follows a literal, 0 if not
+ * @param nCurRepMatchOffset starting rep offset for this block, updated after the block is compressed successfully
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int apultra_compressor_shrink_block(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize, int *nCurBitsOffset, int *nCurBitShift, int *nCurFollowsLiteral, int *nCurRepMatchOffset, const int nBlockFlags) {
+   int nCompressedSize;
+
+   if (apultra_build_suffix_array(pCompressor, pInWindow, nPreviousBlockSize + nInDataSize))
+      nCompressedSize = -1;
+   else {
+      if (nPreviousBlockSize) {
+         apultra_skip_matches(pCompressor, 0, nPreviousBlockSize);
+      }
+      apultra_find_all_matches(pCompressor, NMATCHES_PER_INDEX, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, nBlockFlags);
+
+      nCompressedSize = apultra_optimize_and_write_block(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize, nCurBitsOffset, nCurBitShift, nCurFollowsLiteral, nCurRepMatchOffset, nBlockFlags);
+   }
+
+   return nCompressedSize;
+}
+
+/**
+ * Get maximum compressed size of input(source) data
+ *
+ * @param nInputSize input(source) size in bytes
+ *
+ * @return maximum compressed size
+ */
+size_t apultra_get_max_compressed_size(size_t nInputSize) {
+   return ((nInputSize * 9 /* literals + literal bits */ + 1 /* match bit */ + 2 /* 7+1 command bits */ + 8 /* EOD offset bits */) + 7) >> 3;
+}
+
+/**
+ * Compress memory
+ *
+ * @param pInputData pointer to input(source) data to compress
+ * @param pOutBuffer buffer for compressed data
+ * @param nInputSize input(source) size in bytes
+ * @param nMaxOutBufferSize maximum capacity of compression buffer
+ * @param nFlags compression flags (set to 0)
+ * @param nMaxWindowSize maximum window size to use (0 for default)
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param progress progress function, called after compressing each block, or NULL for none
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
+ *
+ * @return actual compressed size, or -1 for error
+ */
+size_t apultra_compress(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
+      const unsigned int nFlags, size_t nMaxWindowSize, size_t nDictionarySize, void(*progress)(long long nOriginalSize, long long nCompressedSize), apultra_stats *pStats) {
+   apultra_compressor compressor;
+   size_t nOriginalSize = 0;
+   size_t nCompressedSize = 0L;
+   int nResult;
+   int nMaxArrivals = NARRIVALS_PER_POSITION_SMALL;
+   int nError = 0;
+   const int nBlockSize = (nInputSize < BLOCK_SIZE) ? ((nInputSize < 1024) ? 1024 : (int)nInputSize) : BLOCK_SIZE;
+   const int nMaxOutBlockSize = (int)apultra_get_max_compressed_size(nBlockSize);
+
+   if (nDictionarySize < nInputSize) {
+      int nInDataSize = (int)(nInputSize - nDictionarySize);
+      if (nInDataSize > nBlockSize)
+         nInDataSize = nBlockSize;
+
+      if (nInDataSize > 0 && (nDictionarySize + nInDataSize) >= nInputSize) {
+         if (nInputSize <= 65536)
+            nMaxArrivals = NARRIVALS_PER_POSITION_MAX;
+         else
+            nMaxArrivals = NARRIVALS_PER_POSITION_NORMAL;
+      }
+   }
+
+   nResult = apultra_compressor_init(&compressor, nBlockSize, nBlockSize * 2, nMaxArrivals, nFlags);
+   if (nResult != 0) {
+      return -1;
+   }
+
+   compressor.max_offset = nMaxWindowSize ? (int)nMaxWindowSize : MAX_OFFSET;
+
+   int nPreviousBlockSize = 0;
+   int nNumBlocks = 0;
+   int nCurBitsOffset = INT_MIN, nCurBitShift = 0, nCurFollowsLiteral = 0;
+   int nBlockFlags = 1;
+   int nCurRepMatchOffset = 0;
+
+   if (nDictionarySize) {
+      nOriginalSize = (int)nDictionarySize;
+      nPreviousBlockSize = (int)nDictionarySize;
+   }
+
+   while (nOriginalSize < nInputSize && !nError) {
+      int nInDataSize;
+
+      nInDataSize = (int)(nInputSize - nOriginalSize);
+      if (nInDataSize > nBlockSize)
+         nInDataSize = nBlockSize;
+
+      if (nInDataSize > 0) {
+         int nOutDataSize;
+         int nOutDataEnd = (int)(nMaxOutBufferSize - nCompressedSize);
+
+         if (nOutDataEnd > nMaxOutBlockSize)
+            nOutDataEnd = nMaxOutBlockSize;
+
+         if ((nOriginalSize + nInDataSize) >= nInputSize)
+            nBlockFlags |= 2;
+         nOutDataSize = apultra_compressor_shrink_block(&compressor, pInputData + nOriginalSize - nPreviousBlockSize, nPreviousBlockSize, nInDataSize, pOutBuffer + nCompressedSize, nOutDataEnd,
+            &nCurBitsOffset, &nCurBitShift, &nCurFollowsLiteral, &nCurRepMatchOffset, nBlockFlags);
+         nBlockFlags &= (~1);
+
+         if (nOutDataSize >= 0) {
+            /* Write compressed block */
+
+            if (!nError) {
+               nOriginalSize += nInDataSize;
+               nCompressedSize += nOutDataSize;
+               if (nCurBitsOffset != INT_MIN)
+                  nCurBitsOffset -= nOutDataSize;
+            }
+         }
+         else {
+            nError = -1;
+         }
+
+         nPreviousBlockSize = nInDataSize;
+         nNumBlocks++;
+      }
+
+      if (!nError && nOriginalSize < nInputSize) {
+         if (progress)
+            progress(nOriginalSize, nCompressedSize);
+      }
+   }
+
+   if (progress)
+      progress(nOriginalSize, nCompressedSize);
+   if (pStats)
+      *pStats = compressor.stats;
+
+   apultra_compressor_destroy(&compressor);
+
+   if (nError) {
+      return -1;
+   }
+   else {
+      return nCompressedSize;
+   }
+}
diff --git a/tools/apultra/src/shrink.h b/tools/apultra/src/shrink.h
new file mode 100644
index 0000000..0057d68
--- /dev/null
+++ b/tools/apultra/src/shrink.h
@@ -0,0 +1,174 @@
+/*
+ * shrink.h - compressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _SHRINK_H
+#define _SHRINK_H
+
+#include "divsufsort.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LCP_BITS 15
+#define TAG_BITS 4
+#define LCP_MAX ((1U<<(LCP_BITS - TAG_BITS)) - 1)
+#define LCP_AND_TAG_MAX ((1U<<LCP_BITS) - 1)
+#define LCP_SHIFT (63-LCP_BITS)
+#define LCP_MASK (((1ULL<<LCP_BITS) - 1) << LCP_SHIFT)
+#define POS_MASK ((1ULL<<LCP_SHIFT) - 1)
+#define VISITED_FLAG 0x8000000000000000ULL
+#define EXCL_VISITED_MASK  0x7fffffffffffffffULL
+
+#define NARRIVALS_PER_POSITION_MAX 55
+#define NARRIVALS_PER_POSITION_NORMAL 46
+#define NARRIVALS_PER_POSITION_SMALL 9
+
+#define NMATCHES_PER_INDEX 64
+#define MATCHES_PER_INDEX_SHIFT 6
+
+#define LEAVE_ALONE_MATCH_SIZE 120
+
+/** One match option */
+typedef struct _apultra_match {
+   unsigned int length:11;
+   unsigned int offset:21;
+} apultra_match;
+
+/** One finalized match */
+typedef struct _apultra_final_match {
+   int length;
+   int offset;
+} apultra_final_match;
+
+/** Forward arrival slot */
+typedef struct {
+   int cost;
+
+   unsigned int from_pos:21;
+   int from_slot:7;
+   unsigned int follows_literal:1;
+
+   unsigned int rep_offset:21;
+   unsigned int short_offset:4;
+   unsigned int rep_pos:21;
+   unsigned int match_len:11;
+
+   int score;
+} apultra_arrival;
+
+/** Compression statistics */
+typedef struct _apultra_stats {
+   int num_literals;
+   int num_4bit_matches;
+   int num_7bit_matches;
+   int num_variable_matches;
+   int num_rep_matches;
+   int num_eod;
+
+   int safe_dist;
+
+   int min_offset;
+   int max_offset;
+   long long total_offsets;
+
+   int min_match_len;
+   int max_match_len;
+   int total_match_lens;
+
+   int min_rle1_len;
+   int max_rle1_len;
+   int total_rle1_lens;
+
+   int min_rle2_len;
+   int max_rle2_len;
+   int total_rle2_lens;
+
+   int commands_divisor;
+   int match_divisor;
+   int rle1_divisor;
+   int rle2_divisor;
+} apultra_stats;
+
+/** Compression context */
+typedef struct _apultra_compressor {
+   divsufsort_ctx_t divsufsort_context;
+   unsigned long long *intervals;
+   unsigned long long *pos_data;
+   unsigned long long *open_intervals;
+   apultra_match *match;
+   unsigned short *match_depth;
+   unsigned char *match1;
+   apultra_final_match *best_match;
+   apultra_arrival *arrival;
+   int *first_offset_for_byte;
+   int *next_offset_for_pos;
+   int *offset_cache;
+   int flags;
+   int block_size;
+   int max_offset;
+   int max_arrivals;
+   apultra_stats stats;
+} apultra_compressor;
+
+/**
+ * Get maximum compressed size of input(source) data
+ *
+ * @param nInputSize input(source) size in bytes
+ *
+ * @return maximum compressed size
+ */
+size_t apultra_get_max_compressed_size(size_t nInputSize);
+
+/**
+ * Compress memory
+ *
+ * @param pInputData pointer to input(source) data to compress
+ * @param pOutBuffer buffer for compressed data
+ * @param nInputSize input(source) size in bytes
+ * @param nMaxOutBufferSize maximum capacity of compression buffer
+ * @param nFlags compression flags (set to 0)
+ * @param nMaxWindowSize maximum window size to use (0 for default)
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param progress progress function, called after compressing each block, or NULL for none
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
+ *
+ * @return actual compressed size, or -1 for error
+ */
+size_t apultra_compress(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
+   const unsigned int nFlags, size_t nMaxWindowSize, size_t nDictionarySize, void(*progress)(long long nOriginalSize, long long nCompressedSize), apultra_stats *pStats);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SHRINK_H */
author	Juan J. Martinez <jjm@usebox.net>	2021-01-09 09:01:05 +0000
committer	Juan J. Martinez <jjm@usebox.net>	2021-01-09 09:01:05 +0000
commit	9bcf1e97960c0da7322a868efdbc07e2650716fe (patch)
tree	de6d32ad5b0e567991bd3eb262902c15a77074d9 /tools
parent	3b31adf01305e522f7e28c1435fb47418ce43267 (diff)
download	ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.tar.gz ubox-msx-lib-9bcf1e97960c0da7322a868efdbc07e2650716fe.zip