From ce07f842617e711eda130c89131dbcee3c5f2b79 Mon Sep 17 00:00:00 2001 From: Jonathon LeFaive Date: Sat, 14 Oct 2017 16:52:12 -0400 Subject: [PATCH] Alpha 3 Merge (#1) * Changes project name and adds namespace. * Adds gz reader and bgzf writer. * Updates readme. * Adds bgzf support. * Fixes test commands. * Adds ogzbuf and updates target names. * Minor changes. * Updates bgz seek test. * Example of actual bgzf. This commit is a temporary untested example of actual bgzf. I find the the encoding of compressed block size into the header a counterproductive solution to a problem that does not exist. So this code will be replaced with a simpler version of blocked gzip that only restricts the uncompressed data size as opposed to both uncompressed and compressed. * Reverting actual bgzf implementation. * Changes naming structure. * Checks in generic istream. * Adds generic istream tests. * Updates readme with new naming structure. * Fixes bgzf virtual offset bugs. * Adds initial zstd support. * Fixes build error. * Fixes zlib flush issue. * Adds zstd seek support. * Fixes move bug in zstd::ibuf. * Adds zstd.cmake. * Addresses gcc bug 54316. * Addresses gcc bug 54316. * Fixes premature end of file. * Removes unnecessary assert. * Adds cmake 3.2 support. * Attempts to fix cmake issues. * Attempts to fix cmake issues. * Attempts to fix cmake issues. * Attempts to fix cmake issues. * Attempts to fix cmake issues. * Adds optional compression level parameter to zstd::obuf. * Prevents zstd shared libs from being built since advanced api is required. --- .travis.yml | 12 +- CMakeLists.txt | 69 +++- README.md | 47 ++- dep/xz.cmake | 2 +- dep/zstd.cmake | 71 ++++ include/shrinkwrap/gz.hpp | 712 +++++++++++++++++++++++++++++++++ include/shrinkwrap/istream.hpp | 76 ++++ include/shrinkwrap/xz.hpp | 588 +++++++++++++++++++++++++++ include/shrinkwrap/zstd.hpp | 420 +++++++++++++++++++ include/xzbuf.hpp | 556 ------------------------- requirements.txt | 1 + src/test.cpp | 227 ++++++++++- 12 files changed, 2172 insertions(+), 609 deletions(-) create mode 100755 dep/zstd.cmake create mode 100644 include/shrinkwrap/gz.hpp create mode 100644 include/shrinkwrap/istream.hpp create mode 100644 include/shrinkwrap/xz.hpp create mode 100644 include/shrinkwrap/zstd.hpp delete mode 100644 include/xzbuf.hpp diff --git a/.travis.yml b/.travis.yml index 2e0c68c..9e0f7cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: cpp compiler: gcc dist: trusty sudo: required +group: edge branches: only: - develop @@ -15,12 +16,19 @@ addons: - cmake - gcc-5 - g++-5 -#install: + - python-dev + - python-pip +install: + - sudo pip install cget + - cget install -f ./requirements.txt +# - curl -L https://github.com/facebook/zstd/archive/v1.2.0.tar.gz > zstd-v1.2.0.tar.gz +# - tar -xzf zstd-v1.2.0.tar.gz +# - cd zstd-1.2.0/ && cmake -DCMAKE_BUILD_TYPE=Release -DZSTD_BUILD_STATIC=1 build/cmake && make && sudo make install && cd .. # - sudo apt-get install --yes --only-upgrade cmake-data # - sudo apt-get install --yes --only-upgrade cmake script: - cmake --version - mkdir build && cd build - - cmake -DCMAKE_C_COMPILER=/usr/bin/gcc-5 -DCMAKE_CXX_COMPILER=/usr/bin/g++-5 .. || echo "" + - cmake -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake -DCMAKE_C_COMPILER=/usr/bin/gcc-5 -DCMAKE_CXX_COMPILER=/usr/bin/g++-5 .. - make - make CTEST_OUTPUT_ON_FAILURE=1 test \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index ce992e1..b08c1c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.2) include(ExternalProject) include(CMakePackageConfigHelpers) -project(xzbuf VERSION 1.0.0) +project(shrinkwrap VERSION 1.0.0) enable_testing() @@ -9,37 +9,68 @@ set(CMAKE_CXX_STANDARD 11) if (BUILD_SHARED_LIBS) set(LIBLZMA_LIB_NAME ${CMAKE_SHARED_LIBRARY_PREFIX}lzma${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(ZLIB_LIB_NAME ${CMAKE_SHARED_LIBRARY_PREFIX}z${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set(LIBLZMA_LIB_NAME ${CMAKE_STATIC_LIBRARY_PREFIX}lzma${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(ZLIB_LIB_NAME ${CMAKE_STATIC_LIBRARY_PREFIX}z${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() find_library(LIBLZMA_LIBRARIES NAMES ${LIBLZMA_LIB_NAME}) +find_library(ZLIB_LIBRARIES + NAMES ${ZLIB_LIB_NAME}) + +#ZSTD can only likn statically since experimental functions are being used. +find_library(ZSTD_LIBRARIES + NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}) + if (NOT LIBLZMA_LIBRARIES) message(FATAL_ERROR "lzma library not found") endif() -add_library(xzbuf INTERFACE) -target_sources(xzbuf INTERFACE $) -target_include_directories(xzbuf INTERFACE - $ - $) -target_link_libraries(xzbuf INTERFACE ${LIBLZMA_LIBRARIES}) +if (NOT ZLIB_LIBRARIES) + message(FATAL_ERROR "zlib library not found") +endif() -add_executable(xzbuf-test src/test.cpp) -target_link_libraries(xzbuf-test xzbuf) +if (NOT ZSTD_LIBRARIES) + message(FATAL_ERROR "zstd library not found") +endif() -add_test(seek_test xzbuf-test seek) -add_test(iterator_test xzbuf-test iterator) +add_library(shrinkwrap INTERFACE) +if (CMAKE_VERSION VERSION_GREATER 3.3) + target_sources(shrinkwrap INTERFACE $) + target_include_directories(shrinkwrap INTERFACE + $ + $) + target_link_libraries(shrinkwrap INTERFACE ${LIBLZMA_LIBRARIES} ${ZLIB_LIBRARIES} ${ZSTD_LIBRARIES}) + + add_executable(shrinkwrap-test src/test.cpp) + target_link_libraries(shrinkwrap-test shrinkwrap) +else() + add_executable(shrinkwrap-test src/test.cpp) + target_link_libraries(shrinkwrap-test ${LIBLZMA_LIBRARIES} ${ZLIB_LIBRARIES} ${ZSTD_LIBRARIES}) + target_include_directories(shrinkwrap-test PUBLIC include) +endif() -install(FILES $ DESTINATION include) -install(TARGETS xzbuf EXPORT xzbuf-config - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) +add_test(xz_seek_test shrinkwrap-test xz-seek) +add_test(xz_iterator_test shrinkwrap-test xz-iter) +add_test(gz_iterator_test shrinkwrap-test gz-iter) +add_test(bgz_seek_test shrinkwrap-test bgz-seek) +add_test(bgz_iterator_test shrinkwrap-test bgz-iter) +add_test(zstd_iterator_test shrinkwrap-test zstd-iter) +add_test(zstd_seek_test shrinkwrap-test zstd-seek) +add_test(generic_iterator_test shrinkwrap-test generic-iter) +add_test(generic_seek_test shrinkwrap-test generic-seek) -install(EXPORT ${PROJECT_NAME}-config DESTINATION share/${PROJECT_NAME}) -write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake COMPATIBILITY SameMajorVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake DESTINATION share/${PROJECT_NAME}) +install(DIRECTORY include/shrinkwrap DESTINATION include) +if (CMAKE_VERSION VERSION_GREATER 3.3) + install(TARGETS shrinkwrap EXPORT ${PROJECT_NAME}-config + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) -export(EXPORT ${PROJECT_NAME}-config) \ No newline at end of file + install(EXPORT ${PROJECT_NAME}-config DESTINATION share/${PROJECT_NAME}) + write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake COMPATIBILITY SameMajorVersion) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake DESTINATION share/${PROJECT_NAME}) + export(EXPORT ${PROJECT_NAME}-config) +endif() \ No newline at end of file diff --git a/README.md b/README.md index 1012630..4a7683f 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# xzbuf -A streambuf for xz files. +# Shrink Wrap +A std::streambuf wrapper for compression formats. -## ixzbuf with std::istream +## XZ streambuf with std::istream ```c++ std::array buf; -ixzbuf sbuf("file.xz"); +shrinkwrap::xz::ibuf sbuf("file.xz"); std::istream is(&sbuf); is.seekg(-1024, std::ios::end); while (is) @@ -13,17 +13,17 @@ while (is) std::cout.write(buf.data(), is.gcount()); } ``` -## ixzbuf with std::istreambuf_iterator +## XZ streambuf with std::istreambuf_iterator ```c++ -ixzbuf sbuf("file.xz"); +shrinkwrap::xz::ibuf sbuf("file.xz"); for (std::istreambuf_iterator it(&sbuf); it != std::istreambuf_iterator{}; ++it) std::cout.put(*it); ``` -## ixzstream +## XZ input stream ```c++ std::array buf; -ixzstream is("file.xz"); +shrinkwrap::xz::ostream is("file.xz"); while (is) { is.read(buf.data(), buf.size()); @@ -31,10 +31,10 @@ while (is) } ``` -## oxzstream +## XZ output stream ```c++ -std::array buf; -oxzstream os("file.xz"); +std::vector buf(1024 * 1024); +shrinkwrap::xz::ostream os("file.xz"); while (std::cin) { std::cin.read(buf.data(), buf.size()); @@ -43,5 +43,28 @@ while (std::cin) } ``` +## BGZF (Blocked GNU Zip Format) +```c++ +std::array buf; +shrinkwrap::bgz::istream is("file.xz"); +is.read(buf.data(), buf.size()); + +// (gzip_block_position << 16) | relative_uncompressed_offset +auto virtual_offset = is.tellg(); +is.seekg(virtual_offset); +``` + +## Generic input stream +Generic istream detects file format. +```c++ +std::array buf; +shrinkwrap::istream is("file"); +while (is) +{ + is.read(buf.data(), buf.size()); + std::cout.write(buf.data(), is.gcount()); +} +``` + ## Caveats -* Does not support files with concatenated streams. +* Does not support files with concatenated xz streams. diff --git a/dep/xz.cmake b/dep/xz.cmake index 7b8e142..efd58f8 100644 --- a/dep/xz.cmake +++ b/dep/xz.cmake @@ -3,7 +3,7 @@ project(xz VERSION 5.2.3) execute_process(COMMAND ./configure --prefix=${CMAKE_INSTALL_PREFIX} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) -add_custom_target(xz ALL COMMAND make WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} BYPRODUCTS ${CMAKE_CURRENT_SOURCE_DIR}/src/liblzma/.libs/liblzma.a COMMENT "Builing xz ...") +add_custom_target(xz ALL COMMAND make WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Builing xz ...") install(DIRECTORY src/liblzma/api/ DESTINATION include FILES_MATCHING PATTERN "*.h") diff --git a/dep/zstd.cmake b/dep/zstd.cmake new file mode 100755 index 0000000..87fa080 --- /dev/null +++ b/dep/zstd.cmake @@ -0,0 +1,71 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + +PROJECT(zstd) +CMAKE_MINIMUM_REQUIRED(VERSION 2.8.9) +OPTION(ZSTD_BUILD_STATIC "must be static" ON) +SET(ZSTD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/build/cmake/CMakeModules") + +#----------------------------------------------------------------------------- +# Add extra compilation flags +#----------------------------------------------------------------------------- +INCLUDE(AddZstdCompilationFlags) +ADD_ZSTD_COMPILATION_FLAGS() + +#----------------------------------------------------------------------------- +# Options +#----------------------------------------------------------------------------- +OPTION(ZSTD_LEGACY_SUPPORT "LEGACY SUPPORT" OFF) +IF (UNIX) + OPTION(ZSTD_MULTITHREAD_SUPPORT "MULTITHREADING SUPPORT" ON) +ELSE (UNIX) + OPTION(ZSTD_MULTITHREAD_SUPPORT "MULTITHREADING SUPPORT" OFF) +ENDIF (UNIX) +OPTION(ZSTD_BUILD_PROGRAMS "BUILD PROGRAMS" ON) +OPTION(ZSTD_BUILD_CONTRIB "BUILD CONTRIB" OFF) +OPTION(ZSTD_BUILD_TESTS "BUILD TESTS" OFF) + +IF (ZSTD_LEGACY_SUPPORT) + MESSAGE(STATUS "ZSTD_LEGACY_SUPPORT defined!") + ADD_DEFINITIONS(-DZSTD_LEGACY_SUPPORT=4) +ELSE (ZSTD_LEGACY_SUPPORT) + MESSAGE(STATUS "ZSTD_LEGACY_SUPPORT not defined!") + ADD_DEFINITIONS(-DZSTD_LEGACY_SUPPORT=0) +ENDIF (ZSTD_LEGACY_SUPPORT) + +#----------------------------------------------------------------------------- +# Add source directories +#----------------------------------------------------------------------------- +SET(ZSTD_BUILD_SHARED OFF CACHE BOOL "must be static" FORCE) +ADD_SUBDIRECTORY(build/cmake/lib) + +IF (ZSTD_BUILD_PROGRAMS) + ADD_SUBDIRECTORY(build/cmake/programs) +ENDIF (ZSTD_BUILD_PROGRAMS) + +IF (ZSTD_BUILD_TESTS) + IF (NOT ZSTD_BUILD_STATIC) + MESSAGE(SEND_ERROR "You need to build static library to build tests") + ENDIF (NOT ZSTD_BUILD_STATIC) + + ADD_SUBDIRECTORY(build/cmake/tests) +ENDIF (ZSTD_BUILD_TESTS) + +IF (ZSTD_BUILD_CONTRIB) + ADD_SUBDIRECTORY(build/cmake/contrib) +ENDIF (ZSTD_BUILD_CONTRIB) + +#----------------------------------------------------------------------------- +# Add clean-all target +#----------------------------------------------------------------------------- +ADD_CUSTOM_TARGET(clean-all + COMMAND ${CMAKE_BUILD_TOOL} clean + COMMAND rm -rf ${CMAKE_BINARY_DIR}/ +) diff --git a/include/shrinkwrap/gz.hpp b/include/shrinkwrap/gz.hpp new file mode 100644 index 0000000..93185b4 --- /dev/null +++ b/include/shrinkwrap/gz.hpp @@ -0,0 +1,712 @@ +#ifndef SHRINKWRAP_GZ_HPP +#define SHRINKWRAP_GZ_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace shrinkwrap +{ + namespace gz + { + class ibuf : public std::streambuf + { + public: + ibuf(FILE* fp) + : + zstrm_({0}), + compressed_buffer_(default_block_size), + decompressed_buffer_(default_block_size), + discard_amount_(0), + current_block_position_(0), + uncompressed_block_offset_(0), + fp_(fp), + put_back_size_(0), + at_block_boundary_(false) + { + if (fp_) + { + zlib_res_ = inflateInit2(&zstrm_, 15 + 16); // 16 for GZIP only. + if (zlib_res_ != Z_OK) + { + // TODO: handle error. + } + } + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setg(end, end, end); + } + + ibuf(const std::string& file_path) : ibuf(fopen(file_path.c_str(), "rb")) {} +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ibuf(ibuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + ibuf& operator=(ibuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->destroy(); + this->move(std::move(src)); + } + + return *this; + } +#endif + virtual ~ibuf() + { + this->destroy(); + } + + private: + //ixzbuf(const ixzbuf& src) = delete; + //ixzbuf& operator=(const ixzbuf& src) = delete; + + void destroy() + { + if (fp_) + { + inflateEnd(&zstrm_); + fclose(fp_); + } + } + + void move(ibuf&& src) + { + zstrm_ = src.zstrm_; + src.zstrm_ = {0}; + compressed_buffer_ = std::move(src.compressed_buffer_); + decompressed_buffer_ = std::move(src.decompressed_buffer_); + discard_amount_ = src.discard_amount_; + current_block_position_ = src.current_block_position_; + uncompressed_block_offset_ = src.uncompressed_block_offset_; + at_block_boundary_ = src.at_block_boundary_; + fp_ = src.fp_; + src.fp_ = nullptr; + put_back_size_ = src.put_back_size_; + zlib_res_ = src.zlib_res_; + } + + void replenish_compressed_buffer() + { + zstrm_.next_in = compressed_buffer_.data(); + zstrm_.avail_in = fread(compressed_buffer_.data(), 1, compressed_buffer_.size(), fp_); + } + + protected: + + virtual std::streambuf::int_type underflow() + { + if (!fp_) + return traits_type::eof(); + if (gptr() < egptr()) // buffer not exhausted + return traits_type::to_int_type(*gptr()); + + while ((zlib_res_ == Z_OK || zlib_res_ == Z_STREAM_END) && gptr() >= egptr() && (zstrm_.avail_in > 0 || !feof(fp_))) + { + zstrm_.next_out = decompressed_buffer_.data(); + zstrm_.avail_out = static_cast(decompressed_buffer_.size()); + + if (zstrm_.avail_in == 0 && !feof(fp_)) + { + replenish_compressed_buffer(); + } + + + + //assert(zstrm_.avail_in > 0); + if (zlib_res_ == Z_STREAM_END && zstrm_.avail_in > 0) + { + zlib_res_ = inflateReset(&zstrm_); + uncompressed_block_offset_ = 0; + current_block_position_ = std::size_t(ftell(fp_)) - zstrm_.avail_in; + } + + zlib_res_ = inflate(&zstrm_, Z_NO_FLUSH); + + + char* start = ((char*) decompressed_buffer_.data()); + setg(start, start, start + (decompressed_buffer_.size() - zstrm_.avail_out)); + uncompressed_block_offset_ += (egptr() - gptr()); + + if (discard_amount_ > 0) + { + std::uint64_t advance_amount = discard_amount_; + if ((egptr() - gptr()) < advance_amount) + advance_amount = (egptr() - gptr()); + setg(start, gptr() + advance_amount, egptr()); + discard_amount_ -= advance_amount; + } + } + + if (zlib_res_ != Z_OK && zlib_res_ != Z_STREAM_END) + return traits_type::eof(); + else if (gptr() >= egptr()) + return traits_type::eof(); + + return traits_type::to_int_type(*gptr()); + } + + virtual std::streambuf::pos_type seekoff(std::streambuf::off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) + { + return pos_type(off_type(-1)); + } + + private: + std::vector compressed_buffer_; + std::vector decompressed_buffer_; + std::size_t put_back_size_; + bool at_block_boundary_; + protected: + static const std::size_t default_block_size = 64 * 1024; + int zlib_res_; + z_stream zstrm_; + std::uint16_t discard_amount_; + std::size_t current_block_position_; + std::size_t uncompressed_block_offset_; + FILE* fp_; + }; + + class obuf : public std::streambuf + { + public: + obuf(FILE* fp) + : + zstrm_({0}), + fp_(fp), + compressed_buffer_(default_block_size), + decompressed_buffer_(default_block_size) + { + if (!fp_) + { + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp(end, end); + } + else + { + zlib_res_ = deflateInit2(&zstrm_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, (15 | 16), 8, Z_DEFAULT_STRATEGY); // |16 for GZIP + if (zlib_res_ != Z_OK) + { + // TODO: handle error. + } + + zstrm_.next_out = compressed_buffer_.data(); + zstrm_.avail_out = static_cast(compressed_buffer_.size()); + + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp((char*) decompressed_buffer_.data(), end); + } + } + + obuf(const std::string& file_path) : obuf(fopen(file_path.c_str(), "wb")) {} +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + obuf(obuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + obuf& operator=(obuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->close(); + this->move(std::move(src)); + } + + return *this; + } +#endif + virtual ~obuf() + { + this->close(); + } + + private: + void move(obuf&& src) + { + compressed_buffer_ = std::move(src.compressed_buffer_); + decompressed_buffer_ = std::move(src.decompressed_buffer_); + zstrm_ = src.zstrm_; + fp_ = src.fp_; + src.fp_ = nullptr; + zlib_res_ = src.zlib_res_; + } + + void close() + { + if (fp_) + { + sync(); + int res = deflateEnd(&zstrm_); + if (zlib_res_ == Z_OK) + zlib_res_ = res; + fclose(fp_); + fp_ = nullptr; + } + } + protected: + virtual int overflow(int c) + { + if (!fp_) + return traits_type::eof(); + + if ((epptr() - pptr()) > 0) + { + assert(!"Put buffer not empty, this should never happen"); + this->sputc(static_cast(0xFF & c)); + } + else + { + zstrm_.next_in = decompressed_buffer_.data(); + zstrm_.avail_in = static_cast(decompressed_buffer_.size()); + while (zlib_res_ == Z_OK && zstrm_.avail_in > 0) + { + zlib_res_ = deflate(&zstrm_, Z_SYNC_FLUSH); + + if ((compressed_buffer_.size() - zstrm_.avail_out) > 0 && !fwrite(compressed_buffer_.data(), compressed_buffer_.size() - zstrm_.avail_out, 1, fp_)) + { + // TODO: handle error. + return traits_type::eof(); + } + zstrm_.next_out = compressed_buffer_.data(); + zstrm_.avail_out = static_cast(compressed_buffer_.size()); + } + + if (zlib_res_ == Z_STREAM_END) + zlib_res_ = deflateReset(&zstrm_); + + assert(zstrm_.avail_in == 0); + decompressed_buffer_[0] = reinterpret_cast(c); + setp((char*) decompressed_buffer_.data() + 1, (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return (zlib_res_ == Z_OK ? traits_type::to_int_type(c) : traits_type::eof()); + } + + virtual int sync() + { + if (!fp_) + return -1; + + zstrm_.next_in = decompressed_buffer_.data(); + zstrm_.avail_in = static_cast(decompressed_buffer_.size() - (epptr() - pptr())); + if (zstrm_.avail_in) + { + while (zlib_res_ == Z_OK && zstrm_.avail_in > 0) + { + zlib_res_ = deflate(&zstrm_, Z_SYNC_FLUSH); + + if ((compressed_buffer_.size() - zstrm_.avail_out) > 0 && !fwrite(compressed_buffer_.data(), compressed_buffer_.size() - zstrm_.avail_out, 1, fp_)) + { + // TODO: handle error. + return -1; + } + zstrm_.next_out = compressed_buffer_.data(); + zstrm_.avail_out = static_cast(compressed_buffer_.size()); + + } + + if (zlib_res_ != Z_OK) + return -1; + + assert(zstrm_.avail_in == 0); + setp((char*) decompressed_buffer_.data(), (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return 0; + } + + private: + static const std::size_t default_block_size = 64 * 1024; + std::vector compressed_buffer_; + std::vector decompressed_buffer_; + z_stream zstrm_; + FILE* fp_; + int zlib_res_; + }; + + class istream : public std::istream + { + public: + istream(const std::string& file_path) + : + std::istream(&sbuf_), + sbuf_(file_path) + { + } +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + istream(istream&& src) + : + std::istream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + istream& operator=(istream&& src) + { + if (&src != this) + { + std::istream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::gz::ibuf sbuf_; + }; + + + + class ostream : public std::ostream + { + public: + ostream(const std::string& file_path) + : + std::ostream(&sbuf_), + sbuf_(file_path) + { + } +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ostream(ostream&& src) + : + std::ostream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + ostream& operator=(ostream&& src) + { + if (&src != this) + { + std::ostream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::gz::obuf sbuf_; + }; + } + + namespace bgz + { + class ibuf : public gz::ibuf + { + public: + using gz::ibuf::ibuf; +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ibuf(ibuf&& src) + : + gz::ibuf(std::move(src)) + { + } + + ibuf& operator=(ibuf&& src) + { + if (&src != this) + { + ibuf::operator=(std::move(src)); + } + + return *this; + } +#endif + + virtual ~ibuf() + { + } + + protected: + virtual std::streambuf::pos_type seekoff(std::streambuf::off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) // Supports tellg for virtual offset. + { + if (off == 0 && way == std::ios::cur) + { + if (egptr() - gptr() == 0 && zlib_res_ == Z_STREAM_END) + { + std::uint64_t compressed_offset = std::size_t(ftell(fp_)) - zstrm_.avail_in; + std::uint16_t uncompressed_offset = 0; + std::uint64_t virtual_offset = ((compressed_offset << 16) | uncompressed_offset); + return pos_type(off_type(virtual_offset)); + } + else + { + std::uint64_t compressed_offset = current_block_position_; + std::uint16_t uncompressed_offset = (std::uint16_t(uncompressed_block_offset_) - std::uint16_t(egptr() - gptr())) + discard_amount_; + std::uint64_t virtual_offset = ((compressed_offset << 16) | uncompressed_offset); + return pos_type(off_type(virtual_offset)); + } + } + return pos_type(off_type(-1)); + } + + //coffset << 16 | uoffset + virtual std::streambuf::pos_type seekpos(std::streambuf::pos_type pos, std::ios_base::openmode which) + { + std::uint64_t compressed_offset = ((static_cast(pos) >> 16) & 0x0000FFFFFFFFFFFF); + std::uint16_t uncompressed_offset = (std::uint16_t) (static_cast(pos) & 0x000000000000FFFF); + + if (fp_ == 0 || sync()) + return pos_type(off_type(-1)); + + long seek_amount = static_cast(compressed_offset); + if (fseek(fp_, seek_amount, SEEK_SET)) + return pos_type(off_type(-1)); + + current_block_position_ = compressed_offset; + discard_amount_ = uncompressed_offset; + + zstrm_.next_in = nullptr; + zstrm_.avail_in = 0; + zlib_res_ = inflateReset(&zstrm_); + char* end = egptr(); + setg(end, end, end); + + return pos; + } + }; + + class obuf : public std::streambuf + { + public: + obuf(FILE* fp) + : + zstrm_({0}), + fp_(fp), + compressed_buffer_(default_block_size), + decompressed_buffer_(default_block_size) + { + if (!fp_) + { + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp(end, end); + } + else + { + zlib_res_ = deflateInit2(&zstrm_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, (15 | 16), 8, Z_DEFAULT_STRATEGY); // |16 for GZIP + if (zlib_res_ != Z_OK) + { + // TODO: handle error. + } + + zstrm_.next_out = compressed_buffer_.data(); + zstrm_.avail_out = static_cast(compressed_buffer_.size()); + + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp((char*) decompressed_buffer_.data(), end); + } + } + + obuf(const std::string& file_path) : obuf(fopen(file_path.c_str(), "wb")) {} +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + obuf(obuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + obuf& operator=(obuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->close(); + this->move(std::move(src)); + } + + return *this; + } +#endif + virtual ~obuf() + { + this->close(); + } + + private: + void move(obuf&& src) + { + compressed_buffer_ = std::move(src.compressed_buffer_); + decompressed_buffer_ = std::move(src.decompressed_buffer_); + zstrm_ = src.zstrm_; + fp_ = src.fp_; + src.fp_ = nullptr; + zlib_res_ = src.zlib_res_; + } + + void close() + { + if (fp_) + { + sync(); + int res = deflateEnd(&zstrm_); + if (zlib_res_ == Z_OK) + zlib_res_ = res; + fclose(fp_); + fp_ = nullptr; + } + } + protected: + virtual int overflow(int c) + { + if (!fp_) + return traits_type::eof(); + + if ((epptr() - pptr()) > 0) + { + assert(!"Put buffer not empty, this should never happen"); + this->sputc(static_cast(0xFF & c)); + } + else + { + zstrm_.next_in = decompressed_buffer_.data(); + zstrm_.avail_in = static_cast(decompressed_buffer_.size()); + while (zlib_res_ == Z_OK) + { + zlib_res_ = deflate(&zstrm_, Z_FINISH); + + if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - zstrm_.avail_out, 1, fp_)) + { + // TODO: handle error. + return traits_type::eof(); + } + zstrm_.next_out = compressed_buffer_.data(); + zstrm_.avail_out = static_cast(compressed_buffer_.size()); + + } + + if (zlib_res_ == Z_STREAM_END) + zlib_res_ = deflateReset(&zstrm_); + + assert(zstrm_.avail_in == 0); + decompressed_buffer_[0] = reinterpret_cast(c); + setp((char*) decompressed_buffer_.data() + 1, (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return (zlib_res_ == Z_OK ? traits_type::to_int_type(c) : traits_type::eof()); + } + + virtual int sync() + { + if (!fp_) + return -1; + + zstrm_.next_in = decompressed_buffer_.data(); + zstrm_.avail_in = static_cast(decompressed_buffer_.size() - (epptr() - pptr())); + if (zstrm_.avail_in) + { + while (zlib_res_ == Z_OK) + { + zlib_res_ = deflate(&zstrm_, Z_FINISH); + + if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - zstrm_.avail_out, 1, fp_)) + { + // TODO: handle error. + return -1; + } + zstrm_.next_out = compressed_buffer_.data(); + zstrm_.avail_out = static_cast(compressed_buffer_.size()); + + } + + if (zlib_res_ == Z_STREAM_END) + zlib_res_ = deflateReset(&zstrm_); + + if (zlib_res_ != Z_OK) + return -1; + + assert(zstrm_.avail_in == 0); + setp((char*) decompressed_buffer_.data(), (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return 0; + } + + private: + static const std::size_t default_block_size = 64 * 1024; + std::vector compressed_buffer_; + std::vector decompressed_buffer_; + z_stream zstrm_; + FILE* fp_; + int zlib_res_; + }; + + class istream : public std::istream + { + public: + istream(const std::string& file_path) + : + std::istream(&sbuf_), + sbuf_(file_path) + { + } +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + istream(istream&& src) + : + std::istream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + istream& operator=(istream&& src) + { + if (&src != this) + { + std::istream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::bgz::ibuf sbuf_; + }; + + class ostream : public std::ostream + { + public: + ostream(const std::string& file_path) + : + std::ostream(&sbuf_), + sbuf_(file_path) + { + } +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ostream(ostream&& src) + : + std::ostream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + ostream& operator=(ostream&& src) + { + if (&src != this) + { + std::ostream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::bgz::obuf sbuf_; + }; + } +} + +#endif //SHRINKWRAP_GZ_HPP \ No newline at end of file diff --git a/include/shrinkwrap/istream.hpp b/include/shrinkwrap/istream.hpp new file mode 100644 index 0000000..5bacbba --- /dev/null +++ b/include/shrinkwrap/istream.hpp @@ -0,0 +1,76 @@ +#ifndef SHRINKWRAP_ISTREAM_HPP +#define SHRINKWRAP_ISTREAM_HPP + +#include "xz.hpp" +#include "gz.hpp" +#include "zstd.hpp" + +#include +#include + +namespace shrinkwrap +{ + namespace detail + { + template + std::unique_ptr make_unique(Args&& ...args) + { + return std::unique_ptr(new T(std::forward(args)...)); + } + } + + class istream : public std::istream + { + public: + istream(const std::string& file_path) + : + std::istream(nullptr) + { + FILE* fp = fopen(file_path.c_str(), "rb"); + + int first_byte = fgetc(fp); + ungetc(first_byte, fp); + + switch (char(first_byte)) + { + case '\x1F': + sbuf_ = detail::make_unique<::shrinkwrap::bgz::ibuf>(fp); + break; + case char('\xFD'): + sbuf_ = detail::make_unique<::shrinkwrap::xz::ibuf>(fp); + break; + case '\x28': + sbuf_ = detail::make_unique<::shrinkwrap::zstd::ibuf>(fp); + break; + default: + throw std::runtime_error("raw files not yet supported."); + + } + + this->rdbuf(sbuf_.get()); + } + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + istream(istream&& src) + : + std::istream(src.sbuf_.get()), + sbuf_(std::move(src.sbuf_)) + { + } + + istream& operator=(istream&& src) + { + if (&src != this) + { + std::istream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + std::unique_ptr sbuf_; + }; +} + +#endif //SHRINKWRAP_ISTREAM_HPP \ No newline at end of file diff --git a/include/shrinkwrap/xz.hpp b/include/shrinkwrap/xz.hpp new file mode 100644 index 0000000..1983573 --- /dev/null +++ b/include/shrinkwrap/xz.hpp @@ -0,0 +1,588 @@ +#ifndef SHRINKWRAP_XZ_HPP +#define SHRINKWRAP_XZ_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace shrinkwrap +{ + namespace xz + { + class ibuf : public std::streambuf + { + public: + ibuf(FILE* fp) + : + decoded_position_(0), + discard_amount_(0), + fp_(fp), + put_back_size_(0), + lzma_index_(nullptr), + at_block_boundary_(true), + lzma_block_decoder_(LZMA_STREAM_INIT) + { + if (fp_) + { + fread(stream_header_.data(), stream_header_.size(), 1, fp_); // TODO: handle error. + lzma_res_ = lzma_stream_header_decode(&stream_header_flags_, stream_header_.data()); + if (lzma_res_ != LZMA_OK) + { + // TODO: handle error. + } + } + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setg(end, end, end); + } + + ibuf(const std::string& file_path) :ibuf(fopen(file_path.c_str(), "rb")) {} + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ibuf(ibuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + ibuf& operator=(ibuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->destroy(); + this->move(std::move(src)); + } + + return *this; + } +#endif + + virtual ~ibuf() + { + this->destroy(); + } + + protected: + virtual std::streambuf::int_type underflow() + { + if (!fp_) + return traits_type::eof(); + if (gptr() < egptr()) // buffer not exhausted + return traits_type::to_int_type(*gptr()); + + while (gptr() >= egptr() && lzma_res_ == LZMA_OK) + { + lzma_block_decoder_.next_out = decompressed_buffer_.data(); + lzma_block_decoder_.avail_out = decompressed_buffer_.size(); + + if (at_block_boundary_) + { + std::vector block_header(LZMA_BLOCK_HEADER_SIZE_MAX); + if (lzma_block_decoder_.avail_in == 0 && !feof(fp_)) + { + replenish_compressed_buffer(); + } + // TODO: make sure avail_in is greater than 0; + std::memcpy(block_header.data(), lzma_block_decoder_.next_in, 1); + ++(lzma_block_decoder_.next_in); + --(lzma_block_decoder_.avail_in); + + if (block_header[0] == 0x00) + { + // Index indicator found + lzma_res_ = LZMA_STREAM_END; + } + else + { + lzma_block_.version = 0; + lzma_block_.check = stream_header_flags_.check; + lzma_block_.filters = lzma_block_filters_buf_.data(); + lzma_block_.header_size = lzma_block_header_size_decode (block_header[0]); + + std::size_t bytes_already_copied = 0; + if (lzma_block_decoder_.avail_in < (lzma_block_.header_size - 1)) + { + bytes_already_copied = lzma_block_decoder_.avail_in; + std::memcpy(&block_header[1], lzma_block_decoder_.next_in, bytes_already_copied); + lzma_block_decoder_.avail_in -= bytes_already_copied; + lzma_block_decoder_.next_in += bytes_already_copied; + assert(lzma_block_decoder_.avail_in == 0); + replenish_compressed_buffer(); + } + + // TODO: make sure avail_in is greater than (lzma_block_.header_size - 1) - bytes_already_copied. + std::size_t bytes_left_to_copy = (lzma_block_.header_size - 1) - bytes_already_copied; + std::memcpy(&block_header[1 + bytes_already_copied], lzma_block_decoder_.next_in, bytes_left_to_copy); + lzma_block_decoder_.avail_in -= bytes_left_to_copy; + lzma_block_decoder_.next_in += bytes_left_to_copy; + + lzma_res_ = lzma_block_header_decode(&lzma_block_, nullptr, block_header.data()); + if (lzma_res_ != LZMA_OK) + { + // TODO: handle error. + } + else + { + lzma_res_ = lzma_block_decoder(&lzma_block_decoder_, &lzma_block_); + // TODO: handle error. + } + } + at_block_boundary_ = false; + } + + if (lzma_res_ == LZMA_OK) + { + if (lzma_block_decoder_.avail_in == 0 && !feof(fp_)) + { + replenish_compressed_buffer(); + } + + assert(lzma_block_decoder_.avail_in > 0); + + lzma_ret r = lzma_code(&lzma_block_decoder_, LZMA_RUN); + if (r == LZMA_STREAM_END) + { + // End of block. + at_block_boundary_ = true; + r = LZMA_OK; + } + lzma_res_ = r; + } + + char* start = ((char*) decompressed_buffer_.data()); + setg(start, start, start + (decompressed_buffer_.size() - lzma_block_decoder_.avail_out)); + decoded_position_ += (egptr() - gptr()); + + if (discard_amount_ > 0) + { + std::uint64_t advance_amount = discard_amount_; + if ((egptr() - gptr()) < advance_amount) + advance_amount = (egptr() - gptr()); + setg(start, gptr() + advance_amount, egptr()); + discard_amount_ -= advance_amount; + } + } + + if (lzma_res_ == LZMA_STREAM_END && gptr() >= egptr()) + return traits_type::eof(); + else if (lzma_res_ != LZMA_OK && lzma_res_ != LZMA_STREAM_END) + return traits_type::eof(); + + return traits_type::to_int_type(*gptr()); + } + + virtual std::streambuf::pos_type seekoff(std::streambuf::off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) + { + std::uint64_t current_position = decoded_position_ - (egptr() - gptr()); + current_position += discard_amount_; // TODO: overflow check. + + pos_type pos{off_type(current_position)}; + + if (off == 0 && way == std::ios::cur) + return pos; // Supports tellg for streams that can't seek. + + if (way == std::ios::cur) + { + pos = pos + off; + } + else if (way == std::ios::end) + { + if (!lzma_index_) + { + if (!init_index()) + return pos_type(off_type(-1)); + } + + pos = pos_type(lzma_index_uncompressed_size(lzma_index_)) + off; + } + else + { + pos = off; + } + + return seekpos(pos, which); + } + + virtual std::streambuf::pos_type seekpos(std::streambuf::pos_type pos, std::ios_base::openmode which) + { + if (fp_ == 0 || sync()) + return pos_type(off_type(-1)); + + if (!lzma_index_) //stream_flags_.backward_size == LZMA_VLI_UNKNOWN) + { + if (!init_index()) + return pos_type(off_type(-1)); + } + + if (lzma_index_iter_locate(&lzma_index_itr_, (std::uint64_t) off_type(pos))) // Returns true on failure. + return pos_type(off_type(-1)); + + long seek_amount = (lzma_index_itr_.block.compressed_file_offset > std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(lzma_index_itr_.block.compressed_file_offset)); + if (fseek(fp_, seek_amount, SEEK_SET)) + return pos_type(off_type(-1)); + + discard_amount_ = off_type(pos) - lzma_index_itr_.block.uncompressed_file_offset; + decoded_position_ = lzma_index_itr_.block.uncompressed_file_offset; + + at_block_boundary_ = true; + lzma_block_decoder_.next_in = nullptr; + lzma_block_decoder_.avail_in = 0; + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setg(end, end, end); + + return pos; + } + + private: + //ixzbuf(const ixzbuf& src) = delete; + //ixzbuf& operator=(const ixzbuf& src) = delete; + + void destroy() + { + if (lzma_block_decoder_.internal) + lzma_end(&lzma_block_decoder_); + if (lzma_index_) + lzma_index_end(lzma_index_, nullptr); + if (fp_) + fclose(fp_); + } + + void move(ibuf&& src) + { + stream_header_flags_ = src.stream_header_flags_; + stream_footer_flags_ = src.stream_footer_flags_; + lzma_block_decoder_ = src.lzma_block_decoder_; + if (src.lzma_block_decoder_.internal) + src.lzma_block_decoder_.internal = nullptr; + lzma_block_ = src.lzma_block_; + lzma_block_filters_buf_ = src.lzma_block_filters_buf_; // TODO: handle filter.options + lzma_index_itr_ = src.lzma_index_itr_; // lzma_index_iter_init() doesn't allocate any memory, thus there is no lzma_index_iter_end(). + stream_header_ = src.stream_header_; + stream_footer_ = src.stream_footer_; + compressed_buffer_ = src.compressed_buffer_; + decompressed_buffer_ = src.decompressed_buffer_; + decoded_position_ = src.decoded_position_; + discard_amount_ = src.discard_amount_; + fp_ = src.fp_; + if (src.fp_) + src.fp_ = nullptr; + put_back_size_ = src.put_back_size_; + lzma_index_ = src.lzma_index_; + if (src.lzma_index_) + src.lzma_index_ = nullptr; + lzma_res_ = src.lzma_res_; + at_block_boundary_ = src.at_block_boundary_; + } + + void replenish_compressed_buffer() + { + lzma_block_decoder_.next_in = compressed_buffer_.data(); + lzma_block_decoder_.avail_in = fread(compressed_buffer_.data(), 1, compressed_buffer_.size(), fp_); + } + + bool init_index() + { + if (!fp_) + return false; + + if (fseek(fp_, -12, SEEK_END) || !fread(stream_footer_.data(), 12, 1, fp_)) + return false; + + if (lzma_stream_footer_decode(&stream_footer_flags_, stream_footer_.data()) != LZMA_OK) + return false; + + std::vector index_raw(stream_footer_flags_.backward_size); + if (fseek(fp_, -(stream_footer_flags_.backward_size + 12), SEEK_END) || !fread(index_raw.data(), index_raw.size(), 1, fp_)) + return false; + + std::uint64_t memlimit = UINT64_MAX; + size_t in_pos = 0; + auto res = lzma_index_buffer_decode(&lzma_index_, &memlimit, nullptr, index_raw.data(), &in_pos, index_raw.size()); + if (res != LZMA_OK) + return false; + + lzma_index_iter_init(&lzma_index_itr_, lzma_index_); + + return true; + } + + private: + lzma_stream_flags stream_header_flags_; + lzma_stream_flags stream_footer_flags_; + lzma_stream lzma_block_decoder_; + lzma_block lzma_block_; + std::array lzma_block_filters_buf_; + lzma_index_iter lzma_index_itr_; + std::array stream_header_; + std::array stream_footer_; + std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? BUFSIZ : LZMA_BLOCK_HEADER_SIZE_MAX)> compressed_buffer_; + std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? BUFSIZ : LZMA_BLOCK_HEADER_SIZE_MAX)> decompressed_buffer_; + std::uint64_t decoded_position_; + std::uint64_t discard_amount_; + FILE* fp_; + std::size_t put_back_size_; + lzma_index* lzma_index_; + lzma_ret lzma_res_; + bool at_block_boundary_; + }; + + class obuf : public std::streambuf + { + public: + obuf(FILE* fp) + : + fp_(fp) + { + if (!fp_) + { + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp(end, end); + } + else + { + lzma_stream_encoder_ = LZMA_STREAM_INIT; + + lzma_res_ = lzma_easy_encoder(&lzma_stream_encoder_, LZMA_PRESET_DEFAULT, LZMA_CHECK_CRC64); + if (lzma_res_ != LZMA_OK) + { + // TODO: handle error. + } + + lzma_stream_encoder_.next_out = compressed_buffer_.data(); + lzma_stream_encoder_.avail_out = compressed_buffer_.size(); + + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp((char*) decompressed_buffer_.data(), end); + } + } + + obuf(const std::string& file_path) : obuf(fopen(file_path.c_str(), "wb")) {} + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + obuf(obuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + obuf& operator=(obuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->close(); + this->move(std::move(src)); + } + + return *this; + } +#endif + virtual ~obuf() + { + this->close(); + } + + protected: + + virtual int overflow(int c) + { + if (!fp_) + return traits_type::eof(); + + if ((epptr() - pptr()) > 0) + { + assert(!"Put buffer not empty, this should never happen"); + this->sputc(static_cast(0xFF & c)); + } + else + { + lzma_stream_encoder_.next_in = decompressed_buffer_.data(); + lzma_stream_encoder_.avail_in = decompressed_buffer_.size(); + while (lzma_res_ == LZMA_OK && lzma_stream_encoder_.avail_in > 0) + { + lzma_res_ = lzma_code(&lzma_stream_encoder_, LZMA_RUN); + if (lzma_stream_encoder_.avail_out == 0 || lzma_res_ == LZMA_STREAM_END) + { + if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - lzma_stream_encoder_.avail_out, 1, fp_)) + { + // TODO: handle error. + return traits_type::eof(); + } + lzma_stream_encoder_.next_out = compressed_buffer_.data(); + lzma_stream_encoder_.avail_out = compressed_buffer_.size(); + } + } + + if (lzma_res_ == LZMA_STREAM_END) + lzma_res_ = LZMA_OK; + + assert(lzma_stream_encoder_.avail_in == 0); + decompressed_buffer_[0] = reinterpret_cast(c); + setp((char*) decompressed_buffer_.data() + 1, (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return (lzma_res_ == LZMA_OK ? traits_type::to_int_type(c) : traits_type::eof()); + } + + virtual int sync() + { + if (!fp_) + return -1; + + lzma_stream_encoder_.next_in = decompressed_buffer_.data(); + lzma_stream_encoder_.avail_in = decompressed_buffer_.size() - (epptr() - pptr()); + if (lzma_stream_encoder_.avail_in) + { + while (lzma_res_ == LZMA_OK) + { + lzma_res_ = lzma_code(&lzma_stream_encoder_, LZMA_FULL_FLUSH); + if (lzma_stream_encoder_.avail_out == 0 || (lzma_res_ == LZMA_STREAM_END && compressed_buffer_.size() != lzma_stream_encoder_.avail_out)) + { + if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - lzma_stream_encoder_.avail_out, 1, fp_)) + { + // TODO: handle error. + return -1; + } + lzma_stream_encoder_.next_out = compressed_buffer_.data(); + lzma_stream_encoder_.avail_out = compressed_buffer_.size(); + } + } + + if (lzma_res_ == LZMA_STREAM_END) + lzma_res_ = LZMA_OK; + + if (lzma_res_ != LZMA_OK) + return -1; + + assert(lzma_stream_encoder_.avail_in == 0); + setp((char*) decompressed_buffer_.data(), (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return 0; + } + + private: + + void move(obuf&& src) + { + compressed_buffer_ = src.compressed_buffer_; + decompressed_buffer_ = src.decompressed_buffer_; + lzma_stream_encoder_ = src.lzma_stream_encoder_; + if (src.lzma_stream_encoder_.internal) + src.lzma_stream_encoder_.internal = nullptr; + fp_ = src.fp_; + if (src.fp_) + src.fp_ = nullptr; + lzma_res_ = src.lzma_res_; + } + + void close() + { + if (lzma_stream_encoder_.internal) + { + lzma_stream_encoder_.next_in = decompressed_buffer_.data(); + lzma_stream_encoder_.avail_in = decompressed_buffer_.size() - (epptr() - pptr()); + while (lzma_res_ == LZMA_OK) + { + lzma_res_ = lzma_code(&lzma_stream_encoder_, LZMA_FINISH); + if (lzma_stream_encoder_.avail_out == 0 || (lzma_res_ == LZMA_STREAM_END && compressed_buffer_.size() != lzma_stream_encoder_.avail_out)) + { + if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - lzma_stream_encoder_.avail_out, 1, fp_)) + { + break; + } + lzma_stream_encoder_.next_out = compressed_buffer_.data(); + lzma_stream_encoder_.avail_out = compressed_buffer_.size(); + } + } + lzma_end(&lzma_stream_encoder_); + } + + if (fp_) + fclose(fp_); + } + + private: + static const std::size_t default_block_size = 1024; //4 * 1024 * 1024; + std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? default_block_size : LZMA_BLOCK_HEADER_SIZE_MAX)> compressed_buffer_; + std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? default_block_size : LZMA_BLOCK_HEADER_SIZE_MAX)> decompressed_buffer_; + lzma_stream lzma_stream_encoder_; + FILE* fp_; + lzma_ret lzma_res_; + }; + + class istream : public std::istream + { + public: + istream(const std::string& file_path) + : + std::istream(&sbuf_), + sbuf_(file_path) + { + } + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + istream(istream&& src) + : + std::istream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + istream& operator=(istream&& src) + { + if (&src != this) + { + std::istream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::xz::ibuf sbuf_; + }; + + class ostream : public std::ostream + { + public: + ostream(const std::string& file_path) + : + std::ostream(&sbuf_), + sbuf_(file_path) + { + } + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ostream(ostream&& src) + : + std::ostream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + ostream& operator=(ostream&& src) + { + if (&src != this) + { + std::ostream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::xz::obuf sbuf_; + }; + } +} + +#endif //SHRINKWRAP_XZ_HPP \ No newline at end of file diff --git a/include/shrinkwrap/zstd.hpp b/include/shrinkwrap/zstd.hpp new file mode 100644 index 0000000..854a4e0 --- /dev/null +++ b/include/shrinkwrap/zstd.hpp @@ -0,0 +1,420 @@ +#ifndef SHRINKWRAP_ZSTD_HPP +#define SHRINKWRAP_ZSTD_HPP + +#ifndef ZSTD_STATIC_LINKING_ONLY +#define ZSTD_STATIC_LINKING_ONLY +#endif + +#include + +#include +#include +#include + +namespace shrinkwrap +{ + namespace zstd + { + class ibuf : public std::streambuf + { + public: + ibuf(FILE* fp) + : + strm_(ZSTD_createDStream()), + input_({0}), + compressed_buffer_(ZSTD_DStreamInSize()), + decompressed_buffer_(ZSTD_DStreamOutSize()), + current_block_position_(0), + fp_(fp) + { + if (fp_) + { + res_ = ZSTD_initDStream(strm_); // 16 for GZIP only. + if (ZSTD_isError(res_)) + { + // TODO: handle error. + } + } + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setg(end, end, end); + } + + ibuf(const std::string& file_path) : ibuf(fopen(file_path.c_str(), "rb")) {} + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ibuf(ibuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + ibuf& operator=(ibuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->destroy(); + this->move(std::move(src)); + } + + return *this; + } +#endif + + virtual ~ibuf() + { + this->destroy(); + } + + private: + //ixzbuf(const ixzbuf& src) = delete; + //ixzbuf& operator=(const ixzbuf& src) = delete; + + void destroy() + { + if (fp_) + { + ZSTD_freeDStream(strm_); + fclose(fp_); + } + } + + void move(ibuf&& src) + { + strm_ = src.strm_; + src.strm_ = nullptr; + compressed_buffer_ = std::move(src.compressed_buffer_); + decompressed_buffer_ = std::move(src.decompressed_buffer_); + current_block_position_ = src.current_block_position_; + fp_ = src.fp_; + src.fp_ = nullptr; + res_ = src.res_; + input_ = src.input_; + } + + void replenish_compressed_buffer() + { + input_ = {compressed_buffer_.data(), fread(compressed_buffer_.data(), 1, compressed_buffer_.size(), fp_), 0 }; + } + + protected: + + virtual std::streambuf::int_type underflow() + { + if (!fp_) + return traits_type::eof(); + if (gptr() < egptr()) // buffer not exhausted + return traits_type::to_int_type(*gptr()); + + while (!ZSTD_isError(res_) && gptr() >= egptr() && (input_.pos < input_.size || !feof(fp_))) + { + if (input_.pos == input_.size && !feof(fp_)) + { + replenish_compressed_buffer(); + } + + if (res_ == 0 && input_.pos < input_.size) + { + res_ = ZSTD_resetDStream(strm_); + current_block_position_ = std::size_t(ftell(fp_)) - (input_.size - input_.pos); + } + + ZSTD_outBuffer output = {decompressed_buffer_.data(), decompressed_buffer_.size(), 0}; + res_ = ZSTD_decompressStream(strm_, &output , &input_); + + if (!ZSTD_isError(res_)) + { + char* start = ((char*) decompressed_buffer_.data()); + setg(start, start, start + output.pos); + } + } + + if (ZSTD_isError(res_)) + return traits_type::eof(); + else if (gptr() >= egptr()) + return traits_type::eof(); + + return traits_type::to_int_type(*gptr()); + } + + virtual std::streambuf::pos_type seekoff(std::streambuf::off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) + { + if (off == 0 && way == std::ios::cur) + { + if (egptr() - gptr() == 0 && res_ == 0) + { + std::uint64_t compressed_offset = std::size_t(ftell(fp_)) - (input_.size - input_.pos); + return pos_type(off_type(compressed_offset)); + } + else + { + std::uint64_t compressed_offset = current_block_position_; + return pos_type(off_type(compressed_offset)); + } + } + return pos_type(off_type(-1)); + } + + virtual std::streambuf::pos_type seekpos(std::streambuf::pos_type pos, std::ios_base::openmode which) + { + std::uint64_t compressed_offset = static_cast(pos); + + if (fp_ == 0 || sync()) + return pos_type(off_type(-1)); + + long seek_amount = static_cast(compressed_offset); + if (fseek(fp_, seek_amount, SEEK_SET)) + return pos_type(off_type(-1)); + + input_.src = nullptr; + input_.pos = 0; + input_.size = 0; + res_ = 0; + char* end = egptr(); + setg(end, end, end); + + return pos; + } + + private: + std::vector compressed_buffer_; + std::vector decompressed_buffer_; + ZSTD_DStream* strm_; + ZSTD_inBuffer input_; + FILE* fp_; + std::size_t res_; + std::size_t current_block_position_; + }; + + class obuf : public std::streambuf + { + public: + obuf(FILE* fp, int compression_level) + : + strm_(ZSTD_createCStream()), + fp_(fp), + compressed_buffer_(ZSTD_CStreamOutSize()), + decompressed_buffer_(ZSTD_CStreamInSize()), + res_(0) + { + if (!fp_) + { + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp(end, end); + } + else + { + res_ = ZSTD_initCStream(strm_, compression_level); + if (ZSTD_isError(res_)) + { + // TODO: handle error. + } + + char* end = ((char*) decompressed_buffer_.data()) + decompressed_buffer_.size(); + setp((char*) decompressed_buffer_.data(), end); + } + } + + obuf(const std::string& file_path, int compression_level = 3) : obuf(fopen(file_path.c_str(), "wb"), compression_level) {} + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + obuf(obuf&& src) + : + std::streambuf(std::move(src)) + { + this->move(std::move(src)); + } + + obuf& operator=(obuf&& src) + { + if (&src != this) + { + std::streambuf::operator=(std::move(src)); + this->close(); + this->move(std::move(src)); + } + + return *this; + } +#endif + + virtual ~obuf() + { + this->close(); + } + + private: + void move(obuf&& src) + { + compressed_buffer_ = std::move(src.compressed_buffer_); + decompressed_buffer_ = std::move(src.decompressed_buffer_); + strm_ = src.strm_; + fp_ = src.fp_; + src.fp_ = nullptr; + res_ = src.res_; + } + + void close() + { + if (fp_) + { + sync(); + res_ = ZSTD_freeCStream(strm_); + fclose(fp_); + fp_ = nullptr; + } + } + protected: + virtual int overflow(int c) + { + if (!fp_) + return traits_type::eof(); + + if ((epptr() - pptr()) > 0) + { + assert(!"Put buffer not empty, this should never happen"); + this->sputc(static_cast(0xFF & c)); + } + else + { + ZSTD_inBuffer input = {decompressed_buffer_.data(), decompressed_buffer_.size(), 0}; + while (!ZSTD_isError(res_) && input.pos < input.size) + { + ZSTD_outBuffer output = {compressed_buffer_.data(), compressed_buffer_.size(), 0}; + res_ = ZSTD_compressStream(strm_, &output, &input); + + if (output.pos && !fwrite(compressed_buffer_.data(), output.pos, 1, fp_)) + { + // TODO: handle error. + return traits_type::eof(); + } + } + + decompressed_buffer_[0] = reinterpret_cast(c); + setp((char*) decompressed_buffer_.data() + 1, (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return (!ZSTD_isError(res_) ? traits_type::to_int_type(c) : traits_type::eof()); + } + + virtual int sync() + { + if (!fp_) + return -1; + + ZSTD_inBuffer input = {decompressed_buffer_.data(), (decompressed_buffer_.size() - (epptr() - pptr())), 0}; + + if (input.pos < input.size) + { + while (!ZSTD_isError(res_) && input.pos < input.size) + { + ZSTD_outBuffer output = {compressed_buffer_.data(), compressed_buffer_.size(), 0}; + res_ = ZSTD_compressStream(strm_, &output, &input); + + if (output.pos && !fwrite(compressed_buffer_.data(), output.pos, 1, fp_)) + { + // TODO: handle error. + return -1; + } + } + + while (!ZSTD_isError(res_) && res_ != 0) + { + ZSTD_outBuffer output = {compressed_buffer_.data(), compressed_buffer_.size(), 0}; + res_ = ZSTD_endStream(strm_, &output); + if (output.pos && !fwrite(compressed_buffer_.data(), output.pos, 1, fp_)) + { + // TODO: handle error. + return -1; + } + } + + if (ZSTD_isError(res_)) + return -1; + + res_ = ZSTD_resetCStream(strm_, 0); + + setp((char*) decompressed_buffer_.data(), (char*) decompressed_buffer_.data() + decompressed_buffer_.size()); + } + + return 0; + } + + private: + std::vector compressed_buffer_; + std::vector decompressed_buffer_; + ZSTD_CStream* strm_; + FILE* fp_; + std::size_t res_; + }; + + class istream : public std::istream + { + public: + istream(const std::string& file_path) + : + std::istream(&sbuf_), + sbuf_(file_path) + { + } + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + istream(istream&& src) + : + std::istream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + istream& operator=(istream&& src) + { + if (&src != this) + { + std::istream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::zstd::ibuf sbuf_; + }; + + + + class ostream : public std::ostream + { + public: + ostream(const std::string& file_path) + : + std::ostream(&sbuf_), + sbuf_(file_path) + { + } + +#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + ostream(ostream&& src) + : + std::ostream(&sbuf_), + sbuf_(std::move(src.sbuf_)) + { + } + + ostream& operator=(ostream&& src) + { + if (&src != this) + { + std::ostream::operator=(std::move(src)); + sbuf_ = std::move(src.sbuf_); + } + return *this; + } +#endif + private: + ::shrinkwrap::zstd::obuf sbuf_; + }; + } +} + +#endif //SHRINKWRAP_ZSTD_HPP \ No newline at end of file diff --git a/include/xzbuf.hpp b/include/xzbuf.hpp deleted file mode 100644 index a07fddf..0000000 --- a/include/xzbuf.hpp +++ /dev/null @@ -1,556 +0,0 @@ -#ifndef XZBUF_XZBUF_HPP -#define XZBUF_XZBUF_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ixzbuf : public std::streambuf -{ -public: - ixzbuf(const std::string& file_path) : - decoded_position_(0), - discard_amount_(0), - fp_(fopen(file_path.c_str(), "rb")), - put_back_size_(0), - lzma_index_(nullptr), - at_block_boundary_(true), - lzma_block_decoder_(LZMA_STREAM_INIT) - { - if (fp_) - { - fread(stream_header_.data(), stream_header_.size(), 1, fp_); // TODO: handle error. - lzma_res_ = lzma_stream_header_decode(&stream_header_flags_, stream_header_.data()); - if (lzma_res_ != LZMA_OK) - { - // TODO: handle error. - } - } - char* end = ((char*)decompressed_buffer_.data()) + decompressed_buffer_.size(); - setg(end, end, end); - } - - ixzbuf(ixzbuf&& src) : - std::streambuf(std::move(src)) - { - this->move(std::move(src)); - } - - ixzbuf& operator=(ixzbuf&& src) - { - if (&src != this) - { - std::streambuf::operator=(std::move(src)); - this->destroy(); - this->move(std::move(src)); - } - - return *this; - } - - - ~ixzbuf() - { - this->destroy(); - } -private: - //ixzbuf(const ixzbuf& src) = delete; - //ixzbuf& operator=(const ixzbuf& src) = delete; - - void destroy() - { - if (lzma_block_decoder_.internal) - lzma_end(&lzma_block_decoder_); - if (lzma_index_) - lzma_index_end(lzma_index_, nullptr); - if (fp_) - fclose(fp_); - } - - void move(ixzbuf&& src) - { - stream_header_flags_ = src.stream_header_flags_; - stream_footer_flags_ = src.stream_footer_flags_; - lzma_block_decoder_ = src.lzma_block_decoder_; - if (src.lzma_block_decoder_.internal) - src.lzma_block_decoder_.internal = nullptr; - lzma_block_ = src.lzma_block_; - lzma_block_filters_buf_ = src.lzma_block_filters_buf_; // TODO: handle filter.options - lzma_index_itr_ = src.lzma_index_itr_; // lzma_index_iter_init() doesn't allocate any memory, thus there is no lzma_index_iter_end(). - stream_header_ = src.stream_header_; - stream_footer_ = src.stream_footer_; - compressed_buffer_ = src.compressed_buffer_; - decompressed_buffer_ = src.decompressed_buffer_; - decoded_position_ = src.decoded_position_; - discard_amount_ = src.discard_amount_; - fp_ = src.fp_; - if (src.fp_) - src.fp_ = nullptr; - put_back_size_ = src.put_back_size_; - lzma_index_ = src.lzma_index_; - if (src.lzma_index_) - src.lzma_index_ = nullptr; - lzma_res_ = src.lzma_res_; - at_block_boundary_ = src.at_block_boundary_; - } - - std::streambuf::int_type underflow() - { - if (!fp_) - return traits_type::eof(); - if (gptr() < egptr()) // buffer not exhausted - return traits_type::to_int_type(*gptr()); - - while (gptr() >= egptr() && lzma_res_ == LZMA_OK) - { - lzma_block_decoder_.next_out = decompressed_buffer_.data(); - lzma_block_decoder_.avail_out = decompressed_buffer_.size(); - - if (at_block_boundary_) - { - std::vector block_header(LZMA_BLOCK_HEADER_SIZE_MAX); - if (lzma_block_decoder_.avail_in == 0 && !feof(fp_)) - { - replenish_compressed_buffer(); - } - // TODO: make sure avail_in is greater than 0; - std::memcpy(block_header.data(), lzma_block_decoder_.next_in, 1); - ++(lzma_block_decoder_.next_in); - --(lzma_block_decoder_.avail_in); - - if (block_header[0] == 0x00) - { - // Index indicator found - lzma_res_ = LZMA_STREAM_END; - } - else - { - lzma_block_.version = 0; - lzma_block_.check = stream_header_flags_.check; - lzma_block_.filters = lzma_block_filters_buf_.data(); - lzma_block_.header_size = lzma_block_header_size_decode (block_header[0]); - - std::size_t bytes_already_copied = 0; - if (lzma_block_decoder_.avail_in < (lzma_block_.header_size - 1)) - { - bytes_already_copied = lzma_block_decoder_.avail_in; - std::memcpy(&block_header[1], lzma_block_decoder_.next_in, bytes_already_copied); - lzma_block_decoder_.avail_in -= bytes_already_copied; - lzma_block_decoder_.next_in += bytes_already_copied; - assert(lzma_block_decoder_.avail_in == 0); - replenish_compressed_buffer(); - } - - // TODO: make sure avail_in is greater than (lzma_block_.header_size - 1) - bytes_already_copied. - std::size_t bytes_left_to_copy = (lzma_block_.header_size - 1) - bytes_already_copied; - std::memcpy(&block_header[1 + bytes_already_copied], lzma_block_decoder_.next_in, bytes_left_to_copy); - lzma_block_decoder_.avail_in -= bytes_left_to_copy; - lzma_block_decoder_.next_in += bytes_left_to_copy; - - lzma_res_ = lzma_block_header_decode(&lzma_block_, nullptr, block_header.data()); - if (lzma_res_ != LZMA_OK ) - { - // TODO: handle error. - } - else - { - lzma_res_ = lzma_block_decoder(&lzma_block_decoder_, &lzma_block_); - // TODO: handle error. - } - } - at_block_boundary_ = false; - } - - if (lzma_res_ == LZMA_OK) - { - if (lzma_block_decoder_.avail_in == 0 && !feof(fp_)) - { - replenish_compressed_buffer(); - } - - assert(lzma_block_decoder_.avail_in > 0); - - lzma_ret r = lzma_code(&lzma_block_decoder_, LZMA_RUN); - if (r == LZMA_STREAM_END) - { - // End of block. - at_block_boundary_ = true; - r = LZMA_OK; - } - lzma_res_ = r; - } - - char* start = ((char*)decompressed_buffer_.data()); - setg(start, start, start + (decompressed_buffer_.size() - lzma_block_decoder_.avail_out)); - decoded_position_ += (egptr() - gptr()); - - if (discard_amount_ > 0) - { - std::uint64_t advance_amount = discard_amount_; - if ((egptr() - gptr()) < advance_amount) - advance_amount = (egptr() - gptr()); - setg(start, gptr() + advance_amount, egptr()); - discard_amount_ -= advance_amount; - } - } - - if (lzma_res_ == LZMA_STREAM_END && gptr() >= egptr()) - return traits_type::eof(); - else if (lzma_res_ != LZMA_OK && lzma_res_ != LZMA_STREAM_END) - return traits_type::eof(); - - return traits_type::to_int_type(*gptr()); - } - - void replenish_compressed_buffer() - { - lzma_block_decoder_.next_in = compressed_buffer_.data(); - lzma_block_decoder_.avail_in = fread(compressed_buffer_.data(), 1, compressed_buffer_.size(), fp_); - } - - std::streambuf::pos_type seekoff(std::streambuf::off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) - { - std::uint64_t current_position = decoded_position_ - (egptr() - gptr()); - current_position += discard_amount_; // TODO: overflow check. - - pos_type pos{off_type(current_position)}; - - if (off == 0 && way == std::ios::cur) - return pos; // Supports tellg for streams that can't seek. - - if (way == std::ios::cur) - { - pos = pos + off; - } - else if (way == std::ios::end) - { - if (!lzma_index_) - { - if (!init_index()) - return pos_type(off_type(-1)); - } - - pos = pos_type(lzma_index_uncompressed_size(lzma_index_)) + off; - } - else - { - pos = off; - } - - return seekpos(pos, which); - } - - std::streambuf::pos_type seekpos(std::streambuf::pos_type pos, std::ios_base::openmode which) - { - if (fp_ == 0 || sync()) - return pos_type(off_type(-1)); - - if (!lzma_index_) //stream_flags_.backward_size == LZMA_VLI_UNKNOWN) - { - if (!init_index()) - return pos_type(off_type(-1)); - } - - if (lzma_index_iter_locate(&lzma_index_itr_, (std::uint64_t)off_type(pos))) // Returns true on failure. - return pos_type(off_type(-1)); - - long seek_amount = (lzma_index_itr_.block.compressed_file_offset > std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(lzma_index_itr_.block.compressed_file_offset)); - if (fseek(fp_, seek_amount, SEEK_SET)) - return pos_type(off_type(-1)); - - discard_amount_ = off_type(pos) - lzma_index_itr_.block.uncompressed_file_offset; - decoded_position_ = lzma_index_itr_.block.uncompressed_file_offset; - - at_block_boundary_ = true; - lzma_block_decoder_.next_in = nullptr; - lzma_block_decoder_.avail_in = 0; - char* end = ((char*)decompressed_buffer_.data()) + decompressed_buffer_.size(); - setg(end, end, end); - - return pos; - } - - bool init_index() - { - if (!fp_) - return false; - - if (fseek(fp_, -12, SEEK_END) || !fread(stream_footer_.data(), 12, 1, fp_)) - return false; - - if (lzma_stream_footer_decode(&stream_footer_flags_, stream_footer_.data()) != LZMA_OK) - return false; - - std::vector index_raw(stream_footer_flags_.backward_size); - if (fseek(fp_, -(stream_footer_flags_.backward_size + 12), SEEK_END) || !fread(index_raw.data(), index_raw.size(), 1, fp_)) - return false; - - std::uint64_t memlimit = UINT64_MAX; - size_t in_pos = 0; - auto res = lzma_index_buffer_decode(&lzma_index_, &memlimit, nullptr, index_raw.data(), &in_pos, index_raw.size()); - if (res != LZMA_OK) - return false; - - lzma_index_iter_init(&lzma_index_itr_, lzma_index_); - - return true; - } -private: - lzma_stream_flags stream_header_flags_; - lzma_stream_flags stream_footer_flags_; - lzma_stream lzma_block_decoder_; - lzma_block lzma_block_; - std::array lzma_block_filters_buf_; - lzma_index_iter lzma_index_itr_; - std::array stream_header_; - std::array stream_footer_; - std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? BUFSIZ : LZMA_BLOCK_HEADER_SIZE_MAX)> compressed_buffer_; - std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? BUFSIZ : LZMA_BLOCK_HEADER_SIZE_MAX)> decompressed_buffer_; - std::uint64_t decoded_position_; - std::uint64_t discard_amount_; - FILE* fp_; - std::size_t put_back_size_; - lzma_index* lzma_index_; - lzma_ret lzma_res_; - bool at_block_boundary_; -}; - -class oxzbuf : public std::streambuf -{ -public: - oxzbuf(const std::string& file_path) : - fp_(fopen(file_path.c_str(), "wb")) - { - if (!fp_) - { - char* end = ((char*)decompressed_buffer_.data()) + decompressed_buffer_.size(); - setp(end, end); - } - else - { - lzma_stream_encoder_ = LZMA_STREAM_INIT; - - lzma_res_ = lzma_easy_encoder(&lzma_stream_encoder_, LZMA_PRESET_DEFAULT, LZMA_CHECK_CRC64); - if (lzma_res_ != LZMA_OK) - { - // TODO: handle error. - } - - lzma_stream_encoder_.next_out = compressed_buffer_.data(); - lzma_stream_encoder_.avail_out = compressed_buffer_.size(); - - char* end = ((char*)decompressed_buffer_.data()) + decompressed_buffer_.size(); - setp((char*)decompressed_buffer_.data(), end); - } - } - - oxzbuf(oxzbuf&& src) : - std::streambuf(std::move(src)) - { - this->move(std::move(src)); - } - - oxzbuf& operator=(oxzbuf&& src) - { - if (&src != this) - { - std::streambuf::operator=(std::move(src)); - this->close(); - this->move(std::move(src)); - } - - return *this; - } - - ~oxzbuf() - { - this->close(); - } -private: - void move(oxzbuf&& src) - { - compressed_buffer_ = src.compressed_buffer_; - decompressed_buffer_ = src.decompressed_buffer_; - lzma_stream_encoder_ = src.lzma_stream_encoder_; - if (src.lzma_stream_encoder_.internal) - src.lzma_stream_encoder_.internal = nullptr; - fp_ = src.fp_; - if (src.fp_) - src.fp_ = nullptr; - lzma_res_ = src.lzma_res_; - } - - void close() - { - if (lzma_stream_encoder_.internal) - { - lzma_stream_encoder_.next_in = decompressed_buffer_.data(); - lzma_stream_encoder_.avail_in = decompressed_buffer_.size() - (epptr() - pptr()); - while (lzma_res_ == LZMA_OK) - { - lzma_res_ = lzma_code(&lzma_stream_encoder_, LZMA_FINISH); - if (lzma_stream_encoder_.avail_out == 0 || (lzma_res_ == LZMA_STREAM_END && compressed_buffer_.size() != lzma_stream_encoder_.avail_out)) - { - if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - lzma_stream_encoder_.avail_out, 1, fp_)) - { - break; - } - lzma_stream_encoder_.next_out = compressed_buffer_.data(); - lzma_stream_encoder_.avail_out = compressed_buffer_.size(); - } - } - lzma_end(&lzma_stream_encoder_); - } - - if (fp_) - fclose(fp_); - } - - int overflow(int c) - { - if (!fp_) - return traits_type::eof(); - - if ((epptr() - pptr()) > 0) - { - assert(!"Put buffer not empty, this should never happen"); - this->sputc(static_cast(0xFF & c)); - } - else - { - lzma_stream_encoder_.next_in = decompressed_buffer_.data(); - lzma_stream_encoder_.avail_in = decompressed_buffer_.size(); - while (lzma_res_ == LZMA_OK && lzma_stream_encoder_.avail_in > 0) - { - lzma_res_ = lzma_code(&lzma_stream_encoder_, LZMA_RUN); - if (lzma_stream_encoder_.avail_out == 0 || lzma_res_ == LZMA_STREAM_END) - { - if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - lzma_stream_encoder_.avail_out, 1, fp_)) - { - // TODO: handle error. - return traits_type::eof(); - } - lzma_stream_encoder_.next_out = compressed_buffer_.data(); - lzma_stream_encoder_.avail_out = compressed_buffer_.size(); - } - } - - if (lzma_res_ == LZMA_STREAM_END) - lzma_res_ = LZMA_OK; - - assert(lzma_stream_encoder_.avail_in == 0); - decompressed_buffer_[0] = reinterpret_cast(c); - setp((char*)decompressed_buffer_.data() + 1, (char*)decompressed_buffer_.data() + decompressed_buffer_.size()); - } - - return (lzma_res_ == LZMA_OK ? traits_type::to_int_type(c) : traits_type::eof()); - } - - int sync() - { - if (!fp_) - return -1; - - lzma_stream_encoder_.next_in = decompressed_buffer_.data(); - lzma_stream_encoder_.avail_in = decompressed_buffer_.size() - (epptr() - pptr()); - if (lzma_stream_encoder_.avail_in) - { - while (lzma_res_ == LZMA_OK) - { - lzma_res_ = lzma_code(&lzma_stream_encoder_, LZMA_FULL_FLUSH); - if (lzma_stream_encoder_.avail_out == 0 || (lzma_res_ == LZMA_STREAM_END && compressed_buffer_.size() != lzma_stream_encoder_.avail_out)) - { - if (!fwrite(compressed_buffer_.data(), compressed_buffer_.size() - lzma_stream_encoder_.avail_out, 1, fp_)) - { - // TODO: handle error. - return -1; - } - lzma_stream_encoder_.next_out = compressed_buffer_.data(); - lzma_stream_encoder_.avail_out = compressed_buffer_.size(); - } - } - - if (lzma_res_ == LZMA_STREAM_END) - lzma_res_ = LZMA_OK; - - if (lzma_res_ != LZMA_OK) - return -1; - - assert(lzma_stream_encoder_.avail_in == 0); - setp((char*)decompressed_buffer_.data(), (char*)decompressed_buffer_.data() + decompressed_buffer_.size()); - } - - return 0; - } -private: - static const std::size_t default_block_size = 1024; //4 * 1024 * 1024; - std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? default_block_size : LZMA_BLOCK_HEADER_SIZE_MAX)> compressed_buffer_; - std::array= LZMA_BLOCK_HEADER_SIZE_MAX ? default_block_size : LZMA_BLOCK_HEADER_SIZE_MAX)> decompressed_buffer_; - lzma_stream lzma_stream_encoder_; - FILE* fp_; - lzma_ret lzma_res_; -}; - -class ixzstream : public std::istream -{ -public: - ixzstream(const std::string& file_path) : - std::istream(&sbuf_), - sbuf_(file_path) - { - } - - ixzstream(ixzstream&& src) : - std::istream(&sbuf_), - sbuf_(std::move(src.sbuf_)) - { - } - - ixzstream& operator=(ixzstream&& src) - { - if (&src != this) - { - std::istream::operator=(std::move(src)); - sbuf_ = std::move(src.sbuf_); - } - return *this; - } -private: - ixzbuf sbuf_; -}; - -class oxzstream : public std::ostream -{ -public: - oxzstream(const std::string& file_path) : - std::ostream(&sbuf_), - sbuf_(file_path) - { - } - - oxzstream(oxzstream&& src) : - std::ostream(&sbuf_), - sbuf_(std::move(src.sbuf_)) - { - } - - oxzstream& operator=(oxzstream&& src) - { - if (&src != this) - { - std::ostream::operator=(std::move(src)); - sbuf_ = std::move(src.sbuf_); - } - return *this; - } -private: - oxzbuf sbuf_; -}; - -#endif //XZBUF_XZBUF_HPP \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1a5730f..594e322 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ xz,http://tukaani.org/xz/xz-5.2.3.tar.bz2 --cmake dep/xz.cmake +zstd,facebook/zstd@v1.3.2 --cmake dep/zstd.cmake \ No newline at end of file diff --git a/src/test.cpp b/src/test.cpp index 245e14d..199b7c0 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -1,5 +1,7 @@ -#include "xzbuf.hpp" +#include "shrinkwrap/istream.hpp" + + #include #include @@ -12,10 +14,13 @@ #include +namespace sw = shrinkwrap; + +template class test_base { public: - test_base(const std::string& file_path, std::size_t block_size = std::numeric_limits::max()): + test_base(const std::string& file_path, std::size_t block_size = 2048): file_(file_path), block_size_(block_size) { @@ -29,7 +34,7 @@ class test_base static bool generate_test_file(const std::string& file_path, std::size_t block_size) { - oxzstream ofs(file_path); + OutT ofs(file_path); for (std::size_t i = 0; i < (2048 / 4) && ofs.good(); ++i) { if (((i * 4) % block_size) == 0) @@ -43,21 +48,22 @@ class test_base std::size_t block_size_; }; -class iterator_test: public test_base +template +class iterator_test: public test_base { public: - using test_base::test_base; + using test_base::test_base; bool operator()() { bool ret = false; - if ((file_exists(file_) && std::remove(file_.c_str()) != 0) || !generate_test_file(file_, block_size_)) + if ((test_base::file_exists(this->file_) && std::remove(this->file_.c_str()) != 0) || !test_base::generate_test_file(this->file_, this->block_size_)) { std::cerr << "FAILED to generate test file." << std::endl; } else { - if (!run(file_)) + if (!run(this->file_)) { std::cerr << "FAILED iterator test." << std::endl; } @@ -72,9 +78,9 @@ class iterator_test: public test_base private: bool run(const std::string& file_path) { - ixzbuf sbuf(file_path); - std::istreambuf_iterator it(&sbuf); - std::istreambuf_iterator end; + InT is(file_path); + std::istreambuf_iterator it(is.rdbuf()); + std::istreambuf_iterator end{}; std::size_t integer = 0; for (std::size_t i = 0; it != end; ++i) @@ -92,22 +98,23 @@ class iterator_test: public test_base } }; -class seek_test : public test_base +template +class seek_test : public test_base { public: - using test_base::test_base; + using test_base::test_base; bool operator()() { bool ret = false; - if ((file_exists(file_) && std::remove(file_.c_str()) != 0) || !generate_test_file(file_, block_size_)) + if ((test_base::file_exists(this->file_) && std::remove(this->file_.c_str()) != 0) || !test_base::generate_test_file(this->file_, this->block_size_)) { std::cerr << "FAILED to generate test file" << std::endl; } else { - if (!run(file_)) + if (!run(this->file_)) { std::cerr << "FAILED seek test." << std::endl; } @@ -122,7 +129,7 @@ class seek_test : public test_base private: static bool run(const std::string& file_path) { - ixzstream ifs(file_path); + InT ifs(file_path); std::vector pos_sequence; pos_sequence.reserve(128); std::mt19937 rg(std::uint32_t(std::chrono::system_clock::now().time_since_epoch().count())); @@ -150,6 +157,147 @@ class seek_test : public test_base } }; +template +class virtual_offset_seek_test : public test_base +{ +public: + using test_base::test_base; + + bool operator()() + { + bool ret = false; + + if ((test_base::file_exists(this->file_) && std::remove(this->file_.c_str()) != 0) || !test_base::generate_test_file(this->file_, this->block_size_)) + { + std::cerr << "FAILED to generate test file" << std::endl; + } + else + { + if (!run(this->file_)) + { + std::cerr << "FAILED seek test." << std::endl; + } + else + { + ret = true; + } + } + + return ret; + } +private: + static bool run(const std::string& file_path) + { + InT ifs(file_path); + std::vector pos_sequence; + pos_sequence.reserve(128); + std::mt19937 rg(std::uint32_t(std::chrono::system_clock::now().time_since_epoch().count())); + for (unsigned i = 0; i < 128 && ifs.good(); ++i) + { + int val = 2048 / 4; + int pos = rg() % val; + pos_sequence.push_back(pos); + + std::uint64_t virtual_offset{}; + { + InT idx_ifs(file_path); + virtual_offset = idx_ifs.tellg(); + int tmp; + while (idx_ifs >> tmp) + { + if (tmp == pos) + break; + virtual_offset = idx_ifs.tellg(); + } + } + + ifs.seekg(virtual_offset); + ifs >> val; + if (val != pos) + { + std::cerr << "Seek failure sequence:" << std::endl; + for (auto it = pos_sequence.begin(); it != pos_sequence.end(); ++it) + { + if (it != pos_sequence.begin()) + std::cerr << ","; + std::cerr << *it; + } + std::cerr << std::endl; + return false; + } + } + return ifs.good(); + } +}; + +template +class block_seek_test : public test_base +{ +public: + using test_base::test_base; + + bool operator()() + { + bool ret = false; + + if ((test_base::file_exists(this->file_) && std::remove(this->file_.c_str()) != 0) || !test_base::generate_test_file(this->file_, this->block_size_)) + { + std::cerr << "FAILED to generate test file" << std::endl; + } + else + { + if (!run(this->file_)) + { + std::cerr << "FAILED seek test." << std::endl; + } + else + { + ret = true; + } + } + + return ret; + } +private: + bool run(const std::string& file_path) + { + std::array buf; + std::vector pos_sequence = {0}; + + { + InT ifs(file_path); + while (ifs) + { + ifs.read(buf.data(), buf.size()); + long tmp = ifs.tellg(); + if (tmp >= 0 && tmp != pos_sequence.back()) + pos_sequence.push_back(tmp); + } + } + + for (auto pos = pos_sequence.rbegin(); pos != pos_sequence.rend(); ++pos) + { + std::size_t total_bytes_read = 0; + InT ifs(file_path); + ifs.seekg(*pos); + while (ifs) + { + ifs.read(buf.data(), buf.size()); + total_bytes_read += ifs.gcount(); + } + + std::size_t expected_bytes_to_read = std::distance(pos_sequence.rbegin(), pos) * this->block_size_; + if (total_bytes_read != expected_bytes_to_read) + { + std::cerr << "Seek Failure (pos: " << (*pos) << ")" << std::endl; + return false; + } + } + + return true; + } +}; + int main(int argc, char* argv[]) { int ret = -1; @@ -157,10 +305,51 @@ int main(int argc, char* argv[]) if (argc > 1) { std::string sub_command = argv[1]; - if (sub_command == "seek") - ret = !(seek_test("test_seek_file.txt.xz")()); - else if (sub_command == "iterator") - ret = !(iterator_test("test_iterator_file.txt.xz")() && iterator_test("test_iterator_file_512.txt.xz", 512)() && iterator_test("test_iterator_file_1024.txt.xz", 1024)()); + if (sub_command == "generic-iter") + ret = !(iterator_test("test_generic_iterator_file.txt.xz")() + && iterator_test("test_generic_iterator_file_512.txt.xz", 512)() + && iterator_test("test_generic_iterator_file_1024.txt.xz", 1024)() + && iterator_test("test_generic_iterator_file.txt.gz")() + && iterator_test("test_generic_iterator_file_512.txt.gz", 512)() + && iterator_test("test_generic_iterator_file_1024.txt.gz", 1024)() + && iterator_test("test_generic_iterator_file.txt.zst")() + && iterator_test("test_generic_iterator_file_512.txt.zst", 512)() + && iterator_test("test_generic_iterator_file_1024.txt.zst", 1024)()); + else if (sub_command == "generic-seek") + ret = !(seek_test("test_generic_seek_file.txt.xz")() + && seek_test("test_generic_seek_file_512.txt.xz", 512)() + && seek_test("test_generic_seek_file_1024.txt.xz", 1024)() + && virtual_offset_seek_test("test_generic_seek_file.txt.bgzf")() + && virtual_offset_seek_test("test_generic_seek_file_512.txt.bgzf", 512)() + && virtual_offset_seek_test("test_generic_seek_file_1024.txt.bgzf", 1024)()); + else if (sub_command == "xz-seek") + ret = !(seek_test("test_seek_file.txt.xz")() + && seek_test("test_seek_file_512.txt.xz", 512)() + && seek_test("test_seek_file_1024.txt.xz", 1024)()); + else if (sub_command == "xz-iter") + ret = !(iterator_test("test_iterator_file.txt.xz")() + && iterator_test("test_iterator_file_512.txt.xz", 512)() + && iterator_test("test_iterator_file_1024.txt.xz", 1024)()); + else if (sub_command == "gz-iter") + ret = !(iterator_test("test_iterator_file.txt.gz")() + && iterator_test("test_iterator_file_512.txt.gz", 512)() + && iterator_test("test_iterator_file_1024.txt.gz", 1024)()); + else if (sub_command == "bgz-seek") + ret = !(virtual_offset_seek_test("test_seek_file.txt.bgzf")() + && virtual_offset_seek_test("test_seek_file_512.txt.bgzf", 512)() + && virtual_offset_seek_test("test_seek_file_1024.txt.bgzf", 1024)()); + else if (sub_command == "bgz-iter") + ret = !(iterator_test("test_iterator_file.txt.bgzf")() + && iterator_test("test_iterator_file_512.txt.bgzf", 512)() + && iterator_test("test_iterator_file_1024.txt.bgzf", 1024)()); + else if (sub_command == "zstd-iter") + ret = !(iterator_test("test_iterator_file.txt.zst")() + && iterator_test("test_iterator_file_512.txt.zst", 512)() + && iterator_test("test_iterator_file_1024.txt.zst", 1024)()); + else if (sub_command == "zstd-seek") + ret = !(block_seek_test("test_seek_file.txt.zst")() + && block_seek_test("test_seek_file_512.txt.zst", 512)() + && block_seek_test("test_seek_file_1024.txt.zst", 1024)()); } return ret;