diff --git a/.travis.yml b/.travis.yml index 1e13cb2c..42d4fcce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,22 +3,39 @@ sudo: required dist: trusty install: - #Install lcov and MPICH - - sudo apt-get -y install lcov mpich zlib1g-dev libssl-dev libgtest-dev + # Install lcov, MPICH, zlib, OpenSSL, Gtest and LZ4 + - sudo apt-get -y install lcov mpich zlib1g-dev libssl-dev libgtest-dev liblz4-dev + # Install Zstandard + - wget https://github.com/facebook/zstd/archive/v1.0.0.tar.gz + - tar xf v1.0.0.tar.gz + - cd zstd-1.0.0 + - sudo make install PREFIX='/usr' + - cd $TRAVIS_BUILD_DIR + # Install Blosc + - git clone https://github.com/Blosc/c-blosc + - cd c-blosc + - mkdir build + - cd build + - cmake -DCMAKE_INSTALL_PREFIX='/usr' .. + - cmake --build . + - sudo cmake --build . --target install + - cd $TRAVIS_BUILD_DIR + # Install Google Test - cd /usr/src/gtest - sudo cmake . - sudo make - sudo mv libgtest* /usr/lib/ - cd $TRAVIS_BUILD_DIR - # install lcov to coveralls conversion + upload tool + # Install lcov to coveralls conversion + upload tool - gem install coveralls-lcov before_script: - lcov --directory . --zerocounters script: - - make CXX=mpic++ INCLUDE_PATHS=-I/usr/include/mpich MPILIB=-lmpich -j 4 - - make test CXX=mpic++ INCLUDE_PATHS=-I/usr/include/mpich MPILIB=-lmpich + - make clean + - make -j 4 + - make test -j 4 - lcov --directory . --capture --output-file coverage.info - lcov --list coverage.info # debug before upload diff --git a/Makefile b/Makefile index 9dae1363..b36ba5c7 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,28 @@ ifeq ($(VERBOSE),1) CPPFLAGS += -DVERBOSE endif +# --- MAC address interface --- # +MAC_ADDRESS_INTERFACE = +ifneq ($(MAC_ADDRESS_INTERFACE),) + CPPFLAGS += -DTILEDB_MAC_ADDRESS_INTERFACE=$(MAC_ADDRESS_INTERFACE) +endif + +# --- Compression levels --- # +COMPRESSION_LEVEL_GZIP = +ifneq ($(COMPRESSION_LEVEL_GZIP),) + CPPFLAGS += -DTILEDB_COMPRESSION_LEVEL_GZIP=$(COMPRESSION_LEVEL_GZIP) +endif + +COMPRESSION_LEVEL_ZSTD = +ifneq ($(COMPRESSION_LEVEL_ZSTD),) + CPPFLAGS += -DTILEDB_COMPRESSION_LEVEL_ZSTD=$(COMPRESSION_LEVEL_ZSTD) +endif + +COMPRESSION_LEVEL_BLOSC = +ifneq ($(COMPRESSION_LEVEL_BLOSC),) + CPPFLAGS += -DTILEDB_COMPRESSION_LEVEL_BLOSC=$(COMPRESSION_LEVEL_BLOSC) +endif + # --- Use parallel sort --- # USE_PARALLEL_SORT = ifeq ($(USE_PARALLEL_SORT),1) @@ -137,7 +159,10 @@ ifdef TRAVIS endif # --- Libraries --- # -ZLIB = -lz +ZLIB = -lz +ZSTD = -lzstd +LZ4 = -llz4 +BLOSC = -lblosc OPENSSLLIB = -lcrypto GTESTLIB = -lgtest -lgtest_main MPILIB = @@ -237,7 +262,8 @@ $(CORE_LIB_DIR)/libtiledb.$(SHLIB_EXT): $(CORE_OBJ) @mkdir -p $(CORE_LIB_DIR) @echo "Creating dynamic library libtiledb.$(SHLIB_EXT)" @$(CXX) $(SHLIB_FLAGS) $(SONAME) -o $@ $^ $(LIBRARY_PATHS) $(MPILIB) \ - $(PTHREADLIB) $(ZLIB) $(OPENSSLLIB) $(OPENMP_FLAG) + $(PTHREADLIB) $(ZLIB) $(ZSTD) $(LZ4) $(BLOSC) \ + $(OPENSSLLIB) $(OPENMP_FLAG) $(CORE_LIB_DIR)/libtiledb.a: $(CORE_OBJ) @mkdir -p $(CORE_LIB_DIR) @@ -275,8 +301,9 @@ $(EXAMPLES_OBJ_DIR)/%.o: $(EXAMPLES_SRC_DIR)/%.cc $(EXAMPLES_BIN_DIR)/%: $(EXAMPLES_OBJ_DIR)/%.o $(CORE_LIB_DIR)/libtiledb.a @mkdir -p $(EXAMPLES_BIN_DIR) @echo "Creating $@" - @$(CXX) -std=gnu++11 -o $@ $^ $(LIBRARY_PATHS) $(MPILIB) $(ZLIB) \ - $(PTHREADLIB) $(OPENSSLLIB) $(OPENMP_FLAG) + @$(CXX) -std=gnu++11 -o $@ $^ $(LIBRARY_PATHS) $(MPILIB) \ + $(ZLIB) $(LZ4) $(ZSTD) $(BLOSC) \ + $(PTHREADLIB) $(OPENSSLLIB) $(OPENMP_FLAG) # --- Cleaning --- # @@ -308,8 +335,9 @@ $(TEST_OBJ_DIR)/%.o: $(TEST_SRC_DIR)/%.cc $(TEST_BIN_DIR)/tiledb_test: $(TEST_OBJ) $(CORE_LIB_DIR)/libtiledb.a @mkdir -p $(TEST_BIN_DIR) - @echo "Creating test_cmd" - @$(CXX) -o $@ $^ $(LIBRARY_PATHS) $(MPILIB) $(ZLIB) \ + @echo "Creating tiledb_test" + @$(CXX) -o $@ $^ $(LIBRARY_PATHS) $(MPILIB) \ + $(ZLIB) $(ZSTD) $(LZ4) $(BLOSC) \ $(PTHREADLIB) $(OPENSSLLIB) $(GTESTLIB) $(OPENMP_FLAG) # --- Cleaning --- # diff --git a/core/include/array/array.h b/core/include/array/array.h index 682a1904..55437e52 100644 --- a/core/include/array/array.h +++ b/core/include/array/array.h @@ -39,7 +39,7 @@ #include "array_sorted_write_state.h" #include "array_schema.h" #include "book_keeping.h" -#include "config.h" +#include "storage_manager_config.h" #include "constants.h" #include "fragment.h" #include @@ -138,7 +138,7 @@ class Array { const std::vector& attribute_ids() const; /** Returns the configuration parameters. */ - const Config* config() const; + const StorageManagerConfig* config() const; /** Returns the number of fragments in this array. */ int fragment_num() const; @@ -307,7 +307,7 @@ class Array { const char** attributes, int attribute_num, const void* subarray, - const Config* config, + const StorageManagerConfig* config, Array* array_clone = NULL); /** @@ -471,7 +471,7 @@ class Array { */ std::vector attribute_ids_; /** Configuration parameters. */ - const Config* config_; + const StorageManagerConfig* config_; /** The array fragments. */ std::vector fragments_; /** diff --git a/core/include/array/array_schema.h b/core/include/array/array_schema.h index 3ccddc65..3435ed4b 100644 --- a/core/include/array/array_schema.h +++ b/core/include/array/array_schema.h @@ -294,6 +294,9 @@ class ArraySchema { /** Returns the type of the i-th attribute, or NULL if 'i' is invalid. */ int type(int i) const; + /** Returns the type size of the i-th attribute. */ + size_t type_size(int i) const; + /** Returns the number of attributes with variable-sized values. */ int var_attribute_num() const; @@ -707,7 +710,16 @@ class ArraySchema { * The compression type for each attribute (plus one extra at the end for the * coordinates. It can be one of the following: * - TILEDB_NO_COMPRESSION - * - TILEDB_GZIP. + * - TILEDB_GZIP + * - TILEDB_ZSTD + * - TILEDB_LZ4 + * - TILEDB_BLOSC + * - TILEDB_BLOSC_LZ4 + * - TILEDB_BLOSC_LZ4HC + * - TILEDB_BLOSC_SNAPPY + * - TILEDB_BLOSC_ZLIB + * - TILEDB_BLOSC_ZSTD + * - TILEDB_RLE */ std::vector compression_; /** Auxiliary variable used when calculating Hilbert ids. */ diff --git a/core/include/array/array_schema_c.h b/core/include/array/array_schema_c.h index b178cc98..d365236e 100644 --- a/core/include/array/array_schema_c.h +++ b/core/include/array/array_schema_c.h @@ -69,7 +69,16 @@ typedef struct ArraySchemaC { * The compression type for each attribute (plus one extra at the end for the * coordinates. It can be one of the following: * - TILEDB_NO_COMPRESSION - * - TILEDB_GZIP. + * - TILEDB_GZIP + * - TILEDB_ZSTD + * - TILEDB_LZ4 + * - TILEDB_BLOSC + * - TILEDB_BLOSC_LZ4 + * - TILEDB_BLOSC_LZ4HC + * - TILEDB_BLOSC_SNAPPY + * - TILEDB_BLOSC_ZLIB + * - TILEDB_BLOSC_ZSTD + * - TILEDB_RLE */ int* compression_; /** diff --git a/core/include/c_api/c_api.h b/core/include/c_api/c_api.h index 1e271d3e..711dc903 100755 --- a/core/include/c_api/c_api.h +++ b/core/include/c_api/c_api.h @@ -100,7 +100,7 @@ typedef struct TileDB_Config { /** * The method for reading data from a file. * It can be one of the following: - * - TILEDB_IO_MMAP + * - TILEDB_IO_MMAP (default) * TileDB will use mmap. * - TILEDB_IO_READ * TileDB will use standard OS read. @@ -111,7 +111,7 @@ typedef struct TileDB_Config { /** * The method for writing data to a file. * It can be one of the following: - * - TILEDB_IO_WRITE + * - TILEDB_IO_WRITE (default) * TileDB will use standard OS write. * - TILEDB_IO_MPI * TileDB will use MPI-IO write. @@ -233,6 +233,15 @@ typedef struct TileDB_ArraySchema { * coordinates). It can be one of the following: * - TILEDB_NO_COMPRESSION * - TILEDB_GZIP + * - TILEDB_ZSTD + * - TILEDB_LZ4 + * - TILEDB_BLOSC + * - TILEDB_BLOSC_LZ4 + * - TILEDB_BLOSC_LZ4HC + * - TILEDB_BLOSC_SNAPPY + * - TILEDB_BLOSC_ZLIB + * - TILEDB_BLOSC_ZSTD + * - TILEDB_RLE * * If it is *NULL*, then the default TILEDB_NO_COMPRESSION is used for all * attributes. @@ -737,6 +746,15 @@ typedef struct TileDB_MetadataSchema { * key). It can be one of the following: * - TILEDB_NO_COMPRESSION * - TILEDB_GZIP + * - TILEDB_ZSTD + * - TILEDB_LZ4 + * - TILEDB_BLOSC + * - TILEDB_BLOSC_LZ4 + * - TILEDB_BLOSC_LZ4HC + * - TILEDB_BLOSC_SNAPPY + * - TILEDB_BLOSC_ZLIB + * - TILEDB_BLOSC_ZSTD + * - TILEDB_RLE * * If it is *NULL*, then the default TILEDB_NO_COMPRESSION is used for all * attributes. diff --git a/core/include/c_api/constants.h b/core/include/c_api/constants.h index c7d1105c..24b926c5 100644 --- a/core/include/c_api/constants.h +++ b/core/include/c_api/constants.h @@ -37,7 +37,7 @@ #include /** Version. */ -#define TILEDB_VERSION "0.4.0" +#define TILEDB_VERSION "0.5.0" /**@{*/ /** Return code. */ @@ -134,6 +134,15 @@ /** Compression type. */ #define TILEDB_NO_COMPRESSION 0 #define TILEDB_GZIP 1 +#define TILEDB_ZSTD 2 +#define TILEDB_LZ4 3 +#define TILEDB_BLOSC 4 +#define TILEDB_BLOSC_LZ4 5 +#define TILEDB_BLOSC_LZ4HC 6 +#define TILEDB_BLOSC_SNAPPY 7 +#define TILEDB_BLOSC_ZLIB 8 +#define TILEDB_BLOSC_ZSTD 9 +#define TILEDB_RLE 10 /**@}*/ /**@{*/ @@ -167,7 +176,30 @@ #define TILEDB_SORTED_BUFFER_VAR_SIZE 10000000 // ~10MB /**@}*/ -/** The alignment to assist vectorization. */ -#define ALIGNMENT 64 +/**@{*/ +/** Compression levels. */ +#ifndef TILEDB_COMPRESSION_LEVEL_GZIP +# define TILEDB_COMPRESSION_LEVEL_GZIP Z_DEFAULT_COMPRESSION +#endif +#ifndef TILEDB_COMPRESSION_LEVEL_ZSTD +# define TILEDB_COMPRESSION_LEVEL_ZSTD 1 +#endif +#ifndef TILEDB_COMPRESSION_LEVEL_BLOSC +# define TILEDB_COMPRESSION_LEVEL_BLOSC 5 +#endif +/**@}*/ + +/**@{*/ +/** MAC address interface. */ +#if defined(__APPLE__) && defined(__MACH__) + #ifndef TILEDB_MAC_ADDRESS_INTERFACE + #define TILEDB_MAC_ADDRESS_INTERFACE en0 + #endif +#else + #ifndef TILEDB_MAC_ADDRESS_INTERFACE + #define TILEDB_MAC_ADDRESS_INTERFACE eth0 + #endif +#endif +/**@}*/ #endif diff --git a/core/include/fragment/read_state.h b/core/include/fragment/read_state.h index 303dd398..8993437a 100644 --- a/core/include/fragment/read_state.h +++ b/core/include/fragment/read_state.h @@ -576,6 +576,108 @@ class ReadState { template void compute_tile_search_range_hil(); + /** + * Decompresses a tile. + * + * @param attribute_id The id of the attribute the tile belongs to. + * @param tile_compressed The compressed tile to be decompressed. + * @param tile_compressed_size The size of the compressed tile. + * @param tile The resulting decompressed tile. + * @param tile_size The expected size of the decompressed tile (for checking + * for errors). + * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. + */ + int decompress_tile( + int attribute_id, + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size); + + /** + * Decompresses a tile with GZIP. + * + * @param tile_compressed The compressed tile to be decompressed. + * @param tile_compressed_size The size of the compressed tile. + * @param tile The resulting decompressed tile. + * @param tile_size The expected size of the decompressed tile (for checking + * for errors). + * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. + */ + int decompress_tile_gzip( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size); + + /** + * Decompresses a tile with Zstandard. + * + * @param tile_compressed The compressed tile to be decompressed. + * @param tile_compressed_size The size of the compressed tile. + * @param tile The resulting decompressed tile. + * @param tile_size The expected size of the decompressed tile (for checking + * for errors). + * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. + */ + int decompress_tile_zstd( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size); + + /** + * Decompresses a tile with LZ4. + * + * @param tile_compressed The compressed tile to be decompressed. + * @param tile_compressed_size The size of the compressed tile. + * @param tile The resulting decompressed tile. + * @param tile_size The expected size of the decompressed tile (for checking + * for errors). + * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. + */ + int decompress_tile_lz4( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size); + + /** + * Decompresses a tile with Blosc. + * + * @param tile_compressed The compressed tile to be decompressed. + * @param tile_compressed_size The size of the compressed tile. + * @param tile The resulting decompressed tile. + * @param tile_size The expected size of the decompressed tile (for checking + * for errors). + * @param compressor The Blosc compressor to be used. + * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. + */ + int decompress_tile_blosc( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size, + const char* compressor); + + /** + * Decompresses a tile with RLE. + * + * @param attribute_id The id of the attribute the tile belongs to. + * @param tile_compressed The compressed tile to be decompressed. + * @param tile_compressed_size The size of the compressed tile. + * @param tile The resulting decompressed tile. + * @param tile_size The expected size of the decompressed tile (for checking + * for errors). + * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. + */ + int decompress_tile_rle( + int attribute_id, + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size); + /** * Returns the cell position in the search tile that is after the * input coordinates. @@ -642,29 +744,28 @@ class ReadState { /** * Maps a tile from the disk for an attribute into a local buffer, using - * memory map (mmap). This function focuses on the case of GZIP compression. + * memory map (mmap). This function works with any compression. * * @param attribute_id The id of the attribute the read occurs for. * @param offset The offset at which the tile starts in the file. * @param tile_size The tile size. * @return TILEDB_RS_OK for success, and TILEDB_RS_ERR for error. */ - int map_tile_from_file_cmp_gzip( + int map_tile_from_file_cmp( int attribute_id, off_t offset, size_t tile_size); /** * Maps a variable-sized tile from the disk for an attribute into a local - * buffer, using memory map (mmap). This function focuses on the case of GZIP - * compression. + * buffer, using memory map (mmap). This function works with any compression. * * @param attribute_id The id of the attribute the read occurs for. * @param offset The offset at which the tile starts in the file. * @param tile_size The tile size. * @return TILEDB_RS_OK for success, and TILEDB_RS_ERR for error. */ - int map_tile_from_file_var_cmp_gzip( + int map_tile_from_file_var_cmp( int attribute_id, off_t offset, size_t tile_size); @@ -708,14 +809,14 @@ class ReadState { * @param tile_size The tile size. * @return TILEDB_RS_OK for success, and TILEDB_RS_ERR for error. */ - int mpi_io_read_tile_from_file_cmp_gzip( + int mpi_io_read_tile_from_file_cmp( int attribute_id, off_t offset, size_t tile_size); /** * Reads a variable-sized tile from the disk for an attribute into a local - * buffer, using MPI-IO. This function focuses on the case of GZIP + * buffer, using MPI-IO. This function focuses on the case of any * compression. * * @param attribute_id The id of the attribute the read occurs for. @@ -723,7 +824,7 @@ class ReadState { * @param tile_size The tile size. * @return TILEDB_RS_OK for success, and TILEDB_RS_ERR for error. */ - int mpi_io_read_tile_from_file_var_cmp_gzip( + int mpi_io_read_tile_from_file_var_cmp( int attribute_id, off_t offset, size_t tile_size); @@ -749,13 +850,13 @@ class ReadState { /** * Prepares a tile from the disk for reading for an attribute. - * This function focuses on the case there is GZIP compression. + * This function focuses on the case there is any compression. * * @param attribute_id The id of the attribute the tile is prepared for. * @param tile_i The tile position on the disk. * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. */ - int prepare_tile_for_reading_cmp_gzip(int attribute_id, int64_t tile_i); + int prepare_tile_for_reading_cmp(int attribute_id, int64_t tile_i); /** * Prepares a tile from the disk for reading for an attribute. @@ -769,14 +870,14 @@ class ReadState { /** * Prepares a tile from the disk for reading for an attribute. - * This function focuses on the case of variable-sized tiles with GZIP + * This function focuses on the case of variable-sized tiles with any * compression. * * @param attribute_id The id of the attribute the tile is prepared for. * @param tile_i The tile position on the disk. * @return TILEDB_RS_OK for success and TILEDB_RS_ERR for error. */ - int prepare_tile_for_reading_var_cmp_gzip(int attribute_id, int64_t tile_i); + int prepare_tile_for_reading_var_cmp(int attribute_id, int64_t tile_i); /** * Prepares a tile from the disk for reading for an attribute. @@ -823,28 +924,28 @@ class ReadState { /** * Reads a tile from the disk for an attribute into a local buffer. This - * function focuses on the case there is GZIP compression. + * function focuses on the case there is any compression. * * @param attribute_id The id of the attribute the read occurs for. * @param offset The offset at which the tile starts in the file. * @param tile_size The tile size. * @return TILEDB_RS_OK for success, and TILEDB_RS_ERR for error. */ - int read_tile_from_file_cmp_gzip( + int read_tile_from_file_cmp( int attribute_id, off_t offset, size_t tile_size); /** * Reads a tile from the disk for an attribute into a local buffer. This - * function focuses on the case of variable-sized tiles and GZIP compression. + * function focuses on the case of variable-sized tiles and any compression. * * @param attribute_id The id of the attribute the read occurs for. * @param offset The offset at which the tile starts in the file. * @param tile_size The tile size. * @return TILEDB_RS_OK for success, and TILEDB_RS_ERR for error. */ - int read_tile_from_file_var_cmp_gzip( + int read_tile_from_file_var_cmp( int attribute_id, off_t offset, size_t tile_size); diff --git a/core/include/fragment/write_state.h b/core/include/fragment/write_state.h index 8282db47..a39b1b04 100644 --- a/core/include/fragment/write_state.h +++ b/core/include/fragment/write_state.h @@ -205,6 +205,98 @@ class WriteState { /* PRIVATE METHODS */ /* ********************************* */ + /** + * Compresses the input tile buffer, and stores it inside tile_compressed_ + * member attribute. + * + * @param attribute_id The id of the attribute the tile belongs to. + * @param tile The tile buffer to be compressed. + * @param tile_size The size of the tile buffer in bytes. + * @param tile_compressed_size The size of the resulting compressed tile. + * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. + */ + int compress_tile( + int attribute_id, + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size); + + /** + * Compresses with GZIP the input tile buffer, and stores it inside + * tile_compressed_ member attribute. + * + * @param tile The tile buffer to be compressed. + * @param tile_size The size of the tile buffer in bytes. + * @param tile_compressed_size The size of the resulting compressed tile. + * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. + */ + int compress_tile_gzip( + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size); + + /** + * Compresses with Zstandard the input tile buffer, and stores it inside + * tile_compressed_ member attribute. + * + * @param tile The tile buffer to be compressed. + * @param tile_size The size of the tile buffer in bytes. + * @param tile_compressed_size The size of the resulting compressed tile. + * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. + */ + int compress_tile_zstd( + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size); + + /** + * Compresses with LZ4 the input tile buffer, and stores it inside + * tile_compressed_ member attribute. + * + * @param tile The tile buffer to be compressed. + * @param tile_size The size of the tile buffer in bytes. + * @param tile_compressed_size The size of the resulting compressed tile. + * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. + */ + int compress_tile_lz4( + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size); + + /** + * Compresses with Blosc the input tile buffer, and stores it inside + * tile_compressed_ member attribute. + * + * @param attribute_id The id of the attribute the tile belongs to. + * @param tile The tile buffer to be compressed. + * @param tile_size The size of the tile buffer in bytes. + * @param tile_compressed_size The size of the resulting compressed tile. + * @param compressor The Blosc compressor. + * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. + */ + int compress_tile_blosc( + int attribute_id, + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size, + const char* compressor); + + /** + * Compresses with RLE the input tile buffer, and stores it inside + * tile_compressed_ member attribute. + * + * @param attribute_id The id of the attribute the tile belongs to. + * @param tile The tile buffer to be compressed. + * @param tile_size The size of the tile buffer in bytes. + * @param tile_compressed_size The size of the resulting compressed tile. + * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. + */ + int compress_tile_rle( + int attribute_id, + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size); + /** * Compresses the current tile for the input attribute, and writes (appends) * it to its corresponding file on the disk. @@ -359,14 +451,14 @@ class WriteState { /** * Performs the write operation for the case of a dense fragment, focusing - * on a single fixed-sized attribute and the case of GZIP compression. + * on a single fixed-sized attribute and the case of any compression. * * @param attribute_id The id of the attribute this operation focuses on. * @param buffer See write(). * @param buffer_size See write(). * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. */ - int write_dense_attr_cmp_gzip( + int write_dense_attr_cmp( int attribute_id, const void* buffer, size_t buffer_size); @@ -409,7 +501,7 @@ class WriteState { /** * Performs the write operation for the case of a dense fragment, focusing - * on a single variable-sized attribute and the case of GZIP compression. + * on a single variable-sized attribute and the case of any compression. * * @param attribute_id The id of the attribute this operation focuses on. * @param buffer See write() - start offsets in *buffer_var*. @@ -418,7 +510,7 @@ class WriteState { * @param buffer_size See write(). * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. */ - int write_dense_attr_var_cmp_gzip( + int write_dense_attr_var_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -466,14 +558,14 @@ class WriteState { /** * Performs the write operation for the case of a sparse fragment, focusing - * on a single fixed-sized attribute and the case of GZIP compression. + * on a single fixed-sized attribute and the case of any compression. * * @param attribute_id The id of the attribute this operation focuses on. * @param buffer See write(). * @param buffer_size See write(). * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. */ - int write_sparse_attr_cmp_gzip( + int write_sparse_attr_cmp( int attribute_id, const void* buffer, size_t buffer_size); @@ -516,7 +608,7 @@ class WriteState { /** * Performs the write operation for the case of a sparse fragment, focusing - * on a single variable-sized attribute and the case of GZIP compression. + * on a single variable-sized attribute and the case of any compression. * * @param attribute_id The id of the attribute this operation focuses on. * @param buffer See write() - start offsets in *buffer_var*. @@ -525,7 +617,7 @@ class WriteState { * @param buffer_size See write(). * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. */ - int write_sparse_attr_var_cmp_gzip( + int write_sparse_attr_var_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -580,7 +672,7 @@ class WriteState { /** * Performs the write operation for the case of a sparse fragment when the * coordinates are unsorted, focusing on a single fixed-sized attribute and - * the case of GZIP compression. + * the case of any compression. * * @param attribute_id The id of the attribute this operation focuses on. * @param buffer See write(). @@ -588,7 +680,7 @@ class WriteState { * @param cell_pos The sorted positions of the cells. * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. */ - int write_sparse_unsorted_attr_cmp_gzip( + int write_sparse_unsorted_attr_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -638,7 +730,7 @@ class WriteState { /** * Performs the write operation for the case of a sparse fragment when the * coordinates are unsorted, focusing on a single variable-sized attribute and - * the case of GZIP compression. + * the case of any compression. * * @param attribute_id The id of the attribute this operation focuses on. * @param buffer See write() - start offsets in *buffer_var*. @@ -648,7 +740,7 @@ class WriteState { * @param cell_pos The sorted positions of the cells. * @return TILEDB_WS_OK on success and TILEDB_WS_ERR on error. */ - int write_sparse_unsorted_attr_var_cmp_gzip( + int write_sparse_unsorted_attr_var_cmp( int attribute_id, const void* buffer, size_t buffer_size, diff --git a/core/include/metadata/metadata.h b/core/include/metadata/metadata.h index bb9bfada..c300fcaa 100644 --- a/core/include/metadata/metadata.h +++ b/core/include/metadata/metadata.h @@ -34,7 +34,7 @@ #define __METADATA_H__ #include "array.h" -#include "config.h" +#include "storage_manager_config.h" /* ********************************* */ /* CONSTANTS */ @@ -177,7 +177,7 @@ class Metadata { int mode, const char** attributes, int attribute_num, - const Config* config); + const StorageManagerConfig* config); /** * Resets the attributes used upon initialization of the metadata. diff --git a/core/include/metadata/metadata_schema_c.h b/core/include/metadata/metadata_schema_c.h index fb5d67e0..bef2f065 100644 --- a/core/include/metadata/metadata_schema_c.h +++ b/core/include/metadata/metadata_schema_c.h @@ -61,7 +61,16 @@ typedef struct MetadataSchemaC { * The compression type for each attribute (plus one extra at the end for the * key. It can be one of the following: * - TILEDB_NO_COMPRESSION - * - TILEDB_GZIP. + * - TILEDB_GZIP + * - TILEDB_ZSTD + * - TILEDB_LZ4 + * - TILEDB_BLOSC + * - TILEDB_BLOSC_LZ4 + * - TILEDB_BLOSC_LZ4HC + * - TILEDB_BLOSC_SNAPPY + * - TILEDB_BLOSC_ZLIB + * - TILEDB_BLOSC_ZSTD + * - TILEDB_RLE */ int* compression_; /** diff --git a/core/include/misc/utils.h b/core/include/misc/utils.h index 15bdf82b..a84e5ef7 100644 --- a/core/include/misc/utils.h +++ b/core/include/misc/utils.h @@ -600,6 +600,153 @@ int read_from_file_with_mmap( */ std::string real_dir(const std::string& dir); +/** + * Compresses with RLE. + * + * @param input The input buffer to be compressed. + * @param input_size The size of the input buffer. + * @param output The output buffer that results from compression. + * @param output_allocated_size The allocated size of the output buffer. + * @param value_size The size of each single value in the input buffer. + * @return The size of the result ouput buffer upon success, and TILEDB_UT_ERR + * on error. + */ +int64_t RLE_compress( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size); + +/** + * Returns the maximum size of the output of RLE compression. + * + * @param buffer_size The input buffer size. + * @param value_size The size of a sinlge value in the input buffer. + * @return The maximum size of the output after RLE-compressing the input with + * size input_size. + */ +size_t RLE_compress_bound( + size_t input_size, + size_t value_size); + +/** + * Returns the maximum size of the output of RLE compression on the coordinates. + * + * @param buffer_size The input buffer size. + * @param value_size The size of a sinlge value in the input buffer. + * @param dim_num The number of dimensions/coordinates in a single value. + * @return The maximum size of the output after RLE-compressing the input with + * size input_size. + */ +size_t RLE_compress_bound_coords( + size_t input_size, + size_t value_size, + int dim_num); + +/** + * Compresses the coordinates of a buffer with RLE, assuming that the cells in + * input buffer are sorted in column-major order. + * + * @param input The input buffer to be compressed. + * @param input_size The size of the input buffer. + * @param output The output buffer that results from compression. + * @param output_allocated_size The allocated size of the output buffer. + * @param value_size The size of each single value in the input buffer. + * @param dim_num The number of dimensions/coordinates of each cell in the + * input buffer. + * @return The size of the result ouput buffer upon success, and TILEDB_UT_ERR + * on error. + */ +int64_t RLE_compress_coords_col( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size, + int dim_num); + +/** + * Compresses the coordinates of a buffer with RLE, assuming that the cells in + * input buffer are sorted in row-major order. + * + * @param input The input buffer to be compressed. + * @param input_size The size of the input buffer. + * @param output The output buffer that results from compression. + * @param output_allocated_size The allocated size of the output buffer. + * @param value_size The size of each single value in the input buffer. + * @param dim_num The number of dimensions/coordinates of each cell in the + * input buffer. + * @return The size of the result ouput buffer upon success, and TILEDB_UT_ERR + * on error. + */ +int64_t RLE_compress_coords_row( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size, + int dim_num); + +/** + * Decompresses with RLE. + * + * @param input The input buffer to be decompressed. + * @param input_size The size of the input buffer. + * @param output The output buffer that results from decompression. + * @param output_allocated_size The allocated size of the output buffer. + * @param value_size The size of each single value in the input buffer. + * @return TILEDB_UT_OK on success and TILEDB_UT_ERR on error. + */ +int RLE_decompress( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size); + +/** + * Decompresses the coordinates of a buffer with RLE, assuming that the cells in + * input buffer are sorted in column-major order. + * + * @param input The input buffer to be decompressed. + * @param input_size The size of the input buffer. + * @param output The output buffer that results from decompression. + * @param output_allocated_size The allocated size of the output buffer. + * @param value_size The size of each single value in the output buffer. + * @param dim_num The number of dimensions/coordinates of each cell in the + * output buffer. + * @return TILEDB_UT_OK on success and TILEDB_UT_ERR on error. + */ +int RLE_decompress_coords_col( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size, + int dim_num); + +/** + * Decompresses the coordinates of a buffer with RLE, assuming that the cells in + * input buffer are sorted in row-major order. + * + * @param input The input buffer to be decompressed. + * @param input_size The size of the input buffer. + * @param output The output buffer that results from decompression. + * @param output_allocated_size The allocated size of the output buffer. + * @param value_size The size of each single value in the output buffer. + * @param dim_num The number of dimensions/coordinates of each cell in the + * output buffer. + * @return TILEDB_UT_OK on success and TILEDB_UT_ERR on error. + */ +int RLE_decompress_coords_row( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size, + int dim_num); + /** * Checks if a string starts with a certain prefix. * diff --git a/core/include/storage_manager/storage_manager.h b/core/include/storage_manager/storage_manager.h index c154eacf..e6eedfba 100755 --- a/core/include/storage_manager/storage_manager.h +++ b/core/include/storage_manager/storage_manager.h @@ -37,10 +37,10 @@ #include "array_iterator.h" #include "array_schema.h" #include "array_schema_c.h" -#include "config.h" #include "metadata.h" #include "metadata_iterator.h" #include "metadata_schema_c.h" +#include "storage_manager_config.h" #include #ifdef HAVE_OPENMP #include @@ -132,7 +132,7 @@ class StorageManager { * then the default TileDB parameters are used. * @return TILEDB_SM_OK for success and TILEDB_SM_ERR for error. */ - int init(Config* config); + int init(StorageManagerConfig* config); @@ -545,7 +545,7 @@ class StorageManager { /* ********************************* */ /** The TileDB configuration parameters. */ - Config* config_; + StorageManagerConfig* config_; /** The directory of the master catalog. */ std::string master_catalog_dir_; /** OpneMP mutex for creating/deleting an OpenArray object. */ @@ -676,7 +676,7 @@ class StorageManager { * @param config The configuration parameters. * @return TILEDB_SM_OK for success, and TILEDB_SM_ERR for error. */ - int config_set(Config* config); + int config_set(StorageManagerConfig* config); /** * Creates a special file that serves as lock needed for implementing diff --git a/core/include/storage_manager/config.h b/core/include/storage_manager/storage_manager_config.h similarity index 94% rename from core/include/storage_manager/config.h rename to core/include/storage_manager/storage_manager_config.h index c8a0b92f..a73080d9 100644 --- a/core/include/storage_manager/config.h +++ b/core/include/storage_manager/storage_manager_config.h @@ -1,5 +1,5 @@ /** - * @file config.h + * @file storage_manager_config.h * * @section LICENSE * @@ -27,7 +27,7 @@ * * @section DESCRIPTION * - * This file defines class Config. + * This file defines class StorageManagerConfig. */ #ifndef __CONFIG_H__ @@ -41,18 +41,21 @@ -/** This class is responsible for the TileDB configuration parameters. */ -class Config { +/** + * This class is responsible for the TileDB storage manager configuration + * parameters. + */ +class StorageManagerConfig { public: /* ********************************* */ /* CONSTRUCTORS & DESTRUCTORS */ /* ********************************* */ /** Constructor. */ - Config(); + StorageManagerConfig(); /** Destructor. */ - ~Config(); + ~StorageManagerConfig(); diff --git a/core/src/array/array.cc b/core/src/array/array.cc index 8e56125e..e02a7704 100644 --- a/core/src/array/array.cc +++ b/core/src/array/array.cc @@ -228,7 +228,7 @@ const std::vector& Array::attribute_ids() const { return attribute_ids_; } -const Config* Array::config() const { +const StorageManagerConfig* Array::config() const { return config_; } @@ -542,7 +542,7 @@ int Array::init( const char** attributes, int attribute_num, const void* subarray, - const Config* config, + const StorageManagerConfig* config, Array* array_clone) { // Set mode mode_ = mode; diff --git a/core/src/array/array_schema.cc b/core/src/array/array_schema.cc index 3de8ed09..db4430da 100644 --- a/core/src/array/array_schema.cc +++ b/core/src/array/array_schema.cc @@ -500,10 +500,46 @@ void ArraySchema::print() const { for(int i=0; i=0 && i <= attribute_num_); + + return type_sizes_[i]; +} + int ArraySchema::var_attribute_num() const { int var_attribute_num = 0; for(int i=0; i(type); + type_sizes_[i] = compute_type_size(i); } // Load cell_val_num_ cell_val_num_.resize(attribute_num_); @@ -1277,7 +1320,16 @@ int ArraySchema::set_compression(int* compression) { } else { for(int i=0; i #include #include @@ -87,7 +87,7 @@ int tiledb_ctx_init( } // Initialize a Config object - Config* config = new Config(); + StorageManagerConfig* config = new StorageManagerConfig(); if(tiledb_config != NULL) config->init( tiledb_config->home_, diff --git a/core/src/fragment/read_state.cc b/core/src/fragment/read_state.cc index 60417928..aba3c28f 100644 --- a/core/src/fragment/read_state.cc +++ b/core/src/fragment/read_state.cc @@ -32,15 +32,18 @@ #include "utils.h" #include "read_state.h" +#include #include #include #include #include #include #include +#include #include #include #include +#include @@ -1427,6 +1430,247 @@ void ReadState::compute_tile_search_range_hil() { } } +int ReadState::decompress_tile( + int attribute_id, + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size) { + // For easy reference + int compression = array_schema_->compression(attribute_id); + + if(compression == TILEDB_GZIP) + return decompress_tile_gzip( + tile_compressed, + tile_compressed_size, + tile, + tile_size); + else if(compression == TILEDB_ZSTD) + return decompress_tile_zstd( + tile_compressed, + tile_compressed_size, + tile, + tile_size); + else if(compression == TILEDB_LZ4) + return decompress_tile_lz4( + tile_compressed, + tile_compressed_size, + tile, + tile_size); + else if(compression == TILEDB_BLOSC) + return decompress_tile_blosc( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + "blosclz"); + else if(compression == TILEDB_BLOSC_LZ4) + return decompress_tile_blosc( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + "lz4"); + else if(compression == TILEDB_BLOSC_LZ4HC) + return decompress_tile_blosc( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + "lz4hc"); + else if(compression == TILEDB_BLOSC_SNAPPY) + return decompress_tile_blosc( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + "snappy"); + else if(compression == TILEDB_BLOSC_ZLIB) + return decompress_tile_blosc( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + "zlib"); + else if(compression == TILEDB_BLOSC_ZSTD) + return decompress_tile_blosc( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + "zstd"); + else if(compression == TILEDB_RLE) + return decompress_tile_rle( + attribute_id, + tile_compressed, + tile_compressed_size, + tile, + tile_size); + + // Error + assert(0); + return TILEDB_RS_ERR; +} + +int ReadState::decompress_tile_gzip( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size) { + // Decompress tile + size_t gunzip_out_size; + if(gunzip( + tile_compressed, + tile_compressed_size, + tile, + tile_size, + gunzip_out_size) != TILEDB_UT_OK) { + tiledb_rs_errmsg = tiledb_ut_errmsg; + return TILEDB_RS_ERR; + } + + // Success + return TILEDB_RS_OK; +} + +int ReadState::decompress_tile_zstd( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size) { + // Decompress tile + size_t zstd_size = + ZSTD_decompress( + tile, + tile_size, + tile_compressed, + tile_compressed_size); + if(ZSTD_isError(zstd_size)) { + std::string errmsg = "Zstandard decompression failed"; + PRINT_ERROR(errmsg); + tiledb_rs_errmsg = TILEDB_RS_ERRMSG + errmsg; + return TILEDB_RS_ERR; + } + + // Success + return TILEDB_RS_OK; +} + +int ReadState::decompress_tile_lz4( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size) { + // Decompress tile + if(LZ4_decompress_safe( + (const char*) tile_compressed, + (char*) tile, + tile_compressed_size, + tile_size) < 0) { + std::string errmsg = "LZ4 decompression failed"; + PRINT_ERROR(errmsg); + tiledb_rs_errmsg = TILEDB_RS_ERRMSG + errmsg; + return TILEDB_RS_ERR; + } + + // Success + return TILEDB_RS_OK; +} + +int ReadState::decompress_tile_blosc( + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size, + const char* compressor) { + // Initialization + blosc_init(); + + // Decompress tile + if(blosc_decompress( + (const char*) tile_compressed, + (char*) tile, + tile_size) < 0) { + std::string errmsg = "Blosc decompression failed"; + PRINT_ERROR(errmsg); + tiledb_rs_errmsg = TILEDB_RS_ERRMSG + errmsg; + blosc_destroy(); + return TILEDB_RS_ERR; + } + + // Clean up + blosc_destroy(); + + // Success + return TILEDB_RS_OK; +} + +int ReadState::decompress_tile_rle( + int attribute_id, + unsigned char* tile_compressed, + size_t tile_compressed_size, + unsigned char* tile, + size_t tile_size) { + // Special case for search coordinate tiles + if(attribute_id == attribute_num_ + 1) + attribute_id = attribute_num_; + + // For easy reference + const ArraySchema* array_schema = fragment_->array()->array_schema(); + int dim_num = array_schema->dim_num(); + int order = array_schema->cell_order(); + bool is_coords = (attribute_id == attribute_num_); + size_t value_size = + (array_schema->var_size(attribute_id) || is_coords) ? + array_schema->type_size(attribute_id) : + array_schema->cell_size(attribute_id); + + // Decompress tile + int rc; + if(!is_coords) { + rc = RLE_decompress( + (unsigned char*) tile_compressed_, + tile_compressed_size, + tile, + tile_size, + value_size); + } else { + if(order == TILEDB_ROW_MAJOR) { + rc = RLE_decompress_coords_row( + (unsigned char*) tile_compressed_, + tile_compressed_size, + tile, + tile_size, + value_size, + dim_num); + } else if(order == TILEDB_COL_MAJOR) { + rc = RLE_compress_coords_col( + (unsigned char*) tile_compressed_, + tile_compressed_size, + tile, + tile_size, + value_size, + dim_num); + } else { // Error + assert(0); + std::string errmsg = + "Failed decompressing with RLE; unsupported cell order"; + PRINT_ERROR(errmsg); + tiledb_rs_errmsg = TILEDB_RS_ERRMSG + errmsg; + return TILEDB_RS_ERR; + } + } + + // Handle error + if(rc != TILEDB_UT_OK) { + tiledb_rs_errmsg = tiledb_ut_errmsg; + return TILEDB_RS_ERR; + } + + // Success + return TILEDB_RS_OK; +} + template int64_t ReadState::get_cell_pos_after(const T* coords) { // For easy reference @@ -1667,7 +1911,7 @@ bool ReadState::is_empty_attribute(int attribute_id) const { return is_empty_attribute_[attribute_id]; } -int ReadState::map_tile_from_file_cmp_gzip( +int ReadState::map_tile_from_file_cmp( int attribute_id, off_t offset, size_t tile_size) { @@ -1750,7 +1994,7 @@ int ReadState::map_tile_from_file_cmp_gzip( return TILEDB_RS_OK; } -int ReadState::map_tile_from_file_var_cmp_gzip( +int ReadState::map_tile_from_file_var_cmp( int attribute_id, off_t offset, size_t tile_size) { @@ -2008,7 +2252,7 @@ int ReadState::map_tile_from_file_var_cmp_none( } #ifdef HAVE_MPI -int ReadState::mpi_io_read_tile_from_file_cmp_gzip( +int ReadState::mpi_io_read_tile_from_file_cmp( int attribute_id, off_t offset, size_t tile_size) { @@ -2050,7 +2294,7 @@ int ReadState::mpi_io_read_tile_from_file_cmp_gzip( return TILEDB_RS_OK; } -int ReadState::mpi_io_read_tile_from_file_var_cmp_gzip( +int ReadState::mpi_io_read_tile_from_file_var_cmp( int attribute_id, off_t offset, size_t tile_size) { @@ -2098,10 +2342,10 @@ int ReadState::prepare_tile_for_reading( int compression = array_schema_->compression(attribute_id); // Invoke the proper function based on the compression type - if(compression == TILEDB_GZIP) - return prepare_tile_for_reading_cmp_gzip(attribute_id, tile_i); - else + if(compression == TILEDB_NO_COMPRESSION) return prepare_tile_for_reading_cmp_none(attribute_id, tile_i); + else // All compressions + return prepare_tile_for_reading_cmp(attribute_id, tile_i); } int ReadState::prepare_tile_for_reading_var( @@ -2111,13 +2355,13 @@ int ReadState::prepare_tile_for_reading_var( int compression = array_schema_->compression(attribute_id); // Invoke the proper function based on the compression type - if(compression == TILEDB_GZIP) - return prepare_tile_for_reading_var_cmp_gzip(attribute_id, tile_i); - else + if(compression == TILEDB_NO_COMPRESSION) return prepare_tile_for_reading_var_cmp_none(attribute_id, tile_i); + else // All compressions + return prepare_tile_for_reading_var_cmp(attribute_id, tile_i); } -int ReadState::prepare_tile_for_reading_cmp_gzip( +int ReadState::prepare_tile_for_reading_cmp( int attribute_id, int64_t tile_i) { // Return if the tile has already been fetched @@ -2160,18 +2404,18 @@ int ReadState::prepare_tile_for_reading_cmp_gzip( int rc = TILEDB_RS_OK; int read_method = array_->config()->read_method(); if(read_method == TILEDB_IO_READ) { - rc = read_tile_from_file_cmp_gzip( + rc = read_tile_from_file_cmp( attribute_id, file_offset, tile_compressed_size); } else if(read_method == TILEDB_IO_MMAP) { - rc = map_tile_from_file_cmp_gzip( + rc = map_tile_from_file_cmp( attribute_id, file_offset, tile_compressed_size); } else if(read_method == TILEDB_IO_MPI) { #ifdef HAVE_MPI - rc = mpi_io_read_tile_from_file_cmp_gzip( + rc = mpi_io_read_tile_from_file_cmp( attribute_id, file_offset, tile_compressed_size); @@ -2189,21 +2433,15 @@ int ReadState::prepare_tile_for_reading_cmp_gzip( if(rc != TILEDB_RS_OK) return TILEDB_RS_ERR; - // Decompress tile - size_t gunzip_out_size; - if(gunzip( + // Decompress tile + if(decompress_tile( + attribute_id, static_cast(tile_compressed_), tile_compressed_size, static_cast(tiles_[attribute_id]), - full_tile_size, - gunzip_out_size) != TILEDB_UT_OK) { - tiledb_rs_errmsg = tiledb_ut_errmsg; + full_tile_size) != TILEDB_RS_OK) return TILEDB_RS_ERR; - } - - // Sanity check - assert(gunzip_out_size == tile_size); - + // Set the tile size tiles_sizes_[attribute_id] = tile_size; @@ -2269,7 +2507,7 @@ int ReadState::prepare_tile_for_reading_cmp_none( return TILEDB_RS_OK; } -int ReadState::prepare_tile_for_reading_var_cmp_gzip( +int ReadState::prepare_tile_for_reading_var_cmp( int attribute_id, int64_t tile_i) { // Return if the tile has already been fetched @@ -2315,18 +2553,18 @@ int ReadState::prepare_tile_for_reading_var_cmp_gzip( int rc = TILEDB_RS_OK; int read_method = array_->config()->read_method(); if(read_method == TILEDB_IO_READ) { - rc = read_tile_from_file_cmp_gzip( + rc = read_tile_from_file_cmp( attribute_id, file_offset, tile_compressed_size); } else if(read_method == TILEDB_IO_MMAP) { - rc = map_tile_from_file_cmp_gzip( + rc = map_tile_from_file_cmp( attribute_id, file_offset, tile_compressed_size); } else if(read_method == TILEDB_IO_MPI) { #ifdef HAVE_MPI - rc = mpi_io_read_tile_from_file_cmp_gzip( + rc = mpi_io_read_tile_from_file_cmp( attribute_id, file_offset, tile_compressed_size); @@ -2344,20 +2582,14 @@ int ReadState::prepare_tile_for_reading_var_cmp_gzip( if(rc != TILEDB_RS_OK) return TILEDB_RS_ERR; - // Decompress tile - size_t gunzip_out_size; - if(gunzip( + // Decompress tile + if(decompress_tile( + attribute_id, static_cast(tile_compressed_), tile_compressed_size, static_cast(tiles_[attribute_id]), - tile_size, - gunzip_out_size) != TILEDB_UT_OK) { - tiledb_rs_errmsg = tiledb_ut_errmsg; + tile_size) != TILEDB_RS_OK) return TILEDB_RS_ERR; - } - - // Sanity check - assert(gunzip_out_size == tile_size); // Set the tile size tiles_sizes_[attribute_id] = tile_size; @@ -2398,52 +2630,47 @@ int ReadState::prepare_tile_for_reading_var_cmp_gzip( tiles_var_allocated_size_[attribute_id] = tile_var_size; } - // Read tile from file - int rc = TILEDB_RS_OK; - int read_method = array_->config()->read_method(); - if(read_method == TILEDB_IO_READ) { - rc = read_tile_from_file_var_cmp_gzip( - attribute_id, - file_offset, - tile_compressed_size); - } else if(read_method == TILEDB_IO_MMAP) { - rc = map_tile_from_file_var_cmp_gzip( - attribute_id, - file_offset, - tile_compressed_size); - } else if(read_method == TILEDB_IO_MPI) { + // Read tile from file + int rc = TILEDB_RS_OK; + int read_method = array_->config()->read_method(); + if(read_method == TILEDB_IO_READ) { + rc = read_tile_from_file_var_cmp( + attribute_id, + file_offset, + tile_compressed_size); + } else if(read_method == TILEDB_IO_MMAP) { + rc = map_tile_from_file_var_cmp( + attribute_id, + file_offset, + tile_compressed_size); + } else if(read_method == TILEDB_IO_MPI) { #ifdef HAVE_MPI - rc = mpi_io_read_tile_from_file_var_cmp_gzip( - attribute_id, - file_offset, - tile_compressed_size); + rc = mpi_io_read_tile_from_file_var_cmp( + attribute_id, + file_offset, + tile_compressed_size); #else - // Error: MPI not supported - std::string errmsg = - "Cannot prepare variable tile for reading (gzip); MPI not supported"; - PRINT_ERROR(errmsg); - tiledb_rs_errmsg = TILEDB_RS_ERRMSG + errmsg; - return TILEDB_RS_ERR; + // Error: MPI not supported + std::string errmsg = + "Cannot prepare variable tile for reading (gzip); MPI not supported"; + PRINT_ERROR(errmsg); + tiledb_rs_errmsg = TILEDB_RS_ERRMSG + errmsg; + return TILEDB_RS_ERR; #endif - } - - // Error - if(rc != TILEDB_RS_OK) - return TILEDB_RS_ERR; + } - // Decompress tile - if(gunzip( - static_cast(tile_compressed_), - tile_compressed_size, - static_cast(tiles_var_[attribute_id]), - tile_var_size, - gunzip_out_size) != TILEDB_UT_OK) { - tiledb_rs_errmsg = tiledb_ut_errmsg; + // Error + if(rc != TILEDB_RS_OK) return TILEDB_RS_ERR; - } - // Sanity check - assert(gunzip_out_size == tile_var_size); + // Decompress tile + if(decompress_tile( + attribute_id, + static_cast(tile_compressed_), + tile_compressed_size, + static_cast(tiles_var_[attribute_id]), + tile_var_size) != TILEDB_RS_OK) + return TILEDB_RS_ERR; } // Set the variable tile size @@ -2707,7 +2934,7 @@ int ReadState::READ_FROM_TILE_VAR( return TILEDB_RS_OK; } -int ReadState::read_tile_from_file_cmp_gzip( +int ReadState::read_tile_from_file_cmp( int attribute_id, off_t offset, size_t tile_size) { @@ -2718,11 +2945,14 @@ int ReadState::read_tile_from_file_cmp_gzip( // Potentially allocate compressed tile buffer if(tile_compressed_ == NULL) { - size_t full_tile_size = fragment_->tile_size(attribute_id_real); - size_t tile_max_size = - full_tile_size + 6 + 5*(ceil(full_tile_size/16834.0)); - tile_compressed_ = malloc(tile_max_size); - tile_compressed_allocated_size_ = tile_max_size; + tile_compressed_ = malloc(tile_size); + tile_compressed_allocated_size_ = tile_size; + } + + // Potentially expand compressed tile buffer + if(tile_compressed_allocated_size_ < tile_size) { + tile_compressed_ = realloc(tile_compressed_, tile_size); + tile_compressed_allocated_size_ = tile_size; } // Prepare attribute file name @@ -2742,7 +2972,7 @@ int ReadState::read_tile_from_file_cmp_gzip( return TILEDB_RS_OK; } -int ReadState::read_tile_from_file_var_cmp_gzip( +int ReadState::read_tile_from_file_var_cmp( int attribute_id, off_t offset, size_t tile_size) { diff --git a/core/src/fragment/write_state.cc b/core/src/fragment/write_state.cc index e127f224..2bc44b6f 100644 --- a/core/src/fragment/write_state.cc +++ b/core/src/fragment/write_state.cc @@ -35,12 +35,15 @@ #include "utils.h" #include "write_state.h" #include "utils.h" +#include #include #include #include #include +#include #include #include +#include @@ -450,16 +453,80 @@ int WriteState::write(const void** buffers, const size_t* buffer_sizes) { /* PRIVATE METHODS */ /* ****************************** */ -int WriteState::compress_and_write_tile(int attribute_id) { +int WriteState::compress_tile( + int attribute_id, + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size) { // For easy reference const ArraySchema* array_schema = fragment_->array()->array_schema(); - unsigned char* tile = static_cast(tiles_[attribute_id]); - size_t tile_size = tile_offsets_[attribute_id]; + int compression = array_schema->compression(attribute_id); - // Trivial case - No in-memory tile - if(tile_size == 0) - return TILEDB_WS_OK; + // Handle different compression + if(compression == TILEDB_GZIP) + return compress_tile_gzip(tile, tile_size, tile_compressed_size); + else if(compression == TILEDB_ZSTD) + return compress_tile_zstd(tile, tile_size, tile_compressed_size); + else if(compression == TILEDB_LZ4) + return compress_tile_lz4(tile, tile_size, tile_compressed_size); + else if(compression == TILEDB_BLOSC) + return compress_tile_blosc( + attribute_id, + tile, + tile_size, + tile_compressed_size, + "blosclz"); + else if(compression == TILEDB_BLOSC_LZ4) + return compress_tile_blosc( + attribute_id, + tile, + tile_size, + tile_compressed_size, + "lz4"); + else if(compression == TILEDB_BLOSC_LZ4HC) + return compress_tile_blosc( + attribute_id, + tile, + tile_size, + tile_compressed_size, + "lz4hc"); + else if(compression == TILEDB_BLOSC_SNAPPY) + return compress_tile_blosc( + attribute_id, + tile, + tile_size, + tile_compressed_size, + "snappy"); + else if(compression == TILEDB_BLOSC_ZLIB) + return compress_tile_blosc( + attribute_id, + tile, + tile_size, + tile_compressed_size, + "zlib"); + else if(compression == TILEDB_BLOSC_ZSTD) + return compress_tile_blosc( + attribute_id, + tile, + tile_size, + tile_compressed_size, + "zstd"); + else if(compression == TILEDB_RLE) + return compress_tile_rle( + attribute_id, + tile, + tile_size, + tile_compressed_size); + // Error + assert(0); + return TILEDB_WS_ERR; +} + +int WriteState::compress_tile_gzip( + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size) { // Allocate space to store the compressed tile if(tile_compressed_ == NULL) { tile_compressed_allocated_size_ = @@ -481,12 +548,258 @@ int WriteState::compress_and_write_tile(int attribute_id) { static_cast(tile_compressed_); // Compress tile - ssize_t tile_compressed_size = + ssize_t gzip_size = gzip(tile, tile_size, tile_compressed, tile_compressed_allocated_size_); - if(tile_compressed_size == static_cast(TILEDB_UT_ERR)) { + if(gzip_size == static_cast(TILEDB_UT_ERR)) { tiledb_ws_errmsg = tiledb_ut_errmsg; return TILEDB_WS_ERR; } + tile_compressed_size = (size_t) gzip_size; + + // Success + return TILEDB_WS_OK; +} + +int WriteState::compress_tile_zstd( + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size) { + // Allocate space to store the compressed tile + size_t compress_bound = ZSTD_compressBound(tile_size); + if(tile_compressed_ == NULL) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = malloc(compress_bound); + } + + // Expand comnpressed tile if necessary + if(compress_bound > tile_compressed_allocated_size_) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = realloc(tile_compressed_, compress_bound); + } + + // For easy reference + unsigned char* tile_compressed = + static_cast(tile_compressed_); + + // Compress tile + size_t zstd_size = + ZSTD_compress( + tile_compressed, + tile_compressed_allocated_size_, + tile, + tile_size, + TILEDB_COMPRESSION_LEVEL_ZSTD); + if(ZSTD_isError(zstd_size)) { + std::string errmsg = "Failed compressing with Zstandard"; + PRINT_ERROR(errmsg); + tiledb_ws_errmsg = TILEDB_WS_ERRMSG + errmsg; + return TILEDB_WS_ERR; + } + tile_compressed_size = zstd_size; + + // Success + return TILEDB_WS_OK; +} + +int WriteState::compress_tile_lz4( + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size) { + // Allocate space to store the compressed tile + size_t compress_bound = LZ4_compressBound(tile_size); + if(tile_compressed_ == NULL) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = malloc(compress_bound); + } + + // Expand comnpressed tile if necessary + if(compress_bound > tile_compressed_allocated_size_) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = realloc(tile_compressed_, compress_bound); + } + + // Compress tile + int lz4_size = + LZ4_compress( + (const char*) tile, + (char*) tile_compressed_, + tile_size); + if(lz4_size < 0) { + std::string errmsg = "Failed compressing with LZ4"; + PRINT_ERROR(errmsg); + tiledb_ws_errmsg = TILEDB_WS_ERRMSG + errmsg; + return TILEDB_WS_ERR; + } + tile_compressed_size = lz4_size; + + // Success + return TILEDB_WS_OK; +} + +int WriteState::compress_tile_blosc( + int attribute_id, + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size, + const char* compressor) { + // For easy reference + const ArraySchema* array_schema = fragment_->array()->array_schema(); + + // Allocate space to store the compressed tile + size_t compress_bound = tile_size + BLOSC_MAX_OVERHEAD; + if(tile_compressed_ == NULL) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = malloc(compress_bound); + } + + // Expand comnpressed tile if necessary + if(compress_bound > tile_compressed_allocated_size_) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = realloc(tile_compressed_, compress_bound); + } + + // Initialize Blosc + blosc_init(); + + // Set the appropriate compressor + if(blosc_set_compressor(compressor) < 0) { + std::string errmsg = "Failed to set Blosc compressor"; + PRINT_ERROR(errmsg); + tiledb_ws_errmsg = TILEDB_WS_ERRMSG + errmsg; + blosc_destroy(); + return TILEDB_WS_ERR; + } + + // For easy reference + unsigned char* tile_compressed = + static_cast(tile_compressed_); + + // Compress tile + int blosc_size = + blosc_compress( + TILEDB_COMPRESSION_LEVEL_BLOSC, + 1, + array_schema->type_size(attribute_id), + tile_size, + tile, + tile_compressed, + tile_compressed_allocated_size_); + if(blosc_size < 0) { + std::string errmsg = "Failed compressing with Blosc"; + PRINT_ERROR(errmsg); + tiledb_ws_errmsg = TILEDB_WS_ERRMSG + errmsg; + blosc_destroy(); + return TILEDB_WS_ERR; + } + tile_compressed_size = blosc_size; + + // Clean up + blosc_destroy(); + + // Success + return TILEDB_WS_OK; +} + +int WriteState::compress_tile_rle( + int attribute_id, + unsigned char* tile, + size_t tile_size, + size_t& tile_compressed_size) { + // For easy reference + const ArraySchema* array_schema = fragment_->array()->array_schema(); + int attribute_num = array_schema->attribute_num(); + int dim_num = array_schema->dim_num(); + int order = array_schema->cell_order(); + bool is_coords = (attribute_id == attribute_num); + size_t value_size = + (array_schema->var_size(attribute_id) || is_coords) ? + array_schema->type_size(attribute_id) : + array_schema->cell_size(attribute_id); + + // Allocate space to store the compressed tile + size_t compress_bound; + if(!is_coords) + compress_bound = RLE_compress_bound(tile_size, value_size); + else + compress_bound = RLE_compress_bound_coords(tile_size, value_size, dim_num); + if(tile_compressed_ == NULL) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = malloc(compress_bound); + } + + // Expand comnpressed tile if necessary + if(compress_bound > tile_compressed_allocated_size_) { + tile_compressed_allocated_size_ = compress_bound; + tile_compressed_ = realloc(tile_compressed_, compress_bound); + } + + // Compress tile + int64_t rle_size; + if(!is_coords) { + rle_size = RLE_compress( + tile, + tile_size, + (unsigned char*) tile_compressed_, + tile_compressed_allocated_size_, + value_size); + } else { + if(order == TILEDB_ROW_MAJOR) { + rle_size = RLE_compress_coords_row( + tile, + tile_size, + (unsigned char*) tile_compressed_, + tile_compressed_allocated_size_, + value_size, + dim_num); + } else if(order == TILEDB_COL_MAJOR) { + rle_size = RLE_compress_coords_col( + tile, + tile_size, + (unsigned char*) tile_compressed_, + tile_compressed_allocated_size_, + value_size, + dim_num); + } else { // Error + assert(0); + std::string errmsg = + "Failed compressing with RLE; unsupported cell order"; + PRINT_ERROR(errmsg); + tiledb_ws_errmsg = TILEDB_WS_ERRMSG + errmsg; + return TILEDB_WS_ERR; + } + } + + // Handle error + if(rle_size == TILEDB_UT_ERR) { + tiledb_ws_errmsg = tiledb_ut_errmsg; + return TILEDB_WS_ERR; + } + + // Set actual output size + tile_compressed_size = (size_t) rle_size; + + // Success + return TILEDB_WS_OK; +} + +int WriteState::compress_and_write_tile(int attribute_id) { + // For easy reference + const ArraySchema* array_schema = fragment_->array()->array_schema(); + unsigned char* tile = static_cast(tiles_[attribute_id]); + size_t tile_size = tile_offsets_[attribute_id]; + + // Trivial case - No in-memory tile + if(tile_size == 0) + return TILEDB_WS_OK; + + // Compress tile + size_t tile_compressed_size; + if(compress_tile( + attribute_id, + tile, + tile_size, + tile_compressed_size) != TILEDB_WS_OK) + return TILEDB_WS_ERR; // Get the attribute file name std::string filename = fragment_->fragment_name() + "/" + @@ -544,33 +857,14 @@ int WriteState::compress_and_write_tile_var(int attribute_id) { return TILEDB_WS_OK; } - // Allocate space to store the compressed tile - if(tile_compressed_ == NULL) { - tile_compressed_allocated_size_ = - tile_size + 6 + 5*(ceil(tile_size/16834.0)); - tile_compressed_ = malloc(tile_compressed_allocated_size_); - } - - // Expand comnpressed tile if necessary - if(tile_size + 6 + 5*(ceil(tile_size/16834.0)) > - tile_compressed_allocated_size_) { - tile_compressed_allocated_size_ = - tile_size + 6 + 5*(ceil(tile_size/16834.0)); - tile_compressed_ = - realloc(tile_compressed_, tile_compressed_allocated_size_); - } - - // For easy reference - unsigned char* tile_compressed = - static_cast(tile_compressed_); - // Compress tile - ssize_t tile_compressed_size = - gzip(tile, tile_size, tile_compressed, tile_compressed_allocated_size_); - if(tile_compressed_size == TILEDB_UT_ERR) { - tiledb_ws_errmsg = tiledb_ut_errmsg; + size_t tile_compressed_size; + if(compress_tile( + attribute_id, + tile, + tile_size, + tile_compressed_size) != TILEDB_WS_OK) return TILEDB_WS_ERR; - } // Get the attribute file name std::string filename = fragment_->fragment_name() + "/" + @@ -823,7 +1117,7 @@ int WriteState::write_last_tile() { // Flush the last tile for each compressed attribute (it is still in main // memory for(int i=0; icompression(i) == TILEDB_GZIP) { + if(array_schema->compression(i) != TILEDB_NO_COMPRESSION) { if(compress_and_write_tile(i) != TILEDB_WS_OK) return TILEDB_WS_ERR; if(array_schema->var_size(i)) { @@ -886,8 +1180,8 @@ int WriteState::write_dense_attr( // No compression if(compression == TILEDB_NO_COMPRESSION) return write_dense_attr_cmp_none(attribute_id, buffer, buffer_size); - else // GZIP - return write_dense_attr_cmp_gzip(attribute_id, buffer, buffer_size); + else // All compressions + return write_dense_attr_cmp(attribute_id, buffer, buffer_size); } int WriteState::write_dense_attr_cmp_none( @@ -935,7 +1229,7 @@ int WriteState::write_dense_attr_cmp_none( return TILEDB_WS_OK; } -int WriteState::write_dense_attr_cmp_gzip( +int WriteState::write_dense_attr_cmp( int attribute_id, const void* buffer, size_t buffer_size) { @@ -1023,13 +1317,19 @@ int WriteState::write_dense_attr_var( buffer_size, buffer_var, buffer_var_size); - else // GZIP - return write_dense_attr_var_cmp_gzip( + else // All compressions + return write_dense_attr_var_cmp( attribute_id, buffer, buffer_size, buffer_var, buffer_var_size); + + // Sanity check + assert(0); + + // Error + return TILEDB_WS_ERR; } int WriteState::write_dense_attr_var_cmp_none( @@ -1126,7 +1426,7 @@ int WriteState::write_dense_attr_var_cmp_none( return TILEDB_WS_OK; } -int WriteState::write_dense_attr_var_cmp_gzip( +int WriteState::write_dense_attr_var_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -1363,8 +1663,8 @@ int WriteState::write_sparse_attr( // No compression if(compression == TILEDB_NO_COMPRESSION) return write_sparse_attr_cmp_none(attribute_id, buffer, buffer_size); - else // GZIP - return write_sparse_attr_cmp_gzip(attribute_id, buffer, buffer_size); + else // All compressions + return write_sparse_attr_cmp(attribute_id, buffer, buffer_size); } int WriteState::write_sparse_attr_cmp_none( @@ -1420,7 +1720,7 @@ int WriteState::write_sparse_attr_cmp_none( return TILEDB_WS_OK; } -int WriteState::write_sparse_attr_cmp_gzip( +int WriteState::write_sparse_attr_cmp( int attribute_id, const void* buffer, size_t buffer_size) { @@ -1514,8 +1814,8 @@ int WriteState::write_sparse_attr_var( buffer_size, buffer_var, buffer_var_size); - else // GZIP - return write_sparse_attr_var_cmp_gzip( + else // All compressions + return write_sparse_attr_var_cmp( attribute_id, buffer, buffer_size, @@ -1620,7 +1920,7 @@ int WriteState::write_sparse_attr_var_cmp_none( return TILEDB_WS_OK; } -int WriteState::write_sparse_attr_var_cmp_gzip( +int WriteState::write_sparse_attr_var_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -1892,8 +2192,8 @@ int WriteState::write_sparse_unsorted_attr( buffer, buffer_size, cell_pos); - else // GZIP - return write_sparse_unsorted_attr_cmp_gzip( + else // All compressions + return write_sparse_unsorted_attr_cmp( attribute_id, buffer, buffer_size, @@ -1967,7 +2267,7 @@ int WriteState::write_sparse_unsorted_attr_cmp_none( return TILEDB_WS_OK; } -int WriteState::write_sparse_unsorted_attr_cmp_gzip( +int WriteState::write_sparse_unsorted_attr_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -1997,7 +2297,7 @@ int WriteState::write_sparse_unsorted_attr_cmp_gzip( for(int64_t i=0; i TILEDB_SORTED_BUFFER_SIZE) { - if(write_sparse_attr_cmp_gzip( + if(write_sparse_attr_cmp( attribute_id, sorted_buffer, sorted_buffer_size) != TILEDB_WS_OK) { @@ -2018,7 +2318,7 @@ int WriteState::write_sparse_unsorted_attr_cmp_gzip( // Write final batch if(sorted_buffer_size != 0) { - if(write_sparse_attr_cmp_gzip( + if(write_sparse_attr_cmp( attribute_id, sorted_buffer, sorted_buffer_size) != TILEDB_WS_OK) { @@ -2054,8 +2354,8 @@ int WriteState::write_sparse_unsorted_attr_var( buffer_var, buffer_var_size, cell_pos); - else // GZIP - return write_sparse_unsorted_attr_var_cmp_gzip( + else // All compressions + return write_sparse_unsorted_attr_var_cmp( attribute_id, buffer, buffer_size, @@ -2158,7 +2458,7 @@ int WriteState::write_sparse_unsorted_attr_var_cmp_none( return TILEDB_WS_OK; } -int WriteState::write_sparse_unsorted_attr_var_cmp_gzip( +int WriteState::write_sparse_unsorted_attr_var_cmp( int attribute_id, const void* buffer, size_t buffer_size, @@ -2201,7 +2501,7 @@ int WriteState::write_sparse_unsorted_attr_var_cmp_gzip( // Write batch if(sorted_buffer_size + cell_size > TILEDB_SORTED_BUFFER_SIZE || sorted_buffer_var_size + cell_var_size > TILEDB_SORTED_BUFFER_VAR_SIZE) { - if(write_sparse_attr_var_cmp_gzip( + if(write_sparse_attr_var_cmp( attribute_id, sorted_buffer, sorted_buffer_size, @@ -2233,7 +2533,7 @@ int WriteState::write_sparse_unsorted_attr_var_cmp_gzip( // Write final batch if(sorted_buffer_size != 0) { - if(write_sparse_attr_var_cmp_gzip( + if(write_sparse_attr_var_cmp( attribute_id, sorted_buffer, sorted_buffer_size, diff --git a/core/src/metadata/metadata.cc b/core/src/metadata/metadata.cc index e249a689..3f5d29f1 100644 --- a/core/src/metadata/metadata.cc +++ b/core/src/metadata/metadata.cc @@ -167,7 +167,7 @@ int Metadata::init( int mode, const char** attributes, int attribute_num, - const Config* config) { + const StorageManagerConfig* config) { // Sanity check on mode if(mode != TILEDB_METADATA_READ && mode != TILEDB_METADATA_WRITE) { diff --git a/core/src/misc/utils.cc b/core/src/misc/utils.cc index 272a5336..4a8af75f 100644 --- a/core/src/misc/utils.cc +++ b/core/src/misc/utils.cc @@ -53,13 +53,15 @@ #include #include #else -#include + #include #endif #include #include #include +#define XSTR(s) STR(s) +#define STR(s) #s @@ -459,7 +461,7 @@ std::string get_mac_addr() { mib[2] = 0; mib[3] = AF_LINK; mib[4] = NET_RT_IFLIST; - if(((mib[5] = if_nametoindex("en0")) == 0) || + if(((mib[5] = if_nametoindex(XSTR(TILEDB_MAC_ADDRESS_INTERFACE))) == 0) || (sysctl(mib, 6, NULL, &len, NULL, 0) < 0)) { std::string errmsg = "Cannot get MAC address"; PRINT_ERROR(errmsg); @@ -491,7 +493,7 @@ std::string get_mac_addr() { int fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); char mac[13]; - strcpy(s.ifr_name, "eth0"); + strcpy(s.ifr_name, XSTR(TILEDB_MAC_ADDRESS_INTERFACE)); if (0 == ioctl(fd, SIOCGIFHWADDR, &s)) { for (int i = 0; i < 6; ++i) sprintf(mac + 2*i, "%02x", (unsigned char) s.ifr_addr.sa_data[i]); @@ -522,7 +524,7 @@ ssize_t gzip( strm.zalloc = Z_NULL; strm.zfree = Z_NULL; strm.opaque = Z_NULL; - ret = deflateInit(&strm, Z_DEFAULT_COMPRESSION); + ret = deflateInit(&strm, TILEDB_COMPRESSION_LEVEL_GZIP); if(ret != Z_OK) { std::string errmsg = "Cannot compress with GZIP"; @@ -1172,6 +1174,641 @@ std::string real_dir(const std::string& dir) { return ret_dir; } +int64_t RLE_compress( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size) { + // Initializations + int cur_run_len = 1; + int max_run_len = 65535; + const unsigned char* input_cur = input + value_size; + const unsigned char* input_prev = input; + unsigned char* output_cur = output; + int64_t value_num = input_size / value_size; + int64_t output_size = 0; + size_t run_size = value_size + 2*sizeof(char); + unsigned char byte; + + // Trivial case + if(value_num == 0) + return 0; + + // Sanity check on input buffer + if(input_size % value_size) { + std::string errmsg = + "Failed compressing with RLE; invalid input buffer format"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // Make runs + for(int64_t i=1; i output_allocated_size) { + std::string errmsg = + "Failed compressing with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // Copy to output buffer + memcpy(output_cur, input_prev, value_size); + output_cur += value_size; + byte = (unsigned char) (cur_run_len >> 8); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + byte = (unsigned char) (cur_run_len % 256); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + output_size += run_size; + + // Reset current run length + cur_run_len = 1; + } + + // Update run info + input_prev = input_cur; + input_cur = input_prev + value_size; + } + + // Save final run + // --- Sanity check on size + if(output_size + run_size > output_allocated_size) { + std::string errmsg = + "Failed compressing with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // --- Copy to output buffer + memcpy(output_cur, input_prev, value_size); + output_cur += value_size; + byte = (unsigned char) (cur_run_len >> 8); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + byte = (unsigned char) (cur_run_len % 256); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + output_size += run_size; + + // Success + return output_size; +} + +size_t RLE_compress_bound(size_t input_size, size_t value_size) { + // In the worst case, RLE adds two bytes per every value in the buffer. + int64_t value_num = input_size / value_size; + return input_size + value_num * 2; +} + +size_t RLE_compress_bound_coords( + size_t input_size, + size_t value_size, + int dim_num) { + // In the worst case, RLE adds two bytes per every value in the buffer for + // each of its dim_num-1 coordinates (one dimension is never compressed). + // The last sizeof(int64_t) is to record the number of cells compressed. + int64_t cell_num = input_size / (dim_num*value_size); + return input_size + cell_num * (dim_num-1) * 2 + sizeof(int64_t); +} + +int64_t RLE_compress_coords_col( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size, + int dim_num) { + // Initializations + int cur_run_len; + int max_run_len = 65535; + const unsigned char* input_cur; + const unsigned char* input_prev = input; + unsigned char* output_cur = output; + size_t coords_size = value_size*dim_num; + size_t run_size = value_size + 2*sizeof(char); + int64_t coords_num = input_size / coords_size; + int64_t output_size = 0; + unsigned char byte; + + // Sanity check on input buffer format + if(input_size % coords_size) { + std::string errmsg = + "failed compressing coordinates with RLE; invalid input buffer format"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + + // Trivial case + if(coords_num == 0) + return 0; + + // Copy the number of coordinates + if(output_size + sizeof(int64_t) > output_allocated_size) { + std::string errmsg = + "failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + memcpy(output_cur, &coords_num, sizeof(int64_t)); + output_cur += sizeof(int64_t); + output_size += sizeof(int64_t); + + // Copy the first dimension intact + // --- Sanity check on size + if(output_size + coords_num*value_size > output_allocated_size) { + std::string errmsg = + "Failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + // --- Copy to output buffer + for(int64_t i=0; i output_allocated_size) { + std::string errmsg = + "Failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // Copy to output buffer + memcpy(output_cur, input_prev, value_size); + output_cur += value_size; + byte = (unsigned char) (cur_run_len >> 8); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + byte = (unsigned char) (cur_run_len % 256); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + output_size += value_size + 2*sizeof(char); + + // Update current run length + cur_run_len = 1; + } + + // Update run info + input_prev = input_cur; + input_cur = input_prev + coords_size; + } + + // Save final run + //--- Sanity check on ouput size + if(output_size + run_size > output_allocated_size) { + std::string errmsg = + "Failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // --- Copy to output buffer + memcpy(output_cur, input_prev, value_size); + output_cur += value_size; + byte = (unsigned char) (cur_run_len >> 8); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + byte = (unsigned char) (cur_run_len % 256); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + output_size += value_size + 2*sizeof(char); + } + + // Success + return output_size; +} + +int64_t RLE_compress_coords_row( + const unsigned char* input, + size_t input_size, + unsigned char* output, + size_t output_allocated_size, + size_t value_size, + int dim_num) { + // Initializations + int cur_run_len; + int max_run_len = 65535; + const unsigned char* input_cur; + const unsigned char* input_prev; + unsigned char* output_cur = output; + size_t coords_size = value_size*dim_num; + int64_t coords_num = input_size / coords_size; + int64_t output_size = 0; + size_t run_size = value_size + 2*sizeof(char); + unsigned char byte; + + // Sanity check on input buffer format + if(input_size % coords_size) { + std::string errmsg = + "failed compressing coordinates with RLE; invalid input buffer format"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + + // Trivial case + if(coords_num == 0) + return 0; + + // Copy the number of coordinates + if(output_size + sizeof(int64_t) > output_allocated_size) { + std::string errmsg = + "failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + memcpy(output_cur, &coords_num, sizeof(int64_t)); + output_cur += sizeof(int64_t); + output_size += sizeof(int64_t); + + // Make runs for each of the first (dim_num-1) dimensions + for(int d=0; d output_allocated_size) { + std::string errmsg = + "Failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // Copy to output buffer + memcpy(output_cur, input_prev, value_size); + output_cur += value_size; + byte = (unsigned char) (cur_run_len >> 8); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + byte = (unsigned char) (cur_run_len % 256); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + output_size += value_size + 2*sizeof(char); + + // Update current run length + cur_run_len = 1; + } + + // Update run info + input_prev = input_cur; + input_cur = input_prev + coords_size; + } + + // Save final run + // --- Sanity check on size + if(output_size + run_size > output_allocated_size) { + std::string errmsg = + "Failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // --- Copy to output buffer + memcpy(output_cur, input_prev, value_size); + output_cur += value_size; + byte = (unsigned char) (cur_run_len >> 8); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + byte = (unsigned char) (cur_run_len % 256); + memcpy(output_cur, &byte, sizeof(char)); + output_cur += sizeof(char); + output_size += run_size; + } + + // Copy the final dimension intact + // --- Sanity check on size + if(output_size + coords_num*value_size > output_allocated_size) { + std::string errmsg = + "Failed compressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + // --- Copy to output buffer + input_prev = input + (dim_num-1)*value_size; + for(int64_t i=0; i output_allocated_size) { + std::string errmsg = + "Failed decompressing with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // Copy to output buffer + for(int64_t j=0; j input_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; input buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + // --- Copy number of coordinates + memcpy(&coords_num, input_cur, sizeof(int64_t)); + input_cur += sizeof(int64_t); + input_offset += sizeof(int64_t); + + // Trivial case + if(coords_num == 0) + return TILEDB_UT_OK; + + // Copy the first dimension intact + // --- Sanity check on output buffer size + if(coords_num * coords_size > output_allocated_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + // --- Sanity check on output buffer size + if(input_offset + coords_num * value_size > input_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; input buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + // --- Copy first dimension to output + for(int64_t i=0; i input_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; input buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + // --- Copy number of coordinates + memcpy(&coords_num, input_cur, sizeof(int64_t)); + input_cur += sizeof(int64_t); + input_offset += sizeof(int64_t); + + // Trivial case + if(coords_num == 0) + return TILEDB_UT_OK; + + // Sanity check on output buffer size + if(coords_num * coords_size > output_allocated_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; output buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = tiledb_ut_errmsg + errmsg; + return TILEDB_UT_ERR; + } + + // Get number of runs + int64_t run_num = + (input_size - input_offset - coords_num * value_size) / run_size; + + // Sanity check on input buffer format + if((input_size - input_offset - coords_num * value_size) % run_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; " + "invalid input buffer format"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + + // Decompress runs for each of the first (dim_num-1) dimensions + int64_t coords_i = 0; + int d = 0; + for(int64_t i=0; i input_size) { + std::string errmsg = + "Failed decompressing coordinates with RLE; input buffer overflow"; + PRINT_ERROR(errmsg); + tiledb_ut_errmsg = TILEDB_UT_ERRMSG + errmsg; + return TILEDB_UT_ERR; + } + // --- Copy to output buffer + for(int64_t i=0; i value.size()) return false; diff --git a/core/src/storage_manager/storage_manager.cc b/core/src/storage_manager/storage_manager.cc index 883d89fe..8b70d197 100755 --- a/core/src/storage_manager/storage_manager.cc +++ b/core/src/storage_manager/storage_manager.cc @@ -100,7 +100,7 @@ int StorageManager::finalize() { return open_array_mtx_destroy(); } -int StorageManager::init(Config* config) { +int StorageManager::init(StorageManagerConfig* config) { // Set configuration parameters if(config_set(config) != TILEDB_SM_OK) return TILEDB_SM_ERR; @@ -586,7 +586,8 @@ int StorageManager::array_init( delete array_schema; delete array_clone; array = NULL; - array_close(array_dir); + if(array_read_mode(mode)) + array_close(array_dir); return TILEDB_SM_ERR; } @@ -608,7 +609,8 @@ int StorageManager::array_init( delete array_schema; delete array; array = NULL; - array_close(array_dir); + if(array_read_mode(mode)) + array_close(array_dir); tiledb_sm_errmsg = tiledb_as_errmsg; return TILEDB_SM_ERR; } @@ -1695,7 +1697,7 @@ int StorageManager::array_store_schema( return TILEDB_SM_OK; } -int StorageManager::config_set(Config* config) { +int StorageManager::config_set(StorageManagerConfig* config) { // Store config locally config_ = config; @@ -1727,8 +1729,11 @@ int StorageManager::config_set(Config* config) { int StorageManager::consolidation_filelock_create( const std::string& dir) const { + // Create file std::string filename = dir + "/" + TILEDB_SM_CONSOLIDATION_FILELOCK_NAME; int fd = ::open(filename.c_str(), O_WRONLY | O_CREAT | O_SYNC, S_IRWXU); + + // Handle error if(fd == -1) { std::string errmsg = std::string("Cannot create consolidation filelock; ") + strerror(errno); @@ -1737,6 +1742,15 @@ int StorageManager::consolidation_filelock_create( return TILEDB_SM_ERR; } + // Close the file + if(::close(fd)) { + std::string errmsg = + std::string("Cannot close consolidation filelock; ") + strerror(errno); + PRINT_ERROR(errmsg); + tiledb_sm_errmsg = TILEDB_SM_ERRMSG + errmsg; + return TILEDB_SM_ERR; + } + // Success return TILEDB_SM_OK; } diff --git a/core/src/storage_manager/config.cc b/core/src/storage_manager/storage_manager_config.cc similarity index 84% rename from core/src/storage_manager/config.cc rename to core/src/storage_manager/storage_manager_config.cc index c4dc159d..257dd6f5 100644 --- a/core/src/storage_manager/config.cc +++ b/core/src/storage_manager/storage_manager_config.cc @@ -1,5 +1,5 @@ /** - * @file config.cc + * @file storage_manager_config.cc * * @section LICENSE * @@ -27,13 +27,13 @@ * * @section DESCRIPTION * - * This file implements the Config class. + * This file implements the StorageManagerConfig class. */ -#include "config.h" #include "constants.h" +#include "storage_manager_config.h" @@ -42,7 +42,7 @@ /* CONSTRUCTORS & DESTRUCTORS */ /* ****************************** */ -Config::Config() { +StorageManagerConfig::StorageManagerConfig() { // Default values home_ = ""; read_method_ = TILEDB_IO_MMAP; @@ -52,7 +52,7 @@ Config::Config() { #endif } -Config::~Config() { +StorageManagerConfig::~StorageManagerConfig() { } @@ -62,7 +62,7 @@ Config::~Config() { /* MUTATORS */ /* ****************************** */ -void Config::init( +void StorageManagerConfig::init( const char* home, #ifdef HAVE_MPI MPI_Comm* mpi_comm, @@ -101,20 +101,20 @@ void Config::init( /* ACCESSORS */ /* ****************************** */ -const std::string& Config::home() const { +const std::string& StorageManagerConfig::home() const { return home_; } #ifdef HAVE_MPI -MPI_Comm* Config::mpi_comm() const { +MPI_Comm* StorageManagerConfig::mpi_comm() const { return mpi_comm_; } #endif -int Config::read_method() const { +int StorageManagerConfig::read_method() const { return read_method_; } -int Config::write_method() const { +int StorageManagerConfig::write_method() const { return write_method_; } diff --git a/examples/src/tiledb_array_create_dense.cc b/examples/src/tiledb_array_create_dense.cc index a59864c6..e7816f65 100644 --- a/examples/src/tiledb_array_create_dense.cc +++ b/examples/src/tiledb_array_create_dense.cc @@ -55,8 +55,8 @@ int main() { const int compression[] = { TILEDB_GZIP, // a1 - TILEDB_GZIP, // a2 - TILEDB_NO_COMPRESSION, // a3 + TILEDB_ZSTD, // a2 + TILEDB_LZ4, // a3 TILEDB_NO_COMPRESSION // coordinates }; int64_t tile_extents[] = diff --git a/test/include/misc/utils_spec.h b/test/include/misc/utils_spec.h new file mode 100644 index 00000000..29fd2163 --- /dev/null +++ b/test/include/misc/utils_spec.h @@ -0,0 +1,56 @@ +/** + * @file utils_spec.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2016 MIT and Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Declarations for testing various utility function specs. + */ + +#ifndef __C_UTILS_SPEC_H__ +#define __C_UTILS_SPEC_H__ + +#include "utils.h" +#include + + +/** Test fixture for the utility functions. */ +class UtilsTestFixture: public testing::Test { + + public: + + /* ********************************* */ + /* GTEST FUNCTIONS */ + /* ********************************* */ + + /** Test initialization. */ + virtual void SetUp(); + + /** Test finalization. */ + virtual void TearDown(); +}; + +#endif diff --git a/test/src/misc/utils_spec.cc b/test/src/misc/utils_spec.cc new file mode 100644 index 00000000..d20fe56c --- /dev/null +++ b/test/src/misc/utils_spec.cc @@ -0,0 +1,540 @@ +/** + * @file utils_spec.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2016 MIT and Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests for the utility functions spec. + */ + +#include "utils_spec.h" + + +/* ****************************** */ +/* GTEST FUNCTIONS */ +/* ****************************** */ + +void UtilsTestFixture::SetUp() { +} + +void UtilsTestFixture::TearDown() { +} + + + + +/* ****************************** */ +/* TESTS */ +/* ****************************** */ + +/** Tests RLE compression (attribute). */ +TEST_F(UtilsTestFixture, test_RLE) { + // Initializations + unsigned char input[1000000]; + size_t input_size = 0; + unsigned char compressed[1000000]; + size_t compressed_size = 0; + unsigned char decompressed[1000000]; + size_t decompressed_size = 0; + size_t value_size; + size_t run_size = 6; + size_t compress_bound; + int64_t rc; + + // === Attribute compression (value_size = sizeof(int)) === // + value_size = sizeof(int); + + // Test empty bufffer + rc = RLE_compress(input, input_size, compressed, compressed_size, value_size); + ASSERT_EQ(rc, 0); + + // Test input buffer invalid format + input_size = 5; + rc = RLE_compress(input, input_size, compressed, compressed_size, value_size); + ASSERT_EQ(rc, TILEDB_UT_ERR); + + // Test output buffer overflow + input_size = 16; + rc = RLE_compress(input, input_size, compressed, compressed_size, value_size); + ASSERT_EQ(rc, TILEDB_UT_ERR); + + // Test compress bound + compress_bound = RLE_compress_bound(input_size, value_size); + ASSERT_EQ(compress_bound, input_size + ((input_size/value_size) * 2)); + + // Test all values unique (many unitary runs) + for(int i=0; i<100; ++i) + memcpy(input + i*value_size, &i, value_size); + input_size = 100*value_size; + compressed_size = RLE_compress_bound(input_size, value_size); + rc = RLE_compress(input, input_size, compressed, compressed_size, value_size); + ASSERT_EQ(rc, compressed_size); + decompressed_size = input_size; + rc = RLE_decompress( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test all values the same (a single long run) + int v = 111; + for(int i=0; i<100; ++i) + memcpy(input + i*value_size, &v, value_size); + input_size = 100*value_size; + compressed_size = + RLE_compress( + input, + input_size, + compressed, + compressed_size, + value_size); + ASSERT_EQ(compressed_size, run_size); + decompressed_size = input_size; + rc = RLE_decompress( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test a mix of short and long runs + for(int i=0; i<10; ++i) + memcpy(input + i*value_size, &i, value_size); + for(int i=10; i<100; ++i) + memcpy(input + i*value_size, &v, value_size); + for(int i=100; i<110; ++i) + memcpy(input + i*value_size, &i, value_size); + input_size = 110*value_size; + compressed_size = RLE_compress_bound(input_size, value_size); + compressed_size = + RLE_compress( + input, + input_size, + compressed, + compressed_size, + value_size); + ASSERT_EQ(compressed_size, 21*run_size); + decompressed_size = input_size; + rc = RLE_decompress( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test when a run exceeds max run length + for(int i=0; i<10; ++i) + memcpy(input + i*value_size, &i, value_size); + for(int i=10; i<70010; ++i) + memcpy(input + i*value_size, &v, value_size); + for(int i=70010; i<70030; ++i) + memcpy(input + i*value_size, &i, value_size); + input_size = 70030*value_size; + compressed_size = RLE_compress_bound(input_size, value_size); + compressed_size = + RLE_compress( + input, + input_size, + compressed, + compressed_size, + value_size); + ASSERT_EQ(compressed_size, 32*run_size); + decompressed_size = input_size; + rc = RLE_decompress( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // === Attribute compression (value_size = 2*sizeof(double)) === // + value_size = 2*sizeof(double); + run_size = value_size + 2; + + // Test a mix of short and long runs + double dv = 0.234; + double j = 0.1, k = 0.2; + for(int i=0; i<10; ++i) { + j+= 10000.12; + memcpy(input + 2*i*sizeof(double), &j, value_size); + k+= 1000.12; + memcpy(input + (2*i+1)*sizeof(double), &k, value_size); + } + j+= 10000.12; + k+= 1000.12; + for(int i=10; i<100; ++i) { + memcpy(input + 2*i*sizeof(double), &j, value_size); + memcpy(input + (2*i+1)*sizeof(double), &k, value_size); + } + for(int i=100; i<110; ++i) { + j+= 10000.12; + memcpy(input + 2*i*sizeof(double), &j, value_size); + k+= 1000.12; + memcpy(input + (2*i+1)*sizeof(double), &k, value_size); + } + input_size = 110*value_size; + compressed_size = RLE_compress_bound(input_size, value_size); + compressed_size = + RLE_compress( + input, + input_size, + compressed, + compressed_size, + value_size); + ASSERT_EQ(compressed_size, 21*run_size); + decompressed_size = input_size; + rc = RLE_decompress( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); +} + +/** Tests RLE compression (coordinates, row-major cell order). */ +TEST_F(UtilsTestFixture, test_RLE_coords_row) { + // Initializations + unsigned char input[1000000]; + unsigned char compressed[1000000]; + unsigned char decompressed[1000000]; + size_t input_size = 0; + size_t compressed_size = 0; + size_t decompressed_size = 0; + size_t value_size; + size_t coords_size; + size_t run_size; + size_t compress_bound; + int dim_num = 2; + int rc; + + // === Coordinates compression (row-major) === // + value_size = sizeof(int); + coords_size = dim_num*value_size; + run_size = sizeof(int) + 2*sizeof(char); + + // Test empty bufffer + rc = RLE_compress_coords_row( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, 0); + + // Test input buffer invalid format + input_size = 5; + rc = RLE_compress_coords_row( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_ERR); + + // Test output buffer overflow + input_size = 16; + compressed_size = 0; + rc = RLE_compress_coords_row( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_ERR); + + // Test compress bound + input_size = 64; + compress_bound = RLE_compress_bound_coords(input_size, value_size, dim_num); + int64_t cell_num = input_size / coords_size; + size_t compress_bound_expected = + input_size + cell_num*(dim_num-1)*2 + sizeof(int64_t); + ASSERT_EQ(compress_bound, compress_bound_expected); + + // Test all values unique (many unitary runs) + int v; + for(int i=0; i<100; ++i) { + v = i; + memcpy(input + 2*i*value_size, &v, value_size); + memcpy(input + (2*i+1)*value_size, &i, value_size); + } + input_size = 100*value_size*dim_num; + compressed_size = RLE_compress_bound_coords(input_size, value_size, dim_num); + rc = RLE_compress_coords_row( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, compressed_size); + decompressed_size = input_size; + rc = RLE_decompress_coords_row( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test all values the same (a single long run) + v = 111; + for(int i=0; i<100; ++i) { + memcpy(input + 2*i*value_size, &v, value_size); + memcpy(input + (2*i+1)*value_size, &i, value_size); + } + input_size = 100*value_size*dim_num; + compressed_size = RLE_compress_bound_coords(input_size, value_size, dim_num); + compressed_size = + RLE_compress_coords_row( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(compressed_size, 100*value_size + run_size + sizeof(int64_t)); + decompressed_size = input_size; + rc = RLE_decompress_coords_row( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test a mix of short and long runs + for(int i=0; i<10; ++i) { + v = i; + memcpy(input + 2*i*value_size, &v, value_size); + memcpy(input + (2*i+1)*value_size, &i, value_size); + } + v = 111; + for(int i=10; i<90; ++i) { + memcpy(input + 2*i*value_size, &v, value_size); + memcpy(input + (2*i+1)*value_size, &i, value_size); + } + for(int i=90; i<100; ++i) { + v = i; + memcpy(input + 2*i*value_size, &v, value_size); + memcpy(input + (2*i+1)*value_size, &i, value_size); + } + input_size = 100*value_size*dim_num; + compressed_size = RLE_compress_bound_coords(input_size, value_size, dim_num); + compressed_size = + RLE_compress_coords_row( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(compressed_size, 100*value_size + 21*run_size + sizeof(int64_t)); + decompressed_size = input_size; + rc = RLE_decompress_coords_row( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); +} + +/** Tests RLE compression (coordinates, column-major cell order). */ +TEST_F(UtilsTestFixture, test_RLE_coords_col) { + // Initializations + unsigned char input[1000000]; + unsigned char compressed[1000000]; + unsigned char decompressed[1000000]; + size_t input_size = 0; + size_t compressed_size = 0; + size_t decompressed_size = 0; + size_t value_size; + size_t coords_size; + size_t run_size; + size_t compress_bound; + int dim_num = 2; + int rc; + + // === Coordinates compression (row-major) === // + value_size = sizeof(int); + coords_size = dim_num*value_size; + run_size = sizeof(int) + 2*sizeof(char); + + // Test empty bufffer + rc = RLE_compress_coords_col( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, 0); + + // Test input buffer invalid format + input_size = 5; + rc = RLE_compress_coords_col( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_ERR); + + // Test output buffer overflow + input_size = 16; + compressed_size = 0; + rc = RLE_compress_coords_col( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_ERR); + + // Test compress bound + input_size = 64; + compress_bound = RLE_compress_bound_coords(input_size, value_size, dim_num); + int64_t cell_num = input_size / coords_size; + size_t compress_bound_expected = + input_size + cell_num*(dim_num-1)*2 + sizeof(int64_t); + ASSERT_EQ(compress_bound, compress_bound_expected); + + // Test all values unique (many unitary runs) + int v; + for(int i=0; i<100; ++i) { + v = i; + memcpy(input + 2*i*value_size, &i, value_size); + memcpy(input + (2*i+1)*value_size, &v, value_size); + } + input_size = 100*value_size*dim_num; + compressed_size = RLE_compress_bound_coords(input_size, value_size, dim_num); + rc = RLE_compress_coords_col( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, compressed_size); + decompressed_size = input_size; + rc = RLE_decompress_coords_col( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test all values the same (a single long run) + v = 111; + for(int i=0; i<100; ++i) { + memcpy(input + 2*i*value_size, &i, value_size); + memcpy(input + (2*i+1)*value_size, &v, value_size); + } + input_size = 100*value_size*dim_num; + compressed_size = RLE_compress_bound_coords(input_size, value_size, dim_num); + compressed_size = + RLE_compress_coords_col( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(compressed_size, 100*value_size + run_size + sizeof(int64_t)); + decompressed_size = input_size; + rc = RLE_decompress_coords_col( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); + + // Test a mix of short and long runs + for(int i=0; i<10; ++i) { + v = i; + memcpy(input + 2*i*value_size, &i, value_size); + memcpy(input + (2*i+1)*value_size, &v, value_size); + } + v = 111; + for(int i=10; i<90; ++i) { + memcpy(input + 2*i*value_size, &i, value_size); + memcpy(input + (2*i+1)*value_size, &v, value_size); + } + for(int i=90; i<100; ++i) { + v = i; + memcpy(input + 2*i*value_size, &i, value_size); + memcpy(input + (2*i+1)*value_size, &v, value_size); + } + input_size = 100*value_size*dim_num; + compressed_size = RLE_compress_bound_coords(input_size, value_size, dim_num); + compressed_size = + RLE_compress_coords_col( + input, + input_size, + compressed, + compressed_size, + value_size, + dim_num); + ASSERT_EQ(compressed_size, 100*value_size + 21*run_size + sizeof(int64_t)); + decompressed_size = input_size; + rc = RLE_decompress_coords_col( + compressed, + compressed_size, + decompressed, + decompressed_size, + value_size, + dim_num); + ASSERT_EQ(rc, TILEDB_UT_OK); + ASSERT_FALSE(memcmp(input, decompressed, input_size)); +}