diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fc06284..489e051c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -696,6 +696,7 @@ add_library( ${T1HA_SRC} ${SHA_SRC} mum.cc + mum_v3.cc jody_hash32.c jody_hash64.c ${TSIP_SRC} diff --git a/Hashes.h b/Hashes.h index 4e8e20e6..0a52b4ff 100644 --- a/Hashes.h +++ b/Hashes.h @@ -462,6 +462,7 @@ inline void mum_low_test ( const void * key, int len, uint32_t seed, void * out *(uint32_t*)out = (uint32_t)result; } +void mum_v3_hash_test(const void * key, int len, uint32_t seed, void * out); //----------------------------------------------------------------------------- diff --git a/main.cpp b/main.cpp index f483ec56..373cb556 100644 --- a/main.cpp +++ b/main.cpp @@ -630,6 +630,8 @@ HashInfo g_hashes[] = {0x0} /* !! and many more. too many */ }, { mum_low_test, 32, MUMLOW_VERIF,"MUMlow", "github.com/vnmakarov/mum-hash", GOOD, {0x11fb062a, 0x3ca9411b, 0x3edd9a7d, 0x41f18860, 0x691457ba} /* !! */ }, +{ mum_v3_hash_test, 64, MUM_VERIF, "MUMv3", "github.com/vnmakarov/mum-hash", POOR, + {0x0}}, { xmsx32_test, 32, 0x6B54E1D4, "xmsx32", "XMSX-32", GOOD, { 0x1505929f, 0xf0a6a74a } }, #if defined(__GNUC__) && UINT_MAX != ULONG_MAX #define MIR_VERIF 0x00A393C8 diff --git a/mum.cc b/mum.cc index 1dee018c..87139466 100644 --- a/mum.cc +++ b/mum.cc @@ -1,3 +1,4 @@ +#define MUM_V1 #include "mum.h" void mum_hash_test(const void *key, int len, uint32_t seed, void *out) { diff --git a/mum.h b/mum.h index 4c613fbd..72d0be30 100644 --- a/mum.h +++ b/mum.h @@ -1,4 +1,5 @@ -/* Copyright (c) 2016 Vladimir Makarov +/* Copyright (c) 2016, 2017, 2018 + Vladimir Makarov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation @@ -58,11 +59,7 @@ typedef unsigned __int64 uint64_t; #ifdef __GNUC__ #define _MUM_ATTRIBUTE_UNUSED __attribute__((unused)) -# ifdef __clang__ -# define _MUM_OPTIMIZE(opts) -# else -# define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts))) -# endif +#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts))) #define _MUM_TARGET(opts) __attribute__((__target__ (opts))) #else #define _MUM_ATTRIBUTE_UNUSED @@ -83,10 +80,6 @@ typedef unsigned __int64 uint64_t; #endif #endif -#if defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4)) -#define _MUM_FRESH_GCC -#endif - /* Here are different primes randomly generated with the equal probability of their bit values. They are used to randomize input values. */ @@ -97,7 +90,7 @@ static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL; static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL; static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL; static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL; - + static uint64_t _mum_primes [] = { 0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f, 0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81, @@ -116,7 +109,7 @@ _mum (uint64_t v, uint64_t p) { multiplication. If we use a generic code we actually call a function doing 128x128->128 bit multiplication. The function is very slow. */ - lo = v * p, hi; + lo = v * p; asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p)); #else __uint128_t r = (__uint128_t) v * (__uint128_t) p; @@ -133,7 +126,7 @@ _mum (uint64_t v, uint64_t p) { uint64_t rm_1 = hp * lv; uint64_t rl = lv * lp; uint64_t t, carry = 0; - + /* We could ignore a carry bit here if we did not care about the same hash for 32-bit and 64-bit targets. */ t = rl + (rm_0 << 32); @@ -190,6 +183,17 @@ _mum_le32 (uint32_t v) { #endif } +static inline uint64_t +_mum_le16 (uint16_t v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH) + return v; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return (v >> 8) | ((v & 0xff) << 8); +#else +#error "Unknown endianess" +#endif +} + /* Macro defining how many times the most nested loop in _mum_hash_aligned will be unrolled by the compiler (although it can make an own decision:). Use only a constant here to help a @@ -203,8 +207,10 @@ _mum_le32 (uint32_t v) { #define _MUM_UNROLL_FACTOR_POWER 3 #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH) #define _MUM_UNROLL_FACTOR_POWER 4 -#else +#elif defined (MUM_V1) || defined (MUM_V2) #define _MUM_UNROLL_FACTOR_POWER 2 +#else +#define _MUM_UNROLL_FACTOR_POWER 3 #endif #endif @@ -216,21 +222,35 @@ _mum_le32 (uint32_t v) { #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER) +/* Rotate V left by SH. */ +static inline uint64_t _mum_rotl (uint64_t v, int sh) { + return v << sh | v >> (64 - sh); +} + static inline uint64_t _MUM_OPTIMIZE("unroll-loops") -_mum_hash_aligned (uint64_t start, const void* key, size_t len) { +_mum_hash_aligned (uint64_t start, const void *key, size_t len) { uint64_t result = start; - const unsigned char* str = (const unsigned char*) key; + const unsigned char *str = (const unsigned char *) key; uint64_t u64; size_t i; size_t n; - + +#ifndef MUM_V2 result = _mum (result, _mum_block_start_prime); +#endif while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) { /* This loop could be vectorized when we have vector insns for - 64x64->128-bit multiplication. AVX2 currently only have a - vector insn for 4 32x32->64-bit multiplication. */ + 64x64->128-bit multiplication. AVX2 currently only have vector + insns for 4 32x32->64-bit multiplication and for 1 + 64x64->128-bit multiplication (pclmulqdq). */ +#if defined (MUM_V1) || defined (MUM_V2) for (i = 0; i < _MUM_UNROLL_FACTOR; i++) result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]); +#else + for (i = 0; i < _MUM_UNROLL_FACTOR; i += 2) + result ^= _mum (_mum_le (((uint64_t *) str)[i]) ^ _mum_primes[i], + _mum_le (((uint64_t *) str)[i + 1]) ^ _mum_primes[i + 1]); +#endif len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t); str += _MUM_UNROLL_FACTOR * sizeof (uint64_t); /* We will use the same prime numbers on the next iterations -- @@ -244,14 +264,12 @@ _mum_hash_aligned (uint64_t start, const void* key, size_t len) { switch (len) { case 7: u64 = _mum_le32 (*(uint32_t *) str); - u64 |= (uint64_t) str[4] << 32; - u64 |= (uint64_t) str[5] << 40; + u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32; u64 |= (uint64_t) str[6] << 48; return result ^ _mum (u64, _mum_tail_prime); case 6: u64 = _mum_le32 (*(uint32_t *) str); - u64 |= (uint64_t) str[4] << 32; - u64 |= (uint64_t) str[5] << 40; + u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32; return result ^ _mum (u64, _mum_tail_prime); case 5: u64 = _mum_le32 (*(uint32_t *) str); @@ -261,13 +279,11 @@ _mum_hash_aligned (uint64_t start, const void* key, size_t len) { u64 = _mum_le32 (*(uint32_t *) str); return result ^ _mum (u64, _mum_tail_prime); case 3: - u64 = str[0]; - u64 |= (uint64_t) str[1] << 8; + u64 = _mum_le16 (*(uint16_t *) str); u64 |= (uint64_t) str[2] << 16; return result ^ _mum (u64, _mum_tail_prime); case 2: - u64 = str[0]; - u64 |= (uint64_t) str[1] << 8; + u64 = _mum_le16 (*(uint16_t *) str); return result ^ _mum (u64, _mum_tail_prime); case 1: u64 = str[0]; @@ -279,23 +295,18 @@ _mum_hash_aligned (uint64_t start, const void* key, size_t len) { /* Final randomization of H. */ static inline uint64_t _mum_final (uint64_t h) { +#if defined (MUM_V1) h ^= _mum (h, _mum_finish_prime1); h ^= _mum (h, _mum_finish_prime2); +#elif defined (MUM_V2) + h ^= _mum_rotl (h, 33); + h ^= _mum (h, _mum_finish_prime1); +#else + h = _mum (h, h); +#endif return h; } -#if defined(__x86_64__) && defined(_MUM_FRESH_GCC) - -/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where - it is possible. Although on modern Intel processors MULQ takes - 3-cycles vs. 4 for MULX, MULX permits more freedom in insn - scheduling as it uses less fixed registers. */ -static inline uint64_t _MUM_TARGET("arch=haswell") -_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) { - return _mum_final (_mum_hash_aligned (seed + len, key, len)); -} -#endif - #ifndef _MUM_UNALIGNED_ACCESS #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \ || defined(__s390__) || defined(__m32c__) || defined(cris) \ @@ -320,16 +331,16 @@ _mum_hash_avx2 (const void * key, size_t len, uint64_t seed) { static inline uint64_t #if defined(__x86_64__) -//_MUM_TARGET("inline-all-stringops") +_MUM_TARGET("inline-all-stringops") #endif _mum_hash_default (const void *key, size_t len, uint64_t seed) { uint64_t result; const unsigned char *str = (const unsigned char *) key; size_t block_len; uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)]; - + result = seed + len; - if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0) + if (((size_t) str & 0x7) == 0) result = _mum_hash_aligned (result, key, len); else { while (len != 0) { @@ -347,7 +358,7 @@ static inline uint64_t _mum_next_factor (void) { uint64_t start = 0; int i; - + for (i = 0; i < 8; i++) start = (start << 8) | rand() % 256; return start; @@ -380,8 +391,7 @@ mum_hash_init (uint64_t seed) { /* Process data KEY with the state H and return the updated state. */ static inline uint64_t -mum_hash_step (uint64_t h, uint64_t key) -{ +mum_hash_step (uint64_t h, uint64_t key) { return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime); } @@ -402,19 +412,11 @@ mum_hash64 (uint64_t key, uint64_t seed) { target endianess and the unroll factor. */ static inline uint64_t mum_hash (const void *key, size_t len, uint64_t seed) { -#if defined(__x86_64__) && defined(_MUM_FRESH_GCC) - static int avx2_support = 0; - - if (avx2_support > 0) - return _mum_hash_avx2 (key, len, seed); - else if (! avx2_support) { - __builtin_cpu_init (); - avx2_support = __builtin_cpu_supports ("avx2") ? 1 : -1; - if (avx2_support > 0) - return _mum_hash_avx2 (key, len, seed); - } -#endif +#if _MUM_UNALIGNED_ACCESS + return _mum_final (_mum_hash_aligned (seed + len, key, len)); +#else return _mum_hash_default (key, len, seed); +#endif } #endif diff --git a/mum_v3.cc b/mum_v3.cc new file mode 100644 index 00000000..7d865d8d --- /dev/null +++ b/mum_v3.cc @@ -0,0 +1,5 @@ +#include "mum.h" + +void mum_v3_hash_test(const void *key, int len, uint32_t seed, void *out) { + *(uint64_t *)out = mum_hash(key, len, seed); +}