diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fc06284..489e051c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -696,6 +696,7 @@ add_library(
   ${T1HA_SRC}
   ${SHA_SRC}
   mum.cc
+  mum_v3.cc
   jody_hash32.c
   jody_hash64.c
   ${TSIP_SRC}
diff --git a/Hashes.h b/Hashes.h
index 4e8e20e6..0a52b4ff 100644
--- a/Hashes.h
+++ b/Hashes.h
@@ -462,6 +462,7 @@ inline void mum_low_test ( const void * key, int len, uint32_t seed, void * out
   *(uint32_t*)out = (uint32_t)result;
 }
 
+void mum_v3_hash_test(const void * key, int len, uint32_t seed, void * out);
 
 //-----------------------------------------------------------------------------
 
diff --git a/main.cpp b/main.cpp
index f483ec56..373cb556 100644
--- a/main.cpp
+++ b/main.cpp
@@ -630,6 +630,8 @@ HashInfo g_hashes[] =
   {0x0} /* !! and many more. too many */ },
 { mum_low_test,         32, MUMLOW_VERIF,"MUMlow",     "github.com/vnmakarov/mum-hash", GOOD,
   {0x11fb062a, 0x3ca9411b, 0x3edd9a7d, 0x41f18860, 0x691457ba} /* !! */ },
+{ mum_v3_hash_test,     64, MUM_VERIF,  "MUMv3",       "github.com/vnmakarov/mum-hash", POOR,
+  {0x0}},
 { xmsx32_test,          32, 0x6B54E1D4, "xmsx32", "XMSX-32", GOOD, { 0x1505929f, 0xf0a6a74a } },
 #if defined(__GNUC__) && UINT_MAX != ULONG_MAX
 #define MIR_VERIF            0x00A393C8
diff --git a/mum.cc b/mum.cc
index 1dee018c..87139466 100644
--- a/mum.cc
+++ b/mum.cc
@@ -1,3 +1,4 @@
+#define MUM_V1
 #include "mum.h"
 
 void mum_hash_test(const void *key, int len, uint32_t seed, void *out) {
diff --git a/mum.h b/mum.h
index 4c613fbd..72d0be30 100644
--- a/mum.h
+++ b/mum.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
+/* Copyright (c) 2016, 2017, 2018
+   Vladimir Makarov <vmakarov@gcc.gnu.org>
 
    Permission is hereby granted, free of charge, to any person
    obtaining a copy of this software and associated documentation
@@ -58,11 +59,7 @@ typedef unsigned __int64 uint64_t;
 
 #ifdef __GNUC__
 #define _MUM_ATTRIBUTE_UNUSED  __attribute__((unused))
-# ifdef __clang__
-#  define _MUM_OPTIMIZE(opts)
-# else
-#  define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
-# endif
+#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
 #define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
 #else
 #define _MUM_ATTRIBUTE_UNUSED
@@ -83,10 +80,6 @@ typedef unsigned __int64 uint64_t;
 #endif
 #endif
 
-#if defined(__GNUC__) && ((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
-#define _MUM_FRESH_GCC
-#endif
-
 /* Here are different primes randomly generated with the equal
    probability of their bit values.  They are used to randomize input
    values.  */
@@ -97,7 +90,7 @@ static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
 static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
 static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
 static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
-
+  
 static uint64_t _mum_primes [] = {
   0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
   0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
@@ -116,7 +109,7 @@ _mum (uint64_t v, uint64_t p) {
      multiplication.  If we use a generic code we actually call a
      function doing 128x128->128 bit multiplication.  The function is
      very slow.  */
-  lo = v * p, hi;
+  lo = v * p;
   asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
 #else
   __uint128_t r = (__uint128_t) v * (__uint128_t) p;
@@ -133,7 +126,7 @@ _mum (uint64_t v, uint64_t p) {
   uint64_t rm_1 = hp * lv;
   uint64_t rl =  lv * lp;
   uint64_t t, carry = 0;
-
+  
   /* We could ignore a carry bit here if we did not care about the
      same hash for 32-bit and 64-bit targets.  */
   t = rl + (rm_0 << 32);
@@ -190,6 +183,17 @@ _mum_le32 (uint32_t v) {
 #endif
 }
 
+static inline uint64_t
+_mum_le16 (uint16_t v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
+  return v;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return (v >> 8) | ((v & 0xff) << 8);
+#else
+#error "Unknown endianess"
+#endif
+}
+
 /* Macro defining how many times the most nested loop in
    _mum_hash_aligned will be unrolled by the compiler (although it can
    make an own decision:).  Use only a constant here to help a
@@ -203,8 +207,10 @@ _mum_le32 (uint32_t v) {
 #define _MUM_UNROLL_FACTOR_POWER 3
 #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
 #define _MUM_UNROLL_FACTOR_POWER 4
-#else
+#elif defined (MUM_V1) || defined (MUM_V2)
 #define _MUM_UNROLL_FACTOR_POWER 2
+#else
+#define _MUM_UNROLL_FACTOR_POWER 3
 #endif
 #endif
 
@@ -216,21 +222,35 @@ _mum_le32 (uint32_t v) {
 
 #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
 
+/* Rotate V left by SH.  */
+static inline uint64_t _mum_rotl (uint64_t v, int sh) {
+  return v << sh | v >> (64 - sh);
+}
+
 static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
-_mum_hash_aligned (uint64_t start, const void* key, size_t len) {
+_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
   uint64_t result = start;
-  const unsigned char* str = (const unsigned char*) key;
+  const unsigned char *str = (const unsigned char *) key;
   uint64_t u64;
   size_t i;
   size_t n;
-
+  
+#ifndef MUM_V2
   result = _mum (result, _mum_block_start_prime);
+#endif
   while  (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
     /* This loop could be vectorized when we have vector insns for
-       64x64->128-bit multiplication.  AVX2 currently only have a
-       vector insn for 4 32x32->64-bit multiplication. */
+       64x64->128-bit multiplication.  AVX2 currently only have vector
+       insns for 4 32x32->64-bit multiplication and for 1
+       64x64->128-bit multiplication (pclmulqdq).  */
+#if defined (MUM_V1) || defined (MUM_V2)
     for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
       result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
+#else
+    for (i = 0; i < _MUM_UNROLL_FACTOR; i += 2)
+      result ^= _mum (_mum_le (((uint64_t *) str)[i]) ^ _mum_primes[i],
+		      _mum_le (((uint64_t *) str)[i + 1]) ^ _mum_primes[i + 1]);
+#endif
     len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
     str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
     /* We will use the same prime numbers on the next iterations --
@@ -244,14 +264,12 @@ _mum_hash_aligned (uint64_t start, const void* key, size_t len) {
   switch (len) {
   case 7:
     u64 = _mum_le32 (*(uint32_t *) str);
-    u64 |= (uint64_t) str[4] << 32;
-    u64 |= (uint64_t) str[5] << 40;
+    u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32;
     u64 |= (uint64_t) str[6] << 48;
     return result ^ _mum (u64, _mum_tail_prime);
   case 6:
     u64 = _mum_le32 (*(uint32_t *) str);
-    u64 |= (uint64_t) str[4] << 32;
-    u64 |= (uint64_t) str[5] << 40;
+    u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32;
     return result ^ _mum (u64, _mum_tail_prime);
   case 5:
     u64 = _mum_le32 (*(uint32_t *) str);
@@ -261,13 +279,11 @@ _mum_hash_aligned (uint64_t start, const void* key, size_t len) {
     u64 = _mum_le32 (*(uint32_t *) str);
     return result ^ _mum (u64, _mum_tail_prime);
   case 3:
-    u64 = str[0];
-    u64 |= (uint64_t) str[1] << 8;
+    u64 = _mum_le16 (*(uint16_t *) str);
     u64 |= (uint64_t) str[2] << 16;
     return result ^ _mum (u64, _mum_tail_prime);
   case 2:
-    u64 = str[0];
-    u64 |= (uint64_t) str[1] << 8;
+    u64 = _mum_le16 (*(uint16_t *) str);
     return result ^ _mum (u64, _mum_tail_prime);
   case 1:
     u64 = str[0];
@@ -279,23 +295,18 @@ _mum_hash_aligned (uint64_t start, const void* key, size_t len) {
 /* Final randomization of H.  */
 static inline uint64_t
 _mum_final (uint64_t h) {
+#if defined (MUM_V1)
   h ^= _mum (h, _mum_finish_prime1);
   h ^= _mum (h, _mum_finish_prime2);
+#elif defined (MUM_V2)
+  h ^= _mum_rotl (h, 33);
+  h ^= _mum (h, _mum_finish_prime1);
+#else
+  h = _mum (h, h);
+#endif
   return h;
 }
 
-#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
-
-/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
-   it is possible.  Although on modern Intel processors MULQ takes
-   3-cycles vs. 4 for MULX, MULX permits more freedom in insn
-   scheduling as it uses less fixed registers.  */
-static inline uint64_t _MUM_TARGET("arch=haswell")
-_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
-  return _mum_final (_mum_hash_aligned (seed + len, key, len));
-}
-#endif
-
 #ifndef _MUM_UNALIGNED_ACCESS
 #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
     || defined(__s390__) || defined(__m32c__) || defined(cris)     \
@@ -320,16 +331,16 @@ _mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
 
 static inline uint64_t
 #if defined(__x86_64__)
-//_MUM_TARGET("inline-all-stringops")
+_MUM_TARGET("inline-all-stringops")
 #endif
 _mum_hash_default (const void *key, size_t len, uint64_t seed) {
   uint64_t result;
   const unsigned char *str = (const unsigned char *) key;
   size_t block_len;
   uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
-
+  
   result = seed + len;
-  if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
+  if (((size_t) str & 0x7) == 0)
     result = _mum_hash_aligned (result, key, len);
   else {
     while (len != 0) {
@@ -347,7 +358,7 @@ static inline uint64_t
 _mum_next_factor (void) {
   uint64_t start = 0;
   int i;
-
+  
   for (i = 0; i < 8; i++)
     start = (start << 8) | rand() % 256;
   return start;
@@ -380,8 +391,7 @@ mum_hash_init (uint64_t seed) {
 
 /* Process data KEY with the state H and return the updated state.  */
 static inline uint64_t
-mum_hash_step (uint64_t h, uint64_t key)
-{
+mum_hash_step (uint64_t h, uint64_t key) {
   return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
 }
 
@@ -402,19 +412,11 @@ mum_hash64 (uint64_t key, uint64_t seed) {
    target endianess and the unroll factor.  */
 static inline uint64_t
 mum_hash (const void *key, size_t len, uint64_t seed) {
-#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
-  static int avx2_support = 0;
-
-  if (avx2_support > 0)
-    return _mum_hash_avx2 (key, len, seed);
-  else if (! avx2_support) {
-    __builtin_cpu_init ();
-    avx2_support =  __builtin_cpu_supports ("avx2") ? 1 : -1;
-    if (avx2_support > 0)
-      return _mum_hash_avx2 (key, len, seed);
-  }
-#endif
+#if _MUM_UNALIGNED_ACCESS
+  return _mum_final (_mum_hash_aligned (seed + len, key, len));
+#else
   return _mum_hash_default (key, len, seed);
+#endif
 }
 
 #endif
diff --git a/mum_v3.cc b/mum_v3.cc
new file mode 100644
index 00000000..7d865d8d
--- /dev/null
+++ b/mum_v3.cc
@@ -0,0 +1,5 @@
+#include "mum.h"
+
+void mum_v3_hash_test(const void *key, int len, uint32_t seed, void *out) {
+  *(uint64_t *)out = mum_hash(key, len, seed);
+}