From 31a475ab8595de84d75a94d1c43d08494103d017 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 9 Sep 2024 08:50:08 -0400
Subject: [PATCH] LibCompress: Speed up CanonicalCode::read_symbol() slow path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Symbols that need <= 8 bits hit a fast path as of #18075, but
the slow path has done a full binary search over all symbols
ever since this code was added in #2963. (#3405 even added a FIXME
for doing this, but #18075 removed it.)

Instead of doing a binary search over all codes for every single
bit read, this implements the Moffat-Turpin approach described at
https://www.hanshq.net/zip.html#huffdec, which only requires a
table read per bit.

    hyperfine 'Build/lagom/bin/unzip ~/Downloads/enwik8.zip'
    1.008 s ± 0.016 s  =>  957.7 ms ± 3.9 ms, 5% faster

Due to issue #25005, we can't peek the full 15 bits at once but
have to read them one-by-one. This makes the code look a bit
different than in the linked article.

I also tried not changing CanonicalCode::from_bytes() too much.
It does 16 passes over all symbols. I think it could do it in
a single pass instead. But that's for a future change.

No behavior change (other than slightly faster perf).
---
 Userland/Libraries/LibCompress/Deflate.cpp | 35 +++++++++++++++-------
 Userland/Libraries/LibCompress/Deflate.h   |  4 ++-
 2 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/Userland/Libraries/LibCompress/Deflate.cpp b/Userland/Libraries/LibCompress/Deflate.cpp
index a952361443a2564..0554f4dc7a48860 100644
--- a/Userland/Libraries/LibCompress/Deflate.cpp
+++ b/Userland/Libraries/LibCompress/Deflate.cpp
@@ -86,11 +86,17 @@ ErrorOr<CanonicalCode> CanonicalCode::from_bytes(ReadonlyBytes bytes)
     Array<PrefixCode, 1 << CanonicalCode::max_allowed_prefixed_code_length> prefix_codes;
     size_t number_of_prefix_codes = 0;
 
+    code.m_first_symbol_of_length_after.append(0);
+    code.m_offset_to_first_symbol_index.append(0);
+
     auto next_code = 0;
     for (size_t code_length = 1; code_length <= 15; ++code_length) {
         next_code <<= 1;
         auto start_bit = 1 << code_length;
 
+        auto first_code_at_length = next_code;
+        auto first_symbol_index_at_length = code.m_symbol_values.size();
+
         for (size_t symbol = 0; symbol < bytes.size(); ++symbol) {
             if (bytes[symbol] != code_length)
                 continue;
@@ -98,6 +104,8 @@ ErrorOr<CanonicalCode> CanonicalCode::from_bytes(ReadonlyBytes bytes)
             if (next_code > start_bit)
                 return Error::from_string_literal("Failed to decode code lengths");
 
+            code.m_symbol_values.append(symbol);
+
             if (code_length <= CanonicalCode::max_allowed_prefixed_code_length) {
                 if (number_of_prefix_codes >= prefix_codes.size())
                     return Error::from_string_literal("Invalid canonical Huffman code");
@@ -108,9 +116,6 @@ ErrorOr<CanonicalCode> CanonicalCode::from_bytes(ReadonlyBytes bytes)
                 prefix_code.code_length = code_length;
 
                 code.m_max_prefixed_code_length = code_length;
-            } else {
-                code.m_symbol_codes.append(start_bit | next_code);
-                code.m_symbol_values.append(symbol);
             }
 
             if (code.m_bit_codes.size() < symbol + 1) {
@@ -122,6 +127,15 @@ ErrorOr<CanonicalCode> CanonicalCode::from_bytes(ReadonlyBytes bytes)
 
             next_code++;
         }
+
+        u32 sentinel = next_code;
+        code.m_first_symbol_of_length_after.append(sentinel);
+        VERIFY(code.m_first_symbol_of_length_after[code_length] == sentinel);
+
+        if (code.m_symbol_values.size() > first_symbol_index_at_length)
+            code.m_offset_to_first_symbol_index.append(first_symbol_index_at_length - first_code_at_length);
+        else
+            code.m_offset_to_first_symbol_index.append(0); // Never evaluated.
     }
 
     if (next_code != (1 << 15))
@@ -152,15 +166,14 @@ ErrorOr<u32> CanonicalCode::read_symbol(LittleEndianInputBitStream& stream) cons
         return symbol_value;
     }
 
-    auto code_bits = TRY(stream.read_bits<u16>(m_max_prefixed_code_length));
-    code_bits = fast_reverse16(code_bits, m_max_prefixed_code_length);
-    code_bits |= 1 << m_max_prefixed_code_length;
-
-    for (size_t i = m_max_prefixed_code_length; i < 16; ++i) {
-        size_t index;
-        if (binary_search(m_symbol_codes.span(), code_bits, &index))
-            return m_symbol_values[index];
+    auto code_bits = TRY(stream.read_bits<u16>(m_max_prefixed_code_length + 1));
+    code_bits = fast_reverse16(code_bits, m_max_prefixed_code_length + 1);
 
+    for (size_t i = m_max_prefixed_code_length + 1; i <= 15; ++i) {
+        if (code_bits < m_first_symbol_of_length_after[i]) {
+            auto symbol_index = (uint16_t)(m_offset_to_first_symbol_index[i] + code_bits);
+            return m_symbol_values[symbol_index];
+        }
         code_bits = code_bits << 1 | TRY(stream.read_bit());
     }
 
diff --git a/Userland/Libraries/LibCompress/Deflate.h b/Userland/Libraries/LibCompress/Deflate.h
index 20ad4aa4f585b34..4164f72244bdbb5 100644
--- a/Userland/Libraries/LibCompress/Deflate.h
+++ b/Userland/Libraries/LibCompress/Deflate.h
@@ -39,9 +39,11 @@ class CanonicalCode {
     };
 
     // Decompression - indexed by code
-    Vector<u16, 286> m_symbol_codes;
     Vector<u16, 286> m_symbol_values;
 
+    Vector<u32, 16> m_first_symbol_of_length_after;
+    Vector<u16, 16> m_offset_to_first_symbol_index;
+
     Array<PrefixTableEntry, 1 << max_allowed_prefixed_code_length> m_prefix_table {};
     size_t m_max_prefixed_code_length { 0 };