From 38f1737bcfb3943312b97fc09dd6f882e3dd8e9e Mon Sep 17 00:00:00 2001 From: Likith B <62029862+Likith101@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:45:05 +0530 Subject: [PATCH] MB-61640: Fuzzy Dynamic Scoring (#2056) - Added levenshtein distance calculation for fuzzy and wildcard searchers - Added new implementations of certain functions to allow passing of edit distances per term - Multiplied boosts by inverse of edit distance for score calculation --------- Co-authored-by: Abhinav Dangeti --- go.mod | 6 +-- go.sum | 12 +++--- index_test.go | 59 ++++++++++++++++++++++++++++ search/searcher/search_fuzzy.go | 16 +++++--- search/searcher/search_multi_term.go | 51 ++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index 19e826daa..b5b3a5507 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 require ( github.com/RoaringBitmap/roaring v1.9.3 github.com/bits-and-blooms/bitset v1.12.0 - github.com/blevesearch/bleve_index_api v1.1.12 + github.com/blevesearch/bleve_index_api v1.1.13 github.com/blevesearch/geo v0.1.20 github.com/blevesearch/go-faiss v1.0.23 github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 @@ -18,13 +18,13 @@ require ( github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/stempel v0.2.0 github.com/blevesearch/upsidedown_store_api v1.0.2 - github.com/blevesearch/vellum v1.0.10 + github.com/blevesearch/vellum v1.0.11 github.com/blevesearch/zapx/v11 v11.3.10 github.com/blevesearch/zapx/v12 v12.3.10 github.com/blevesearch/zapx/v13 v13.3.10 github.com/blevesearch/zapx/v14 v14.3.10 github.com/blevesearch/zapx/v15 v15.3.16 - github.com/blevesearch/zapx/v16 v16.1.8 + github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.7.0 diff --git a/go.sum b/go.sum index bf94edfae..565d9a56c 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4 github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY= -github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48= +github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= github.com/blevesearch/go-faiss v1.0.23 h1:Wmc5AFwDLKGl2L6mjLX1Da3vCL0EKa2uHHSorcIS1Uc= @@ -31,8 +31,8 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= -github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI= -github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k= +github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU= +github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7 github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE= github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= -github.com/blevesearch/zapx/v16 v16.1.8 h1:Bxzpw6YQpFs7UjoCV1+RvDw6fmAT2GZxldwX8b3wVBM= -github.com/blevesearch/zapx/v16 v16.1.8/go.mod h1:JqQlOqlRVaYDkpLIl3JnKql8u4zKTNlVEa3nLsi0Gn8= +github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2 h1:+RX9SM7KO7q91E7rFj4NARSsAhKj2EbvdWfzX+ihg/w= +github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2/go.mod h1:zuxVgVaLZ0g4lZvrv06xDc24N6nLCOzXYHVkXI7LMHM= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/index_test.go b/index_test.go index e5094440d..e75c5fd87 100644 --- a/index_test.go +++ b/index_test.go @@ -2953,3 +2953,62 @@ func TestCopyIndex(t *testing.T) { } } } + +func TestFuzzyScoring(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + mp := NewIndexMapping() + mp.DefaultAnalyzer = "simple" + idx, err := New(tmpIndexPath, mp) + if err != nil { + t.Fatal(err) + } + + batch := idx.NewBatch() + + docs := []map[string]interface{}{ + { + "textField": "ab", + }, + { + "textField": "abc", + }, + { + "textField": "abcd", + }, + } + + for _, doc := range docs { + err := batch.Index(fmt.Sprintf("%v", doc["textField"]), doc) + if err != nil { + t.Fatal(err) + } + } + + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + query := NewFuzzyQuery("ab") + query.Fuzziness = 2 + searchRequest := NewSearchRequestOptions(query, 10, 0, true) + res, err := idx.Search(searchRequest) + if err != nil { + t.Error(err) + } + + maxScore := res.Hits[0].Score + + for i, hit := range res.Hits { + if maxScore/float64(i+1) != hit.Score { + t.Errorf("expected score - %f, got score - %f", maxScore/float64(i+1), hit.Score) + } + } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } +} diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 35001874b..6c29f845d 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -78,9 +78,11 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s } var candidates []string + var editDistances []uint8 var dictBytesRead uint64 if fuzzyCandidates != nil { candidates = fuzzyCandidates.candidates + editDistances = fuzzyCandidates.editDistances dictBytesRead = fuzzyCandidates.bytesRead } @@ -93,8 +95,8 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s } } - return NewMultiTermSearcher(ctx, indexReader, candidates, field, - boost, options, true) + return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field, + boost, editDistances, options, true) } func getAutoFuzziness(term string) int { @@ -113,8 +115,9 @@ func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, te } type fuzzyCandidates struct { - candidates []string - bytesRead uint64 + candidates []string + editDistances []uint8 + bytesRead uint64 } func reportIOStats(ctx context.Context, bytesRead uint64) { @@ -132,7 +135,8 @@ func reportIOStats(ctx context.Context, bytesRead uint64) { func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) { rv = &fuzzyCandidates{ - candidates: make([]string, 0), + candidates: make([]string, 0), + editDistances: make([]uint8, 0), } // in case of advanced reader implementations directly call @@ -151,6 +155,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, tfd, err := fieldDict.Next() for err == nil && tfd != nil { rv.candidates = append(rv.candidates, tfd.Term) + rv.editDistances = append(rv.editDistances, tfd.EditDistance) if tooManyClauses(len(rv.candidates)) { return nil, tooManyClausesErr(field, len(rv.candidates)) } @@ -185,6 +190,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse) if !exceeded && ld <= fuzziness { rv.candidates = append(rv.candidates, tfd.Term) + rv.editDistances = append(rv.editDistances, uint8(ld)) if tooManyClauses(len(rv.candidates)) { return nil, tooManyClausesErr(field, len(rv.candidates)) } diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index 913f99f55..98f8f92b8 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -45,6 +45,31 @@ func NewMultiTermSearcher(ctx context.Context, indexReader index.IndexReader, te options, limit) } +// Works similarly to the multi term searcher but additionally boosts individual terms based on +// their edit distance from the query terms +func NewMultiTermSearcherBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, + field string, boost float64, editDistances []uint8, options search.SearcherOptions, limit bool) ( + search.Searcher, error) { + + if tooManyClauses(len(terms)) { + if optionsDisjunctionOptimizable(options) { + return optimizeMultiTermSearcher(ctx, indexReader, terms, field, boost, options) + } + if limit { + return nil, tooManyClausesErr(field, len(terms)) + } + } + + qsearchers, err := makeBatchSearchersBoosted(ctx, indexReader, terms, field, boost, editDistances, options) + if err != nil { + return nil, err + } + + // build disjunction searcher of these ranges + return newMultiTermSearcherInternal(ctx, indexReader, qsearchers, field, boost, + options, limit) +} + func NewMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { @@ -151,6 +176,32 @@ func makeBatchSearchers(ctx context.Context, indexReader index.IndexReader, term return qsearchers, nil } +func makeBatchSearchersBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, field string, + boost float64, editDistances []uint8, options search.SearcherOptions) ([]search.Searcher, error) { + + qsearchers := make([]search.Searcher, len(terms)) + qsearchersClose := func() { + for _, searcher := range qsearchers { + if searcher != nil { + _ = searcher.Close() + } + } + } + for i, term := range terms { + var err error + var editMultiplier float64 + if editDistances != nil { + editMultiplier = 1 / float64(editDistances[i]+1) + } + qsearchers[i], err = NewTermSearcher(ctx, indexReader, term, field, boost*editMultiplier, options) + if err != nil { + qsearchersClose() + return nil, err + } + } + return qsearchers, nil +} + func optimizeMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) {