-
Notifications
You must be signed in to change notification settings - Fork 687
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support vector data type, query + searcher (#1857)
+ Add support for indexing and querying vector data (array of float32) + Corresponding to each field of type vector, there will be associated properties like Dimensions and Similarity, which will be used to determine validity of a vector and a criteria for scoring search hits, respectively. + Supports vector reader interface (searcher) and the `knn` construct within the SearchRequest. Related PR: * blevesearch/bleve_index_api#34 --------- Co-authored-by: Abhi Dangeti <[email protected]> Co-authored-by: Aditi Ahuja <[email protected]>
- Loading branch information
1 parent
840fde5
commit 80d9b18
Showing
22 changed files
with
1,348 additions
and
171 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
// Copyright (c) 2023 Couchbase, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
//go:build vectors | ||
// +build vectors | ||
|
||
package document | ||
|
||
import ( | ||
"fmt" | ||
"reflect" | ||
|
||
"github.com/blevesearch/bleve/v2/size" | ||
index "github.com/blevesearch/bleve_index_api" | ||
) | ||
|
||
var reflectStaticSizeVectorField int | ||
|
||
func init() { | ||
var f VectorField | ||
reflectStaticSizeVectorField = int(reflect.TypeOf(f).Size()) | ||
} | ||
|
||
const DefaultVectorIndexingOptions = index.IndexField | ||
|
||
type VectorField struct { | ||
name string | ||
dims int // Dimensionality of the vector | ||
similarity string // Similarity metric to use for scoring | ||
options index.FieldIndexingOptions | ||
value []float32 | ||
numPlainTextBytes uint64 | ||
} | ||
|
||
func (n *VectorField) Size() int { | ||
return reflectStaticSizeVectorField + size.SizeOfPtr + | ||
len(n.name) + | ||
int(numBytesFloat32s(n.value)) | ||
} | ||
|
||
func (n *VectorField) Name() string { | ||
return n.name | ||
} | ||
|
||
func (n *VectorField) ArrayPositions() []uint64 { | ||
return nil | ||
} | ||
|
||
func (n *VectorField) Options() index.FieldIndexingOptions { | ||
return n.options | ||
} | ||
|
||
func (n *VectorField) NumPlainTextBytes() uint64 { | ||
return n.numPlainTextBytes | ||
} | ||
|
||
func (n *VectorField) AnalyzedLength() int { | ||
// vectors aren't analyzed | ||
return 0 | ||
} | ||
|
||
func (n *VectorField) EncodedFieldType() byte { | ||
return 'v' | ||
} | ||
|
||
func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies { | ||
// vectors aren't analyzed | ||
return nil | ||
} | ||
|
||
func (n *VectorField) Analyze() { | ||
// vectors aren't analyzed | ||
} | ||
|
||
func (n *VectorField) Value() []byte { | ||
return nil | ||
} | ||
|
||
func (n *VectorField) GoString() string { | ||
return fmt.Sprintf("&document.VectorField{Name:%s, Options: %s, "+ | ||
"Value: %+v}", n.name, n.options, n.value) | ||
} | ||
|
||
// For the sake of not polluting the API, we are keeping arrayPositions as a | ||
// parameter, but it is not used. | ||
func NewVectorField(name string, arrayPositions []uint64, | ||
vector []float32, dims int, similarity string) *VectorField { | ||
return NewVectorFieldWithIndexingOptions(name, arrayPositions, | ||
vector, dims, similarity, DefaultVectorIndexingOptions) | ||
} | ||
|
||
// For the sake of not polluting the API, we are keeping arrayPositions as a | ||
// parameter, but it is not used. | ||
func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64, | ||
vector []float32, dims int, similarity string, | ||
options index.FieldIndexingOptions) *VectorField { | ||
options = options | DefaultVectorIndexingOptions | ||
|
||
return &VectorField{ | ||
name: name, | ||
dims: dims, | ||
similarity: similarity, | ||
options: options, | ||
value: vector, | ||
numPlainTextBytes: numBytesFloat32s(vector), | ||
} | ||
} | ||
|
||
func numBytesFloat32s(value []float32) uint64 { | ||
return uint64(len(value) * size.SizeOfFloat32) | ||
} | ||
|
||
// ----------------------------------------------------------------------------- | ||
// Following methods help in implementing the bleve_index_api's VectorField | ||
// interface. | ||
|
||
func (n *VectorField) Vector() []float32 { | ||
return n.value | ||
} | ||
|
||
func (n *VectorField) Dims() int { | ||
return n.dims | ||
} | ||
|
||
func (n *VectorField) Similarity() string { | ||
return n.similarity | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
// Copyright (c) 2023 Couchbase, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
//go:build vectors | ||
// +build vectors | ||
|
||
package scorch | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"fmt" | ||
"reflect" | ||
|
||
"github.com/blevesearch/bleve/v2/size" | ||
index "github.com/blevesearch/bleve_index_api" | ||
segment_api "github.com/blevesearch/scorch_segment_api/v2" | ||
) | ||
|
||
var reflectStaticSizeIndexSnapshotVectorReader int | ||
|
||
func init() { | ||
var istfr IndexSnapshotVectorReader | ||
reflectStaticSizeIndexSnapshotVectorReader = int(reflect.TypeOf(istfr).Size()) | ||
} | ||
|
||
type IndexSnapshotVectorReader struct { | ||
vector []float32 | ||
field string | ||
k int64 | ||
snapshot *IndexSnapshot | ||
postings []segment_api.VecPostingsList | ||
iterators []segment_api.VecPostingsIterator | ||
segmentOffset int | ||
currPosting segment_api.VecPosting | ||
currID index.IndexInternalID | ||
ctx context.Context | ||
} | ||
|
||
func (i *IndexSnapshotVectorReader) Size() int { | ||
sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr + | ||
len(i.vector) + len(i.field) + len(i.currID) | ||
|
||
for _, entry := range i.postings { | ||
sizeInBytes += entry.Size() | ||
} | ||
|
||
for _, entry := range i.iterators { | ||
sizeInBytes += entry.Size() | ||
} | ||
|
||
if i.currPosting != nil { | ||
sizeInBytes += i.currPosting.Size() | ||
} | ||
|
||
return sizeInBytes | ||
} | ||
|
||
func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( | ||
*index.VectorDoc, error) { | ||
rv := preAlloced | ||
if rv == nil { | ||
rv = &index.VectorDoc{} | ||
} | ||
|
||
for i.segmentOffset < len(i.iterators) { | ||
next, err := i.iterators[i.segmentOffset].Next() | ||
if err != nil { | ||
return nil, err | ||
} | ||
if next != nil { | ||
// make segment number into global number by adding offset | ||
globalOffset := i.snapshot.offsets[i.segmentOffset] | ||
nnum := next.Number() | ||
rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) | ||
rv.Score = float64(next.Score()) | ||
|
||
i.currID = rv.ID | ||
i.currPosting = next | ||
|
||
return rv, nil | ||
} | ||
i.segmentOffset++ | ||
} | ||
|
||
return nil, nil | ||
} | ||
|
||
func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, | ||
preAlloced *index.VectorDoc) (*index.VectorDoc, error) { | ||
|
||
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { | ||
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k) | ||
if err != nil { | ||
return nil, err | ||
} | ||
// close the current term field reader before replacing it with a new one | ||
_ = i.Close() | ||
*i = *(i2.(*IndexSnapshotVectorReader)) | ||
} | ||
|
||
num, err := docInternalToNumber(ID) | ||
if err != nil { | ||
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) | ||
} | ||
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) | ||
if segIndex >= len(i.snapshot.segment) { | ||
return nil, fmt.Errorf("computed segment index %d out of bounds %d", | ||
segIndex, len(i.snapshot.segment)) | ||
} | ||
// skip directly to the target segment | ||
i.segmentOffset = segIndex | ||
next, err := i.iterators[i.segmentOffset].Advance(ldocNum) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if next == nil { | ||
// we jumped directly to the segment that should have contained it | ||
// but it wasn't there, so reuse Next() which should correctly | ||
// get the next hit after it (we moved i.segmentOffset) | ||
return i.Next(preAlloced) | ||
} | ||
|
||
if preAlloced == nil { | ||
preAlloced = &index.VectorDoc{} | ||
} | ||
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ | ||
i.snapshot.offsets[segIndex]) | ||
i.currID = preAlloced.ID | ||
i.currPosting = next | ||
return preAlloced, nil | ||
} | ||
|
||
func (i *IndexSnapshotVectorReader) Count() uint64 { | ||
var rv uint64 | ||
for _, posting := range i.postings { | ||
rv += posting.Count() | ||
} | ||
return rv | ||
} | ||
|
||
func (i *IndexSnapshotVectorReader) Close() error { | ||
// TODO Consider if any scope of recycling here. | ||
return nil | ||
} |
Oops, something went wrong.