Skip to content

Commit

Permalink
Support vector data type, query + searcher (#1857)
Browse files Browse the repository at this point in the history
+ Add support for indexing and querying
  vector data (array of float32)
+ Corresponding to each field of type vector,
  there will be associated properties like
  Dimensions and Similarity, which will be used to
  determine validity of a vector and a criteria for
  scoring search hits, respectively.
+ Supports vector reader interface (searcher) and
  the `knn` construct within the SearchRequest.

Related PR: 
* blevesearch/bleve_index_api#34

---------

Co-authored-by: Abhi Dangeti <[email protected]>
Co-authored-by: Aditi Ahuja <[email protected]>
  • Loading branch information
3 people authored Nov 3, 2023
1 parent 840fde5 commit 80d9b18
Show file tree
Hide file tree
Showing 22 changed files with 1,348 additions and 171 deletions.
138 changes: 138 additions & 0 deletions document/field_vector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build vectors
// +build vectors

package document

import (
"fmt"
"reflect"

"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeVectorField int

func init() {
var f VectorField
reflectStaticSizeVectorField = int(reflect.TypeOf(f).Size())
}

const DefaultVectorIndexingOptions = index.IndexField

type VectorField struct {
name string
dims int // Dimensionality of the vector
similarity string // Similarity metric to use for scoring
options index.FieldIndexingOptions
value []float32
numPlainTextBytes uint64
}

func (n *VectorField) Size() int {
return reflectStaticSizeVectorField + size.SizeOfPtr +
len(n.name) +
int(numBytesFloat32s(n.value))
}

func (n *VectorField) Name() string {
return n.name
}

func (n *VectorField) ArrayPositions() []uint64 {
return nil
}

func (n *VectorField) Options() index.FieldIndexingOptions {
return n.options
}

func (n *VectorField) NumPlainTextBytes() uint64 {
return n.numPlainTextBytes
}

func (n *VectorField) AnalyzedLength() int {
// vectors aren't analyzed
return 0
}

func (n *VectorField) EncodedFieldType() byte {
return 'v'
}

func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies {
// vectors aren't analyzed
return nil
}

func (n *VectorField) Analyze() {
// vectors aren't analyzed
}

func (n *VectorField) Value() []byte {
return nil
}

func (n *VectorField) GoString() string {
return fmt.Sprintf("&document.VectorField{Name:%s, Options: %s, "+
"Value: %+v}", n.name, n.options, n.value)
}

// For the sake of not polluting the API, we are keeping arrayPositions as a
// parameter, but it is not used.
func NewVectorField(name string, arrayPositions []uint64,
vector []float32, dims int, similarity string) *VectorField {
return NewVectorFieldWithIndexingOptions(name, arrayPositions,
vector, dims, similarity, DefaultVectorIndexingOptions)
}

// For the sake of not polluting the API, we are keeping arrayPositions as a
// parameter, but it is not used.
func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64,
vector []float32, dims int, similarity string,
options index.FieldIndexingOptions) *VectorField {
options = options | DefaultVectorIndexingOptions

return &VectorField{
name: name,
dims: dims,
similarity: similarity,
options: options,
value: vector,
numPlainTextBytes: numBytesFloat32s(vector),
}
}

func numBytesFloat32s(value []float32) uint64 {
return uint64(len(value) * size.SizeOfFloat32)
}

// -----------------------------------------------------------------------------
// Following methods help in implementing the bleve_index_api's VectorField
// interface.

func (n *VectorField) Vector() []float32 {
return n.value
}

func (n *VectorField) Dims() int {
return n.dims
}

func (n *VectorField) Similarity() string {
return n.similarity
}
41 changes: 12 additions & 29 deletions geo/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"reflect"
"strconv"
"strings"

"github.com/blevesearch/bleve/v2/util"
)

// ExtractGeoPoint takes an arbitrary interface{} and tries it's best to
Expand Down Expand Up @@ -61,12 +63,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
first := thingVal.Index(0)
if first.CanInterface() {
firstVal := first.Interface()
lon, foundLon = extractNumericVal(firstVal)
lon, foundLon = util.ExtractNumericValFloat64(firstVal)
}
second := thingVal.Index(1)
if second.CanInterface() {
secondVal := second.Interface()
lat, foundLat = extractNumericVal(secondVal)
lat, foundLat = util.ExtractNumericValFloat64(secondVal)
}
}
}
Expand Down Expand Up @@ -105,12 +107,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
// is it a map
if l, ok := thing.(map[string]interface{}); ok {
if lval, ok := l["lon"]; ok {
lon, foundLon = extractNumericVal(lval)
lon, foundLon = util.ExtractNumericValFloat64(lval)
} else if lval, ok := l["lng"]; ok {
lon, foundLon = extractNumericVal(lval)
lon, foundLon = util.ExtractNumericValFloat64(lval)
}
if lval, ok := l["lat"]; ok {
lat, foundLat = extractNumericVal(lval)
lat, foundLat = util.ExtractNumericValFloat64(lval)
}
}

Expand All @@ -121,19 +123,19 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
if thingVal.Field(i).CanInterface() {
fieldVal := thingVal.Field(i).Interface()
lon, foundLon = extractNumericVal(fieldVal)
lon, foundLon = util.ExtractNumericValFloat64(fieldVal)
}
}
if strings.HasPrefix(strings.ToLower(fieldName), "lng") {
if thingVal.Field(i).CanInterface() {
fieldVal := thingVal.Field(i).Interface()
lon, foundLon = extractNumericVal(fieldVal)
lon, foundLon = util.ExtractNumericValFloat64(fieldVal)
}
}
if strings.HasPrefix(strings.ToLower(fieldName), "lat") {
if thingVal.Field(i).CanInterface() {
fieldVal := thingVal.Field(i).Interface()
lat, foundLat = extractNumericVal(fieldVal)
lat, foundLat = util.ExtractNumericValFloat64(fieldVal)
}
}
}
Expand All @@ -157,25 +159,6 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
return lon, lat, foundLon && foundLat
}

// extract numeric value (if possible) and returns a float64
func extractNumericVal(v interface{}) (float64, bool) {
val := reflect.ValueOf(v)
if !val.IsValid() {
return 0, false
}
typ := val.Type()
switch typ.Kind() {
case reflect.Float32, reflect.Float64:
return val.Float(), true
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return float64(val.Int()), true
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return float64(val.Uint()), true
}

return 0, false
}

// various support interfaces which can be used to find lat/lon
type loner interface {
Lon() float64
Expand Down Expand Up @@ -209,12 +192,12 @@ func extractCoordinates(thing interface{}) []float64 {
first := thingVal.Index(0)
if first.CanInterface() {
firstVal := first.Interface()
lon, foundLon = extractNumericVal(firstVal)
lon, foundLon = util.ExtractNumericValFloat64(firstVal)
}
second := thingVal.Index(1)
if second.CanInterface() {
secondVal := second.Interface()
lat, foundLat = extractNumericVal(secondVal)
lat, foundLat = util.ExtractNumericValFloat64(secondVal)
}

if !foundLon || !foundLat {
Expand Down
156 changes: 156 additions & 0 deletions index/scorch/snapshot_index_vr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build vectors
// +build vectors

package scorch

import (
"bytes"
"context"
"fmt"
"reflect"

"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
)

var reflectStaticSizeIndexSnapshotVectorReader int

func init() {
var istfr IndexSnapshotVectorReader
reflectStaticSizeIndexSnapshotVectorReader = int(reflect.TypeOf(istfr).Size())
}

type IndexSnapshotVectorReader struct {
vector []float32
field string
k int64
snapshot *IndexSnapshot
postings []segment_api.VecPostingsList
iterators []segment_api.VecPostingsIterator
segmentOffset int
currPosting segment_api.VecPosting
currID index.IndexInternalID
ctx context.Context
}

func (i *IndexSnapshotVectorReader) Size() int {
sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr +
len(i.vector) + len(i.field) + len(i.currID)

for _, entry := range i.postings {
sizeInBytes += entry.Size()
}

for _, entry := range i.iterators {
sizeInBytes += entry.Size()
}

if i.currPosting != nil {
sizeInBytes += i.currPosting.Size()
}

return sizeInBytes
}

func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) (
*index.VectorDoc, error) {
rv := preAlloced
if rv == nil {
rv = &index.VectorDoc{}
}

for i.segmentOffset < len(i.iterators) {
next, err := i.iterators[i.segmentOffset].Next()
if err != nil {
return nil, err
}
if next != nil {
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
nnum := next.Number()
rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset)
rv.Score = float64(next.Score())

i.currID = rv.ID
i.currPosting = next

return rv, nil
}
i.segmentOffset++
}

return nil, nil
}

func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {

if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k)
if err != nil {
return nil, err
}
// close the current term field reader before replacing it with a new one
_ = i.Close()
*i = *(i2.(*IndexSnapshotVectorReader))
}

num, err := docInternalToNumber(ID)
if err != nil {
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
}
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
if segIndex >= len(i.snapshot.segment) {
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
segIndex, len(i.snapshot.segment))
}
// skip directly to the target segment
i.segmentOffset = segIndex
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
if err != nil {
return nil, err
}
if next == nil {
// we jumped directly to the segment that should have contained it
// but it wasn't there, so reuse Next() which should correctly
// get the next hit after it (we moved i.segmentOffset)
return i.Next(preAlloced)
}

if preAlloced == nil {
preAlloced = &index.VectorDoc{}
}
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
i.snapshot.offsets[segIndex])
i.currID = preAlloced.ID
i.currPosting = next
return preAlloced, nil
}

func (i *IndexSnapshotVectorReader) Count() uint64 {
var rv uint64
for _, posting := range i.postings {
rv += posting.Count()
}
return rv
}

func (i *IndexSnapshotVectorReader) Close() error {
// TODO Consider if any scope of recycling here.
return nil
}
Loading

0 comments on commit 80d9b18

Please sign in to comment.