Skip to content

Commit

Permalink
Redesign vector index
Browse files Browse the repository at this point in the history
This is an implementation of TuGraph-family#819

Signed-off-by: Junwang Zhao <[email protected]>
  • Loading branch information
zhjwpku committed Dec 16, 2024
1 parent 78e62fc commit 061f9b4
Show file tree
Hide file tree
Showing 24 changed files with 3,541 additions and 182 deletions.
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ add_library(graphdb STATIC
src/graphdb/graph_entity.cpp
src/graphdb/meta_info.cpp
src/graphdb/index.cpp
src/graphdb/uuid_generator.cpp
src/graphdb/embedding/delete_map.cpp
src/graphdb/embedding/delta.cpp
src/graphdb/embedding/faiss_index_chunk.cpp
src/graphdb/embedding/id_mapper.cpp
src/graphdb/embedding/index.cpp
src/graphdb/embedding/mem_index_chunk.cpp
src/graphdb/embedding/vsag_index_chunk.cpp
)

add_library(common STATIC
Expand Down Expand Up @@ -131,6 +139,10 @@ set(THIRD_STATIC_LIBS
date-tz.a
${FT_INDEX_LIB}
antlr4-runtime.a
faiss.a
openblas.a
gomp
gfortran
)

if (NOT ENABLE_ASAN)
Expand Down Expand Up @@ -226,5 +238,7 @@ install(FILES
/usr/local/lib64/libvsag.so
/lib64/libgfortran.so.5
/lib64/libgfortran.so.5.0.0
/lib64/libgomp.so.1
/lib64/libgomp.so.1.0.0
DESTINATION lib64/lgraph)

3 changes: 2 additions & 1 deletion src/common/exceptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ X(VectorIndexException, "Vector index exception.") \
X(VertexVectorIndexAlreadyExist, "Vertex vector index already exist.") \
X(VertexFullTextIndexAlreadyExist, "Vertex fulltext index already exist.") \
X(ConnectionDisconnected, "Connection has been disconnected.") \
X(Unimplemented, "Unimplemented.")
X(Unimplemented, "Unimplemented.") \
X(IOException, "IO exception.")

enum class ErrorCode {
#define X(code, msg) code,
Expand Down
105 changes: 105 additions & 0 deletions src/graphdb/embedding/delete_map.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/**
* Copyright 2024 AntGroup CO., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* Author:
* Junwang Zhao <[email protected]>
*/

#include "delete_map.h"

#include <filesystem>
#include <fstream>

#include "common/exceptions.h"

namespace graphdb {

namespace embedding {

namespace {

const size_t SPARSE_FORMAT_MAX_DELETE = 2048;

inline size_t BitCnt2Byte(size_t bit_count) { return (bit_count + 7) >> 3; }
inline size_t Bit2Byte(size_t id) { return id >> 3; }

inline void BitmapSet(std::vector<uint8_t>& bitmap, size_t id) {
bitmap[Bit2Byte(id)] |= (1 << (id & 7));
}

inline bool BitmapTest(const std::vector<uint8_t>& bitmap, size_t id) {
return bitmap[Bit2Byte(id)] & (1 << (id & 7));
}

} // namespace

DeleteMap::DeleteMap(int64_t total_bit_count)
: total_bit_count_(total_bit_count) {}

bool DeleteMap::SetDelete(int64_t id) {
if (id >= total_bit_count_) {
THROW_CODE(InvalidParameter, "overflow: id {}, capacity {}", id,
total_bit_count_);
}

if (format_ == SPARSE) {
if (deleted_labels_.count(id)) {
// already deleted
return false;
}
deleted_labels_.emplace(id);
deleted_count_++;
if (deleted_count_ > SPARSE_FORMAT_MAX_DELETE) {
auto bitmap_size = BitCnt2Byte(total_bit_count_);
bitmap_.resize(bitmap_size, 0);
for (auto label : deleted_labels_) {
BitmapSet(bitmap_, label);
}
deleted_labels_.clear();
}
} else {
if (BitmapTest(bitmap_, id)) {
// already deleted
return false;
}
BitmapSet(bitmap_, id);
deleted_count_++;
}

return true;
}

bool DeleteMap::IsDeleted(int64_t id) const {
if (id >= total_bit_count_) {
THROW_CODE(InvalidParameter, "overflow: id {}, maximum id {}", id,
total_bit_count_ - 1);
}

if (deleted_count_ == 0) {
return false;
}

if (format_ == SPARSE) {
return deleted_labels_.count(id);
}

if (format_ == DENSE) {
return BitmapTest(bitmap_, id);
}

THROW_CODE(InvalidParameter, "illegal format {}", (uint32_t)format_);
}

} // namespace embedding

} // namespace graphdb
70 changes: 70 additions & 0 deletions src/graphdb/embedding/delete_map.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/**
* Copyright 2024 AntGroup CO., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* Author:
* Junwang Zhao <[email protected]>
*/

#pragma once

#include <cstdint>
#include <memory>
#include <shared_mutex>
#include <string>
#include <unordered_set>
#include <vector>

namespace graphdb {

namespace embedding {

/**
* This structure is primarily used to mark the Embedding LabelId of Persistent
* Index Chunk as deleted.
* For a sparse delete map, we simply inset the LabelId to the set.
* For a dense delete map, we convert the file to use bits to represent the
* LabelId.
*/
class DeleteMap {
public:
enum Format : uint32_t { SPARSE = 0, DENSE = 1 };

DeleteMap(int64_t total_bit_count);
~DeleteMap() = default;

/**
* Set the label id as deleted
* return:
* false if already set before
* true if set successfully
*/
bool SetDelete(int64_t id);
/**
* Is the lable id deleted?
*/
bool IsDeleted(int64_t id) const;

int64_t GetDeleteCount() const { return deleted_count_; }

private:
std::unordered_set<int64_t> deleted_labels_;
int64_t total_bit_count_;
size_t deleted_count_{0};
Format format_{SPARSE};
std::vector<uint8_t> bitmap_;
std::shared_mutex mutex_;
};

} // namespace embedding

} // namespace graphdb
138 changes: 138 additions & 0 deletions src/graphdb/embedding/delta.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/**
* Copyright 2024 AntGroup CO., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* Author:
* Junwang Zhao <[email protected]>
*/

#include "delta.h"

#include <faiss/impl/FaissException.h>
#include <faiss/index_factory.h>

#include <filesystem>
#include <fstream>

#include "common/exceptions.h"
#include "faiss_internal.h"
#include "index.h"

namespace graphdb {

namespace embedding {

namespace {

faiss::MetricType DistanceTypeToFaissMetricType(
meta::VectorDistanceType distance_type) {
switch (distance_type) {
case meta::VectorDistanceType::L2:
return faiss::MetricType::METRIC_L2;
// cosine can be converted to IP, so faiss do not have COSINE metric
case meta::VectorDistanceType::IP:
case meta::VectorDistanceType::COSINE:
return faiss::MetricType::METRIC_INNER_PRODUCT;
default:
THROW_CODE(VectorIndexException, "index type {} not supported",
meta::VectorDistanceType_Name(distance_type));
}
}

class DeltaIdSelector : public IdSelector {
public:
DeltaIdSelector(const std::vector<int64_t>& vid_vec) : vid_vec_(vid_vec) {}
~DeltaIdSelector() override = default;

bool is_member(int64_t id) const override { return vid_vec_[id] != -1; }

private:
const std::vector<int64_t>& vid_vec_;
};

} // namespace

Delta::Delta(int64_t dim, meta::VectorDistanceType distance_type) {
delta_.reset(faiss::index_factory(
dim, "Flat", DistanceTypeToFaissMetricType(distance_type)));
}

void Delta::Add(const DataSet& ds) {
assert(ds.n == 1);
delta_->add(ds.n, reinterpret_cast<const float*>(ds.x));
int64_t vid = ds.vids[0];

if (vid_labelid_map_.count(vid)) {
if (vid_vec_[vid_labelid_map_[vid]] != -1) {
THROW_CODE(InvalidParameter, "vid {} already exists", vid);
}
}
vid_vec_.push_back(vid);
vid_labelid_map_[vid] = vid_cnt_++;
}

void Delta::AddBatch(const DataSet& ds) {
delta_->add(ds.n, reinterpret_cast<const float*>(ds.x));
int64_t* vids = ds.vids;

for (size_t i = 0; i < ds.n; i++) {
int64_t vid = vids[i];
vid_vec_.push_back(vid);
vid_labelid_map_[vid] = vid_cnt_++;
}
}

void Delta::Delete(const DataSet& ds) {
assert(ds.n == 1);
int64_t vid = ds.vids[0];
if (vid_labelid_map_.count(vid) == 0 ||
vid_vec_[vid_labelid_map_[vid]] == -1) {
THROW_CODE(InvalidParameter, "vid {} doesn't exist", vid);
}

vid_vec_[vid_labelid_map_[vid]] = -1;
}

void Delta::Search(const DataSet& ds) {
assert(ds.n == 1);
std::unique_ptr<faiss::SearchParameters> faiss_params =
std::make_unique<faiss::SearchParameters>();

DeltaIdSelector id_selector(vid_vec_);
std::unique_ptr<FaissIDSelector> faiss_selector =
std::make_unique<FaissIDSelector>(id_selector);

faiss_params->sel = faiss_selector.get();

try {
delta_->search(ds.n, reinterpret_cast<const float*>(ds.x), ds.k,
(float*)ds.distances, ds.vids, faiss_params.get());
} catch (faiss::FaissException& e) {
THROW_CODE(VectorIndexException, "search failed with exception: {}",
e.msg);
}

// convert labelid to vid
for (size_t i = 0; i < ds.n * ds.k; i++) {
if (ds.vids[i] >= 0) {
ds.vids[i] = vid_vec_[ds.vids[i]];
}
}
}

void Delta::RangeSearch(const DataSet& ds) {
THROW_CODE(VectorIndexException, "TODO: implement range search later");
}

} // namespace embedding

} // namespace graphdb
Loading

0 comments on commit 061f9b4

Please sign in to comment.