forked from TuGraph-family/tugraph-db
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This is an implementation of TuGraph-family#819 Signed-off-by: Junwang Zhao <[email protected]>
- Loading branch information
Showing
24 changed files
with
3,541 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/** | ||
* Copyright 2024 AntGroup CO., Ltd. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* | ||
* Author: | ||
* Junwang Zhao <[email protected]> | ||
*/ | ||
|
||
#include "delete_map.h" | ||
|
||
#include <filesystem> | ||
#include <fstream> | ||
|
||
#include "common/exceptions.h" | ||
|
||
namespace graphdb { | ||
|
||
namespace embedding { | ||
|
||
namespace { | ||
|
||
const size_t SPARSE_FORMAT_MAX_DELETE = 2048; | ||
|
||
inline size_t BitCnt2Byte(size_t bit_count) { return (bit_count + 7) >> 3; } | ||
inline size_t Bit2Byte(size_t id) { return id >> 3; } | ||
|
||
inline void BitmapSet(std::vector<uint8_t>& bitmap, size_t id) { | ||
bitmap[Bit2Byte(id)] |= (1 << (id & 7)); | ||
} | ||
|
||
inline bool BitmapTest(const std::vector<uint8_t>& bitmap, size_t id) { | ||
return bitmap[Bit2Byte(id)] & (1 << (id & 7)); | ||
} | ||
|
||
} // namespace | ||
|
||
DeleteMap::DeleteMap(int64_t total_bit_count) | ||
: total_bit_count_(total_bit_count) {} | ||
|
||
bool DeleteMap::SetDelete(int64_t id) { | ||
if (id >= total_bit_count_) { | ||
THROW_CODE(InvalidParameter, "overflow: id {}, capacity {}", id, | ||
total_bit_count_); | ||
} | ||
|
||
if (format_ == SPARSE) { | ||
if (deleted_labels_.count(id)) { | ||
// already deleted | ||
return false; | ||
} | ||
deleted_labels_.emplace(id); | ||
deleted_count_++; | ||
if (deleted_count_ > SPARSE_FORMAT_MAX_DELETE) { | ||
auto bitmap_size = BitCnt2Byte(total_bit_count_); | ||
bitmap_.resize(bitmap_size, 0); | ||
for (auto label : deleted_labels_) { | ||
BitmapSet(bitmap_, label); | ||
} | ||
deleted_labels_.clear(); | ||
} | ||
} else { | ||
if (BitmapTest(bitmap_, id)) { | ||
// already deleted | ||
return false; | ||
} | ||
BitmapSet(bitmap_, id); | ||
deleted_count_++; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
bool DeleteMap::IsDeleted(int64_t id) const { | ||
if (id >= total_bit_count_) { | ||
THROW_CODE(InvalidParameter, "overflow: id {}, maximum id {}", id, | ||
total_bit_count_ - 1); | ||
} | ||
|
||
if (deleted_count_ == 0) { | ||
return false; | ||
} | ||
|
||
if (format_ == SPARSE) { | ||
return deleted_labels_.count(id); | ||
} | ||
|
||
if (format_ == DENSE) { | ||
return BitmapTest(bitmap_, id); | ||
} | ||
|
||
THROW_CODE(InvalidParameter, "illegal format {}", (uint32_t)format_); | ||
} | ||
|
||
} // namespace embedding | ||
|
||
} // namespace graphdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/** | ||
* Copyright 2024 AntGroup CO., Ltd. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* | ||
* Author: | ||
* Junwang Zhao <[email protected]> | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
#include <shared_mutex> | ||
#include <string> | ||
#include <unordered_set> | ||
#include <vector> | ||
|
||
namespace graphdb { | ||
|
||
namespace embedding { | ||
|
||
/** | ||
* This structure is primarily used to mark the Embedding LabelId of Persistent | ||
* Index Chunk as deleted. | ||
* For a sparse delete map, we simply inset the LabelId to the set. | ||
* For a dense delete map, we convert the file to use bits to represent the | ||
* LabelId. | ||
*/ | ||
class DeleteMap { | ||
public: | ||
enum Format : uint32_t { SPARSE = 0, DENSE = 1 }; | ||
|
||
DeleteMap(int64_t total_bit_count); | ||
~DeleteMap() = default; | ||
|
||
/** | ||
* Set the label id as deleted | ||
* return: | ||
* false if already set before | ||
* true if set successfully | ||
*/ | ||
bool SetDelete(int64_t id); | ||
/** | ||
* Is the lable id deleted? | ||
*/ | ||
bool IsDeleted(int64_t id) const; | ||
|
||
int64_t GetDeleteCount() const { return deleted_count_; } | ||
|
||
private: | ||
std::unordered_set<int64_t> deleted_labels_; | ||
int64_t total_bit_count_; | ||
size_t deleted_count_{0}; | ||
Format format_{SPARSE}; | ||
std::vector<uint8_t> bitmap_; | ||
std::shared_mutex mutex_; | ||
}; | ||
|
||
} // namespace embedding | ||
|
||
} // namespace graphdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
/** | ||
* Copyright 2024 AntGroup CO., Ltd. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* | ||
* Author: | ||
* Junwang Zhao <[email protected]> | ||
*/ | ||
|
||
#include "delta.h" | ||
|
||
#include <faiss/impl/FaissException.h> | ||
#include <faiss/index_factory.h> | ||
|
||
#include <filesystem> | ||
#include <fstream> | ||
|
||
#include "common/exceptions.h" | ||
#include "faiss_internal.h" | ||
#include "index.h" | ||
|
||
namespace graphdb { | ||
|
||
namespace embedding { | ||
|
||
namespace { | ||
|
||
faiss::MetricType DistanceTypeToFaissMetricType( | ||
meta::VectorDistanceType distance_type) { | ||
switch (distance_type) { | ||
case meta::VectorDistanceType::L2: | ||
return faiss::MetricType::METRIC_L2; | ||
// cosine can be converted to IP, so faiss do not have COSINE metric | ||
case meta::VectorDistanceType::IP: | ||
case meta::VectorDistanceType::COSINE: | ||
return faiss::MetricType::METRIC_INNER_PRODUCT; | ||
default: | ||
THROW_CODE(VectorIndexException, "index type {} not supported", | ||
meta::VectorDistanceType_Name(distance_type)); | ||
} | ||
} | ||
|
||
class DeltaIdSelector : public IdSelector { | ||
public: | ||
DeltaIdSelector(const std::vector<int64_t>& vid_vec) : vid_vec_(vid_vec) {} | ||
~DeltaIdSelector() override = default; | ||
|
||
bool is_member(int64_t id) const override { return vid_vec_[id] != -1; } | ||
|
||
private: | ||
const std::vector<int64_t>& vid_vec_; | ||
}; | ||
|
||
} // namespace | ||
|
||
Delta::Delta(int64_t dim, meta::VectorDistanceType distance_type) { | ||
delta_.reset(faiss::index_factory( | ||
dim, "Flat", DistanceTypeToFaissMetricType(distance_type))); | ||
} | ||
|
||
void Delta::Add(const DataSet& ds) { | ||
assert(ds.n == 1); | ||
delta_->add(ds.n, reinterpret_cast<const float*>(ds.x)); | ||
int64_t vid = ds.vids[0]; | ||
|
||
if (vid_labelid_map_.count(vid)) { | ||
if (vid_vec_[vid_labelid_map_[vid]] != -1) { | ||
THROW_CODE(InvalidParameter, "vid {} already exists", vid); | ||
} | ||
} | ||
vid_vec_.push_back(vid); | ||
vid_labelid_map_[vid] = vid_cnt_++; | ||
} | ||
|
||
void Delta::AddBatch(const DataSet& ds) { | ||
delta_->add(ds.n, reinterpret_cast<const float*>(ds.x)); | ||
int64_t* vids = ds.vids; | ||
|
||
for (size_t i = 0; i < ds.n; i++) { | ||
int64_t vid = vids[i]; | ||
vid_vec_.push_back(vid); | ||
vid_labelid_map_[vid] = vid_cnt_++; | ||
} | ||
} | ||
|
||
void Delta::Delete(const DataSet& ds) { | ||
assert(ds.n == 1); | ||
int64_t vid = ds.vids[0]; | ||
if (vid_labelid_map_.count(vid) == 0 || | ||
vid_vec_[vid_labelid_map_[vid]] == -1) { | ||
THROW_CODE(InvalidParameter, "vid {} doesn't exist", vid); | ||
} | ||
|
||
vid_vec_[vid_labelid_map_[vid]] = -1; | ||
} | ||
|
||
void Delta::Search(const DataSet& ds) { | ||
assert(ds.n == 1); | ||
std::unique_ptr<faiss::SearchParameters> faiss_params = | ||
std::make_unique<faiss::SearchParameters>(); | ||
|
||
DeltaIdSelector id_selector(vid_vec_); | ||
std::unique_ptr<FaissIDSelector> faiss_selector = | ||
std::make_unique<FaissIDSelector>(id_selector); | ||
|
||
faiss_params->sel = faiss_selector.get(); | ||
|
||
try { | ||
delta_->search(ds.n, reinterpret_cast<const float*>(ds.x), ds.k, | ||
(float*)ds.distances, ds.vids, faiss_params.get()); | ||
} catch (faiss::FaissException& e) { | ||
THROW_CODE(VectorIndexException, "search failed with exception: {}", | ||
e.msg); | ||
} | ||
|
||
// convert labelid to vid | ||
for (size_t i = 0; i < ds.n * ds.k; i++) { | ||
if (ds.vids[i] >= 0) { | ||
ds.vids[i] = vid_vec_[ds.vids[i]]; | ||
} | ||
} | ||
} | ||
|
||
void Delta::RangeSearch(const DataSet& ds) { | ||
THROW_CODE(VectorIndexException, "TODO: implement range search later"); | ||
} | ||
|
||
} // namespace embedding | ||
|
||
} // namespace graphdb |
Oops, something went wrong.