-
Notifications
You must be signed in to change notification settings - Fork 116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
How to create QG. #176
Comments
Hello, |
Thank you for your response. I still have a few questions I need your help with. I wrote a piece of code to test the speed of QG as follows:
#include <iostream>
#include <chrono>
#include "NGT/Index.h"
#include "NGT/GraphOptimizer.h"
#include "NGT/NGTQ/QuantizedGraph.h"
int load_data_from_file(const std::string& file_path, std::vector<uint64_t>& labels, std::vector<uint8_t>& vecs, const size_t vector_size, const size_t count) {
std::string line;
std::cout << "count is " << count << std::endl;
std::cout << "load data from file: " << file_path << std::endl;
std::ifstream data(file_path);
size_t c = 0;
size_t invalid_count = 0;
if (data.is_open()) {
while (std::getline(data, line) && c < count) {
std::vector<std::string> fields;
size_t pos = 0;
std::string delimiter = "\t";
while ((pos = line.find(delimiter)) != std::string::npos) {
fields.push_back(line.substr(0, pos));
line.erase(0, pos + delimiter.length());
}
fields.push_back(line);
if (fields.size() != 2) {
invalid_count++;
continue;
}
uint64_t label = std::stoull(fields[0]);
std::string emb_str = fields[1];
std::stringstream ss(emb_str);
std::vector<uint8_t> vec(vector_size);
std::string token;
float* f_vec = (float*)vec.data();
while(getline(ss, token, ',') && f_vec < (float*)(vec.data() + vector_size)) {
// vec.push_back(std::stof(token));
*f_vec = std::stof(token);
f_vec++;
}
labels.push_back(label);
vecs.insert(vecs.end(), vec.begin(), vec.end());
c++;
}
} else {
std::cout << "Unable to open file" << std::endl;
return 1;
}
std::cout << "Invalid count is " << invalid_count << std::endl;
std::cout << "Load data size " << c << std::endl;
data.close();
return 0;
}
int main(int argc, char **argv)
{
// craete anng index
NGT::Property property;
property.dimension = 128;
property.objectType = NGT::ObjectSpace::ObjectType::Float;
property.distanceType = NGT::Index::Property::DistanceType::L2;
property.edgeSizeForCreation = 200;
std::string anng_path("cpp_anng-100");
NGT::Index::create(anng_path, property);
NGT::Index index(anng_path);
// loading data from file
// id1\tf1,f2,f3,f4,...f128
// id2\tf1,f2,f3,f4,...f128
// ...
std::string data_path = "../../data/deal/float_data_600k_deal.csv";
std::vector<uint8_t> object;
std::vector<uint64_t> labels;
size_t vec_size = 128*sizeof(float);
size_t vec_num = 600000;
load_data_from_file(data_path, labels, object, vec_size, vec_num);
for (size_t i = 0; i < vec_num; i++) {
float* d = (float*)(object.data() + i*vec_size);
std::vector<float> vec;
for (int j = 0; j < 128; j++) {
vec.push_back(*(d+j));
}
index.append(vec);
}
index.createIndex(16);
index.save();
// // optimizer search paramters
// NGT::GraphOptimizer graphOptimizer1(false);
// graphOptimizer1.setProcessingModes(false, true, true, true);
// graphOptimizer1.optimizeSearchParameters(anng_path);
// // refine
// std::string ranng_path = "cpp_ranng-100";
// NGT::Index index_refine(anng_path);
// NGT::GraphReconstructor::refineANNG(index_refine, false);
// index_refine.save(ranng_path);
// trans anng to onng
std::string onng_path = "cpp_onng-100";
NGT::GraphOptimizer graphOptimizer(false);
int numOfOutgoingEdges = 10;
int numOfIncomingEdges = 200;
int numOfQueries = 200;
int numOfResultantObjects = 100; // k
graphOptimizer.set(numOfOutgoingEdges, numOfIncomingEdges, numOfQueries, numOfResultantObjects);
graphOptimizer.execute(anng_path, onng_path);
// PG
size_t dimensionOfSubvector = 1;
size_t maxNumberOfEdges = 50;
try {
std::cout << "quantizing index ..." << std::endl;
NGTQG::Index::quantize(onng_path, dimensionOfSubvector, maxNumberOfEdges, true);
} catch (NGT::Exception &err) {
std::cout << "error: " << err.what() << std::endl;
return 1;
} catch (...) {
std::cout << "error" << std::endl;
return 1;
}
// search
NGTQG::Index index_search(onng_path, true);
NGT::Property property2;
index_search.getProperty(property2);
// load query data
std::string query_path = "../../data/gt/float_query_emb_deal_part0.csv";
std::vector<uint8_t> object2;
std::vector<uint64_t> labels2;
size_t query_vec_num = 1000;
load_data_from_file(query_path, labels2, object2, vec_size, query_vec_num);
std::vector<std::vector<float>> querys;
for (size_t i = 0; i < query_vec_num; i++) {
std::vector<float> query;
for (size_t j = 0; j < vec_size/sizeof(float); j++) {
float value = *((float*)(object2.data()+i*vec_size)+j);
query.push_back(value);
}
querys.push_back(query);
}
std::cout << "laod data success." << std::endl;
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
uint64_t total = 0;
std::cout << "ready to search" << std::endl;
float dis = 0.0;
float sum = 0.0;
size_t repeat = 1;
for (size_t r = 0; r < repeat; r++) {
for (size_t i = 0; i < 1000; i++) {
NGTQG::SearchQuery searchQuery(querys[i]);
NGT::ObjectDistances results;
searchQuery.setResults(&results);
searchQuery.setSize(100);
// searchQuery.setEpsilon(0.1);
searchQuery.setExpectedAccuracy(0.9);
start = std::chrono::high_resolution_clock::now();
index_search.search(searchQuery);
end = std::chrono::high_resolution_clock::now();
total += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
for (size_t i = 0; i < results.size(); i++) {
std::cout << i + 1 << "\t" << labels[results[i].id] << "\t" << results[i].distance << std::endl;
}
}
}
std::cout << "consume:\t" << total << std::endl;
return 0;
} I have encountered a few issues:
I saw in the ann-benchmark tests that NGT-QG demonstrated very strong performance, but my results seem different. Is there any mistake in my code, or are there any parameters I can adjust? I hope you can provide some suggestions, thank you very much. |
I compile this code using g++ -o ./ngt_qg_test ./ngt_qg_test.cc -lngt -lblas -march=native -O3 -fopenmp and I compile ngt using yum install blas-devel lapack-devel
unzip NGT-x.x.x.zip
cd NGT-x.x.x
mkdir build
cd build
cmake ..
make
make install
ldconfig /usr/local/lib |
As for your questions 1 to 3, the issues might be caused by your dataset. First, I recommend trying the same operation using the NGT command-line tools (ngt and qbg). If you encounter the same situation, try adjusting the parameters. |
thank you very much, I will retry using command-line. |
Hello, I am using NGT-qg for vector recall, and I found that there are two ways to convert ONNG to QG:
Command line:
qbg command [option] index [data]
C++ code:
NGTQG::Index::quantize(indexPath, dimensionOfSubvector, maxNumberOfEdges, true);
which fromsamples/qg-l2-float
However, I noticed that when using the
qbg
command line to perform QG on ONNG, it involves two steps:qbg create-qg
qbg build-qg
When I checked the C++ source code corresponding to these two commands, I found that they perform the following operations:
It seems that when using the command line to perform QG on ONNG, the operations involved are more extensive and mainly include:
This differs from the method implemented in samples/qg-l2-float. What is the difference here?
I would appreciate any help or insights you can provide on this issue.
The text was updated successfully, but these errors were encountered: