Skip to content

Commit

Permalink
Normalize msmarco-v2-vector queries and switch to dot-product (elasti…
Browse files Browse the repository at this point in the history
…c#578)

* msmarco-v2-vector: add vector normalization to parse_documents.py and use dot-product on ingestion mappings

* msmarco-v2-vector: update dataset file sizes to match new normalised dataset files

* msmarco-v2-vector: normalize queries
  • Loading branch information
1stvamp authored Mar 14, 2024
1 parent c7d67a8 commit d17928c
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 97 deletions.
4 changes: 3 additions & 1 deletion msmarco-v2-vector/_tools/parse_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from os import environ

import ir_datasets
import numpy
import vg
from cohere import AsyncClient

DATASET_NAME: str = "msmarco-passage-v2/train"
Expand All @@ -12,7 +14,7 @@

async def retrieve_embed_for_query(co, text):
response = await co.embed(texts=[text], model="embed-english-v3.0", input_type="search_query")
return response.embeddings[0]
return vg.normalize(numpy.array(response.embeddings[0])).tolist()


async def output_queries(queries_file):
Expand Down
1 change: 1 addition & 0 deletions msmarco-v2-vector/_tools/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cohere
datasets
ir-datasets
vg
2 changes: 1 addition & 1 deletion msmarco-v2-vector/index-vectors-only-mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"element_type": "float",
"dims": 1024,
"index": true,
"similarity": "max_inner_product",
"similarity": "dot_product",
"index_options": {
"type": {{ vector_index_type | default("int8_hnsw") | tojson }}
}
Expand Down
2 changes: 1 addition & 1 deletion msmarco-v2-vector/index-vectors-with-text-mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"element_type": "float",
"dims": 1024,
"index": true,
"similarity": "max_inner_product",
"similarity": "dot_product",
"index_options": {
"type": {{ vector_index_type | default("int8_hnsw") |tojson }}
}
Expand Down
Binary file modified msmarco-v2-vector/queries.json.bz2
Binary file not shown.
Loading

0 comments on commit d17928c

Please sign in to comment.