Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phraseandspeed #17

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions .env.crkeng

This file was deleted.

2 changes: 0 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pytest-cov = "*"
pytest-datadir = "*"
pytest-mypy = "*"
hypothesis = {version = "~=4.34", extras = ["django"]}
codecov = "*"
pysnooper = "*"
python-levenshtein = "*"
django-debug-toolbar = "*"
Expand All @@ -21,7 +20,6 @@ mypy = "*"
pytest-env = "*"
jupyterlab = "*"
appnope = "*"
nb_black = "*"
statsmodels = "*"
pandas-stubs = "*"
pytest-pythonpath = "*"
Expand Down
4 changes: 2 additions & 2 deletions src/API/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ def search_with_affixes(
rw_index: str,
rw_domain: str,
wn_synset: str,
include_auto_definitions=False,
inflect_english_phrases=False,
include_auto_definitions=True,
inflect_english_phrases=True,
):
"""
Search for wordforms matching:
Expand Down
9 changes: 8 additions & 1 deletion src/API/search/cvd_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def do_cvd_search(search_run: SearchRun):

search_run.add_verbose_message(cvd_extracted_keys=keys)
query_vector = vector_for_keys(google_news_vectors(), keys)
print(query_vector)

try:
closest = definition_vectors().similar_by_vector(query_vector, 50)
Expand All @@ -41,20 +42,25 @@ def do_cvd_search(search_run: SearchRun):
wordform_queries = [
cvd_key_to_wordform_query(similarity) for similarity, weight in closest
]
print(wordform_queries)
similarities = [similarity for cvd_key, similarity in closest]

print(similarities)
# Get all possible wordforms in one big query. We will select more than we
# need, then filter it down later, but this will have to do until we get
# better homonym handling.
print(Wordform.objects.count())
wordform_results = Wordform.objects.filter(
text__in=set(wf["text"] for wf in wordform_queries)
)
print(wordform_results)

# Now match back up
wordforms_by_text = {
text: list(wordforms)
for text, wordforms in itertools.groupby(wordform_results, key=lambda x: x.text)
}

print(wordforms_by_text)

for similarity, wordform_query in zip(similarities, wordform_queries):
# gensim uses the terminology, similarity = 1 - distance. Its
Expand All @@ -64,6 +70,7 @@ def do_cvd_search(search_run: SearchRun):
distance = 1 - similarity

wordforms_for_query = wordforms_by_text.get(wordform_query["text"], None)
print(wordforms_for_query)
if wordforms_for_query is None:
logger.warning(
f"Wordform {wordform_query['text']} not found in CVD; mismatch between definition vector model file and definitions in database?"
Expand Down
Binary file removed src/arpeng/db/db.sqlite3
Binary file not shown.
Binary file removed src/arpeng/res/vector_models/definitions_v2.kv
Binary file not shown.
Binary file removed src/crkeng/res/vector_models/definitions_v2.kv
Binary file not shown.
Binary file not shown.
Binary file removed src/cwdeng/db/db.sqlite3
Binary file not shown.
Binary file removed src/hdneng/db/db.sqlite3
Binary file not shown.
Binary file removed src/srseng/db/db.sqlite3
Binary file not shown.
33 changes: 4 additions & 29 deletions src/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import logging
from pathlib import Path

import time
from typing import Dict, Literal
from nltk.corpus import wordnet as wn

Expand Down Expand Up @@ -187,6 +187,7 @@ def search_api(request):
:param request:
:return:
"""
start = time.time()
query_string = request.GET.get("name")
rw_index = request.GET.get("rw_index")
rw_domain = request.GET.get("rw_domain")
Expand Down Expand Up @@ -227,10 +228,6 @@ def search_api(request):
search_run.verbose_messages, indent=2, ensure_ascii=False
)

context["search_results"] = fetch_single_recording(
context["search_results"], request
)

for result in context["search_results"]:
result["wordform_text"] = wordform_orth_text(result["wordform_text"])
result["lemma_wordform"]["wordform_text"] = wordform_orth_text(
Expand All @@ -246,7 +243,8 @@ def search_api(request):
result["relabelled_fst_analysis"] = relabelFSTAnalysis(
result["relabelled_fst_analysis"]
)

end = time.time()
print(end - start)
return Response(context)


Expand Down Expand Up @@ -300,29 +298,6 @@ def wordnet_api(request, classification):
return Response(context)


def fetch_single_recording(results, request):
query_terms = []
for result in results:
query_terms.append(result["wordform_text"])

speech_db_eq = settings.SPEECH_DB_EQ
matched_recordings = {}

for search_terms in divide_chunks(query_terms, 30):
for source in speech_db_eq:
url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
matched_recordings.update(get_recordings_from_url(search_terms, url))

for result in results:
if result["wordform_text"] in matched_recordings:
result["recording"] = matched_recordings[result["wordform_text"]][
"recording_url"
]
else:
result["recording"] = ""

return results


def relabelInflectionalCategory(ic):
with open(Path(settings.RESOURCES_DIR / "altlabel.tsv")) as f:
Expand Down