From 73610023a8fb45215d3279619f79441f158f286e Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 6 Aug 2024 17:38:01 +0100
Subject: [PATCH 1/8] Add NLPLinker class and tests

---
 nlp_link/linker.py   | 264 +++++++++++++++++++++++++++++++++++++++++--
 tests/test_linker.py |  74 +++++++++++-
 2 files changed, 324 insertions(+), 14 deletions(-)

diff --git a/nlp_link/linker.py b/nlp_link/linker.py
index 2b61fc8..3f6b849 100644
--- a/nlp_link/linker.py
+++ b/nlp_link/linker.py
@@ -1,12 +1,260 @@
+"""
+Class to link two datasets.
+
+Example usage:
+
+from nlp_link.linker import NLPLinker
+
+nlp_link = NLPLinker()
+
+# dict inputs
+comparison_data = {'a': 'cats', 'b': 'dogs', 'd': 'rats', 'e': 'birds'}
+input_data = {'x': 'owls', 'y': 'feline', 'z': 'doggies', 'za': 'dogs', 'zb': 'chair'}
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data)
+# Top match output
+print(matches)
+
+# list inputs
+comparison_data = ['cats', 'dogs', 'rats', 'birds']
+input_data = ['owls', 'feline', 'doggies', 'dogs','chair']
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data)
+# Top match output
+print(matches)
+
+"""
+
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 import pandas as pd
-import random
 
+from typing import Union, Optional
+import logging
+
+from nlp_link.utils import chunk_list
+
+logger = logging.getLogger(__name__)
+
+# TO DO: cosine or euclidean?
+
+
+class NLPLinker(object):
+    """docstring for NLPLinker"""
+
+    def __init__(self, batch_size=32, embed_chunk_size=500, match_chunk_size=10000):
+        super(NLPLinker, self).__init__()
+        self.batch_size = batch_size
+        self.embed_chunk_size = embed_chunk_size
+        self.match_chunk_size = match_chunk_size
+        ## Cleaning?
+
+    def _process_dataset(
+        self,
+        input_data: Union[list, dict, pd.DataFrame],
+        id_column: Optional[str] = None,
+        text_column: Optional[str] = None,
+    ) -> dict:
+        """Check and process a dataset according to the input type
+        Args:
+            input_data (Union[list, dict, pd.DataFrame])
+                A list of texts or a dictionary of texts where the key is the unique id.
+                If a list is given then a unique id will be assigned with the index order.
+
+        Returns:
+            dict: key is the id and the value is the text
+        """
+
+        if isinstance(input_data, list):
+            return {ix: text for ix, text in enumerate(input_data)}
+        elif isinstance(input_data, dict):
+            return input_data
+        elif isinstance(input_data, pd.DataFrame):
+            try:
+                return dict(zip(input_data[id_column], input_data[text_column]))
+            except:
+                logger.warning(
+                    "Input is a dataframe, please specify id_column and text_column"
+                )
+        else:
+            logger.warning(
+                "The input_data input must be a dictionary, a list or pandas dataframe"
+            )
+
+        if not isinstance(input_data[0], str):
+            logger.warning(
+                "The input_data input must be a list of texts, or a dictionary where the values are texts"
+            )
+
+    def load(
+        self,
+        comparison_data: Union[list, dict],
+    ):
+        """
+        Load the embedding model and embed the comparison dataset
+        Args:
+            comparison_data (Union[list, dict]): The comparison texts to find links to.
+                A list of texts or a dictionary of texts where the key is the unique id.
+                If a list is given then a unique id will be assigned with the index order.
+        """
+        logger.info("Loading model")
+        self.bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        self.bert_model.max_seq_length = 512
+
+        self.comparison_data = self._process_dataset(comparison_data)
+        self.comparison_data_texts = list(self.comparison_data.values())
+        self.comparison_data_ids = list(self.comparison_data.keys())
+
+        self.comparison_embeddings = self._get_embeddings(self.comparison_data_texts)
+
+    def _get_embeddings(self, text_list: list) -> np.array:
+        """
+        Get embeddings for a list of texts
+
+        Args:
+            text_list (list): A lists of texts
+        Returns:
+            np.array: The embeddings for the input list of texts
+        """
+
+        logger.info(
+            f"Finding embeddings for {len(text_list)} texts chunked into {round(len(text_list)/self.embed_chunk_size)} chunks"
+        )
+        all_embeddings = []
+        for batch_texts in tqdm(chunk_list(text_list, self.embed_chunk_size)):
+            all_embeddings.append(
+                self.bert_model.encode(
+                    np.array(batch_texts), batch_size=self.batch_size
+                )
+            )
+        all_embeddings = np.concatenate(all_embeddings)
+
+        return all_embeddings
+
+    def get_matches(
+        self,
+        input_data_ids: list,
+        input_embeddings: np.array,
+        comparison_data_ids: list,
+        comparison_embeddings: np.array,
+        top_n: int,
+        drop_most_similar: bool = False,
+    ) -> dict:
+        """
+        Find top matches across two datasets using their embeddings.
+
+        Args:
+            input_data_ids (list): The ids of the input texts.
+            input_embeddings (np.array): Embeddings for the input texts.
+            comparison_data_ids (list): The ids of the comparison texts.
+            comparison_embeddings (np.array): Embeddings for the comparison texts.
+            top_n (int): The number of top links to return in the output.
+            drop_most_similar (bool, default = False): Whether to not output the most similar match, this would be set to True if you are matching a list with itself.
+
+        Returns:
+            dict: The top matches for each input id.
+        """
+
+        logger.info(
+            f"Finding the top dataset matches for {len(input_data_ids)} input texts chunked into {round(len(input_data_ids)/self.match_chunk_size)}"
+        )
+
+        if drop_most_similar:
+            top_n = top_n + 1
+            start_n = 1
+        else:
+            start_n = 0
+
+        # We chunk up comparisons otherwise it can crash
+        matches_topn = {}
+        for batch_indices in tqdm(
+            chunk_list(range(len(input_data_ids)), n_chunks=self.match_chunk_size)
+        ):
+            batch_input_ids = [input_data_ids[i] for i in batch_indices]
+            batch_input_embeddings = [input_embeddings[i] for i in batch_indices]
+
+            batch_similarities = cosine_similarity(
+                batch_input_embeddings, comparison_embeddings
+            )
+
+            # Top links for each input text
+            for input_ix, similarities in enumerate(batch_similarities):
+                top_links = []
+                for comparison_ix in np.flip(np.argsort(similarities))[start_n:top_n]:
+                    # comparison data id + cosine similarity score
+                    top_links.append(
+                        [
+                            comparison_data_ids[comparison_ix],
+                            similarities[comparison_ix],
+                        ]
+                    )
+                matches_topn[batch_input_ids[input_ix]] = top_links
+        return matches_topn
+
+    def link_dataset(
+        self,
+        input_data: Union[list, dict],
+        top_n: int = 3,
+        format_output: bool = True,
+        drop_most_similar: bool = False,
+    ) -> dict:
+        """
+        Link a dataset to the comparison dataset.
+
+        Args:
+            input_data (Union[list, dict]): The main dictionary to be linked to texts in the loaded comparison_data.
+                A list of texts or a dictionary of texts where the key is the unique id.
+                If a list is given then a unique id will be assigned with the index order.
+            top_n (int, default = 3): The number of top links to return in the output.
+            format_output (bool, default = True): If you'd like the output to be formatted to include the texts of
+                the matched datasets or not (will just give the indices).
+            drop_most_similar (bool, default = False): Whether to not output the most similar match, this would be set to True if you are matching a list with itself.
+        Returns:
+            dict: The keys are the ids of the input_data and the values are a list of lists of the top_n most similar
+                ids from the comparison_data and a probability score.
+                e.g. {'x': [['a', 0.75], ['c', 0.7]], 'y': [...]}
+        """
+
+        try:
+            logger.info(
+                f"Comparing {len(input_data)} input texts to {len(self.comparison_embeddings)} comparison texts"
+            )
+        except:
+            logger.warning(
+                "self.comparison_embeddings does not exist - you may have not run load()"
+            )
+
+        input_data = self._process_dataset(input_data)
+        input_data_texts = list(input_data.values())
+        input_data_ids = list(input_data.keys())
+
+        input_embeddings = self._get_embeddings(input_data_texts)
 
-def link_lists(list_1, list_2):
-    """
-    Mock linker
-    """
-    list_1_index = list(range(len(list_1)))
-    list_2_index = list(range(len(list_2)))
+        self.matches_topn = self.get_matches(
+            input_data_ids,
+            input_embeddings,
+            self.comparison_data_ids,
+            self.comparison_embeddings,
+            top_n,
+            drop_most_similar,
+        )
 
-    return [(i, random.choice(list_1_index)) for i in list_2_index]
+        if format_output:
+            # Format the output into a user friendly pandas format with the top link only
+            df_output = pd.DataFrame(
+                [
+                    {
+                        "input_id": input_id,
+                        "input_text": input_data[input_id],
+                        "link_id": link_data[0][0],
+                        "link_text": self.comparison_data[link_data[0][0]],
+                        "similarity": link_data[0][1],
+                    }
+                    for input_id, link_data in self.matches_topn.items()
+                ]
+            )
+            return df_output
+        else:
+            return self.matches_topn
diff --git a/tests/test_linker.py b/tests/test_linker.py
index 87526dc..6274dbf 100644
--- a/tests/test_linker.py
+++ b/tests/test_linker.py
@@ -1,10 +1,72 @@
-from nlp_link.linker import link_lists
+from nlp_link.linker import NLPLinker
 
+import numpy as np
 
-def test_link_lists():
 
-    list_1 = ["dog", "cat"]
-    list_2 = ["kitten", "puppy"]
-    linked = link_lists(list_1, list_2)
+def test_NLPLinker_dict_input():
 
-    assert len(linked) == len(list_1)
+    nlp_link = NLPLinker()
+
+    comparison_data = {"a": "cats", "b": "dogs", "c": "rats", "d": "birds"}
+    input_data = {
+        "x": "owls",
+        "y": "feline",
+        "z": "doggies",
+        "za": "dogs",
+        "zb": "chair",
+    }
+    nlp_link.load(comparison_data)
+    matches = nlp_link.link_dataset(input_data)
+
+    assert len(matches) == len(input_data)
+    assert len(set(matches["link_id"]).difference(set(comparison_data.keys()))) == 0
+
+
+def test_NLPLinker_list_input():
+
+    nlp_link = NLPLinker()
+
+    comparison_data = ["cats", "dogs", "rats", "birds"]
+    input_data = ["owls", "feline", "doggies", "dogs", "chair"]
+    nlp_link.load(comparison_data)
+    matches = nlp_link.link_dataset(input_data)
+
+    assert len(matches) == len(input_data)
+    assert (
+        len(set(matches["link_id"]).difference(set(range(len(comparison_data))))) == 0
+    )
+
+
+def test_get_matches():
+
+    nlp_link = NLPLinker()
+
+    matches_topn = nlp_link.get_matches(
+        input_data_ids=["x", "y", "z"],
+        input_embeddings=np.array(
+            [[0.1, 0.13, 0.14], [0.12, 0.18, 0.15], [0.5, 0.9, 0.91]]
+        ),
+        comparison_data_ids=["a", "b"],
+        comparison_embeddings=np.array([[0.51, 0.99, 0.9], [0.1, 0.13, 0.14]]),
+        top_n=1,
+    )
+
+    assert matches_topn["x"][0][0] == "b"
+    assert matches_topn["y"][0][0] == "b"
+    assert matches_topn["z"][0][0] == "a"
+
+
+def test_same_input():
+
+    nlp_link = NLPLinker()
+
+    comparison_data = {"a": "cats", "b": "dogs", "c": "rats", "d": "birds"}
+    input_data = comparison_data
+    nlp_link.load(comparison_data)
+    matches = nlp_link.link_dataset(input_data, drop_most_similar=False)
+
+    assert all(matches["input_id"] == matches["link_id"])
+
+    matches = nlp_link.link_dataset(input_data, drop_most_similar=True)
+
+    assert all(matches["input_id"] != matches["link_id"])

From ac7b9f737ac88f5b4bde8a9c856f2ffd07223714 Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 6 Aug 2024 17:40:55 +0100
Subject: [PATCH 2/8] add utils

---
 nlp_link/linker_utils.py | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 nlp_link/linker_utils.py

diff --git a/nlp_link/linker_utils.py b/nlp_link/linker_utils.py
new file mode 100644
index 0000000..3618a54
--- /dev/null
+++ b/nlp_link/linker_utils.py
@@ -0,0 +1,3 @@
+def chunk_list(orig_list, n_chunks):
+    for i in range(0, len(orig_list), n_chunks):
+        yield orig_list[i : i + n_chunks]

From afd6f2923ff65a6dfffd01b8f1d28b66155e0eb9 Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 6 Aug 2024 17:43:15 +0100
Subject: [PATCH 3/8] Correct utils file name

---
 nlp_link/linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlp_link/linker.py b/nlp_link/linker.py
index 3f6b849..1feabc9 100644
--- a/nlp_link/linker.py
+++ b/nlp_link/linker.py
@@ -34,7 +34,7 @@
 from typing import Union, Optional
 import logging
 
-from nlp_link.utils import chunk_list
+from nlp_link.linker_utils import chunk_list
 
 logger = logging.getLogger(__name__)
 

From 57ece3fa62fac6bbedc1b0e79e982e766e0d47b8 Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 6 Aug 2024 18:01:21 +0100
Subject: [PATCH 4/8] Update versions

---
 pyproject.toml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index db799b5..f2c05fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,17 +4,17 @@ version = "0.1.0"
 description = "A python package to semantically link two lists of texts."
 authors = ["Nesta <dataanalytics@nesta.org.uk>"]
 readme = "README.md"
-packages = [{include = "nlp_link"}]
+packages = [{include = "nlp_link", "soc_mapper"}]
 
 [tool.poetry.dependencies]
 python = "^3.9"
-scikit-learn = "^1.4.2"
+scikit-learn = "^1.5.1"
 pandas = "^2.2.2"
-sentence-transformers = "^2.1.0"
-torch = "^1.10.0"
-pytest = "^8.2.0"
-tqdm = "^4.64.1"
-numpy = "^1.24.1"
+sentence-transformers = "^2.7.0"
+torch = "^1.13.1"
+pytest = "^8.3.2"
+tqdm = "^4.66.4"
+numpy = "^1.26.4"
 
 [build-system]
 requires = ["poetry-core"]

From 5b890a02042e6dd869bb8a736970352603cace48 Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 6 Aug 2024 18:05:23 +0100
Subject: [PATCH 5/8] Add future package inclusion|

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f2c05fd..6e56ec5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "A python package to semantically link two lists of texts."
 authors = ["Nesta <dataanalytics@nesta.org.uk>"]
 readme = "README.md"
-packages = [{include = "nlp_link", "soc_mapper"}]
+packages = [{include = "nlp_link"}, {include = "soc_mapper"}]
 
 [tool.poetry.dependencies]
 python = "^3.9"

From 7e209aeeb45fc9c1e4f596c0727d7a1e3f839e7a Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 6 Aug 2024 18:07:29 +0100
Subject: [PATCH 6/8] remove socmapper for now

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6e56ec5..44deb8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "A python package to semantically link two lists of texts."
 authors = ["Nesta <dataanalytics@nesta.org.uk>"]
 readme = "README.md"
-packages = [{include = "nlp_link"}, {include = "soc_mapper"}]
+packages = [{include = "nlp_link"}]
 
 [tool.poetry.dependencies]
 python = "^3.9"

From eab5a1656810dcaeb15922d2bc5fa2d786c7c335 Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 3 Sep 2024 10:12:49 +0100
Subject: [PATCH 7/8] set torch threads to try to fix github actions test runs

---
 tests/test_linker.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_linker.py b/tests/test_linker.py
index 6274dbf..89551a3 100644
--- a/tests/test_linker.py
+++ b/tests/test_linker.py
@@ -2,6 +2,11 @@
 
 import numpy as np
 
+# Needed for Github Actions to not fail (see torch bug https://github.com/pytorch/pytorch/issues/121101)
+import torch
+
+torch.set_num_threads(1)
+
 
 def test_NLPLinker_dict_input():
 

From 1b446b05a786525b0f0bb927bb3b509092e07345 Mon Sep 17 00:00:00 2001
From: lizgzil <lizgzil@hotmail.com>
Date: Tue, 3 Sep 2024 10:30:14 +0100
Subject: [PATCH 8/8] set torch threads to try to fix github actions test runs
 - 2

---
 nlp_link/linker.py   | 6 +++++-
 tests/test_linker.py | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/nlp_link/linker.py b/nlp_link/linker.py
index 1feabc9..6a60dd5 100644
--- a/nlp_link/linker.py
+++ b/nlp_link/linker.py
@@ -26,6 +26,7 @@
 """
 
 from sentence_transformers import SentenceTransformer
+import torch
 from tqdm import tqdm
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
@@ -100,7 +101,10 @@ def load(
                 If a list is given then a unique id will be assigned with the index order.
         """
         logger.info("Loading model")
-        self.bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
+        self.bert_model = SentenceTransformer(
+            "sentence-transformers/all-MiniLM-L6-v2", device=device
+        )
         self.bert_model.max_seq_length = 512
 
         self.comparison_data = self._process_dataset(comparison_data)
diff --git a/tests/test_linker.py b/tests/test_linker.py
index 89551a3..d7b35cf 100644
--- a/tests/test_linker.py
+++ b/tests/test_linker.py
@@ -1,12 +1,12 @@
-from nlp_link.linker import NLPLinker
-
-import numpy as np
-
 # Needed for Github Actions to not fail (see torch bug https://github.com/pytorch/pytorch/issues/121101)
 import torch
 
 torch.set_num_threads(1)
 
+from nlp_link.linker import NLPLinker
+
+import numpy as np
+
 
 def test_NLPLinker_dict_input():