jbesomi · henrifroese · Aug 8, 2020 · Aug 9, 2020 · Aug 9, 2020 · Aug 9, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -3,25 +3,36 @@ jobs:
   include:
     - name: "Python 3.6.0 on Xenial Linux"
       python: 3.6
+
     - name: "Python 3.7.0 on Xenial Linux"
       python: 3.7
+
     - name: "Python 3.8.0 on Xenial Linux"
       python: 3.8     # this works for Linux but is ignored on macOS or Windows
+
     - name: "Python 3.7.4 on macOS"
       os: osx
       osx_image: xcode11.2  # Python 3.7.4 running on macOS 10.14.4
       language: shell       # 'language: python' is an error on Travis CI macOS
+      install:
+        - pip3 install --upgrade pip  # all three OSes agree about 'pip3'
+        - pip3 install black
+        - pip3 install --user ".[dev]" .
+
     - name: "Python 3.8.0 on Windows"
       os: windows           # Windows 10.0.17134 N/A Build 17134
       language: shell       # 'language: python' is an error on Travis CI Windows
       before_install:
         - choco install python --version 3.8.0
         - python -m pip install --upgrade pip
+        - pip3 install --no-deps torch===1.6.0 -f https://download.pytorch.org/whl/torch_stable.html
       env: PATH=/c/Python38:/c/Python38/Scripts:$PATH
+
 install: 
     - pip3 install --upgrade pip  # all three OSes agree about 'pip3'
     - pip3 install black==19.10b0
     - pip3 install ".[dev]" .
+
 # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows
 # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only
 script:

diff --git a/setup.cfg b/setup.cfg
@@ -49,5 +49,6 @@ dev =
     nbsphinx
     parameterized>=0.7.4
     coverage
+    flair>=0.5.1
     pre-commit
     pandas>=1.1.0
diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -2,6 +2,7 @@
 import numpy as np
 from texthero import representation
 from texthero import preprocessing
+from texthero import _types
 
 from . import PandasTestCase
 
@@ -12,6 +13,13 @@
 import warnings
 from parameterized import parameterized
 
+from flair.embeddings import (
+    WordEmbeddings,
+    DocumentPoolEmbeddings,
+    DocumentRNNEmbeddings,
+    TransformerDocumentEmbeddings,
+    SentenceTransformerDocumentEmbeddings,
+)
 
 """
 Test doctest
@@ -268,3 +276,89 @@ def test_normalize_DataFrame_also_as_output(self):
         pd.testing.assert_frame_equal(
             result, correct_output, check_dtype=False, rtol=0.1, atol=0.1,
         )
+
+
+"""
+Test embeddings functions.
+"""
+
+
+s_tokenized_for_embeddings = pd.Series(
+    [["Test", "Test2", "!", "yes", "hä", "^°"], ["Test3", "wow  ", "aha", "super"]],
+    index=[5, 7],
+)
+
+
+class TestEmbeddings(PandasTestCase):
+    """
+    Test embed function.
+
+    There are three types of Document Embeddings that
+    don't require additional dependencies
+    (the SentenceTransformerDocumentEmbeddings requires extra
+    dependencies),
+    see `here https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md`_.
+    We test all of them here.
+    """
+
+    def test_embed_document_pool_embedding(self):
+        word_embedding = WordEmbeddings("turian")
+        document_embedding = DocumentPoolEmbeddings([word_embedding])
+
+        s_return = representation.embed(s_tokenized_for_embeddings, document_embedding)
+
+        self.assertTrue(isinstance(s_return.iloc[0], list))
+        self.assertTrue(isinstance(s_return.iloc[1], list))
+        self.assertTrue(len(s_return.iloc[0]) == len(s_return.iloc[1]) > 0)
+
+        pd.testing.assert_index_equal(s_return.index, s_return.index)
+
+        # check if output is valid VectorSeries
+        try:
+            _types.VectorSeries.check_type(s_return)
+        except:
+            self.fail("Output is not a valid VectorSeries.")
+
+        del word_embedding
+
+    def test_embed_document_rnn_embedding(self):
+        word_embedding = WordEmbeddings("turian")
+        document_embedding = DocumentPoolEmbeddings([word_embedding])
+
+        s_return = representation.embed(s_tokenized_for_embeddings, document_embedding)
+
+        self.assertTrue(isinstance(s_return.iloc[0], list))
+        self.assertTrue(isinstance(s_return.iloc[1], list))
+        self.assertTrue(len(s_return.iloc[0]) == len(s_return.iloc[1]) > 0)
+
+        pd.testing.assert_index_equal(s_return.index, s_return.index)
+
+        # check if output is valid VectorSeries
+        try:
+            _types.VectorSeries.check_type(s_return)
+        except:
+            self.fail("Output is not a valid VectorSeries.")
+
+        del word_embedding
+
+    def test_embed_transformer_document_embedding(self):
+        # load smallest available transformer model
+        document_embedding = TransformerDocumentEmbeddings(
+            "google/reformer-crime-and-punishment"
+        )
+
+        s_return = representation.embed(s_tokenized_for_embeddings, document_embedding)
+
+        self.assertTrue(isinstance(s_return.iloc[0], list))
+        self.assertTrue(isinstance(s_return.iloc[1], list))
+        self.assertTrue(len(s_return.iloc[0]) == len(s_return.iloc[1]) > 0)
+
+        pd.testing.assert_index_equal(s_return.index, s_return.index)
+
+        # check if output is valid VectorSeries
+        try:
+            _types.VectorSeries.check_type(s_return)
+        except:
+            self.fail("Output is not a valid VectorSeries.")
+
+        del document_embedding
diff --git a/texthero/_types.py b/texthero/_types.py
@@ -198,7 +198,7 @@ def is_numeric(x):
                 return True
 
         def is_list_of_numbers(cell):
-            return isinstance(cell, (list, tuple)) and all(is_numeric(x) for x in cell)
+            return all(is_numeric(x) for x in cell)
 
         try:
             first_non_nan_value = s.loc[s.first_valid_index()]

diff --git a/texthero/representation.py b/texthero/representation.py
@@ -3,6 +3,9 @@
 TF-IDF, word2vec or GloVe.
 """
 
+from texthero._types import InputSeries, TokenSeries, VectorSeries
+from typing import List, Union
+import flair
 import pandas as pd
 import numpy as np
 
@@ -975,3 +978,145 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser
         )
     else:
         return pd.Series(list(result), index=input_matrix.index)
+
+    return s_result
+
+
+"""
+Flair Embeddings
+----------------
+
+Embed documents and use the embeddings to find similar documents,
+find topics, ... .
+
+There are many different ways to transform text data into vectors to gain
+insights from them. The resulting vectors are called _embeddings_.
+A _word embedding_ assigns a vector to each word, a _document embedding_
+(sometimes called _thought vector_)
+assigns a vector to each document.
+
+One way to get embeddings is to use a function such as tfidf,
+count, or term_frequency that creates vectors depending
+on which words occur how frequently.
+
+Another option is to use embeddings that try to directly capture
+the semantic relationship between words or sentences. For example,
+the vector of 'biochemistry' minus the vector of 'chemistry' is
+close to the vector of 'biology'.
+
+In Texthero, both options are supported. The second option is
+implemented through the `Flair library <https://github.com/flairNLP/flair>`_
+"""
+
+
+# We only `import flair` and
+# call everything from flair directly,
+# e.g. flair.data.Tokens, to not pollute our
+# namespace with similarly sounding names.
+
+
+"""
+Helper functions.
+"""
+
+
+def _texthero_init_for_flair_sentence(self, already_tokenized_text: List[str]):
+    """
+    To use Flair embeddings, Flair needs as input a
+    'flair.Sentence' object. Creating such an object
+    only works from strings in flair. However, we want
+    our embeddings to work on TokenSeries, so we
+    overwrite the 'flair Sentence' __init__ method with
+    this method to create Sentence objects from already
+    tokenized text.
+    """
+
+    super(flair.data.Sentence, self).__init__()
+
+    self.tokens: List[flair.data.Token] = []
+    self._embeddings: Dict = {}
+    self.language_code: str = None
+    self.tokenized = None
+
+    # already tokenized -> simply add the tokens
+    for token in already_tokenized_text:
+        self.add_token(token)
+
+
+# Overwrite flair Sentence __init__ method to handle already tokenized text
+def _flair_setup_for_texthero():
+    flair.data.Sentence.__init__ = _texthero_init_for_flair_sentence
+
+
+"""
+Support for flair embeddings.
+"""
+
+
+@InputSeries(TokenSeries)
+def embed(
+    s: TokenSeries, flair_embedding: flair.embeddings.DocumentEmbeddings
+) -> VectorSeries:
+    """
+    Generate a vector for each document using the given flair_embedding.
+
+    Given a tokenized Series and a
+    `Flair Document Embedding https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md`_,
+    return the document embedding for every
+    document in the tokenized Series.
+
+    Parameters
+    ----------
+    s : :class:`texthero._types.TokenSeries`
+        The series we want to calculate embeddings for.
+
+    flair_embedding : flair.embeddings.DocumentEmbeddings
+        The document embedding we want to use.
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> from flair.embeddings import TransformerDocumentEmbeddings
+    >>> embedding = TransformerDocumentEmbeddings("google/reformer-crime-and-punishment")
+    >>> s = pd.Series(["Text of doc 1", "Text of doc 2"]).pipe(hero.tokenize)
+    >>> hero.embed(s, embedding)  # doctest: +SKIP
+    0    [-0.6618074, -0.20467158, -0.05876905, -0.3482...
+    1    [-0.5505255, -0.21915795, -0.0913163, -0.26856...
+    dtype: object
+
+    See Also
+    --------
+    `Flair Document Embedding https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md`_
+
+    TODO: Add a link to the Flair/Texthero article that explains how
+          to flair with Texthero and how to install flair+torch
+    """
+    try:
+        import flair
+    except:
+        raise ImportError(
+            "To use this function, you need to have"
+            " the flair library (https://github.com/flairNLP/flair)"
+            " installed and imported!"
+        )
+
+    _flair_setup_for_texthero()
+
+    def _embed_and_return_embedding(x):
+        # flair embeddings need a 'flair Sentence' object as input.
+        x = flair.data.Sentence(x)
+        # Calculate the embedding; flair writes it to x.embedding
+        flair_embedding.embed(x)
+        # Return it as list.
+        return x.embedding.detach().tolist()
+
+    if isinstance(flair_embedding, flair.embeddings.DocumentEmbeddings):
+        s = s.apply(lambda x: _embed_and_return_embedding(x))
+    else:
+        raise ValueError(
+            "Unknown embedding type. Texthero only works with"
+            " flair DocumentEmbeddings."
+        )
+
+    return s