Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Flair Embeddings: hero.embed(s, flair_embedding) function #146

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,36 @@ jobs:
include:
- name: "Python 3.6.0 on Xenial Linux"
python: 3.6

- name: "Python 3.7.0 on Xenial Linux"
python: 3.7

- name: "Python 3.8.0 on Xenial Linux"
python: 3.8 # this works for Linux but is ignored on macOS or Windows

- name: "Python 3.7.4 on macOS"
os: osx
osx_image: xcode11.2 # Python 3.7.4 running on macOS 10.14.4
language: shell # 'language: python' is an error on Travis CI macOS
install:
- pip3 install --upgrade pip # all three OSes agree about 'pip3'
- pip3 install black
- pip3 install --user ".[dev]" .

- name: "Python 3.8.0 on Windows"
os: windows # Windows 10.0.17134 N/A Build 17134
language: shell # 'language: python' is an error on Travis CI Windows
before_install:
- choco install python --version 3.8.0
- python -m pip install --upgrade pip
- pip3 install --no-deps torch===1.6.0 -f https://download.pytorch.org/whl/torch_stable.html
env: PATH=/c/Python38:/c/Python38/Scripts:$PATH

install:
- pip3 install --upgrade pip # all three OSes agree about 'pip3'
- pip3 install black==19.10b0
- pip3 install ".[dev]" .

# 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows
# 'python3' is a 'command not found' error on Windows but 'py' works on Windows only
henrifroese marked this conversation as resolved.
Show resolved Hide resolved
script:
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,6 @@ dev =
nbsphinx
parameterized>=0.7.4
coverage
flair>=0.5.1
pre-commit
pandas>=1.1.0
94 changes: 94 additions & 0 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
from texthero import representation
from texthero import preprocessing
from texthero import _types

from . import PandasTestCase

Expand All @@ -12,6 +13,13 @@
import warnings
from parameterized import parameterized

from flair.embeddings import (
WordEmbeddings,
DocumentPoolEmbeddings,
DocumentRNNEmbeddings,
TransformerDocumentEmbeddings,
SentenceTransformerDocumentEmbeddings,
)

"""
Test doctest
Expand Down Expand Up @@ -268,3 +276,89 @@ def test_normalize_DataFrame_also_as_output(self):
pd.testing.assert_frame_equal(
result, correct_output, check_dtype=False, rtol=0.1, atol=0.1,
)


"""
Test embeddings functions.
"""


s_tokenized_for_embeddings = pd.Series(
[["Test", "Test2", "!", "yes", "hä", "^°"], ["Test3", "wow ", "aha", "super"]],
index=[5, 7],
)


class TestEmbeddings(PandasTestCase):
"""
Test embed function.

There are three types of Document Embeddings that
don't require additional dependencies
(the SentenceTransformerDocumentEmbeddings requires extra
dependencies),
see `here https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md`_.
We test all of them here.
"""

def test_embed_document_pool_embedding(self):
word_embedding = WordEmbeddings("turian")
document_embedding = DocumentPoolEmbeddings([word_embedding])

s_return = representation.embed(s_tokenized_for_embeddings, document_embedding)

self.assertTrue(isinstance(s_return.iloc[0], list))
self.assertTrue(isinstance(s_return.iloc[1], list))
self.assertTrue(len(s_return.iloc[0]) == len(s_return.iloc[1]) > 0)

pd.testing.assert_index_equal(s_return.index, s_return.index)

# check if output is valid VectorSeries
try:
_types.VectorSeries.check_type(s_return)
except:
self.fail("Output is not a valid VectorSeries.")

del word_embedding

def test_embed_document_rnn_embedding(self):
word_embedding = WordEmbeddings("turian")
document_embedding = DocumentPoolEmbeddings([word_embedding])

s_return = representation.embed(s_tokenized_for_embeddings, document_embedding)

self.assertTrue(isinstance(s_return.iloc[0], list))
self.assertTrue(isinstance(s_return.iloc[1], list))
self.assertTrue(len(s_return.iloc[0]) == len(s_return.iloc[1]) > 0)

pd.testing.assert_index_equal(s_return.index, s_return.index)

# check if output is valid VectorSeries
try:
_types.VectorSeries.check_type(s_return)
except:
self.fail("Output is not a valid VectorSeries.")

del word_embedding

def test_embed_transformer_document_embedding(self):
# load smallest available transformer model
document_embedding = TransformerDocumentEmbeddings(
"google/reformer-crime-and-punishment"
)

s_return = representation.embed(s_tokenized_for_embeddings, document_embedding)

self.assertTrue(isinstance(s_return.iloc[0], list))
self.assertTrue(isinstance(s_return.iloc[1], list))
self.assertTrue(len(s_return.iloc[0]) == len(s_return.iloc[1]) > 0)

pd.testing.assert_index_equal(s_return.index, s_return.index)

# check if output is valid VectorSeries
try:
_types.VectorSeries.check_type(s_return)
except:
self.fail("Output is not a valid VectorSeries.")

del document_embedding
2 changes: 1 addition & 1 deletion texthero/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def is_numeric(x):
return True

def is_list_of_numbers(cell):
return isinstance(cell, (list, tuple)) and all(is_numeric(x) for x in cell)
return all(is_numeric(x) for x in cell)

try:
first_non_nan_value = s.loc[s.first_valid_index()]
Expand Down
145 changes: 145 additions & 0 deletions texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
TF-IDF, word2vec or GloVe.
"""

from texthero._types import InputSeries, TokenSeries, VectorSeries
from typing import List, Union
import flair
import pandas as pd
import numpy as np

Expand Down Expand Up @@ -975,3 +978,145 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser
)
else:
return pd.Series(list(result), index=input_matrix.index)

return s_result


"""
Flair Embeddings
----------------

Embed documents and use the embeddings to find similar documents,
find topics, ... .

There are many different ways to transform text data into vectors to gain
insights from them. The resulting vectors are called _embeddings_.
A _word embedding_ assigns a vector to each word, a _document embedding_
(sometimes called _thought vector_)
assigns a vector to each document.

One way to get embeddings is to use a function such as tfidf,
count, or term_frequency that creates vectors depending
on which words occur how frequently.

Another option is to use embeddings that try to directly capture
the semantic relationship between words or sentences. For example,
the vector of 'biochemistry' minus the vector of 'chemistry' is
close to the vector of 'biology'.

In Texthero, both options are supported. The second option is
implemented through the `Flair library <https://github.com/flairNLP/flair>`_
"""


# We only `import flair` and
# call everything from flair directly,
# e.g. flair.data.Tokens, to not pollute our
# namespace with similarly sounding names.


"""
Helper functions.
"""


def _texthero_init_for_flair_sentence(self, already_tokenized_text: List[str]):
"""
To use Flair embeddings, Flair needs as input a
'flair.Sentence' object. Creating such an object
only works from strings in flair. However, we want
our embeddings to work on TokenSeries, so we
overwrite the 'flair Sentence' __init__ method with
this method to create Sentence objects from already
tokenized text.
"""

super(flair.data.Sentence, self).__init__()

self.tokens: List[flair.data.Token] = []
self._embeddings: Dict = {}
self.language_code: str = None
self.tokenized = None

# already tokenized -> simply add the tokens
for token in already_tokenized_text:
self.add_token(token)


# Overwrite flair Sentence __init__ method to handle already tokenized text
def _flair_setup_for_texthero():
flair.data.Sentence.__init__ = _texthero_init_for_flair_sentence


"""
Support for flair embeddings.
"""


@InputSeries(TokenSeries)
def embed(
s: TokenSeries, flair_embedding: flair.embeddings.DocumentEmbeddings
) -> VectorSeries:
"""
Generate a vector for each document using the given flair_embedding.

Given a tokenized Series and a
`Flair Document Embedding https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md`_,
return the document embedding for every
document in the tokenized Series.

Parameters
----------
s : :class:`texthero._types.TokenSeries`
The series we want to calculate embeddings for.

flair_embedding : flair.embeddings.DocumentEmbeddings
The document embedding we want to use.

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> from flair.embeddings import TransformerDocumentEmbeddings
>>> embedding = TransformerDocumentEmbeddings("google/reformer-crime-and-punishment")
>>> s = pd.Series(["Text of doc 1", "Text of doc 2"]).pipe(hero.tokenize)
>>> hero.embed(s, embedding) # doctest: +SKIP
0 [-0.6618074, -0.20467158, -0.05876905, -0.3482...
1 [-0.5505255, -0.21915795, -0.0913163, -0.26856...
dtype: object

See Also
--------
`Flair Document Embedding https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md`_

TODO: Add a link to the Flair/Texthero article that explains how
to flair with Texthero and how to install flair+torch
"""
try:
import flair
except:
raise ImportError(
"To use this function, you need to have"
" the flair library (https://github.com/flairNLP/flair)"
" installed and imported!"
)

_flair_setup_for_texthero()

def _embed_and_return_embedding(x):
# flair embeddings need a 'flair Sentence' object as input.
x = flair.data.Sentence(x)
# Calculate the embedding; flair writes it to x.embedding
flair_embedding.embed(x)
# Return it as list.
return x.embedding.detach().tolist()

if isinstance(flair_embedding, flair.embeddings.DocumentEmbeddings):
s = s.apply(lambda x: _embed_and_return_embedding(x))
else:
raise ValueError(
"Unknown embedding type. Texthero only works with"
" flair DocumentEmbeddings."
)

return s