Skip to content

Commit

Permalink
Merge pull request #644 from VesnaT/keywords
Browse files Browse the repository at this point in the history
[ENH] Extract Keywords: New widget
  • Loading branch information
PrimozGodec authored Apr 22, 2021
2 parents 97e3a1b + de02b4f commit 0986f20
Show file tree
Hide file tree
Showing 8 changed files with 1,157 additions and 11 deletions.
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Widgets
widgets/duplicatedetection
widgets/statistics
widgets/corpustonetwork
widgets/keywords

Scripting
---------
Expand Down
15 changes: 15 additions & 0 deletions doc/widgets/keywords.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Extract Keywords
================

Infers characteristic words from the input corpus.

**Inputs**

- Corpus: A collection of documents.
- Words: A table of words.

**Outputs**

- Words: A table of words.

**Extract Keywords** infers characteristic words from corpus.
288 changes: 277 additions & 11 deletions orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,309 @@
from typing import List, Tuple
"""
Module for keyword extraction.
"""
from collections import defaultdict
from itertools import chain
from typing import List, Tuple, Callable

import yake
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from Orange.util import dummy_callback

from orangecontrib.text.keywords.rake import Rake
from orangecontrib.text.preprocess import StopwordsFilter


# all available languages for RAKE
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
"Arabic": "ar",
"Armenian": "hy",
"Breton": "br",
"Bulgarian": "bg",
"Chinese": "zh",
"Croatian": "hr",
"Czech": "cz",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"Finnish": "fi",
"French": "fr",
"German": "de",
"Greek": "el",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Italian": "it",
"Japanese": "ja",
"Latvian": "lv",
"Lithuanian": "lt",
"Norwegian": "no",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Turkish": "tr",
"Ukrainian": "uk"
}


def tfidf_keywords(
tokens: List[List[str]],
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
"""
Extract keywords using TF-IDF.
Parameters
----------
tokens : list
Lists of tokens.
progress_callback : callable
Function for reporting progress.
Returns
-------
keywords : list
"""
if progress_callback is None:
progress_callback = dummy_callback

vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(tokens)
words = vectorizer.get_feature_names()

keywords = []
n_docs = X.shape[0]
for i, row in enumerate(X):
progress_callback(i / n_docs)
keywords.append([(words[i], row[0, i]) for i in row.nonzero()[1]])
return keywords


def yake_keywords(
texts: List[str],
language: str = "English",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
"""
Extract keywords using YAKE!.
Parameters
----------
texts : list
List of documents.
language : str
Selected language.
max_len : int
Maximum number of tokens.
progress_callback : callable
Function for reporting progress.
Returns
-------
keywords : list
"""
if progress_callback is None:
progress_callback = dummy_callback

language = YAKE_LANGUAGE_MAPPING[language]
extractor = yake.KeywordExtractor(lan=language, n=max_len)

def rake(
texts: List[str], language: str, max_len: int
keywords = []
n_docs = len(texts)
for i, text in enumerate(texts):
progress_callback(i / n_docs)
keywords.append(extractor.extract_keywords(text))
return keywords


def rake_keywords(
texts: List[str],
language: str = "English",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
"""
Extract keywords from text with RAKE method.
Parameters
----------
texts
texts : list
List of texts from which keywords are extracted
language
language : str
The language of texts
max_len
max_len : int
Maximal length of keywords/keyphrases extracted
progress_callback : callable
Function for reporting progress.
Returns
-------
List which contains list of keywords for each of the documents in texts.
For each keyword function returns a tuple with keyword and it's score.
"""
if progress_callback is None:
progress_callback = dummy_callback

if language.lower() not in [l.lower() for l in RAKE_LANGUAGES]:
raise ValueError(f"Language must be one of: {RAKE_LANGUAGES}")

stop_words_ = [x.strip() for x in stopwords.words(language.lower())]
rake_object = Rake(stop_words_, max_words_length=max_len)
kws = [rake_object.run(text) for text in texts]
return kws

keywords = []
n_docs = len(texts)
for i, text in enumerate(texts):
progress_callback(i / n_docs)
keywords.append(rake_object.run(text))
return keywords


class ScoringMethods:
"""
Scoring methods enum.
"""
TF_IDF, RAKE, YAKE, EMBEDDING = "TF-IDF", "Rake", "YAKE!", "Embedding"
ITEMS = list(zip((TF_IDF, YAKE, RAKE),
(tfidf_keywords, yake_keywords, rake_keywords)))

TOKEN_METHODS = TF_IDF, EMBEDDING
DOCUMENT_METHODS = RAKE, YAKE


class AggregationMethods:
"""
Aggregation methods enum and helper functions.
"""
MEAN, MEDIAN, MIN, MAX = range(4)
ITEMS = "Mean", "Median", "Min", "Max"

@staticmethod
def aggregate(
keywords: List[List[Tuple[str, float]]],
agg_method: int
) -> List[Tuple[str, float]]:
"""
Aggregate scores.
Parameters
----------
keywords : list
List of keywords for each document.
agg_method : int
Method type. One of: MEAN, MEDIAN, MIN, MAX.
Returns
-------
Aggregated keyword scores.
"""
return [AggregationMethods.mean,
AggregationMethods.median,
AggregationMethods.min,
AggregationMethods.max][agg_method](keywords)

@staticmethod
def mean(
keywords: List[List[Tuple[str, float]]]
) -> List[Tuple[str, float]]:
"""
'mean' aggregation function.
Parameters
----------
keywords : list
List of keywords for each document.
Returns
-------
Aggregated keyword scores.
"""
scores = list(chain.from_iterable(keywords))
unique_scores = defaultdict(lambda: 0.)
for word, score in scores:
unique_scores[word] += score
for word, score in unique_scores.items():
unique_scores[word] = score / len(keywords)
return list(unique_scores.items())

@staticmethod
def median(
keywords: List[List[Tuple[str, float]]]
) -> List[Tuple[str, float]]:
"""
'median' aggregation function.
Parameters
----------
keywords : list
List of keywords for each document.
Returns
-------
Aggregated keyword scores.
"""
scores = list(chain.from_iterable(keywords))
unique_scores = defaultdict(lambda: [])
for word, score in scores:
unique_scores[word].append(score)
for word, score in unique_scores.items():
unique_scores[word] = np.median(score)
return list(unique_scores.items())

@staticmethod
def min(
keywords: List[List[Tuple[str, float]]]
) -> List[Tuple[str, float]]:
"""
'min' aggregation function.
Parameters
----------
keywords : list
List of keywords for each document.
Returns
-------
Aggregated keyword scores.
"""
scores = list(chain.from_iterable(keywords))
unique_scores = defaultdict(lambda: [])
for word, score in scores:
unique_scores[word].append(score)
for word, score in unique_scores.items():
unique_scores[word] = np.min(score)
return list(unique_scores.items())

@staticmethod
def max(
keywords: List[List[Tuple[str, float]]]
) -> List[Tuple[str, float]]:
"""
'max' aggregation function.
Parameters
----------
keywords : list
List of keywords for each document.
if __name__ == "__main__":
print(rake(["sample text"], "english", max_len=3))
Returns
-------
Aggregated keyword scores.
"""
scores = list(chain.from_iterable(keywords))
unique_scores = defaultdict(lambda: [])
for word, score in scores:
unique_scores[word].append(score)
for word, score in unique_scores.items():
unique_scores[word] = np.max(score)
return list(unique_scores.items())
Loading

0 comments on commit 0986f20

Please sign in to comment.