Skip to content

Commit

Permalink
Guardian - infer language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Jan 11, 2023
1 parent 25952ea commit c177895
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 31 deletions.
68 changes: 41 additions & 27 deletions orangecontrib/text/guardian.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@
10
"""

import requests
import math
import json
import os
from functools import partial

from Orange import data

from orangecontrib.text.corpus import Corpus
import requests
from Orange.data import (
StringVariable,
DiscreteVariable,
ContinuousVariable,
TimeVariable,
)
from dateutil.parser import isoparse

from orangecontrib.text.util import create_corpus

BASE_URL = 'http://content.guardianapis.com/search'
ARTICLES_PER_PAGE = 10
Expand Down Expand Up @@ -53,29 +58,33 @@ def __eq__(self, other):


class TheGuardianAPI:
attributes = []

class_vars = [
(data.DiscreteVariable('Section'), lambda doc: doc['sectionName']),
(partial(DiscreteVariable, "Section"), lambda doc: doc["sectionName"]),
]

tv = data.TimeVariable('Publication Date')
metas = [
(data.StringVariable('Headline'), lambda doc: doc['fields']['headline']),
(data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']),
(data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']),
(data.StringVariable('HTML'), lambda doc: doc['fields']['body']),
(tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])),
(data.DiscreteVariable('Type'), lambda doc: doc['type']),
(data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']),
(data.StringVariable('Tags'),
lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])),
(data.StringVariable('URL'), lambda doc: doc['webUrl']),
(data.ContinuousVariable('Word Count', number_of_decimals=0),
lambda doc: doc['fields']['wordcount']),
(partial(StringVariable, "Headline"), lambda doc: doc["fields"]["headline"]),
(partial(StringVariable, "Content"), lambda doc: doc["fields"]["bodyText"]),
(partial(StringVariable, "Trail Text"), lambda doc: doc["fields"]["trailText"]),
(partial(StringVariable, "HTML"), lambda doc: doc["fields"]["body"]),
(
partial(TimeVariable, "Publication Date"),
lambda doc: isoparse(doc["webPublicationDate"]).timestamp(),
),
(partial(DiscreteVariable, "Type"), lambda doc: doc["type"]),
(partial(DiscreteVariable, "Language"), lambda doc: doc["fields"]["lang"]),
(
partial(StringVariable, "Tags"),
lambda doc: ", ".join(tag["webTitle"] for tag in doc["tags"]),
),
(partial(StringVariable, "URL"), lambda doc: doc["webUrl"]),
(
partial(ContinuousVariable, "Word Count", number_of_decimals=0),
lambda doc: doc["fields"]["wordcount"],
),
]

text_features = [metas[0][0], metas[1][0]] # Headline + Content
text_features = ["Headline", "Content"] #
title_indices = [-1] # Headline

def __init__(self, credentials, on_progress=None, should_break=None):
Expand Down Expand Up @@ -156,11 +165,16 @@ def search(self, query, from_date=None, to_date=None, max_documents=None,
self._search(query, from_date, to_date, p)
self.on_progress(p*self.per_page, pages * self.per_page)

c = Corpus.from_documents(
self.results, 'The Guardian', self.attributes, self.class_vars,
self.metas, title_indices=self.title_indices)
c.text_features = self.text_features
return c
return create_corpus(
self.results,
[],
self.class_vars,
self.metas,
self.title_indices,
self.text_features,
"The Guardian",
"Language",
)


if __name__ == '__main__':
Expand Down
4 changes: 4 additions & 0 deletions orangecontrib/text/tests/test_guardian.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,7 @@ def test_search_mock_data(self, mock_get):
"""
corp = self.api.search('Slovenia')
self.assertEqual(len(corp), 2)


if __name__ == "__main__":
unittest.main()
85 changes: 84 additions & 1 deletion orangecontrib/text/util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from functools import wraps
from math import ceil
from typing import Union, List
from typing import Union, List, Callable, Any, Tuple

import numpy as np
import scipy.sparse as sp
from Orange.data import Domain, DiscreteVariable
from gensim.matutils import Sparse2Corpus

from orangecontrib.text import Corpus
from orangecontrib.text.language import infer_language_from_variable


def chunks(iterable, chunk_size):
""" Splits iterable objects into chunk of fixed size.
Expand Down Expand Up @@ -88,3 +92,82 @@ def __getitem__(
"""
sparse = self.sparse.__getitem__((slice(None, None, None), key))
return Sparse2CorpusSliceable(sparse)


def create_corpus(
documents: List[Any],
attributes: List[Tuple[Callable, Callable]],
class_vars: List[Tuple[Callable, Callable]],
metas: List[Tuple[Callable, Callable]],
title_indices: List[int],
text_features: List[str],
name: str,
language_attribute: str,
):
"""
Create a corpus from list of features/documents produced by modelu such as
Guardian/NYT
Parameters
----------
documents
List with values downloaded from API
attributes
List of attributes and recipes on how to extract values from documents.
class_vars
List of class attributes and recipes on how to extract values from documents.
metas
List of meta and recipes on how to extract values from documents.
title_indices
The index of the title attribute.
text_features
Names of text features
name
The name of the Corpus
language_attribute
The attribute to infer the language from.
Returns
-------
Corpus with documents.
"""
domain = Domain(
attributes=[attr() for attr, _ in attributes],
class_vars=[attr() for attr, _ in class_vars],
metas=[attr() for attr, _ in metas],
)
for ind in title_indices:
domain[ind].attributes["title"] = True

def to_val(attr, val):
if isinstance(attr, DiscreteVariable):
attr.val_from_str_add(val)
return attr.to_val(val)

X = [
[to_val(a, f(doc)) for a, (_, f) in zip(domain.class_vars, attributes)]
for doc in documents
]
Y = [
[to_val(a, f(doc)) for a, (_, f) in zip(domain.class_vars, class_vars)]
for doc in documents
]
metas = [
[to_val(a, f(doc)) for a, (_, f) in zip(domain.metas, metas)]
for doc in documents
]
X = np.array(X, dtype=np.float64)
Y = np.array(Y, dtype=np.float64)
metas = np.array(metas, dtype=object)

language = infer_language_from_variable(domain[language_attribute])
corpus = Corpus.from_numpy(
domain=domain,
X=X,
Y=Y,
metas=metas,
text_features=[domain[f] for f in text_features],
language=language,
)
corpus.name = name
return corpus
7 changes: 4 additions & 3 deletions orangecontrib/text/widgets/owguardian.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ class Outputs:
recent_queries = Setting([])
date_from = Setting((datetime.now().date() - timedelta(365)))
date_to = Setting(datetime.now().date())
attributes = [feat.name for feat, _ in TheGuardianAPI.metas if
isinstance(feat, StringVariable)]
text_includes = Setting([feat.name for feat in TheGuardianAPI.text_features])
attributes = [
part.args[0] for part, _ in TheGuardianAPI.metas if part.func is StringVariable
]
text_includes = Setting([feat for feat in TheGuardianAPI.text_features])

class Warning(OWWidget.Warning):
no_text_fields = Msg('Text features are inferred when none are selected.')
Expand Down

0 comments on commit c177895

Please sign in to comment.