Skip to content

Commit

Permalink
Merge pull request #925 from PrimozGodec/lagdetect-guardian
Browse files Browse the repository at this point in the history
[ENH] Guardian - infer language and add to corpus
  • Loading branch information
VesnaT authored Mar 14, 2023
2 parents 9bd6012 + 1de1617 commit 20dc93f
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 60 deletions.
68 changes: 41 additions & 27 deletions orangecontrib/text/guardian.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@
10
"""

import requests
import math
import json
import os
from functools import partial

from Orange import data

from orangecontrib.text.corpus import Corpus
import requests
from Orange.data import (
StringVariable,
DiscreteVariable,
ContinuousVariable,
TimeVariable,
)
from dateutil.parser import isoparse

from orangecontrib.text.util import create_corpus

BASE_URL = 'http://content.guardianapis.com/search'
ARTICLES_PER_PAGE = 10
Expand Down Expand Up @@ -53,29 +58,33 @@ def __eq__(self, other):


class TheGuardianAPI:
attributes = []

class_vars = [
(data.DiscreteVariable('Section'), lambda doc: doc['sectionName']),
(partial(DiscreteVariable, "Section"), lambda doc: doc["sectionName"]),
]

tv = data.TimeVariable('Publication Date')
metas = [
(data.StringVariable('Headline'), lambda doc: doc['fields']['headline']),
(data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']),
(data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']),
(data.StringVariable('HTML'), lambda doc: doc['fields']['body']),
(tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])),
(data.DiscreteVariable('Type'), lambda doc: doc['type']),
(data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']),
(data.StringVariable('Tags'),
lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])),
(data.StringVariable('URL'), lambda doc: doc['webUrl']),
(data.ContinuousVariable('Word Count', number_of_decimals=0),
lambda doc: doc['fields']['wordcount']),
(partial(StringVariable, "Headline"), lambda doc: doc["fields"]["headline"]),
(partial(StringVariable, "Content"), lambda doc: doc["fields"]["bodyText"]),
(partial(StringVariable, "Trail Text"), lambda doc: doc["fields"]["trailText"]),
(partial(StringVariable, "HTML"), lambda doc: doc["fields"]["body"]),
(
partial(TimeVariable, "Publication Date", have_time=1, have_date=1),
lambda doc: isoparse(doc["webPublicationDate"]).timestamp(),
),
(partial(DiscreteVariable, "Type"), lambda doc: doc["type"]),
(partial(DiscreteVariable, "Language"), lambda doc: doc["fields"]["lang"]),
(
partial(StringVariable, "Tags"),
lambda doc: ", ".join(tag["webTitle"] for tag in doc["tags"]),
),
(partial(StringVariable, "URL"), lambda doc: doc["webUrl"]),
(
partial(ContinuousVariable, "Word Count", number_of_decimals=0),
lambda doc: doc["fields"]["wordcount"],
),
]

text_features = [metas[0][0], metas[1][0]] # Headline + Content
text_features = ["Headline", "Content"] #
title_indices = [-1] # Headline

def __init__(self, credentials, on_progress=None, should_break=None):
Expand Down Expand Up @@ -156,11 +165,16 @@ def search(self, query, from_date=None, to_date=None, max_documents=None,
self._search(query, from_date, to_date, p)
self.on_progress(p*self.per_page, pages * self.per_page)

c = Corpus.from_documents(
self.results, 'The Guardian', self.attributes, self.class_vars,
self.metas, title_indices=self.title_indices)
c.text_features = self.text_features
return c
return create_corpus(
self.results,
[],
self.class_vars,
self.metas,
self.title_indices,
self.text_features,
"The Guardian",
"Language",
)


if __name__ == '__main__':
Expand Down
115 changes: 86 additions & 29 deletions orangecontrib/text/tests/test_guardian.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,70 @@

from datetime import date, datetime
from unittest import mock
from unittest.mock import Mock

from orangecontrib.text import guardian


API_KEY = os.getenv('THE_GUARDIAN_API_KEY', 'test')
API_KEY = os.getenv("THE_GUARDIAN_API_KEY", "test")
responses = [
"""
{
"response": {
"pages": 2,
"results": [
{
"type": "article",
"sectionName": "World news",
"webPublicationDate": "2018-07-05T23:27:25Z",
"webUrl": "https://www.theguardian.com/world/2018/jul/06",
"fields": {
"headline": "Rohingya refugees reject UN-Myanmar repatriati",
"trailText": "Leaders say agreement does not address concer",
"body": "<p><strong><strong><strong></strong></strong></str",
"wordcount": "512",
"lang": "en",
"bodyText": "Rohingya community leaders have rejected an."
},
"tags": [
{
"webTitle": "Myanmar"
}
]
}
]
}
}
""",
"""
{
"response": {
"pages": 2,
"results": [
{
"type": "article",
"sectionName": "World news",
"webPublicationDate": "2018-07-05T23:27:25Z",
"webUrl": "https://www.theguardian.com/world/2018/jul/06",
"fields": {
"headline": "Rohingya refugees reject UN-Myanmar repatriati",
"trailText": "Leaders say agreement does not address concer",
"body": "<p><strong><strong><strong></strong></strong></str",
"wordcount": "512",
"lang": "fr",
"bodyText": "Rohingya community leaders have rejected an."
},
"tags": [
{
"webTitle": "Myanmar"
}
]
}
]
}
}
""",
]


class TestCredentials(unittest.TestCase):
Expand Down Expand Up @@ -79,33 +138,31 @@ def test_api_limit_error(self, mock_get):

@mock.patch('requests.get')
def test_search_mock_data(self, mock_get):
mock_get().text = """
{
"response": {
"pages": 2,
"results": [
{
"type": "article",
"sectionName": "World news",
"webPublicationDate": "2018-07-05T23:27:25Z",
"webUrl": "https://www.theguardian.com/world/2018/jul/06",
"fields": {
"headline": "Rohingya refugees reject UN-Myanmar repatriati",
"trailText": "Leaders say agreement does not address concer",
"body": "<p><strong><strong><strong></strong></strong></str",
"wordcount": "512",
"lang": "en",
"bodyText": "Rohingya community leaders have rejected an."
},
"tags": [
{
"webTitle": "Myanmar"
}
]
}
]
}
}
"""
mock_get.return_value.text = responses[0]
corp = self.api.search('Slovenia')
self.assertEqual(len(corp), 2)

@mock.patch("requests.get")
def test_tweets_language(self, mock_get):
mms = []
for r in responses:
mms.append(Mock())
mms[-1].text = r

mock_get.side_effect = mms
# language should be None returned articles have different languages
corpus = self.api.search("Slovenia")
self.assertIsNone(corpus.language)

mock_get.side_effect = [mms[0], mms[0]]
# corpus language should be set since articles have same language
corpus = self.api.search("Slovenia")
self.assertEqual("en", corpus.language)

mock_get.side_effect = [mms[1], mms[1]]
corpus = self.api.search("Slovenia")
self.assertEqual("fr", corpus.language)


if __name__ == "__main__":
unittest.main()
85 changes: 84 additions & 1 deletion orangecontrib/text/util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from functools import wraps
from math import ceil
from typing import Union, List
from typing import Union, List, Callable, Any, Tuple

import numpy as np
import scipy.sparse as sp
from Orange.data import Domain, DiscreteVariable
from gensim.matutils import Sparse2Corpus

from orangecontrib.text import Corpus
from orangecontrib.text.language import infer_language_from_variable


def chunks(iterable, chunk_size):
""" Splits iterable objects into chunk of fixed size.
Expand Down Expand Up @@ -88,3 +92,82 @@ def __getitem__(
"""
sparse = self.sparse.__getitem__((slice(None, None, None), key))
return Sparse2CorpusSliceable(sparse)


def create_corpus(
documents: List[Any],
attributes: List[Tuple[Callable, Callable]],
class_vars: List[Tuple[Callable, Callable]],
metas: List[Tuple[Callable, Callable]],
title_indices: List[int],
text_features: List[str],
name: str,
language_attribute: str,
):
"""
Create a corpus from list of features/documents produced by modelu such as
Guardian/NYT
Parameters
----------
documents
List with values downloaded from API
attributes
List of attributes and recipes on how to extract values from documents.
class_vars
List of class attributes and recipes on how to extract values from documents.
metas
List of meta and recipes on how to extract values from documents.
title_indices
The index of the title attribute.
text_features
Names of text features
name
The name of the Corpus
language_attribute
The attribute to infer the language from.
Returns
-------
Corpus with documents.
"""
domain = Domain(
attributes=[attr() for attr, _ in attributes],
class_vars=[attr() for attr, _ in class_vars],
metas=[attr() for attr, _ in metas],
)
for ind in title_indices:
domain[ind].attributes["title"] = True

def to_val(attr, val):
if isinstance(attr, DiscreteVariable):
attr.val_from_str_add(val)
return attr.to_val(val)

X = [
[to_val(a, f(doc)) for a, (_, f) in zip(domain.class_vars, attributes)]
for doc in documents
]
Y = [
[to_val(a, f(doc)) for a, (_, f) in zip(domain.class_vars, class_vars)]
for doc in documents
]
metas = [
[to_val(a, f(doc)) for a, (_, f) in zip(domain.metas, metas)]
for doc in documents
]
X = np.array(X, dtype=np.float64)
Y = np.array(Y, dtype=np.float64)
metas = np.array(metas, dtype=object)

language = infer_language_from_variable(domain[language_attribute])
corpus = Corpus.from_numpy(
domain=domain,
X=X,
Y=Y,
metas=metas,
text_features=[domain[f] for f in text_features],
language=language,
)
corpus.name = name
return corpus
7 changes: 4 additions & 3 deletions orangecontrib/text/widgets/owguardian.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ class Outputs:
recent_queries = Setting([])
date_from = Setting((datetime.now().date() - timedelta(365)))
date_to = Setting(datetime.now().date())
attributes = [feat.name for feat, _ in TheGuardianAPI.metas if
isinstance(feat, StringVariable)]
text_includes = Setting([feat.name for feat in TheGuardianAPI.text_features])
attributes = [
part.args[0] for part, _ in TheGuardianAPI.metas if part.func is StringVariable
]
text_includes = Setting([feat for feat in TheGuardianAPI.text_features])

class Warning(OWWidget.Warning):
no_text_fields = Msg('Text features are inferred when none are selected.')
Expand Down

0 comments on commit 20dc93f

Please sign in to comment.