diff --git a/orangecontrib/text/nyt.py b/orangecontrib/text/nyt.py index 87cb1756b..fc08ed247 100644 --- a/orangecontrib/text/nyt.py +++ b/orangecontrib/text/nyt.py @@ -4,19 +4,24 @@ import shelve import warnings from datetime import date -from time import sleep -from urllib import request, parse +from functools import partial from http.client import HTTPException +from time import sleep +from urllib import parse, request from urllib.error import HTTPError, URLError +import numpy as np +from dateutil.parser import isoparse +from Orange.data import ( + ContinuousVariable, + DiscreteVariable, + StringVariable, + TimeVariable, +) +from Orange.misc import environ -from Orange import data from orangecontrib.text.corpus import Corpus - -try: - from Orange.misc import environ -except ImportError: - from Orange.canvas.utils import environ +from orangecontrib.text.util import create_corpus SLEEP = 1 TIMEOUT = 10 @@ -26,39 +31,70 @@ BASE_URL = 'http://api.nytimes.com/svc/search/v2/articlesearch.json' -class NYT: - """ Class for fetching records from the NYT API. """ +def keywords(doc, name): + kws = doc.get("keywords", []) + return ", ".join([kw.get("value") for kw in kws if kw["name"] == name]) - @staticmethod - def keywords(doc, name): - return ', '.join([kw.get('value') - for kw in doc.get('keywords', []) - if kw['name'] == name]) - attributes = [] +def parse_date(doc): + date = doc.get("pub_date") + return isoparse(date).timestamp() if date is not None else np.nan + + +class NYT: + """ Class for fetching records from the NYT API. """ class_vars = [ - (data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)), + ( + partial(DiscreteVariable, "Section"), + lambda doc: doc.get("section_name", None), + ), ] - tv = data.TimeVariable('Publication Date') metas = [ - (data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''), - (data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''), - (data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''), - (data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''), - (data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')), - (data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''), - (data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')), - (data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')), - (data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')), - (data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')), - (tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))), - (data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)), - (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)), + ( + partial(StringVariable, "Headline"), + lambda doc: doc.get("headline", {}).get("main") or "", + ), + (partial(StringVariable, "Abstract"), lambda doc: doc.get("abstract") or ""), + (partial(StringVariable, "Snippet"), lambda doc: doc.get("snippet") or ""), + ( + partial(StringVariable, "Lead Paragraph"), + lambda doc: doc.get("lead_paragraph") or "", + ), + ( + partial(StringVariable, "Subject Keywords"), + partial(keywords, name="subject"), + ), + (partial(StringVariable, "URL"), lambda doc: doc.get("web_url") or ""), + ( + partial(StringVariable, "Locations"), + partial(keywords, name="glocations"), + ), + (partial(StringVariable, "Persons"), partial(keywords, name="persons")), + ( + partial(StringVariable, "Organizations"), + partial(keywords, name="organizations"), + ), + ( + partial(StringVariable, "Creative Works"), + partial(keywords, name="creative_works"), + ), + ( + partial(TimeVariable, "Publication Date", have_time=1, have_date=1), + parse_date, + ), + ( + partial(DiscreteVariable, "Article Type"), + lambda doc: doc.get("type_of_material", None), + ), + ( + partial(ContinuousVariable, "Word Count", number_of_decimals=0), + lambda doc: doc.get("word_count", None), + ), ] - text_features = [metas[0][0], metas[1][0]] # headline + abstract + text_features = ["Headline", "Abstract"] def __init__(self, api_key): """ @@ -129,8 +165,17 @@ def search(self, query, date_from=None, date_to=None, max_docs=None, if len(records) > max_docs: records = records[:max_docs] - return Corpus.from_documents(records, 'NY Times', self.attributes, - self.class_vars, self.metas, title_indices=[-1]) + corpus = create_corpus( + documents=records, + attributes=[], + class_vars=self.class_vars, + metas=self.metas, + title_indices=[-1], + text_features=self.text_features, + name="NY Times", + ) + corpus.attributes["language"] = "en" # NYT publishes only in English + return corpus def _cache_init(self): """ Initialize cache in Orange environment buffer dir. """ diff --git a/orangecontrib/text/tests/test_nyt.py b/orangecontrib/text/tests/test_nyt.py index 4a92013fc..56dd0989d 100644 --- a/orangecontrib/text/tests/test_nyt.py +++ b/orangecontrib/text/tests/test_nyt.py @@ -7,7 +7,6 @@ from http.client import HTTPException from urllib.error import HTTPError, URLError -from Orange.data import TimeVariable from orangecontrib.text import Corpus from orangecontrib.text.nyt import NYT, BATCH_SIZE @@ -87,6 +86,7 @@ def test_nyt_key(self): def test_nyt_query_keywords(self): c = self.nyt.search('slovenia', max_docs=10) + self.assertEqual(c.language, "en") self.assertIsInstance(c, Corpus) self.assertEqual(len(c), 10) @@ -96,8 +96,8 @@ def test_nyt_query_date_range(self): corpus = self.nyt.search('slovenia', from_date, to_date, max_docs=10) self.assertEqual(len(corpus), 10) - time_index = next(i for i, (var, _) in enumerate(NYT.metas) if isinstance(var, TimeVariable)) - tv = corpus.domain.metas[time_index] + tv = corpus.domain["Publication Date"] + time_index = corpus.domain.metas.index(tv) for doc in corpus: date = tv.repr_val(doc.metas[time_index]) date = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S').date() @@ -108,15 +108,20 @@ def test_nyt_query_date_range(self): def test_nyt_query_max_records(self): c = self.nyt.search('slovenia', max_docs=25) self.assertEqual(len(c), 25) + self.assertEqual(c.language, "en") def test_nyt_corpus_domain_generation(self): corpus = self.nyt.search('slovenia', max_docs=10) - for var, _ in NYT.attributes: - self.assertIn(var, corpus.domain.attributes) - for var, _ in NYT.class_vars: - self.assertIn(var, corpus.domain.class_vars) - for var, _ in NYT.metas: - self.assertIn(var, corpus.domain.metas) + self.assertTupleEqual((), corpus.domain.attributes) + self.assertListEqual( + [var.args[0] for var, _ in NYT.class_vars], + [var.name for var in corpus.domain.class_vars] + ) + self.assertListEqual( + [var.args[0] for var, _ in NYT.metas], + [var.name for var in corpus.domain.metas] + ) + self.assertEqual(corpus.language, "en") def test_nyt_result_caching(self): self.nyt._fetch_page('slovenia', None, None, 0) # assure in cache diff --git a/orangecontrib/text/util.py b/orangecontrib/text/util.py index 92c1ee9f9..1fd640ceb 100644 --- a/orangecontrib/text/util.py +++ b/orangecontrib/text/util.py @@ -1,6 +1,6 @@ from functools import wraps from math import ceil -from typing import Union, List, Callable, Any, Tuple +from typing import Union, List, Callable, Any, Tuple, Optional import numpy as np import scipy.sparse as sp @@ -102,10 +102,10 @@ def create_corpus( title_indices: List[int], text_features: List[str], name: str, - language_attribute: str, + language_attribute: Optional[str] = None, ): """ - Create a corpus from list of features/documents produced by modelu such as + Create a corpus from list of features/documents produced by model such as Guardian/NYT Parameters @@ -160,7 +160,9 @@ def to_val(attr, val): Y = np.array(Y, dtype=np.float64) metas = np.array(metas, dtype=object) - language = infer_language_from_variable(domain[language_attribute]) + language = None + if language_attribute is not None: + language = infer_language_from_variable(domain[language_attribute]) corpus = Corpus.from_numpy( domain=domain, X=X, diff --git a/orangecontrib/text/widgets/ownyt.py b/orangecontrib/text/widgets/ownyt.py index ed470d84b..0524e4210 100644 --- a/orangecontrib/text/widgets/ownyt.py +++ b/orangecontrib/text/widgets/ownyt.py @@ -79,7 +79,7 @@ class Outputs: date_to = Setting(datetime.now().date()) attributes = [feat.name for feat, _ in NYT.metas if isinstance(feat, StringVariable)] - text_includes = Setting([feat.name for feat in NYT.text_features]) + text_includes = Setting([NYT.text_features]) class Warning(OWWidget.Warning): no_text_fields = Msg('Text features are inferred when none are selected.')