Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] NYTimes - add language to corpus #926

Merged
merged 1 commit into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 79 additions & 34 deletions orangecontrib/text/nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,24 @@
import shelve
import warnings
from datetime import date
from time import sleep
from urllib import request, parse
from functools import partial
from http.client import HTTPException
from time import sleep
from urllib import parse, request
from urllib.error import HTTPError, URLError

import numpy as np
from dateutil.parser import isoparse
from Orange.data import (
ContinuousVariable,
DiscreteVariable,
StringVariable,
TimeVariable,
)
from Orange.misc import environ

from Orange import data
from orangecontrib.text.corpus import Corpus

try:
from Orange.misc import environ
except ImportError:
from Orange.canvas.utils import environ
from orangecontrib.text.util import create_corpus

SLEEP = 1
TIMEOUT = 10
Expand All @@ -26,39 +31,70 @@
BASE_URL = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'


class NYT:
""" Class for fetching records from the NYT API. """
def keywords(doc, name):
kws = doc.get("keywords", [])
return ", ".join([kw.get("value") for kw in kws if kw["name"] == name])

@staticmethod
def keywords(doc, name):
return ', '.join([kw.get('value')
for kw in doc.get('keywords', [])
if kw['name'] == name])

attributes = []
def parse_date(doc):
date = doc.get("pub_date")
return isoparse(date).timestamp() if date is not None else np.nan


class NYT:
""" Class for fetching records from the NYT API. """

class_vars = [
(data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)),
(
partial(DiscreteVariable, "Section"),
lambda doc: doc.get("section_name", None),
),
]

tv = data.TimeVariable('Publication Date')
metas = [
(data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''),
(data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''),
(data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''),
(data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''),
(data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')),
(data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''),
(data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')),
(data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')),
(data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')),
(data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')),
(tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))),
(data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)),
(data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)),
(
partial(StringVariable, "Headline"),
lambda doc: doc.get("headline", {}).get("main") or "",
),
(partial(StringVariable, "Abstract"), lambda doc: doc.get("abstract") or ""),
(partial(StringVariable, "Snippet"), lambda doc: doc.get("snippet") or ""),
(
partial(StringVariable, "Lead Paragraph"),
lambda doc: doc.get("lead_paragraph") or "",
),
(
partial(StringVariable, "Subject Keywords"),
partial(keywords, name="subject"),
),
(partial(StringVariable, "URL"), lambda doc: doc.get("web_url") or ""),
(
partial(StringVariable, "Locations"),
partial(keywords, name="glocations"),
),
(partial(StringVariable, "Persons"), partial(keywords, name="persons")),
(
partial(StringVariable, "Organizations"),
partial(keywords, name="organizations"),
),
(
partial(StringVariable, "Creative Works"),
partial(keywords, name="creative_works"),
),
(
partial(TimeVariable, "Publication Date", have_time=1, have_date=1),
parse_date,
),
(
partial(DiscreteVariable, "Article Type"),
lambda doc: doc.get("type_of_material", None),
),
(
partial(ContinuousVariable, "Word Count", number_of_decimals=0),
lambda doc: doc.get("word_count", None),
),
]

text_features = [metas[0][0], metas[1][0]] # headline + abstract
text_features = ["Headline", "Abstract"]

def __init__(self, api_key):
"""
Expand Down Expand Up @@ -129,8 +165,17 @@ def search(self, query, date_from=None, date_to=None, max_docs=None,
if len(records) > max_docs:
records = records[:max_docs]

return Corpus.from_documents(records, 'NY Times', self.attributes,
self.class_vars, self.metas, title_indices=[-1])
corpus = create_corpus(
documents=records,
attributes=[],
class_vars=self.class_vars,
metas=self.metas,
title_indices=[-1],
text_features=self.text_features,
name="NY Times",
)
corpus.attributes["language"] = "en" # NYT publishes only in English
return corpus

def _cache_init(self):
""" Initialize cache in Orange environment buffer dir. """
Expand Down
23 changes: 14 additions & 9 deletions orangecontrib/text/tests/test_nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from http.client import HTTPException
from urllib.error import HTTPError, URLError

from Orange.data import TimeVariable
from orangecontrib.text import Corpus
from orangecontrib.text.nyt import NYT, BATCH_SIZE

Expand Down Expand Up @@ -87,6 +86,7 @@ def test_nyt_key(self):

def test_nyt_query_keywords(self):
c = self.nyt.search('slovenia', max_docs=10)
self.assertEqual(c.language, "en")
self.assertIsInstance(c, Corpus)
self.assertEqual(len(c), 10)

Expand All @@ -96,8 +96,8 @@ def test_nyt_query_date_range(self):
corpus = self.nyt.search('slovenia', from_date, to_date, max_docs=10)
self.assertEqual(len(corpus), 10)

time_index = next(i for i, (var, _) in enumerate(NYT.metas) if isinstance(var, TimeVariable))
tv = corpus.domain.metas[time_index]
tv = corpus.domain["Publication Date"]
time_index = corpus.domain.metas.index(tv)
for doc in corpus:
date = tv.repr_val(doc.metas[time_index])
date = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S').date()
Expand All @@ -108,15 +108,20 @@ def test_nyt_query_date_range(self):
def test_nyt_query_max_records(self):
c = self.nyt.search('slovenia', max_docs=25)
self.assertEqual(len(c), 25)
self.assertEqual(c.language, "en")

def test_nyt_corpus_domain_generation(self):
corpus = self.nyt.search('slovenia', max_docs=10)
for var, _ in NYT.attributes:
self.assertIn(var, corpus.domain.attributes)
for var, _ in NYT.class_vars:
self.assertIn(var, corpus.domain.class_vars)
for var, _ in NYT.metas:
self.assertIn(var, corpus.domain.metas)
self.assertTupleEqual((), corpus.domain.attributes)
self.assertListEqual(
[var.args[0] for var, _ in NYT.class_vars],
[var.name for var in corpus.domain.class_vars]
)
self.assertListEqual(
[var.args[0] for var, _ in NYT.metas],
[var.name for var in corpus.domain.metas]
)
self.assertEqual(corpus.language, "en")

def test_nyt_result_caching(self):
self.nyt._fetch_page('slovenia', None, None, 0) # assure in cache
Expand Down
10 changes: 6 additions & 4 deletions orangecontrib/text/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from functools import wraps
from math import ceil
from typing import Union, List, Callable, Any, Tuple
from typing import Union, List, Callable, Any, Tuple, Optional

import numpy as np
import scipy.sparse as sp
Expand Down Expand Up @@ -102,10 +102,10 @@ def create_corpus(
title_indices: List[int],
text_features: List[str],
name: str,
language_attribute: str,
language_attribute: Optional[str] = None,
):
"""
Create a corpus from list of features/documents produced by modelu such as
Create a corpus from list of features/documents produced by model such as
Guardian/NYT

Parameters
Expand Down Expand Up @@ -160,7 +160,9 @@ def to_val(attr, val):
Y = np.array(Y, dtype=np.float64)
metas = np.array(metas, dtype=object)

language = infer_language_from_variable(domain[language_attribute])
language = None
if language_attribute is not None:
language = infer_language_from_variable(domain[language_attribute])
corpus = Corpus.from_numpy(
domain=domain,
X=X,
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/ownyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class Outputs:
date_to = Setting(datetime.now().date())

attributes = [feat.name for feat, _ in NYT.metas if isinstance(feat, StringVariable)]
text_includes = Setting([feat.name for feat in NYT.text_features])
text_includes = Setting([NYT.text_features])

class Warning(OWWidget.Warning):
no_text_fields = Msg('Text features are inferred when none are selected.')
Expand Down