This repository has been archived by the owner on Feb 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
google_news.py
97 lines (75 loc) · 2.91 KB
/
google_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from urllib.parse import urlparse
import sys
import time
from lxml import etree
import requests
from storage import Session, Article, GoogleStory, Source
from util import printWithPid
GOOGLE_NEWS_URL = "https://news.google.com/news/rss/?ned=us&hl=en"
GOOGLE_STORY_URL = "https://news.google.com/news/rss/story/{cluster}?ned=us&hl=en"
CLUSTER_PREFIX = "cluster="
REQUEST_WAIT_TIME = 2 # seconds
SCRAPE_WAIT_TIME = 300 # seconds
def _getOrCreateArticle(session, xmlArticle, story):
url = xmlArticle.find("link").text
hostname = urlparse(url).hostname.lower()
if hostname.find("www.") == 0:
hostname = hostname[4:]
source = session.query(Source).get(hostname)
if source is None:
source = Source(hostname=hostname, name="")
session.add(source)
article = session.query(Article).get(url)
if article is None:
title = xmlArticle.find("title").text
article = Article(url=url, title=title, story_id=story.id,
source_hostname=source.hostname)
session.add(article)
else:
article.story_id = story.id
return article
def _scrape():
printWithPid("Scraping Google News at " + GOOGLE_NEWS_URL)
session = Session()
googleNewsRequest = requests.get(GOOGLE_NEWS_URL)
googleNewsXml = etree.XML(googleNewsRequest.content)
googleNewsArticles = googleNewsXml.findall(".//item")
for xmlArticle in googleNewsArticles:
articleGuid = xmlArticle.find("guid").text
clusterOffset = articleGuid.find(CLUSTER_PREFIX) + len(CLUSTER_PREFIX)
cluster = articleGuid[clusterOffset:]
story = session.query(GoogleStory).get(cluster)
if story is None:
story = GoogleStory(id=cluster)
session.add(story)
printWithPid(story)
article = _getOrCreateArticle(session, xmlArticle, story)
printWithPid(article)
time.sleep(REQUEST_WAIT_TIME)
relatedArticlesRequest = requests.get(
GOOGLE_STORY_URL.format(cluster=article.story_id))
try:
relatedArticlesXml = etree.XML(relatedArticlesRequest.content)
except etree.XMLSyntaxError as error:
printWithPid("XML syntax error:", error)
printWithPid(relatedArticlesRequest.content)
continue
except:
printWithPid("Unexpected error:", sys.exc_info()[0])
continue
relatedArticles = relatedArticlesXml.findall(".//item")
for relatedXmlArticle in relatedArticles:
relatedArticle = _getOrCreateArticle(session, relatedXmlArticle, story)
printWithPid(relatedArticle)
session.commit()
session.commit()
def scrapeProcess():
printWithPid("-- Starting Google News scraper --")
while True:
try:
_scrape()
except Exception as e:
printWithPid(e)
time.sleep(SCRAPE_WAIT_TIME)
if __name__ == "__main__":
scrapeProcess()