This repository has been archived by the owner on Feb 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
article_scraper.py
95 lines (80 loc) · 3.22 KB
/
article_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import multiprocessing
import subprocess
import random
import time
from urllib.parse import urljoin
from lxml import html
import requests
import sqlalchemy
from storage import Session, Article, Source
from util import printWithPid
READABILITY_PATH = "/usr/bin/readability-scrape"
REQUEST_TIMEOUT = 30 # seconds
RENDER_TIMEOUT = 120 # seconds
def _requestWait():
time.sleep(random.uniform(1, 5))
# Scraping is mostly I/O-bound right now, so this is fine
SCRAPE_PROCESSES = max(4, multiprocessing.cpu_count() * 2)
def _scrapeArticlesFromSource(hostname):
printWithPid("Scraping " + hostname)
session = Session()
# Find new articles from the homepage of the source
# TODO: Recurse?
try:
# requests will follow redirects, including upgrades to https
sourceRequest = requests.get("http://" + hostname,
timeout=REQUEST_TIMEOUT)
sourceTree = html.fromstring(sourceRequest.content)
for link in sourceTree.findall(".//a"):
if "href" not in link.attrib:
continue
url = urljoin(sourceRequest.url, link.attrib["href"])
article = session.query(Article).get(url)
if article is None and hostname in url:
printWithPid("Found " + url)
session.add(Article(url=url, source_hostname=hostname))
except Exception as e:
printWithPid(e)
session.commit()
_requestWait()
# Actually retrieve the articles
for article in session.query(Article).filter(
Article.source_hostname == hostname,
Article.text.is_(None),
Article.retrieved.is_(None)):
printWithPid("Retrieving " + article.url)
try:
readabilityString = subprocess.check_output(
[READABILITY_PATH, "--json", article.url],
timeout=RENDER_TIMEOUT).decode("utf8")
readabilityOutput = json.loads(readabilityString)
if readabilityOutput is not None:
article.title = readabilityOutput["title"]
article.text = readabilityOutput["textContent"]
except subprocess.CalledProcessError as e:
printWithPid(e)
except Exception as e:
printWithPid(e)
article.retrieved = sqlalchemy.sql.functions.current_timestamp()
session.commit()
_requestWait()
def scrapeProcess():
while True:
printWithPid("-- Starting article scraper --");
# Each process should be responsible for exactly one source to load
# balance scraping a little better
with multiprocessing.Pool(SCRAPE_PROCESSES, maxtasksperchild=1) as pool:
try:
session = Session()
# Pass just the hostname so it can be serialized for the child
pool.map(_scrapeArticlesFromSource,
(s.hostname for s in session.query(Source.hostname)), 1)
except sqlalchemy.exc.ResourceClosedError as e:
printWithPid(e)
except Exception as e:
printWithPid(e)
if __name__ == "__main__":
# Spawn, don't fork, so that each child gets its own database connection
multiprocessing.set_start_method("spawn")
scrapeProcess()