This repository has been archived by the owner on Oct 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
light_scrapper.py
215 lines (189 loc) · 7.62 KB
/
light_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# coding=utf-8
import urllib2
import sys
import os
import re
from collections import OrderedDict
from readability.readability import Document
from bs4 import BeautifulSoup
from ebooklib import epub
reload(sys)
sys.setdefaultencoding('utf-8') # Needed fore websites that use Unicode
class TableOfContentsError(Exception):
"""
Useful to raise if TOC not found
"""
pass
class LightScrap(object):
"""
Scrapper object which can walk through chapters and grab relevant content
"""
def __init__(self, title, start_chapter_number, end_chapter_number, url, header=None):
"""
Instantiates the scrapper with the relevant information like the start URL (url) and how far to walk for
all the chapters (end_chapter_number)
:param title: str
:param start_chapter_number: int
:param end_chapter_number: int
:param url: str
:param header: dict
:return:
"""
self.title = title
self.start_chapter_number = int(start_chapter_number)
self.end_chapter_number = int(end_chapter_number)
self.start_url = self.url = url
self.main_content_div = 'entry-content'
self.toc = OrderedDict()
if header is None:
self.header = {'User-agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) '
'Gecko/20091102 Firefox/3.5.5'}
else:
self.header = header
if not os.path.exists(self.title):
os.makedirs(title)
def visit_url(self, url):
"""
http://stackoverflow.com/questions/7933417/how-do-i-set-headers-using-pythons-urllib
:param url: URL to visit
:return: str
"""
request = urllib2.Request(url=url, headers=self.header)
return urllib2.urlopen(request).read()
def strip_chapter(self, html):
"""
Strips chapter and gets relevant HTML using Readability
:param html: str
:return:
"""
doc = Document(html)
if len(doc.summary()) <= 20:
print 'This page has errors, returning entry-content div raw HTML.'
content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0])
content = '<html><head><meta charset="utf-8"></head>' + content + '</html>'
return doc.short_title(), content
return (doc.short_title(),
str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
def find_from_toc(self, chapter_number, url):
"""
Grabs link from table of contents provided the chapter number and TOC URL
:param chapter_number: int
:param url: str
:return:
"""
chapter_number = str(chapter_number)
soup = BeautifulSoup(self.visit_url(url), 'html.parser')
chapter = 'chapter ' + chapter_number
for link in soup.find_all('a'):
if chapter in link.text.lower():
return link.get('href')
def make_html_toc(self):
"""
Generates a HTML table of contents (to use with Calibre)
:return: str
"""
toc = """<html><html><body><h1>Table of Contents</h1>
<p style="text-indent:0pt">"""
chapter_html = '<a href="{0}.html">Chapter {0}</a><br/>'
for chapter_number in self.toc.keys():
toc += chapter_html.format(chapter_number)
toc += '</p></body></html>'
with open(os.path.join(self.title, self.title + '-toc.html'), 'w+') as f:
f.write(toc)
return toc
def find_toc(self):
"""
Locate, from the start URL, and return URL of table of contents
:return:
"""
soup = BeautifulSoup(self.visit_url(self.start_url), 'html.parser')
for link in soup.find_all('a'):
if 'table of contents' in link.text.lower():
return link.get('href')
raise TableOfContentsError('Table of contents not found, please specify it.')
def chapters_walk(self):
"""
Recursive method to walk from of URL to end
:return:
"""
if self.start_chapter_number > self.end_chapter_number:
return
self.toc[self.start_chapter_number] = self.url
# Check if walk is hitting the same url
if self.url != self.start_url:
if self.url == self.toc[self.start_chapter_number]:
# Cycle detected
self.url = self.find_from_toc(self.start_chapter_number, self.find_toc())
print 'Fetching chapter ' + str(self.start_chapter_number) + ' ' + self.url
html = self.visit_url(self.url)
chapter = self.strip_chapter(html)
with open(os.path.join(self.title, str(self.start_chapter_number) + '.html'), 'w+') as f:
f.write(chapter[1])
# Start walking
soup = BeautifulSoup(html, 'html.parser')
toc = ''
# Find next chapter
for link in soup.find_all('a'):
if 'next chapter' in link.text.lower():
self.start_chapter_number += 1
self.url = link.get('href')
return self.chapters_walk()
if 'table of contents' in link.text.lower():
toc = link.get('href')
self.start_chapter_number += 1
return self.chapters_walk()
def generate_epub(self):
"""
Generates a ePub with contents from the chapters_walk()
:return:
"""
book = epub.EpubBook()
book.set_title(self.title)
chapters = []
if len(self.toc) < 1:
self.chapters_walk()
for chapter in self.toc.keys():
chapter = str(chapter)
with open(os.path.join(self.title, chapter + '.html')) as f:
content = f.read()
chapter = epub.EpubHtml(title='Chapter ' + chapter,
file_name=chapter + '.xhtml',
content=content)
book.add_item(chapter)
chapters.append(chapter)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ['nav']
for chapter in chapters:
book.spine.append(chapter)
epub.write_epub(os.path.join(self.title, self.title + '.epub'), book, {})
def toc_walk(self, toc_url):
"""
Grabs links from table of contents
:param toc_url: str
:return:
"""
self.toc = OrderedDict()
watered_soup = BeautifulSoup(self.visit_url(toc_url), 'html.parser')
for i in range(self.start_chapter_number, self.end_chapter_number + 1):
self.toc[i] = None
chapter_regex = re.compile(r'[0-9]*(c|C)hapter(\s|\S):(?P<chap_no>[0-9]*)')
for link in watered_soup.find_all('a'):
if 'chapter' in link.text.lower():
found = chapter_regex.search(str(link.text))
if found is not None:
found = found.group('chap_no')
if found and int(found) in self.toc.keys():
self.toc[int(found)] = link.get('href')
content = self.strip_chapter(self.visit_url(link.get('href')))
with open(os.path.join(self.title, found + '.html'), 'w+') as f:
f.write(content[1])
if __name__ == '__main__':
ls = LightScrap(title='Arcane Emperor',
start_chapter_number=1,
end_chapter_number=68,
url='https://royalroadl.com/fiction/chapter/94620')
ls.toc_walk('https://royalroadl.com/fiction/8463')
#ls.chapters_walk()
# ls.make_html_toc()
ls.generate_epub()