-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhihu.py
executable file
·131 lines (114 loc) · 6.15 KB
/
zhihu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import logging
import os
import time
from bs4 import BeautifulSoup
import requests
from urllib import request
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
_LOGGER = logging.getLogger(__file__)
#https://www.zhihu.com/api/v4/questions/60908883/answers?sort_by=default&limit=20&offset=23
#https://www.zhihu.com/question/60908883/answer/183125520
QUESTION_API = 'https://www.zhihu.com/api/v4/questions/{0}'
ANSWERS_OFFSET = '/answers?sort_by=default&limit=20&offset={0}'
ANSWER_URL = 'https://www.zhihu.com/question/{0}/answer/{1}'
ANSWER_LIMIT = 20
class ZhiHuSpider():
def __init__(self, question_id, root):
self.question_id = question_id
self.question_url = QUESTION_API.format(self.question_id)
self.root = root
self.directory = os.path.join(self.root, self.question_id)
if not os.path.exists(self.directory):
os.makedirs(self.directory)
self._session = requests.session()
self._headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Host':'www.zhihu.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
def set_cookie(self, cookie):
self._headers['Cookie'] = cookie
def download_imgs(self, answer_count=100):
self._print('Start to download images under question %s...' % self.question_id)
offset = 0
url = self.question_url + ANSWERS_OFFSET.format(offset)
has_more, total_count, answer_list = self._get_answers(url)
self._print('Start to download answers from %s to %s (total: %s)...' % (offset, offset + ANSWER_LIMIT, total_count))
self._download_imgs_by_answer_group(answer_list)
while has_more:
offset = offset + ANSWER_LIMIT
self._print('Start to download answers from %s to %s (total: %s)...' % (offset, offset + ANSWER_LIMIT, total_count))
url = self.question_url + ANSWERS_OFFSET.format(offset)
has_more, total_count, answer_list = self._get_answers(url)
self._download_imgs_by_answer_group(answer_list)
time.sleep(3)
def _download_imgs_by_answer_group(self, answer_list):
for answer_id, author_name, author_url in answer_list:
self._print('Answer %s' % answer_id)
self._download_imgs_by_answer(answer_id, author_name, author_url)
self._print('-' * 24)
def _download_imgs_by_answer(self, answer_id, author_name, author_url):
answer_url = ANSWER_URL.format(self.question_id, answer_id)
rep = self._session.get(answer_url, headers=self._headers, verify=True)
soup = BeautifulSoup(rep.text, 'html.parser')
tag_list = soup.find_all('noscript')
index = 1
if len(tag_list) == 0:
self._print('No image in answer %s' % answer_id)
return
for tag in tag_list:
img = tag.find('img')
if 'data-original' in img:
img_url = img['data-original']
else:
img_url = img['src']
fmt = img_url[-4:]
img_name = '{0}_{1}{2}'.format(answer_id, index, fmt)
index = index + 1
self.download_img(img_url, img_name)
def _get_answers(self, offset_url):
rep = self._session.get(offset_url, headers=self._headers, verify=True)
if rep.status_code != 200:
_LOGGER.error(rep.text)
return
answer_json = json.loads(rep.text)
total_count = answer_json['paging']['totals']
has_more = not answer_json['paging']['is_end']
data = answer_json['data']
answer_list = []
if len(data) > 0:
for item in data:
author_name = item['author']['name']
author_url = item['author']['url'].replace('/api/v4', '')
answer_id = item['url'].replace('http://www.zhihu.com/api/v4/answers/', '')
answer_list.append((answer_id, author_name, author_url))
return has_more, total_count, answer_list
def download_img(self, url, name):
try:
f = open(os.path.join(self.directory, name), 'wb')
rep = self._session.get(url)
f.write(rep.content)
self._print('%s saved' % name)
except:
_LOGGER.error('Failed to download img: %s', url)
pass
def _record(self, msg):
pass #fp = open("test.txt",w)
def _print(self, msg):
print(msg, flush=True)
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Requestion ID is required.')
else:
request_id = sys.argv[1]
spider = ZhiHuSpider(request_id, r'/samba/zhihu')
spider.set_cookie('d_c0="AJDCS--zEwyPTvyxJKnCZ2X2o_Waf99sjYE=|1500260667"; _zap=5367fc7a-055c-432f-b5d6-8b07b7203565; q_c1=2979b7f3e756448cbbaff523dd6bc382|1506307958000|1496998492000; q_c1=2979b7f3e756448cbbaff523dd6bc382|1506321963000|1496998492000; capsion_ticket="2|1:0|10:1506322340|14:capsion_ticket|44:MmNkZjczZmE5ODA2NDZhNzgzZGU1YWY0Zjk1Zjg1NDE=|358289c363a386fc8b08c2cfb88587a798435f57ad67493f083aa8234db6d0a7"; _ga=GA1.2.89326784.1503368122; _gid=GA1.2.1126089039.1506592707; r_cap_id="MjYyODE4YzY4NGYxNDk5Y2IwNzI5OGE2MDA0M2IxMzU=|1506658206|7244c0622bc3a3c8d52d3e564d1f4931f5c997e0"; cap_id="NmU5ZWQwNzEwOGI5NDI4MWEyMTc0Yzc2Njg1N2IzNjA=|1506658206|d261b6a8b092034ca83f4aec5eeb5661bd525992"; z_c0=Mi4xRFdNc0FBQUFBQUFBa01KTDc3TVREQmNBQUFCaEFsVk53VlQxV1FCX2xvVXhKTlVOZkpoZGJjRzlhN2xHM1lUa0xR|1506658241|82b6a3486fe20de0e904389a586cab6c1e775d9f; s-q=nba; s-i=1; sid=4fg7gsp8; __utma=155987696.89326784.1503368122.1506667724.1506667724.1; __utmb=155987696.0.10.1506667724; __utmc=155987696; __utmz=155987696.1506667724.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _xsrf=6a7b0738-5a1d-438d-aaed-c7e5480c6a55')
spider.download_imgs()