-
Notifications
You must be signed in to change notification settings - Fork 79
/
百度文库.py
118 lines (97 loc) · 3.46 KB
/
百度文库.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#encoding:utf-8
import requests
from bs4 import BeautifulSoup
import re
import json
import os
#url='https://wenku.baidu.com/view/0571dbdf6f1aff00bed51e62.html?sxts=1539958717044'
jsList=[]
picList=[]
if not os.path.exists('./img/'):
os.makedirs('./img/')
def parserJS(url):#文章地址
global add,docType
r=requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
script=soup.find_all('script',attrs={'type':'text/javascript'})
for i in script:
if 'WkInfo.htmlUrls' in i.text:
#print(i.text)
add=i.text
if 'WkInfo.Urls' in i.text:
Doc=i.text
#print(Doc)
# print(i.text)
#print(Doc)
DocInfo=Doc.split('WkInfo.DocInfo=')[0]
#print(DocInfo)
docType=re.findall(r'\'docType\': \'\w+\'',DocInfo)[0]
#print(docType)
docType=docType.split(':')[1].replace('\'','').strip(' ')
title=re.findall(r'\'title\': \'.*\'',DocInfo)[0]
docId=re.findall(r'\'docId\': \'.*\'',DocInfo)[0]
docId = docId.split(':')[1].replace('\'', '').strip(' ')
title=title.split(':')[1].replace('\'','').strip(' ')
print(title)
print('文档的类型为%s' % docType)
if docType =='doc' or docType == 'txt':
parserDoc(add,title)
if docType =='ppt':
parserPPt(docId,title)
if docType == 'pdf':
parserPDF(add,title)
def parserDoc(add,file_name):
add=add.split(' WkInfo.htmlUrls =')[1].split(';')[0]
add = add.replace('\\x22', '').replace('\\', '')
add=re.findall(r'pageLoadUrl:.*\w', add)[0].split(',')
#print(add)
for j in add:
if 'json' in j:
jsList.append(j.split(':',1)[1].replace('}','').strip(']'))
print('共有%d页' % len(jsList))
for i in jsList:
parserPage(i,file_name)
def parserPPt(docId,file_name):
print('Downloading pictures······')
#print(docId)
r=requests.get('https://wenku.baidu.com/browse/getbcsurl?doc_id=%s&pn=1&rn=99999&type=ppt'%docId)
print(r.url)
result=r.json()
print('共有%d页' % len(result))
for i in result:
r=requests.get(i['zoom'])
#print(i['zoom'])
print('Downloading page %d'%i['page'])
with open('./img/'+file_name+'%d.jpg'%i['page'],'wb') as fd:
fd.write(r.content)
def parserPDF(add,file_name):
add = add.split(' WkInfo.htmlUrls =')[1].split(';')[0]
add = add.replace('\\x22', '').replace('\\', '')
add = re.findall(r'pageLoadUrl:.*\w', add)[0].split(',')
# print(add)
for j in add:
if 'png' in j:
picList.append(j.split(':', 1)[1].replace('}', '').strip(']'))
picList.remove(picList[0])
print('共有%d页' % len(picList))
for i in range(len(picList)):
r = requests.get(picList[i])
with open('./img/data%d.png' % i, 'wb') as fd:
fd.write(r.content)
def parserPage(url,file_name):#js地址
r = requests.get(url.replace('\\', ''))#访问js
result = json.loads(r.text.split('(', 1)[1].strip(')'))#转为json格式
body = result['body']
for i in body:
if i['t'] == 'word':
text=i['c']
# if type(text)!=str:
# continue
if i['ps']!=None and '_enter' in i['ps'].keys():
text='\n'
print(text,end='')
with open(file_name+'.txt','a') as fd:
fd.write(text)
if __name__ == '__main__':
url=input('请输入网址:')
parserJS(url)