-
Notifications
You must be signed in to change notification settings - Fork 2
/
任务爬虫.py
158 lines (134 loc) · 4.98 KB
/
任务爬虫.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#coding:utf-8
import requests
import re
import json
from lxml import etree
import urlparse
import sys
import time
from selenium import webdriver
class T3spider(object):
def __init__(self,start_url=None,path=None,headless=True):
self.web = webdriver.Firefox()
if start_url:
self.start_url = start_url
else:
self.start_url = 'https://www.endclothing.com/us/neighborhood-drizzler-jacket-181spnh-jkm01-grn.html'
self.head = {
'authority':'www.endclothing.com',
'method':'GET',
'path':urlparse.urlsplit(self.start_url).path,
'scheme':'https',
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding':'gzip, deflate, br',
'accept-language':'zh-CN,zh;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_urls(self,htmls):
html = etree.HTML(htmls)
href = html.xpath('//a[@class="c-pagination__next"]/@href')
if href:
print href[0]
self.start_url = href[0]
else:
self.start_url = ''
urls = html.xpath('//div[@class="product-item-info"]/a/@href')
return urls
def get_imgs(self,imgjson):
ls = []
img = json.loads(imgjson)
for i in img.keys():
for j in img[i].keys():
for k in img[i][j].keys():
if k == 'data':
ls.extend(img[i][j][k])
return ls
def get_desc(self,descjson):
ls = []
desc = json.loads(descjson)
ds = ["name","description",["offers","priceCurrency"],["offers","price"]]
for i in ds:
if type(i) == list:
ls.append(desc[i[0]][i[1]])
else:
ls.append(desc[i])
return ls
def get_data(self,htmls):
data = []
html = etree.HTML(htmls)
imgs = html.xpath('//script[contains(text(),"[data-gallery-role=gallery-placeholder]")]')
if imgs:
img_text = imgs[0].text.strip()
data.append(self.get_imgs(img_text))
product_data = html.xpath('//script[@type="application/ld+json"]')
if product_data:
product_text = product_data[0].text.strip()
data.extend(self.get_desc(product_text))
title = html.xpath('//meta[@name="WT.z_pcolour"]')
if title:
data.append(title[0].attrib['content'])
return data
def get_content(self,url):
self.head['path'] = urlparse.urlsplit(url).path
req = requests.get(url,headers=self.head)
ps = len(req.content)
print u'获取的网页长度为:{}'.format(ps)
if ps<10000:
print u'获取出错,需要更换cookie'
#print req.content
return self.get_data(req.content)
def cookieiter(self,url):
self.web.get(url)
ls = {}
lp = []
for i in self.web.get_cookies():
for j in i.keys():
ls[j] = i[j]
for i in ls.keys():
lp.append([i,ls[i]])
print lp
print map(lambda x:':'.join(x),lp)
self.head['cookie'] = ls
def main():
try:
spider = T3spider('https://www.endclothing.com/row/brands/nike')
times = 0
while spider.start_url:
spider.cookieiter(spider.start_url)
req = requests.get(spider.start_url,headers=spider.head)
#print req.headers
#print req.content
#print req.request.headers
urls = spider.get_urls(req.content)
print u'网址数量为:'.format(len(urls))
for i in urls:
times +=1
if times%2 == 0:
spider.cookieiter(spider.start_url)
print u'正在获取数据,网址为:{}'.format(i)
print u'取得的数据长度为:{}'.format(len(spider.get_content(i)))
except Exception as e:
print e
finally:
spider.web.quit()
def maan():
spider = T3spider('https://www.endclothing.com/row/brands/nike')
times = 0
while spider.start_url:
spider.cookieiter(spider.start_url)
req = requests.get(spider.start_url,headers=spider.head)
#print req.headers
#print req.content
#print req.request.headers
urls = spider.get_urls(req.content)
print u'网址数量为:'.format(len(urls))
for i in urls:
times +=1
if times%2 == 0:
spider.cookieiter(spider.start_url)
print u'正在获取数据,网址为:{}'.format(i)
print u'取得的数据长度为:{}'.format(len(spider.get_content(i)))
if __name__ == '__main__':
maan()