-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipelines.py
156 lines (132 loc) · 4.43 KB
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from itemadapter import ItemAdapter
class BookscraperPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
## Strip all whitespaces from strings
field_names = adapter.field_names()
for field_name in field_names:
if field_name != 'description':
value = adapter.get(field_name)
adapter[field_name] = value.strip()
## Category & Product Type --> switch to lowercase
lowercase_keys = ['category', 'product_type']
for lowercase_key in lowercase_keys:
value = adapter.get(lowercase_key)
adapter[lowercase_key] = value.lower()
## Price --> convert to float
price_keys = ['price', 'price_excl_tax', 'price_incl_tax', 'tax']
for price_key in price_keys:
value = adapter.get(price_key)
value = value.replace('£', '')
adapter[price_key] = float(value)
## Availability --> extract number of books in stock
availability_string = adapter.get('availability')
split_string_array = availability_string.split('(')
if len(split_string_array) < 2:
adapter['availability'] = 0
else:
availability_array = split_string_array[1].split(' ')
adapter['availability'] = int(availability_array[0])
## Reviews --> convert string to number
num_reviews_string = adapter.get('num_reviews')
adapter['num_reviews'] = int(num_reviews_string)
## Stars --> convert text to number
stars_string = adapter.get('stars')
split_stars_array = stars_string.split(' ')
stars_text_value = split_stars_array[1].lower()
if stars_text_value == "zero":
adapter['stars'] = 0
elif stars_text_value == "one":
adapter['stars'] = 1
elif stars_text_value == "two":
adapter['stars'] = 2
elif stars_text_value == "three":
adapter['stars'] = 3
elif stars_text_value == "four":
adapter['stars'] = 4
elif stars_text_value == "five":
adapter['stars'] = 5
return item
import mysql.connector
class SaveToMySQLPipeline:
def __init__(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
# password = '',
database = 'books'
)
## Create cursor, used to execute commands
self.cur = self.conn.cursor()
## Create books table if none exists
self.cur.execute("""
CREATE TABLE IF NOT EXISTS books(
id int NOT NULL auto_increment,
url VARCHAR(255),
title text,
upc VARCHAR(255),
product_type VARCHAR(255),
price_excl_tax DECIMAL,
price_incl_tax DECIMAL,
tax DECIMAL,
price DECIMAL,
availability INTEGER,
num_reviews INTEGER,
stars INTEGER,
category VARCHAR(255),
description text,
PRIMARY KEY (id)
)
""")
def process_item(self, item, spider):
## Define insert statement
self.cur.execute(""" insert into books (
url,
title,
upc,
product_type,
price_excl_tax,
price_incl_tax,
tax,
price,
availability,
num_reviews,
stars,
category,
description
) values (
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
)""", (
item["url"],
item["title"],
item["upc"],
item["product_type"],
item["price_excl_tax"],
item["price_incl_tax"],
item["tax"],
item["price"],
item["availability"],
item["num_reviews"],
item["stars"],
item["category"],
str(item["description"])
))
## Execute insert of data into database
self.conn.commit()
return item
def close_spider(self, spider):
## Close cursor & connection to database
self.cur.close()
self.conn.close()