-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
45 lines (38 loc) · 1.47 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import time
import ujson
import spacy
from collections import defaultdict
from rank_dishes import rank_dishes
def get_restaurants(business_path):
# Get business_id of all restaurants in the dataset
restaurants = {}
with open(business_path, 'r', encoding='utf8') as f:
for line in f:
jso = ujson.loads(line)
if jso['categories'] and 'Restaurants' in jso['categories']:
restaurants[jso['business_id']] = jso['name']
return restaurants
def get_reviews(restaurants, review_path):
# Get reviews for each business_id
reviews = defaultdict(list)
with open(review_path, 'r', encoding='utf8') as f:
for line in f:
jso = ujson.loads(line)
business_id = jso['business_id']
if business_id in restaurants:
reviews[business_id].append(jso['text'].replace('\n',' '))
return reviews
if __name__ == '__main__':
start_time = time.time()
restaurants = get_restaurants('yelp_dataset/business.json')
reviews_dict = get_reviews(restaurants, 'yelp_dataset/review.json')
best_dishes = {}
for bid, reviews in reviews_dict.items():
dishes = rank_dishes('models', reviews)
best_dishes[bid] = [dish for dish, _ in dishes]
# Write rankings to a file
max_bytes = 2**31 - 1
dump = ujson.dumps(best_dishes)
with open('best_dishes.json', 'w') as f:
for i in range(0, len(dump), max_bytes):
f.write(dump[i:i + max_bytes])