-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_foodndtv.py
57 lines (38 loc) · 1.48 KB
/
scrape_foodndtv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
import urllib2
import re
from bs4 import BeautifulSoup
def char_range(c1, c2, lower=False):
"""Generates the characters from `c1` to `c2`, inclusive."""
for c in xrange(ord(c1), ord(c2)+1):
if lower:
yield chr(c).lower()
else:
yield chr(c)
def get_category_urls(base_url):
html = urllib2.urlopen(base_url).read()
soup = BeautifulSoup(html, 'html5lib')
# Gather the category URLs
categories = soup.find_all('a', href=re.compile(r'http://food.ndtv.com/ingredient/\w+'))
urls = list(set(map(lambda x: x['href'], categories)))
return urls
def urljoin(first, second):
return '/'.join([first, second])
def scraper(urls):
all_ingredients = []
for url in urls:
for ch in char_range('a', 'z', lower=True):
current = urljoin(url, ch)
print("Currently parsing URL: %s" % current)
html = urllib2.urlopen(current).read()
soup = BeautifulSoup(html, 'html5lib')
container = soup.find('div', class_='vdo_lst')
links = container.find_all(lambda tag: tag.name == 'a' and tag.has_attr('title'))
ingredients = list(set(map(lambda tag: tag['title'], links)))
print("Adding %d new ingredients" % len(ingredients))
all_ingredients.extend(ingredients)
return all_ingredients
if __name__ == '__main__':
base_url = 'http://food.ndtv.com/ingredient'
urls = get_category_urls(base_url)
ingredients = scraper(urls)