-
Notifications
You must be signed in to change notification settings - Fork 0
/
uoft_scrape.py
41 lines (29 loc) · 1.08 KB
/
uoft_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from uoft_public_health import *
import pprint
suffixes = [
"phd-final-oral-examination-2011-current",
"phd-final-oral-examination-2006-2010",
"phd-final-oral-examination-2000-2005",
]
def test_first_page():
for suffix in suffixes:
listing_url = UofTPublicHealthListing.listing_url() + suffix
page = requests.get(listing_url).text
soup = BeautifulSoup(page, "html5lib")
directory = UofTPublicHealthListing(soup)
pp = pprint.PrettyPrinter(indent = 2)
for defence in directory.defences(listing_url):
pp.pprint(defence.visit())
def scrape():
t = TabWriter("output/uoft_public_health.txt")
t.write_headers()
for suffix in suffixes:
listing_url = UofTPublicHealthListing.listing_url() + suffix
page = requests.get(listing_url).text
soup = BeautifulSoup(page, "html5lib")
directory = UofTPublicHealthListing(soup)
pp = pprint.PrettyPrinter(indent = 2)
for defence in directory.defences(listing_url):
t.write_row(defence)
#test_first_page()
scrape()