-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_util.py
214 lines (186 loc) · 7.73 KB
/
web_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from BeautifulSoup import BeautifulSoup
import cookielib
import re
import urllib
import urllib2
# Some fake headers so we can login properly.
HTTP_HEADERS = {
"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5;Windows NT) AppleWebKit/537.6+ (KHTML, like Gecko) WebKitGTK+/1.10.1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
def send_get_request(url, params={}):
'''
Gets the page with a get request and returns the response of
said page, see docs for urllib and urllib2 to read more about this response
object.
TOOD: Add get parameters in case we can't simply send it as a post for some
reason (look up examples of this sort of thing occuring as well, just in
case any valid GET request with parameters ends up being okay as a POST
request as well).
'''
get_data_encoded = urllib.urlencode(params)
if get_data_encoded:
url = "?".join([url, get_data_encoded])
request = urllib2.Request(url=url, data=None, headers=HTTP_HEADERS)
response = urllib2.urlopen(request)
return response
def parse_hidden_params(html_str):
'''
This is to deal with all of the hidden form parameters (which are likely for
csrf safety). These are returned as a dictionary of names to values
(intended to be used as parameters for 'send_post_request').
If no hidden parameters are found, an empty dictionary will be returned.
'''
params = {}
page = BeautifulSoup(html_str)
inputs = [p for p in page.findAll('input') if p['type'] == u'hidden']
for input_ in inputs:
params[input_['name']] = input_['value']
return params
def parse_redirect_action(html_str):
'''
This is meant to handle the case when the user has to be redirected by one
of those silly 'document.onload' functions. It's usually just sending a
form with a bunch of hidden parameters.
Note this will only work if there is one form on the page. It will refuse
to work if there is no 'onload' function atop the page that ends in a submit
function.
'''
# This should probably make sure the page is a link to UW weblogin
# or maybe even simpler, like checking to see if it links to a different
# page, or a set of pages which happens to include UW weblogin.
page = BeautifulSoup(html_str)
form = page.form
body = page.body
try:
submit_func = body['onload']
except KeyError:
return None
if not re.match(r'^.*?submit\(\)$', submit_func):
return None
if form is not None:
try:
link = form['action']
return link
except KeyError:
return None
return None
def make_url_opener():
'''
Builds a cookie-lovin, url openin machine (pretty simple, but the
implementation may change later, so it's a function).
Returns a CookieJar class that is tied to the url opener, just in case
we wish to peer into the cookie jar later.
'''
cookies = cookielib.CookieJar()
cookie_handler = urllib2.HTTPCookieProcessor(cookies)
url_opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies))
urllib2.install_opener(url_opener)
return cookies
def send_post_request(url, params={}, headers={}):
'''
Attempts to open the link using a post request with the passed dictionary of
params.
TODO: Document exceptions.
'''
post_data_encoded = urllib.urlencode(params)
request = urllib2.Request(url, post_data_encoded, HTTP_HEADERS)
for k in headers.keys():
request.add_header(k, headers[k])
return urllib2.urlopen(request)
def unwrap_html_contents(elmnt):
'''
Recursively tries to unwrap the data from within an element until there are
no more layers to unwrap. This shouldn't run into any infinite loops
as eventually an element will be empty or will contain some sort of
contents.
This will return the first non-None object it sees, so this is more for
taking elements out of nested tags, like:
<tt>
<a>
<i>
<strong>
Foober Bazzle-Snazz
</string>
</i>
</a>
</tt>
Which would be a pain to hard code.
'''
contents = elmnt.contents
while contents:
for c in contents:
if c.string is None:
return unwrap_html_contents(c)
else:
return c.string
return None
def parse_table_headers(tags, html_str):
'''
This is for getting the data directly under the set of table headers on a
page (for now). By that I mean if a table passed was rendered such as this:
_____ _____
| Foo | Bar |
----- -----
| 22 | 64 |
----- -----
Then the data under 'Foo' would be 22, and the data under 'Bar' would be 64.
The arguments required are a list of table header names (case insensitive)
that the callee intends to find in at most one of the table headers (the
last matching table header will have its value stored in the dictionary for
now).
For example, if we simply want the data under the headers 'Foo' and 'Bar,'
we would pass ['Foo', 'Bar'] as the first parameter, and then if 'Foo' and
'Bar' were found anywhere within the table headers of the html string that
renders the example above, then as long as said value is not None, the value
of the corresponding string that led to the match will be a key within the
returned dictionary, with the value as the string directly below the header.
From the above example, calling parse_table_headers(['Foo', 'Bar'], html),
where html would render something similar to the above, the returned
dictionary would be:
{ u'Foo': u'22', u'Bar': u'64' }
'''
regex = re.compile(
"^.*(?P<header>{0})".format('|'.join(tags)),
re.IGNORECASE,
)
page = BeautifulSoup(html_str, convertEntities=BeautifulSoup.HTML_ENTITIES)
# Remove all <br /> tags, because they'll only screw things up.
for br in page.findAll('br'):
br.extract()
# This is a bit of a hack, butif we parse the page again, all the removed br
# tags will leave contiguous strings in their wake. This will allow us to
# parse things like the current enrollment and room capacity.
page = BeautifulSoup(str(page))
info = {}
# Go through the tables and find any class info (this loop is why I hate
# tables....). We'll iterate through all of the rows and columns, keeping
# track of where we are so we can access other sections of the rows and
# columns if we encounter the types of elements we're looking for.
tables = page.findAll('table')
for table in tables:
rows = table.findAll('tr')
row_index = 0
for row in rows:
headers = row.findAll('th')
column_index = 0
for header in headers:
# This will only match after converting the unicode to a regular
# string. There's likely a far better way to do this.
m = re.match(regex, str(header.string))
'''
If m was a match, then we'll simply pluck the element directly
under the row and column we were looking for. After that, if
the element under the header is not None, then we have a key and
value that can be stored in the info dictionary.
'''
if m is not None:
next_row_elmnt = rows[
row_index + 1
].findAll('td')[column_index]
string = unwrap_html_contents(next_row_elmnt)
if string is not None:
info[m.group('header')] = string.strip()
column_index += 1
row_index += 1
return info