-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
119 lines (78 loc) · 3.05 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import time
import os
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType
import re
import requests
def get_towns_by_state(state: str, driver: webdriver.Chrome):
"""
Get a list of towns by state from municode.com
Args:
state (str): the state to get towns from
driver (webdriver.Chrome): the webdriver to use
Returns:
data_out (pd.DataFrame): a dataframe of towns by state
"""
# assert that the driver is a webdriver.Chrome
assert isinstance(driver, webdriver.Chrome)
driver.refresh()
url = f"https://library.municode.com/{state}"
xpath_init = '/html/body/div[1]/div[2]/ui-view/div[2]/section/div/div'
driver.get(url)
data = driver.find_elements(by=By.XPATH, value=xpath_init)
time.sleep(2)
html = data[0].get_attribute("outerHTML")
url_str = url + '/[\w\.-]+'
r1 = re.findall(url_str,html)
towns_list = pd.DataFrame(r1)
towns_list = towns_list.drop_duplicates().reset_index(drop=True)
towns_list = towns_list.rename(columns={0: 'url'})
# add state column (uppercase)
towns_list['state'] = state.upper()
# add town column
towns_list['town'] = towns_list['url'].str.split('/').str[-1]
# re-order columns
data_out = towns_list[['state', 'town', 'url']]
# save to csv
csv_filename = f'{state}_town_urls.csv'
data_out.to_csv(csv_filename, index=False)
# driver.quit()
return data_out
def identify_tbl(url: str, driver: webdriver.Chrome):
driver.get(url)
time.sleep(2)
xpath_init = '/html/body/div[1]/div[2]/ui-view/mcc-codes/div[2]/section[1]/div[2]'
data = driver.find_elements(by=By.XPATH, value=xpath_init)
time.sleep(2)
html = data[0].get_attribute("outerHTML")
r2 = re.findall(r'(https?://[^\s]+)', html)
url = r2[-1]
url = url[:-1]
return url
driver = webdriver.Chrome()
town_urls_tbl = get_towns_by_state('ga', driver)
town_urls = town_urls_tbl['url'].tolist()
identify_tbl(town_urls[0], driver)
# DEPRECATIONS
# chrome_options = Options()
# determine if local or docker
# if os.getenv('DOCKER'):
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.binary_location = '/usr/bin/google-chrome'
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-dev-shm-usage')
# elif os.getenv('LOCAL') == 'True':
# chrome_options.add_experimental_option("detach", True)
# chrome_service = ChromeService(ChromeDriverManager().install())
# chrome_service = ChromeService('/usr/bin/chromedriver')