Skip to content

Commit

Permalink
- rename output routeFareList.min.json to routeFareList.mergeRoutes.m…
Browse files Browse the repository at this point in the history
…in.json in previous workflow (mergeRoutes.py)

- add mergeStopList stop groupping to alpha json for testing
- update python 3.8 to 3.12
- update pyproj 3.3.0 to 3.6.1
- update outdated actions/checkout@v2 to actions/checkout@v4
- update outdated actions/setup-python@v4 to actions/setup-python@v5
- update outdated actions/upload-artifact@v3 to actions/upload-artifact@v4
  • Loading branch information
chengkeith committed Sep 23, 2024
1 parent 6cd056f commit e005877
Show file tree
Hide file tree
Showing 4 changed files with 277 additions and 24 deletions.
26 changes: 20 additions & 6 deletions .github/workflows/fetch-data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ jobs:

steps:
- name: Check out repository code
uses: actions/checkout@v2
uses: actions/checkout@v4
- name: Setup Python environment
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
python-version: '3.12'
architecture: 'x64'
cache: 'pip'
cache-dependency-path: crawling/requirements.txt
Expand Down Expand Up @@ -45,11 +45,12 @@ jobs:
python ./crawling/matchGtfs.py
python ./crawling/cleansing.py
python ./crawling/mergeRoutes.py
python ./crawling/mergeStopList.py
python ./crawling/routeCompare.py
python ./crawling/mtrExits.py
- name: Archive crawling outputs
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: always()
with:
name: Crawled Files
Expand All @@ -59,6 +60,7 @@ jobs:
routeFareList*
routeList*
stopList*
stopMap*
routeTime.json
gtfs
gtfs.json
Expand All @@ -68,11 +70,23 @@ jobs:
route-ts/
exits.mtr.json
- name: Update MD5
run: md5sum routeFareList.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.md5
run: |
md5sum routeFareList.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.md5
md5sum routeFareList.alpha.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.alpha.md5
- name: create deployment folder
run: mkdir -p build
- name: cp files into deployment folder
run: cp -r routeFareList.json routeFareList.min.json routeFareList.md5 CNAME exits.mtr.json route-ts build/
run: |
cp \
routeFareList.json \
routeFareList.min.json \
routeFareList.alpha.json \
routeFareList.alpha.min.json \
routeFareList.md5 \
routeFareList.alpha.md5 \
CNAME \
exits.mtr.json \
build/
- name: cp route-ts into deployment folder
run: cp -r route-ts build
- name: Update resources
Expand Down
35 changes: 19 additions & 16 deletions crawling/mergeRoutes.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,25 +68,28 @@ def importRouteListJson( co ):
stop_b = stopList[stop_b]
dist = haversine(
(stop_a['location']['lat'], stop_a['location']['lng']),
(stop_b['location']['lat'], stop_b['location']['lng'])
) * 1000 # in meter
(stop_b['location']['lat'], stop_b['location']['lng']),
unit=Unit.METERS # specify that we want distance in metres, default unit is km
)
merge = merge and dist < 300
if merge:
found = True
route['stops'].append((co, _route['stops']))
route['bound'][co] = _route['bound']
for i in range(0, route['seq']):
if route['stops'][0][0] == co:
# skip if same company
continue
if route['stops'][0][1][i] not in stopMap:
stopMap[route['stops'][0][1][i]] = [(co, _route['stops'][i])]
elif (co, _route['stops'][i]) not in stopMap[route['stops'][0][1][i]]:
stopMap[route['stops'][0][1][i]].append( (co, _route['stops'][i]) )
if _route['stops'][i] not in stopMap:
stopMap[_route['stops'][i]] = [(route['stops'][0][0], route['stops'][0][1][i])]
elif (route['stops'][0][0], route['stops'][0][1][i]) not in stopMap[_route['stops'][i]]:
stopMap[_route['stops'][i]].append( (route['stops'][0][0], route['stops'][0][1][i]) )
#### stopMap will be generated by mergeStopList.py, hence commented below ####
# for i in range(0, route['seq']):
# if route['stops'][0][0] == co:
# # skip if same company
# continue
# if route['stops'][0][1][i] not in stopMap:
# stopMap[route['stops'][0][1][i]] = [(co, _route['stops'][i])]
# elif (co, _route['stops'][i]) not in stopMap[route['stops'][0][1][i]]:
# stopMap[route['stops'][0][1][i]].append( (co, _route['stops'][i]) )
# if _route['stops'][i] not in stopMap:
# stopMap[_route['stops'][i]] = [(route['stops'][0][0], route['stops'][0][1][i])]
# elif (route['stops'][0][0], route['stops'][0][1][i]) not in stopMap[_route['stops'][i]]:
# stopMap[_route['stops'][i]].append( (route['stops'][0][0], route['stops'][0][1][i]) )
####

if not found:
routeList.append(
Expand Down Expand Up @@ -177,8 +180,8 @@ def standardizeDict(d):
'serviceDayMap': serviceDayMap,
})

with open( 'routeFareList.json', 'w', encoding='UTF-8' ) as f:
with open( 'routeFareList.mergeRoutes.json', 'w', encoding='UTF-8' ) as f:
f.write(json.dumps(db, ensure_ascii=False, indent=4))

with open( 'routeFareList.min.json', 'w', encoding='UTF-8' ) as f:
with open( 'routeFareList.mergeRoutes.min.json', 'w', encoding='UTF-8' ) as f:
f.write(json.dumps(db, ensure_ascii=False, separators=(',', ':')))
236 changes: 236 additions & 0 deletions crawling/mergeStopList.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import logging
import math
import json
import time
from haversine import haversine, Unit

def get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id):
DISTANCE_THRESHOLD = 50 # in metres
BEARING_THRESHOLD = 45 # in degrees
STOP_LIST_LIMIT = 50 # max number of stops in a group

def get_stops_haversine_distance(stop_a, stop_b):
return haversine(
(stop_a['location']['lat'], stop_a['location']['lng']),
(stop_b['location']['lat'], stop_b['location']['lng']),
unit=Unit.METERS # specify that we want distance in meter, default is km
)

bearing_targets = stop_seq_mapping.get(stop_id, {}).get('bearings', [])

def is_bearing_in_range(bearing):
if BEARING_THRESHOLD >= 180 or not bearing_targets:
return True
for target in bearing_targets:
bearing_min = target - BEARING_THRESHOLD
bearing_max = target + BEARING_THRESHOLD
if bearing_min < 0:
bearing_min += 360
if bearing_max > 360:
bearing_max -= 360
if (bearing_min <= bearing <= bearing_max or
(bearing_min > bearing_max and (bearing <= bearing_max or bearing >= bearing_min))):
return True
return False

def search_nearby_stops(target_stop_id, excluded_stop_id_list):
target_stop = stop_list[target_stop_id]
# take lat/lng up to 3 decimal places, that's about 100m x 100m square
lat = int(target_stop['location']['lat'] * 1000)
lng = int(target_stop['location']['lng'] * 1000)

nearby_stops = []
for stop_id in stop_list_grid.get(f"{lat}_{lng}", []):
if (stop_id not in excluded_stop_id_list and get_stops_haversine_distance(target_stop, stop_list[stop_id]) <= DISTANCE_THRESHOLD):
bearings = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
if any(is_bearing_in_range(b) for b in bearings):
nearby_stops.append({
'id': stop_id,
'co': stop_seq_mapping.get(stop_id, {}).get('co', '')
})
return nearby_stops

stop_group = []
stop_list_entries = search_nearby_stops(stop_id, [])

# recursively search for nearby stops within thresholds (distance and bearing)
# stop searching when no new stops are found within range, or when stop list is getting too large
i = 0
while i < len(stop_list_entries):
entry = stop_list_entries[i]
stop_group.append([entry['co'], entry['id']])
i += 1
if len(stop_list_entries) < STOP_LIST_LIMIT:
stop_list_entries.extend(search_nearby_stops(entry['id'], [e['id'] for e in stop_list_entries]))

# to reduce size of routeFareList.min.json, excl current stop_id from final output stopMap
return [stop for stop in stop_group if stop[1] != stop_id]
# return stop_group

def get_bearing(a, b):
φ1 = math.radians(a['lat'])
φ2 = math.radians(b['lat'])
λ1 = math.radians(a['lng'])
λ2 = math.radians(b['lng'])

y = math.sin(λ2 - λ1) * math.cos(φ2)
x = (math.cos(φ1) * math.sin(φ2) -
math.sin(φ1) * math.cos(φ2) * math.cos(λ2 - λ1))
θ = math.atan2(y, x)
brng = (math.degrees(θ) + 360) % 360 # in degrees
return brng

def get_stop_bearings(route_stops):
unique_routes = []
bearings = []
for route_stop in route_stops:
if route_stop['bearing'] != -1:
unique_route = f"{route_stop['co']}_{route_stop['routeKey'].split('+')[0]}_{route_stop['bearing']}"
if unique_route not in unique_routes:
unique_routes.append(unique_route)
bearings.append(route_stop['bearing'])

if not bearings:
return []

BEARING_THRESHOLD = 45 # in degrees
BEARING_EPSILON = 10e-6 # very small number
bearing_groups = []

for bearing in bearings:
if bearing == -1:
continue
if not bearing_groups:
bearing_groups.append([bearing])
continue

for group in bearing_groups:
if any(abs(b - bearing) < BEARING_EPSILON for b in group):
break
if any(abs(b - bearing) <= BEARING_THRESHOLD or abs(b - bearing) >= 360 - BEARING_THRESHOLD for b in group):
group.append(bearing)
break
else:
bearing_groups.append([bearing])

if len(bearing_groups) == 1:
return bearing_groups[0]

longest_length = max(len(group) for group in bearing_groups)
return [b for group in bearing_groups if len(group) == longest_length for b in group]

# Main function to process stops
def merge_stop_list():
# Read the result from previous pipeline
with open('routeFareList.mergeRoutes.min.json', 'r', encoding='UTF-8') as f:
db = json.load(f)

route_list = db['routeList']
stop_list = db['stopList']
start_time = time.time()
stop_seq_mapping = {}

# Preprocess the list of bearings for each stop
for route_key, route_list_entry in route_list.items():
stops = route_list_entry.get('stops', {})
for co, co_stops in stops.items():
for stop_pos, stop_id in enumerate(co_stops):
if stop_id not in stop_seq_mapping:
stop_seq_mapping[stop_id] = {"routeStops": [], "co": co, "bearings": []}
if stop_pos == len(co_stops) - 1:
stop_seq_mapping[stop_id]['routeStops'].append({
'routeKey': route_key,
'co': co,
'seq': stop_pos,
'bearing': -1
})
else:
bearing = get_bearing(stop_list[stop_id]['location'], stop_list[co_stops[stop_pos + 1]]['location'])
stop_seq_mapping[stop_id]['routeStops'].append({
'routeKey': route_key,
'co': co,
'seq': stop_pos,
'bearing': bearing
})

for stop_id in stop_seq_mapping.keys():
stop_seq_mapping[stop_id]['bearings'] = get_stop_bearings(stop_seq_mapping[stop_id]['routeStops'])

# Just dump the json in case of a need for trouble-shooting, but otherwise we do not need this file
with open('stopMap.routeStopsSequence.json', 'w', encoding='UTF-8') as f:
json.dump(stop_seq_mapping, f)

logger.info(f'Processed routeStopsSequence in {(time.time() - start_time) * 1000:.2f}ms')

# Preprocess stopList, organise stops into ~100m x ~100m squares to reduce size of nested loop later
stop_list_grid = {}
for stop_id, stop in stop_list.items():
# take lat/lng up to 3 decimal places, that's about 100m x 100m square
lat = int(stop['location']['lat'] * 1000)
lng = int(stop['location']['lng'] * 1000)
# add stop into the 9 grid boxes surrounding this stop
grid = [
f"{lat - 1}_{lng - 1}",
f"{lat }_{lng - 1}",
f"{lat + 1}_{lng - 1}",
f"{lat - 1}_{lng }",
f"{lat }_{lng }",
f"{lat + 1}_{lng }",
f"{lat - 1}_{lng + 1}",
f"{lat }_{lng + 1}",
f"{lat + 1}_{lng + 1}",
]
for grid_id in grid:
if grid_id not in stop_list_grid:
stop_list_grid[grid_id] = []
stop_list_grid[grid_id].append(stop_id)

target_stop_list = list(stop_list.items())
stop_map = {}
count = 0
group_count = 0

for stop_id, stop in target_stop_list:
count += 1
# if count % 1000 == 0:
# logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")

stop_group = get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id)
if len(stop_group) > 0:
group_count += 1
stop_map[stop_id] = stop_group

logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")

with open('stopMap.json', 'w', encoding='UTF-8') as f:
json.dump(stop_map, f, indent=4)

db['stopMap'] = stop_map

with open('routeFareList.json', 'w', encoding='UTF-8') as f:
json.dump(db, f, indent=4)

# reduce size of routeFareList.min.json by rounding lat/lng values to 5 decimal places
# 5 d.p. is roughly one-metre accuracy, it is good enough for this project
# saves around 50kb in size for 14,000 stops
for stop_id, stop in target_stop_list:
stop_list[stop_id]['location']['lat'] = float('%.5f' % (stop_list[stop_id]['location']['lat']))
stop_list[stop_id]['location']['lng'] = float('%.5f' % (stop_list[stop_id]['location']['lng']))

db['stopList'] = stop_list

logger.info(f"Reduced location lat/lng to 5 d.p. at {(time.time() - start_time) * 1000:.2f}ms")

with open('routeFareList.alpha.json', 'w', encoding='UTF-8') as f:
json.dump(db, f, indent=4)

with open('routeFareList.min.json', 'w', encoding='UTF-8') as f:
json.dump(db, f)

with open('routeFareList.alpha.min.json', 'w', encoding='UTF-8') as f:
json.dump(db, f)

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
merge_stop_list()
4 changes: 2 additions & 2 deletions crawling/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ certifi==2020.12.5
cffi==1.15.0
chardet==4.0.0
cryptography==3.4.7
haversine==2.3.0
haversine>=2.3.0
idna==2.10
pycparser==2.20
pyOpenSSL==20.0.1
Expand All @@ -12,7 +12,7 @@ PySocks==1.7.1
six==1.15.0
urllib3==1.26.4
wheel==0.36.2
pyproj==3.3.0
pyproj>=3.6.1
httpx==0.25.2
xxhash==3.2.0
-e .

0 comments on commit e005877

Please sign in to comment.