Skip to content

chore: Update Nashville MTA URLs for schedule / realtime (#314) #354

chore: Update Nashville MTA URLs for schedule / realtime (#314)

chore: Update Nashville MTA URLs for schedule / realtime (#314) #354

Workflow file for this run

name: Export catalogs to CSV
on:
push:
branches: [ main ]
workflow_dispatch:
jobs:
export-to-csv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest wheel numpy
sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable
sudo apt-get update
sudo apt-get install gdal-bin python3-gdal
sudo apt-get install libgdal-dev
pip install GDAL==$(gdal-config --version) --global-option=build_ext --global-option="-I/usr/include/gdal"
sudo apt-get install libspatialindex-dev
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Export the catalog of sources as CSV
uses: jannekem/run-python-script-action@v1
with:
script: |
import pandas as pd
import os
import json
CSV_PATH = "./sources.csv"
CSV_COLUMNS = [
'mdb_source_id',
'data_type',
'entity_type',
'location.country_code',
'location.subdivision_name',
'location.municipality',
'provider',
'name',
'note',
'static_reference',
'urls.direct_download',
'urls.authentication_type',
'urls.authentication_info',
'urls.api_key_parameter_name',
'urls.latest',
'urls.license',
'location.bounding_box.minimum_latitude',
'location.bounding_box.maximum_latitude',
'location.bounding_box.minimum_longitude',
'location.bounding_box.maximum_longitude',
'location.bounding_box.extracted_on',
'status',
'features',
'redirect.id',
'redirect.comment'
]
# tools.constants
GTFS = "gtfs"
GTFS_RT = "gtfs-rt"
MDB_SOURCE_ID = "mdb_source_id"
DATA_TYPE = "data_type"
LOCATION = "location"
COUNTRY_CODE = "country_code"
SUBDIVISION_NAME = "subdivision_name"
MUNICIPALITY = "municipality"
STATIC_REFERENCE = "static_reference"
ENTITY_TYPE = "entity_type"
UNKNOWN = "unknown"
URLS_AUTHENTICATION_TYPE = "urls.authentication_type"
FEATURES = "features"
REDIRECTS = "redirect"
REDIRECT_ID = "redirect.id"
REDIRECT_COMMENT = "redirect.comment"
# tools.constants.GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT
GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/schedule"
# tools.constants.GTFS_REALTIME_CATALOG_PATH_FROM_ROOT
GTFS_REALTIME_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/realtime"
# tools.operations.get_sources
gtfs_schedule_catalog_path = os.path.join(".", GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT)
gtfs_realtime_catalog_path = os.path.join(".", GTFS_REALTIME_CATALOG_PATH_FROM_ROOT)
catalog = {}
for catalog_path in [gtfs_schedule_catalog_path, gtfs_realtime_catalog_path]:
for path, sub_dirs, files in os.walk(catalog_path):
for file in files:
with open(os.path.join(path, file)) as fp:
entity_json = json.load(fp)
entity_id = entity_json[MDB_SOURCE_ID]
catalog[entity_id] = entity_json
# Complete the GTFS Realtime Sources: location information from their static reference
# and pipe delimited static reference and entity type
for source_id, source in catalog.items():
if source.get(DATA_TYPE) == GTFS_RT:
if len(source.get(STATIC_REFERENCE, [])) > 0:
if catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION) is not None:
source[LOCATION] = catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION)
source[STATIC_REFERENCE] = "|".join([str(ref_id) for ref_id in source.get(STATIC_REFERENCE)])
else:
source[LOCATION] = {COUNTRY_CODE: UNKNOWN, SUBDIVISION_NAME: UNKNOWN, MUNICIPALITY: UNKNOWN}
source[ENTITY_TYPE] = "|".join(source.get(ENTITY_TYPE))
if len(source.get(FEATURES, [])) > 0:
source[FEATURES] = "|".join(source.get(FEATURES))
# For redirects, allow strings or integers
redirects = source.pop(REDIRECTS, [])
# Extract ids and comments
ids = []
comments = []
for item in redirects:
ids.append(str(item["id"])) # Convert to string since we allow integer ids
comments.append(item.get("comment", "")) # Default to an empty string if "comment" is missing
# Join the ids and comments with '|' as separator
ids_str = "|".join(ids)
comments_str = "|".join(comments) if comments else ""
source[REDIRECT_ID] = ids_str
source[REDIRECT_COMMENT] = comments_str
catalog[source_id] = source
# Sort the catalog and convert it to a list
catalog = list(dict(sorted(catalog.items())).values())
# tools.helpers.to_csv
path = CSV_PATH
columns = CSV_COLUMNS
catalog = pd.json_normalize(catalog)
tmp = pd.DataFrame()
for column in columns:
if column in catalog:
tmp[column] = catalog[column]
catalog = tmp
if URLS_AUTHENTICATION_TYPE in catalog:
catalog[URLS_AUTHENTICATION_TYPE] = catalog[URLS_AUTHENTICATION_TYPE].astype('Int64')
catalog.to_csv(path, sep=",", index=False)
- name: Upload the catalog of sources CSV artifact
uses: actions/upload-artifact@v1
with:
name: sources.csv
path: sources.csv
store-csv:
needs: [ export-to-csv ]
runs-on: ubuntu-latest
# Don't upload to Google Cloud if it was triggered by hand.
# In that case the resulting sources.csv can be found in the build artifacts.
if: ${{ github.event_name != 'workflow_dispatch' }}
steps:
- uses: actions/checkout@v2
- name: Download the catalog of sources CSV artifact
uses: actions/download-artifact@v1
with:
name: sources.csv
path: sources.csv
- name: Set up and authorize Cloud
uses: google-github-actions/auth@v0
with:
credentials_json: ${{ secrets.ARCHIVE_DATASET_SA_KEY }}
- name: Upload csv to Google Cloud Storage
id: upload-csv
uses: google-github-actions/upload-cloud-storage@v0
with:
path: sources.csv
destination: mdb-csv
parent: false