chore: Update Nashville MTA URLs for schedule / realtime (#314) #354

Workflow file for this run

.github/workflows/export_to_csv.yml at 23145ba

	name: Export catalogs to CSV

	on:
	push:
	branches: [ main ]
	workflow_dispatch:

	jobs:
	export-to-csv:
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v2
	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v2
	with:
	python-version: 3.9
	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install pytest wheel numpy
	sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable
	sudo apt-get update
	sudo apt-get install gdal-bin python3-gdal
	sudo apt-get install libgdal-dev
	pip install GDAL==$(gdal-config --version) --global-option=build_ext --global-option="-I/usr/include/gdal"
	sudo apt-get install libspatialindex-dev
	if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
	- name: Export the catalog of sources as CSV
	uses: jannekem/run-python-script-action@v1
	with:
	script: \|
	import pandas as pd
	import os
	import json

	CSV_PATH = "./sources.csv"
	CSV_COLUMNS = [
	'mdb_source_id',
	'data_type',
	'entity_type',
	'location.country_code',
	'location.subdivision_name',
	'location.municipality',
	'provider',
	'name',
	'note',
	'static_reference',
	'urls.direct_download',
	'urls.authentication_type',
	'urls.authentication_info',
	'urls.api_key_parameter_name',
	'urls.latest',
	'urls.license',
	'location.bounding_box.minimum_latitude',
	'location.bounding_box.maximum_latitude',
	'location.bounding_box.minimum_longitude',
	'location.bounding_box.maximum_longitude',
	'location.bounding_box.extracted_on',
	'status',
	'features',
	'redirect.id',
	'redirect.comment'
	]

	# tools.constants
	GTFS = "gtfs"
	GTFS_RT = "gtfs-rt"
	MDB_SOURCE_ID = "mdb_source_id"
	DATA_TYPE = "data_type"
	LOCATION = "location"
	COUNTRY_CODE = "country_code"
	SUBDIVISION_NAME = "subdivision_name"
	MUNICIPALITY = "municipality"
	STATIC_REFERENCE = "static_reference"
	ENTITY_TYPE = "entity_type"
	UNKNOWN = "unknown"
	URLS_AUTHENTICATION_TYPE = "urls.authentication_type"
	FEATURES = "features"
	REDIRECTS = "redirect"
	REDIRECT_ID = "redirect.id"
	REDIRECT_COMMENT = "redirect.comment"

	# tools.constants.GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT
	GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/schedule"

	# tools.constants.GTFS_REALTIME_CATALOG_PATH_FROM_ROOT
	GTFS_REALTIME_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/realtime"

	# tools.operations.get_sources
	gtfs_schedule_catalog_path = os.path.join(".", GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT)
	gtfs_realtime_catalog_path = os.path.join(".", GTFS_REALTIME_CATALOG_PATH_FROM_ROOT)
	catalog = {}
	for catalog_path in [gtfs_schedule_catalog_path, gtfs_realtime_catalog_path]:
	for path, sub_dirs, files in os.walk(catalog_path):
	for file in files:
	with open(os.path.join(path, file)) as fp:
	entity_json = json.load(fp)
	entity_id = entity_json[MDB_SOURCE_ID]
	catalog[entity_id] = entity_json
	# Complete the GTFS Realtime Sources: location information from their static reference
	# and pipe delimited static reference and entity type
	for source_id, source in catalog.items():
	if source.get(DATA_TYPE) == GTFS_RT:
	if len(source.get(STATIC_REFERENCE, [])) > 0:
	if catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION) is not None:
	source[LOCATION] = catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION)
	source[STATIC_REFERENCE] = "\|".join([str(ref_id) for ref_id in source.get(STATIC_REFERENCE)])
	else:
	source[LOCATION] = {COUNTRY_CODE: UNKNOWN, SUBDIVISION_NAME: UNKNOWN, MUNICIPALITY: UNKNOWN}
	source[ENTITY_TYPE] = "\|".join(source.get(ENTITY_TYPE))
	if len(source.get(FEATURES, [])) > 0:
	source[FEATURES] = "\|".join(source.get(FEATURES))

	# For redirects, allow strings or integers
	redirects = source.pop(REDIRECTS, [])
	# Extract ids and comments
	ids = []
	comments = []

	for item in redirects:
	ids.append(str(item["id"])) # Convert to string since we allow integer ids
	comments.append(item.get("comment", "")) # Default to an empty string if "comment" is missing

	# Join the ids and comments with '\|' as separator
	ids_str = "\|".join(ids)
	comments_str = "\|".join(comments) if comments else ""
	source[REDIRECT_ID] = ids_str
	source[REDIRECT_COMMENT] = comments_str

	catalog[source_id] = source
	# Sort the catalog and convert it to a list
	catalog = list(dict(sorted(catalog.items())).values())

	# tools.helpers.to_csv
	path = CSV_PATH
	columns = CSV_COLUMNS
	catalog = pd.json_normalize(catalog)
	tmp = pd.DataFrame()
	for column in columns:
	if column in catalog:
	tmp[column] = catalog[column]
	catalog = tmp
	if URLS_AUTHENTICATION_TYPE in catalog:
	catalog[URLS_AUTHENTICATION_TYPE] = catalog[URLS_AUTHENTICATION_TYPE].astype('Int64')
	catalog.to_csv(path, sep=",", index=False)

	- name: Upload the catalog of sources CSV artifact
	uses: actions/upload-artifact@v1
	with:
	name: sources.csv
	path: sources.csv
	store-csv:
	needs: [ export-to-csv ]
	runs-on: ubuntu-latest
	# Don't upload to Google Cloud if it was triggered by hand.
	# In that case the resulting sources.csv can be found in the build artifacts.
	if: ${{ github.event_name != 'workflow_dispatch' }}
	steps:
	- uses: actions/checkout@v2
	- name: Download the catalog of sources CSV artifact
	uses: actions/download-artifact@v1
	with:
	name: sources.csv
	path: sources.csv
	- name: Set up and authorize Cloud
	uses: google-github-actions/auth@v0
	with:
	credentials_json: ${{ secrets.ARCHIVE_DATASET_SA_KEY }}
	- name: Upload csv to Google Cloud Storage
	id: upload-csv
	uses: google-github-actions/upload-cloud-storage@v0
	with:
	path: sources.csv
	destination: mdb-csv
	parent: false

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

chore: Update Nashville MTA URLs for schedule / realtime (#314) #354

Workflow file

chore: Update Nashville MTA URLs for schedule / realtime (#314) #354

Jobs

Run details

Workflow file for this run