Flex status update [SOURCES] #1011
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Direct download URLs test for sources | |
on: | |
pull_request: | |
branches: [ main ] | |
paths-ignore: | |
- 'scripts/**' | |
- '.github/workflows/add_new_or_updated_feeds.yml' | |
- '.github/workflows/**' | |
jobs: | |
get-urls: | |
if: contains(github.event.pull_request.title, '[SOURCES]') | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install wheel numpy | |
- name: Get added and modified files | |
id: files | |
uses: jitterbit/get-changed-files@v1 | |
- name: Create the direct download URLs matrix | |
shell: python | |
run: | | |
import os | |
import json | |
import numpy as np | |
# OS constants | |
ROOT = os.getcwd() | |
GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/schedule" | |
MATRIX_FILE = "urls_matrix.json" | |
# File constants | |
URLS = "urls" | |
DIRECT_DOWNLOAD = "direct_download" | |
AUTHENTICATION_TYPE = "authentication_type" | |
API_KEY_PARAMETER_NAME = "api_key_parameter_name" | |
# Github constants | |
MAX_JOB_NUMBER = 256 | |
# Matrix constants | |
INCLUDE = "include" | |
DATA = "data" | |
BASE = "base" | |
changed_files = "${{ steps.files.outputs.added_modified }}".split() | |
changed_files = [file for file in changed_files if GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT in file] | |
urls = {} | |
for file in changed_files: | |
base = os.path.splitext(os.path.basename(file))[0] | |
with open(os.path.join(ROOT, file), "r") as fp: | |
file_json = json.load(fp) | |
direct_download_url = file_json.get(URLS, {}).get(DIRECT_DOWNLOAD) | |
if direct_download_url is None: | |
raise ValueError(f"{base}: Missing direct download url.") | |
authentication_type = file_json.get(URLS, {}).get(AUTHENTICATION_TYPE) | |
api_key_parameter_name = file_json.get(URLS, {}).get(API_KEY_PARAMETER_NAME) | |
urls[base] = { | |
DIRECT_DOWNLOAD: direct_download_url, | |
AUTHENTICATION_TYPE: authentication_type, | |
API_KEY_PARAMETER_NAME: api_key_parameter_name | |
} | |
urls_data = [] | |
jobs = np.array_split(list(urls.keys()), min(MAX_JOB_NUMBER, len(list(urls.keys())))) | |
jobs = [list(job) for job in jobs] | |
for job in jobs: | |
urls_data_string = "" | |
while len(job) > 0: | |
file_base = job.pop() | |
file_information = { | |
BASE: file_base, | |
DIRECT_DOWNLOAD: urls[file_base][DIRECT_DOWNLOAD], | |
AUTHENTICATION_TYPE: urls[file_base][AUTHENTICATION_TYPE], | |
API_KEY_PARAMETER_NAME: urls[file_base][API_KEY_PARAMETER_NAME] | |
} | |
urls_data_string = urls_data_string + json.dumps( | |
file_information, separators=(",", ":") | |
) | |
job_data = {DATA: urls_data_string.replace("}{", "} {")} | |
urls_data.append(job_data) | |
matrix_data = {INCLUDE: urls_data} | |
with open(os.path.join(ROOT, MATRIX_FILE), "w") as fp: | |
file_json = json.dump(matrix_data, fp) | |
- name: Set URLs matrix | |
id: set-matrix | |
run: | | |
DATASETS=$(jq . ./urls_matrix.json -c) | |
echo $DATASETS | |
echo "matrix=$DATASETS" >> $GITHUB_OUTPUT | |
- name: Persist URLs matrix artifact | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: urls_matrix | |
path: ./urls_matrix.json | |
outputs: | |
matrix: ${{ steps.set-matrix.outputs.matrix }} | |
download-datasets: | |
name: Validate and download - ${{ fromJson(matrix.data).base }} | |
needs: [ get-urls ] | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: ${{ fromJson(needs.get-urls.outputs.matrix) }} | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python 3.9 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install pytest wheel numpy | |
sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable | |
sudo apt-get update | |
sudo apt-get install gdal-bin python3-gdal | |
sudo apt-get install libgdal-dev | |
pip install GDAL==$(gdal-config --version) --global-option=build_ext --global-option="-I/usr/include/gdal" | |
sudo apt-get install libspatialindex-dev | |
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | |
- name: Validate and download the datasets | |
shell: python | |
env: | |
API_SOURCE_SECRETS: ${{ secrets.API_SOURCE_SECRETS }} | |
run: | | |
import os | |
import json | |
import gtfs_kit | |
import requests | |
# OS constants | |
ROOT = os.getcwd() | |
DATASETS = "datasets" | |
# Jobs constants | |
BASE = "base" | |
DIRECT_DOWNLOAD = "direct_download" | |
AUTHENTICATION_TYPE = "authentication_type" | |
API_KEY_PARAMETER_NAME = "api_key_parameter_name" | |
# Secrets constants | |
API_SOURCE_SECRETS = "API_SOURCE_SECRETS" | |
# Requests constants | |
USER_AGENT = "User-Agent" | |
# Selecting a default user agent instead of using a library like fake-useragent. | |
# We want to make sure we always use the same user agent for reproducibility purposes. | |
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" | |
jobs = """${{ matrix.data }}""".split() | |
for job in jobs: | |
job_json = json.loads(job) | |
base = job_json[BASE] | |
url = job_json[DIRECT_DOWNLOAD] | |
authentication_type = job_json[AUTHENTICATION_TYPE] | |
api_key_parameter_name = job_json[API_KEY_PARAMETER_NAME] | |
params = {} | |
headers = {USER_AGENT: DEFAULT_USER_AGENT} | |
if authentication_type in [1, 2]: | |
# Load API source secrets only if authentication is required | |
# This will allow users to add or update | |
# sources that do not require authentication via forks. | |
api_source_secrets = json.loads(os.environ[API_SOURCE_SECRETS]) | |
api_key_parameter_value = api_source_secrets.get(base) | |
if authentication_type == 1: | |
params[api_key_parameter_name] = api_key_parameter_value | |
elif authentication_type == 2: | |
headers[api_key_parameter_name] = api_key_parameter_value | |
# Download the dataset | |
zip_path = os.path.join(ROOT, DATASETS, f"{base}.zip") | |
os.makedirs(os.path.dirname(zip_path), exist_ok=True) | |
try: | |
zip_file_req = requests.get(url, params=params, headers=headers, allow_redirects=True) | |
zip_file_req.raise_for_status() | |
except Exception as e: | |
raise Exception( | |
f"{base}: Exception {e} occurred when downloading the URL {url}.\n" | |
) | |
zip_file = zip_file_req.content | |
with open(zip_path, "wb") as f: | |
f.write(zip_file) | |
# Make sure that the dataset is a readable GTFS Schedule dataset | |
try: | |
gtfs_kit.read_feed(zip_path, dist_units="km") | |
except Exception as e: | |
# Delete the zip file if an exception occurs | |
os.remove(zip_path) | |
raise Exception( | |
f"{base}: Exception {e} found while parsing the GTFS Schedule dataset with the GTFS Kit library." | |
f"The dataset for the source must be a valid GTFS Schedule zip file or URL.\n" | |
) | |
- name: Persist datasets artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: datasets-${{ fromJson(matrix.data).base }} | |
path: datasets |