.github/workflows/export_to_csv.yml

name: Export catalogs to CSV

on:
  push:
    branches: [ main ]
  workflow_dispatch:

jobs:
  export-to-csv:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: 3.9
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install pytest wheel numpy
          sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable
          sudo apt-get update
          sudo apt-get install gdal-bin python3-gdal
          sudo apt-get install libgdal-dev
          pip install GDAL==$(gdal-config --version) --global-option=build_ext --global-option="-I/usr/include/gdal"
          sudo apt-get install libspatialindex-dev
          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
      - name: Export the catalog of sources as CSV
        uses: jannekem/run-python-script-action@v1
        with:
          script: |
            import pandas as pd
            import os
            import json

            CSV_PATH = "./sources.csv"
            CSV_COLUMNS = [
              'mdb_source_id',
              'data_type',
              'entity_type',
              'location.country_code',
              'location.subdivision_name',
              'location.municipality',
              'provider',
              'name',
              'note',
              'static_reference',
              'urls.direct_download',
              'urls.authentication_type',
              'urls.authentication_info',
              'urls.api_key_parameter_name',
              'urls.latest',
              'urls.license',
              'location.bounding_box.minimum_latitude',
              'location.bounding_box.maximum_latitude',
              'location.bounding_box.minimum_longitude',
              'location.bounding_box.maximum_longitude',
              'location.bounding_box.extracted_on',
              'status',
              'features',
              'redirect.id',
              'redirect.comment'
            ]

            # tools.constants
            GTFS = "gtfs"
            GTFS_RT = "gtfs-rt"
            MDB_SOURCE_ID = "mdb_source_id"
            DATA_TYPE = "data_type"
            LOCATION = "location"
            COUNTRY_CODE = "country_code"
            SUBDIVISION_NAME = "subdivision_name"
            MUNICIPALITY = "municipality"
            STATIC_REFERENCE = "static_reference"
            ENTITY_TYPE = "entity_type"
            UNKNOWN = "unknown"
            URLS_AUTHENTICATION_TYPE = "urls.authentication_type"
            FEATURES = "features"
            REDIRECTS = "redirect"
            REDIRECT_ID = "redirect.id"
            REDIRECT_COMMENT = "redirect.comment"

            # tools.constants.GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT
            GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/schedule"

            # tools.constants.GTFS_REALTIME_CATALOG_PATH_FROM_ROOT
            GTFS_REALTIME_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/realtime"

            # tools.operations.get_sources
            gtfs_schedule_catalog_path = os.path.join(".", GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT)
            gtfs_realtime_catalog_path = os.path.join(".", GTFS_REALTIME_CATALOG_PATH_FROM_ROOT)
            catalog = {}
            for catalog_path in [gtfs_schedule_catalog_path, gtfs_realtime_catalog_path]:
                for path, sub_dirs, files in os.walk(catalog_path):
                    for file in files:
                        with open(os.path.join(path, file)) as fp:
                            entity_json = json.load(fp)
                            entity_id = entity_json[MDB_SOURCE_ID]
                            catalog[entity_id] = entity_json
            # Complete the GTFS Realtime Sources: location information from their static reference
            # and pipe delimited static reference and entity type
            for source_id, source in catalog.items():
                if source.get(DATA_TYPE) == GTFS_RT:
                    if len(source.get(STATIC_REFERENCE, [])) > 0:
                        if catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION) is not None:
                            source[LOCATION] = catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION)
                        source[STATIC_REFERENCE] = "|".join([str(ref_id) for ref_id in source.get(STATIC_REFERENCE)])
                    else:
                        source[LOCATION] = {COUNTRY_CODE: UNKNOWN, SUBDIVISION_NAME: UNKNOWN, MUNICIPALITY: UNKNOWN}
                    source[ENTITY_TYPE] = "|".join(source.get(ENTITY_TYPE))
                if len(source.get(FEATURES, [])) > 0:
                    source[FEATURES] = "|".join(source.get(FEATURES))
            
                # For redirects, allow strings or integers
                redirects = source.pop(REDIRECTS, [])
                # Extract ids and comments
                ids = []
                comments = []

                for item in redirects:
                    ids.append(str(item["id"]))               # Convert to string since we allow integer ids
                    comments.append(item.get("comment", ""))  # Default to an empty string if "comment" is missing

                # Join the ids and comments with '|' as separator
                ids_str = "|".join(ids)
                comments_str = "|".join(comments) if comments else ""
                source[REDIRECT_ID] = ids_str
                source[REDIRECT_COMMENT] = comments_str

                catalog[source_id] = source
            # Sort the catalog and convert it to a list
            catalog = list(dict(sorted(catalog.items())).values())

            # tools.helpers.to_csv
            path = CSV_PATH
            columns = CSV_COLUMNS
            catalog = pd.json_normalize(catalog)
            tmp = pd.DataFrame()
            for column in columns:
                if column in catalog:
                    tmp[column] = catalog[column]
            catalog = tmp
            if URLS_AUTHENTICATION_TYPE in catalog:
                catalog[URLS_AUTHENTICATION_TYPE] = catalog[URLS_AUTHENTICATION_TYPE].astype('Int64')
            catalog.to_csv(path, sep=",", index=False)

      - name: Upload the catalog of sources CSV artifact
        uses: actions/upload-artifact@v1
        with:
          name: sources.csv
          path: sources.csv
  store-csv:
    needs: [ export-to-csv ]
    runs-on: ubuntu-latest
    # Don't upload to Google Cloud if it was triggered by hand.
    # In that case the resulting sources.csv can be found in the build artifacts.
    if: ${{ github.event_name != 'workflow_dispatch' }}
    steps:
      - uses: actions/checkout@v2
      - name: Download the catalog of sources CSV artifact
        uses: actions/download-artifact@v1
        with:
          name: sources.csv
          path: sources.csv
      - name: Set up and authorize Cloud
        uses: google-github-actions/auth@v0
        with:
          credentials_json: ${{ secrets.ARCHIVE_DATASET_SA_KEY }}
      - name: Upload csv to Google Cloud Storage
        id: upload-csv
        uses: google-github-actions/upload-cloud-storage@v0
        with:
          path: sources.csv
          destination: mdb-csv
          parent: false