From 83dbf1e52ad9db184fb14e8874b2bf398c218474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Simard?= Date: Mon, 2 Oct 2023 21:16:35 -0400 Subject: [PATCH] feat : adding GH action to automatically fetch feeds sent by providers (#304) * First commit * Create add_new_or_updated_feeds.yml --- .../workflows/add_new_or_updated_feeds.yml | 125 ++++++++++++ scripts/process_csv_in_github_action.swift | 190 ++++++++++++++++++ 2 files changed, 315 insertions(+) create mode 100644 .github/workflows/add_new_or_updated_feeds.yml create mode 100644 scripts/process_csv_in_github_action.swift diff --git a/.github/workflows/add_new_or_updated_feeds.yml b/.github/workflows/add_new_or_updated_feeds.yml new file mode 100644 index 00000000..53815632 --- /dev/null +++ b/.github/workflows/add_new_or_updated_feeds.yml @@ -0,0 +1,125 @@ +name: Add new or updated feeds from Google Sheets/Form + +on: + schedule: + - cron: '0 4 * * *' # At 00:00 ETC every day + +env: + DATE_FORMAT: "[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}|[0-9]{4}-[0-9]{2}-[0-9]{2}" # this is the format we need to compare dates between the CSV and the local system. + DATE_FORMAT_DESIRED: "MM/dd/yyyy" + + USERNAME: "github-actions[bot]" # GitHub username that will create the PR + USERNAME_EMAIL: "41898282+github-actions[bot]@users.noreply.github.com" + + ORGANIZATION: MobilityData # organization name + REPO_NAME: mobity-database-catalogs # repository name + BASE: "main" + + REVIEWERS_JSON: "[\"emmambd\"]" # List of GitHub usernames of the reviewers, in a JSON array : ["username1", "username2"] + + GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT: "catalogs/sources/gtfs/schedule/" + GTFS_REALTIME_CATALOG_PATH_FROM_ROOT: "catalogs/sources/gtfs/realtime/" + +jobs: + add-new-updated-feeds: + runs-on: ubuntu-latest + steps: + - name: Setup global variables + id: global_vars + run: | + echo "TODAYS_DATE=$(date +%m/%d/%Y)" >> $GITHUB_ENV # Ex.: 07/27/2023 + echo "TODAYS_DAY=$(date '+%d')" >> $GITHUB_ENV # Ex.: 27 + echo "TODAYS_MONTH=$(date '+%m')" >> $GITHUB_ENV # Ex.: 07 + echo "TODAYS_YEAR=$(date '+%Y')" >> $GITHUB_ENV # Ex.: 2023 + + - name: Create branch name + id: create_branch_name + run: | + echo "BRANCH=${{ env.TODAYS_YEAR }}-${{ env.TODAYS_MONTH }}-${{ env.TODAYS_DAY }}" >> $GITHUB_OUTPUT # Branch name + + - name: Load secrets from 1Password + id: onepw_secrets + uses: 1password/load-secrets-action@v1.3.1 + with: + export-env: true # Export loaded secrets as environment variables + env: + OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }} + CREDENTIALS: "op://rbiv7rvkkrsdlpcrz3bmv7nmcu/ifkeehu5gzi7wy5ub5qvwkaire/credential" + + # - name: Get swift version # just for verification, can leave commented out + # run: swift --version + # Swift is installed by default on github runners, tested with v 5.8.1 + + - name: Checkout repo + id: checkout_repo + uses: actions/checkout@v4 + with: + ref: ${{ env.BASE }} + fetch-depth: 0 + token: ${{ env.CREDENTIALS }} + + - name: Create new branch + shell: bash + run: | + git checkout -b ${{ steps.create_branch_name.outputs.BRANCH }} + git reset --hard ${{ env.BASE }} + + - name: Download CSV and process each lines + id: process-csv + run: | + cd ${{ github.workspace }}/scripts + OUTPUT=$(swift process_csv_in_github_action.swift "${{ secrets.CSV_URL }}" "${{ env.DATE_FORMAT }}" "${{ env.DATE_FORMAT_DESIRED }}") + echo "PYTHON_SCRIPT_ARGS=${OUTPUT}" >> $GITHUB_OUTPUT + + - name: Setup Python + if: steps.process-csv.outputs.PYTHON_SCRIPT_ARGS != '' + uses: actions/setup-python@v4.7.0 + with: + python-version: '3.11' # install the python version needed + + - name: Create + activate a Python virtual env & run script + if: steps.process-csv.outputs.PYTHON_SCRIPT_ARGS != '' + env: + PYTHONPATH: ${{ github.workspace }}/tools + PYTHONIOENCODING: "utf8" #ascii + shell: bash + run: | + python -m venv env + source env/bin/activate + pip install virtualenv --quiet + pip install gtfs_kit --quiet + pip install unidecode --quiet + + # We use § as the separator because for an unknown reason a newline doesn't work. + sections=$(echo '${{ steps.process-csv.outputs.PYTHON_SCRIPT_ARGS }}' | awk -F'§' '{for (i=1; i<=NF; i++) print $i}') + for section in "${sections[@]}"; do + eval "python -c 'from tools.operations import *; ${section}'" + done + + - name: Commit & push + if: steps.process-csv.outputs.PYTHON_SCRIPT_ARGS != '' + uses: EndBug/add-and-commit@v9.1.3 + with: + github_token: ${{ env.CREDENTIALS }} + new_branch: ${{ steps.create_branch_name.outputs.BRANCH }} + author_name: ${{ env.USERNAME }} + author_email: ${{ env.USERNAME_EMAIL }} + committer_name: ${{ env.USERNAME }} + committer_email: ${{ env.USERNAME_EMAIL }} + message: "Automated commit — New/Updated feed" + + # - name: Create Pull Request + # if: steps.process-csv.outputs.PYTHON_SCRIPT_ARGS != '' + # uses: peter-evans/create-pull-request@v5.0.2 + # with: + # token: ${{ env.CREDENTIALS }} + # title: "Automated commit — New/Updated feed" + # commit-message: "Automated commit — New/Updated feed" + # body: "New feed(s) were found, and added as a PR for you to review." + # author: "${{ env.USERNAME }} <${{ env.USERNAME_EMAIL }}>" + # reviewers: ${{ env.REVIEWERS_JSON }} + # branch: ${{ steps.create_branch_name.outputs.BRANCH }} + # base: ${{ env.BASE }} + # add-paths: | + # ${{ env.GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT }} + # ${{ env.GTFS_REALTIME_CATALOG_PATH_FROM_ROOT }} \ No newline at end of file diff --git a/scripts/process_csv_in_github_action.swift b/scripts/process_csv_in_github_action.swift new file mode 100644 index 00000000..fc4d89df --- /dev/null +++ b/scripts/process_csv_in_github_action.swift @@ -0,0 +1,190 @@ +import Foundation +#if canImport(FoundationNetworking) + import FoundationNetworking +#endif + +enum column : Int, CaseIterable { + case timestamp = 0 + case provider = 1 + case regioncity = 2 + case currenturl = 3 + case updatednewsourceurl = 4 + case datatype1 = 5 + case request = 6 + case downloadurl = 7 + case country = 8 + case subdivision_name = 9 + case municipality = 10 + case name = 11 + case yournameorg = 12 + case license_url = 13 + case tripupdatesurl = 14 + case servicealertsurl = 15 + case genunknownrturl = 16 + case authentication_type = 17 + case authentication_info_url = 18 + case api_key_parameter_name = 19 + case note = 20 + case gtfsschedulefeatures = 21 + case gtfsschedulestatus = 22 + case gtfsrealtimestatus = 23 + case youremail = 24 + case dataproduceremail = 25 + case realtimefeatures = 26 + case isocountrycode = 27 + case feedupdatestatus = 28 +} + +enum defaults: String { + case date = "01/01/1970" +} + +enum requestType: String { + case isAddNewFeed = "New source" + case isUpdateExistingFeed = "Source update" + case isToRemoveFeed = "removed" +} + +enum dataType: String { + case schedule = "Schedule" + case realtime = "Realtime" +} + +let arguments : [String] = CommandLine.arguments + +if CommandLine.argc == 4 { + + let csvLineSeparator : String = "\n" + let csvColumnSeparator : String = "," + + let csvURLStringArg : String = arguments[1] // the first argument [0] is the name of the script, we can ignore in this context. + let dateFormatGREPArg : String = arguments[2] + let dateFormatDesiredArg : String = arguments[3] + + guard let csvURLasURL : URL = URL(string: csvURLStringArg) else { + print("\n ERROR: The specified URL does not appear to exist :\n \(csvURLStringArg)\n") + exit(1) + } + + let dateFormatter : DateFormatter = DateFormatter(); let today : Date = Date() + dateFormatter.dateFormat = dateFormatDesiredArg + let todayDate : String = dateFormatter.string(from: today) // Ex.: 07/27/2023 + + let csvData : String = try String(contentsOf: csvURLasURL, encoding:.utf8) + + let csvLines : [String] = csvData.components(separatedBy: csvLineSeparator) ; var csvArray : [[String]] = [[]] + for currentLine : String in csvLines { + csvArray.append(currentLine.components(separatedBy: csvColumnSeparator)) + } + + var PYTHON_SCRIPT_OUTPUT : String = "" + let dateFormatAsRegex : Regex = try Regex(dateFormatGREPArg) + + for line : [String] in csvArray { + + var PYTHON_SCRIPT_ARGS_TEMP : String = "" + + if line.count >= column.allCases.count { + + let timestamp : String = line[column.timestamp.rawValue].trimmingCharacters(in: .whitespacesAndNewlines) + let provider : String = line[column.provider.rawValue] + let datatype1 : String = line[column.datatype1.rawValue] + let request : String = line[column.request.rawValue] + let country : String = line[column.country.rawValue] + let subdivision_name : String = line[column.subdivision_name.rawValue] + let municipality : String = line[column.municipality.rawValue] + let name : String = line[column.name.rawValue] + let license_url : String = line[column.license_url.rawValue] + let downloadURL : String = line[column.downloadurl.rawValue] + let authentication_type : String = line[column.authentication_type.rawValue] + let authentication_info_url : String = line[column.authentication_info_url.rawValue] + let api_key_parameter_name : String = line[column.api_key_parameter_name.rawValue] + let note : String = line[column.note.rawValue] + let gtfsschedulefeatures : String = line[column.gtfsschedulefeatures.rawValue] + let gtfsschedulestatus : String = line[column.gtfsschedulestatus.rawValue] + let gtfsrealtimestatus : String = line[column.gtfsrealtimestatus.rawValue] + let realtimefeatures : String = line[column.realtimefeatures.rawValue] + + let dateFromCurrentLine : String = extractDate(from: timestamp, usingGREP: dateFormatAsRegex, desiredDateFormat: dateFormatDesiredArg) + + if dateFromCurrentLine == todayDate { // ...the row has been added today, process it. + + if request.contains(requestType.isAddNewFeed.rawValue) { // add new feed + + if datatype1.contains(dataType.schedule.rawValue) { // add_gtfs_schedule_source + + PYTHON_SCRIPT_ARGS_TEMP = "add_gtfs_schedule_source(provider=\(provider), country_code=\(country), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), subdivision_name=\(subdivision_name), municipality=\(municipality), license_url=\(license_url), name=\(name), status=\(gtfsschedulestatus), features=\(gtfsschedulefeatures))" + + } else if datatype1.contains(dataType.realtime.rawValue) { // add_gtfs_realtime_source + // Emma: entity_type matches the realtime Data type options of Vehicle Positions, Trip Updates, or Service Alerts. If one of those three are selected, add it. If not, omit it. + + PYTHON_SCRIPT_ARGS_TEMP = "add_gtfs_realtime_source(entity_type=\(datatype1), provider=\(provider), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), license_url=\(license_url), name=\(name), static_reference=\"TO_BE_PROVIDED\", note=\(note), status=\(gtfsrealtimestatus), features=\(realtimefeatures))" + + } + + } else if request.contains(requestType.isUpdateExistingFeed.rawValue) { // update existing feed + + if datatype1.contains(dataType.schedule.rawValue) { // update_gtfs_schedule_source + + PYTHON_SCRIPT_ARGS_TEMP = "update_gtfs_schedule_source(mdb_source_id=, provider=\(provider), name=\(name), country_code=\(country), subdivision_name=\(subdivision_name), municipality=\(municipality), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), license_url=\(license_url), status=\(gtfsschedulestatus), features=\(gtfsschedulefeatures))" + + } else if datatype1.contains(dataType.realtime.rawValue) { // update_gtfs_realtime_source + + PYTHON_SCRIPT_ARGS_TEMP = "update_gtfs_realtime_source(mdb_source_id=, entity_type=\(datatype1), provider=\(provider), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), license_url=\(license_url), name=\(name), static_reference=\"TO_BE_PROVIDED\", note=\(note), status=\(gtfsrealtimestatus), features=\(realtimefeatures))" + } + + } else if request.contains(requestType.isToRemoveFeed.rawValue) { // remove feed + + if datatype1.contains(dataType.schedule.rawValue) { // update_gtfs_schedule_source + + PYTHON_SCRIPT_ARGS_TEMP = "update_gtfs_schedule_source(mdb_source_id=, provider=\(provider), name=\"**** Requested for removal ****\", country_code=\(country), subdivision_name=\(subdivision_name), municipality=\(municipality), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), license_url=\(license_url), status=\(gtfsschedulestatus), features=\(gtfsschedulefeatures))" + + } else if datatype1.contains(dataType.realtime.rawValue) { // update_gtfs_realtime_source + + PYTHON_SCRIPT_ARGS_TEMP = "update_gtfs_realtime_source(mdb_source_id=, entity_type=\(datatype1), provider=\(provider), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), license_url=\(license_url), name=\"**** Requested for removal ****\", static_reference=\"TO_BE_PROVIDED\", note=\(note), status=\(gtfsrealtimestatus), features=\(realtimefeatures))" + + } + + } else { // ... assume this is a new feed by default :: add_gtfs_schedule_source + + PYTHON_SCRIPT_ARGS_TEMP = "add_gtfs_schedule_source(provider=\(provider), country_code=\(country), direct_download_url=\(downloadURL), authentication_type=\(authentication_type), authentication_info_url=\(authentication_info_url), api_key_parameter_name=\(api_key_parameter_name), subdivision_name=\(subdivision_name), municipality=\(municipality), license_url=\(license_url), name=\(name), status=\(gtfsschedulestatus), features=\(gtfsschedulefeatures))" + + } + + } + + } // END of the row has been added today, process it. + + if PYTHON_SCRIPT_ARGS_TEMP.count > 0 { PYTHON_SCRIPT_OUTPUT = ( PYTHON_SCRIPT_OUTPUT + "§" + PYTHON_SCRIPT_ARGS_TEMP ) } + + } // END FOR LOOP + + // return final output so the action can grab it and pass it on to the Python script. + print(PYTHON_SCRIPT_OUTPUT.dropFirst()) + +} else { + print("Incorrect number of arguments provided to the script. Expected 3: a string with the URL, a date format and the date format desired.") + exit(1) +} + +// MARK: - FUNCTIONS + +func extractDate(from theDateToConvert: String, usingGREP dateFormatAsGREP: Regex, desiredDateFormat desiredFormat: String) -> String { + if let match : Regex.RegexOutput>.Match = theDateToConvert.firstMatch(of: dateFormatAsGREP) { + // find first match + let matchOutput : String = String(match.output[0].substring!) + + // date formatter and find date + let dateFormatter : DateFormatter = DateFormatter() + dateFormatter.dateFormat = desiredFormat + let date : Date? = dateFormatter.date(from: matchOutput) + + // default date if formatter fails, otherwise return correctly formatted date + var returnDate : String = defaults.date.rawValue + if date != nil { returnDate = dateFormatter.string(from: date!) } + return returnDate + } + + // return default date + return defaults.date.rawValue +} \ No newline at end of file