feat: Create Python script to track usage of best practices in Mobili…

…ty Database feeds #72 (#275) * feat: script to generate report * fix: changed name to respect constraints * feat: updated report --------- Co-authored-by: Jingsi Lu <[email protected]>
MobilityData · Sep 26, 2023 · 07b58e2 · 07b58e2
1 parent eb4cbdd
commit 07b58e2
Show file tree

Hide file tree

Showing 7 changed files with 436 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -134,4 +134,6 @@ dmypy.json
 
 # Mac
 */.DS_Store
-.DS_Store
+.DS_Store
+
+cebc62a4-ed30-4d1b-9816-53b3376baabc/
diff --git a/compliance_track/README.md b/compliance_track/README.md
@@ -0,0 +1,38 @@
+# Best practices and bad practices tracking
+
+This is a Python script to track usage of best practices and bad practices in the Mobility Database feeds.
+
+## Table of Contents
+
+* [Installation](#installation)
+* [Run the Script](#run-the-script)
+
+## Installation
+### Gcloud installation
+Install the `gcloud CLI` following the instructions in the [official documentation](https://cloud.google.com/sdk/docs/install) and authenticate yourself.
+
+Once it's completed, make sure you can access the [mobilitydata-gtfs-validation-results bucket](https://console.cloud.google.com/storage/browser/mobilitydata-gtfs-validation-results;tab=objects?forceOnBucketsSortingFiltering=true&project=md-poc-playground&supportedpurview=project&prefix=&forceOnObjectsSortingFiltering=false).
+
+### Python environment
+Create a Python virtual environment as described [here](https://github.com/MobilityData/mobility-database-catalogs/blob/main/README.md#installation).
+Once the described installation steps are successfully completed you should install `xlsxwriter`:
+```sh
+$ pip install xlsxwriter
+```
+
+### Retrieve reports from the Google Cloud bucket
+After activating the virtual environment and being in the root directory of the mobility-database-catalog repository, run the following commands:
+```sh
+$ pip install gsutil
+$ gsutil -m cp -r "gs://mobilitydata-gtfs-validation-results/reports/2023-06-06T02:45/cebc62a4-ed30-4d1b-9816-53b3376baabc" .
+```
+
+## Run the script
+Simply run:
+```sh
+$ python3 -m compliance_track.main 
+```
+To produce the report containing details about practices under discussion run:
+```sh
+$ python3 -m compliance_track.details
+```
diff --git a/compliance_track/__init__.py b/compliance_track/__init__.py
diff --git a/compliance_track/constants.py b/compliance_track/constants.py
@@ -0,0 +1,121 @@
+from zipfile import ZipFile
+
+import pandas as pd
+
+from compliance_track.validation import has_defined_values, extension_file_has_columns, cross_validate_blocks, \
+    current_validator, validate_sub_directory_exists, validate_shape_dist_traveled
+
+FILE = "file"
+FIELD = "field"
+RULE_TO_COUNT = "rule to count instance"
+VALIDATOR = "validator"
+RULE_ID = "rule_id"
+GC_COPY_PATH = "cebc62a4-ed30-4d1b-9816-53b3376baabc"
+
+BEST_PRACTICES_RULES = pd.DataFrame([
+    {
+        FILE: "routes.txt",
+        FIELD: "route_short_name",
+        RULE_TO_COUNT: "route_short_name is !empty AND routes.route_long_name is empty",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        has_defined_values(file_path, extension_file, "route_short_name") and
+        has_defined_values(file_path, extension_file, "route_long_name", check_undefined=True)
+    },
+    {
+        FILE: "routes.txt",
+        FIELD: "agency_id",
+        RULE_TO_COUNT: "agency_id is !empty AND there is only one agency_id in agency.txt",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_field', extension_file, 'agency_id')
+    },
+    {
+        FILE: "agency.txt",
+        FIELD: "agency_id",
+        RULE_TO_COUNT: "agency_id is !empty AND there is only one agency_id in agency.txt",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_field', extension_file, 'agency_id')
+    },
+    {
+        FILE: "fare_attributes.txt",
+        FIELD: "agency_id",
+        RULE_TO_COUNT: "agency_id is !empty AND there is only one agency_id in agency.txt",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_field', extension_file, 'agency_id')
+    },
+    {
+        FILE: "feed_info.txt",
+        FIELD: "",
+        RULE_TO_COUNT: "feed_info.txt is !empty AND there is no translations.txt file",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_file', extension_file, None)
+    },
+    {
+        FILE: "feed_info.txt",
+        FIELD: "feed_start_date",
+        RULE_TO_COUNT: "field is !empty",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_field', extension_file, 'feed_start_date')
+    },
+    {
+        FILE: "feed_info.txt",
+        FIELD: "feed_end_date",
+        RULE_TO_COUNT: "field is !empty",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_field', extension_file, 'feed_end_date')
+    },
+    {
+        FILE: "feed_info.txt",
+        FIELD: "feed_version",
+        RULE_TO_COUNT: "field is !empty",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_recommended_field', extension_file, 'feed_version')
+    },
+    {
+        FILE: "feed_info.txt",
+        FIELD: "feed_contact_email",
+        RULE_TO_COUNT: "feed_contact_email is !empty AND there is no feed_contact_url",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        has_defined_values(file_path, extension_file, "feed_contact_url", check_undefined=True)
+        and has_defined_values(file_path, extension_file, "feed_contact_email")
+    },
+    {
+        FILE: "feed_info.txt",
+        FIELD: "feed_contact_url",
+        RULE_TO_COUNT: "feed_contact_url is !empty AND there is no feed_contact_email",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        has_defined_values(file_path, extension_file, "feed_contact_email", check_undefined=True)
+        and has_defined_values(file_path, extension_file, "feed_contact_url")
+    },
+    {
+        FILE: "stop_times.txt",
+        FIELD: "timepoint",
+        RULE_TO_COUNT: "column exists",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file:
+        current_validator(report_folder_path, 'missing_timepoint_column', extension_file, None)
+    },
+    {
+        FILE: "trips.txt",
+        FIELD: "block_id",
+        RULE_TO_COUNT: "block_id is !empty AND the row with block_id has a trip_id that is included in frequences.txt",
+        VALIDATOR: lambda file_path, report_folder_path, extension_file: cross_validate_blocks(file_path)
+    }
+])
+
+BEST_PRACTICES_RULES[RULE_ID] = [f"rule_{i}" for i in range(1, len(BEST_PRACTICES_RULES) + 1)]
+
+
+BAD_PRACTICES_RULES = pd.DataFrame([
+    {
+        FILE: "zip subfolder within feed",
+        FIELD: "",
+        RULE_TO_COUNT: "zip subfolder exists",
+        VALIDATOR: lambda file_path, _, __: validate_sub_directory_exists(file_path)
+    },
+    {
+        FILE: "",
+        FIELD: "shape_dist_traveled",
+        RULE_TO_COUNT: "stop_times.shape_dist_traveled exceeds maximum shapes.shape_dist_traveled",
+        VALIDATOR: lambda file_path, _, __: validate_shape_dist_traveled(file_path)
+    },
+])
+BAD_PRACTICES_RULES[RULE_ID] = [f"rule_{i}" for i in range(1, len(BAD_PRACTICES_RULES) + 1)]
diff --git a/compliance_track/details.py b/compliance_track/details.py
@@ -0,0 +1,54 @@
+import os
+
+import pandas as pd
+from requests import RequestException
+
+from compliance_track.validation import download_latest_dataset, get_sub_directories, get_exceeded_shape_dist
+from tools.constants import GTFS
+from tools.operations import get_sources
+
+pd.options.mode.chained_assignment = None
+
+# script to get details for rules under review
+if __name__ == '__main__':
+    # retrieve data
+    dataset = get_sources(GTFS)
+    rule_1_results = pd.DataFrame({
+        'mdb_id': [],
+        'sub_folders_titles': []
+    })
+    rule_2_results = pd.DataFrame({
+        'mdb_id': [],
+        'trip_id': [],
+        'shape_id': [],
+        'max_stop_times': [],
+        'max_shapes': [],
+        'relative_diff': [],
+    })
+
+    for data in dataset.values():
+        mdb_id = data['mdb_source_id']
+        try:
+            dataset_path = download_latest_dataset(data)
+        except RequestException:
+            continue
+
+        # get results for subfolders
+        sub_folders_names = get_sub_directories(dataset_path)
+        if len(sub_folders_names) > 0:
+            sub_folders_names = ", ".join(sub_folders_names)
+            rule_1_results = rule_1_results.append(pd.Series([mdb_id, sub_folders_names], index=rule_1_results.columns), ignore_index=True)
+
+        # get results for exceeded max distance
+        exceeded_max_dist = get_exceeded_shape_dist(dataset_path)
+        if exceeded_max_dist is not None and len(exceeded_max_dist) > 0:
+            exceeded_max_dist['mdb_id'] = mdb_id
+            rule_2_results = pd.concat([rule_2_results, exceeded_max_dist], axis=0)
+
+        # clean up
+        os.remove(dataset_path)
+        print(mdb_id)
+    with pd.ExcelWriter('details.xlsx', engine='xlsxwriter') as writer:
+        rule_1_results.to_excel(writer, sheet_name=f'Subfolders Details', index=False)
+        rule_2_results.to_excel(writer, sheet_name=f'Max Dist Details', index=False)
+    print('Completed. Check details.xlsx file.')
diff --git a/compliance_track/main.py b/compliance_track/main.py
@@ -0,0 +1,96 @@
+import os
+
+import numpy as np
+import pandas as pd
+from requests import RequestException
+
+from compliance_track.constants import BEST_PRACTICES_RULES, GC_COPY_PATH, BAD_PRACTICES_RULES, VALIDATOR
+from compliance_track.validation import download_latest_dataset
+from tools.constants import GTFS
+from tools.operations import get_sources
+
+pd.options.mode.chained_assignment = None
+
+
+def validate_practices(practices, results):
+    for index, best_practice in practices.iterrows():
+        results.loc[results.mdb_id == mdb_id, best_practice.rule_id] = \
+            best_practice.validator(dataset_path, report_folder_path, best_practice.file)
+
+
+def init_results_container(practices):
+    results = pd.DataFrame(columns=["mdb_id"] + list(practices.rule_id))
+    results.mdb_id = list(dataset.keys())
+    results[practices.rule_id] = False
+    return results
+
+
+def write_results(file_writer, practices, results, prefix):
+    practices.drop(columns=[VALIDATOR]).to_excel(file_writer, sheet_name=f'{prefix} Rules', index=False)
+    results.to_excel(file_writer, sheet_name=f'{prefix} Results', index=False)
+    pd.DataFrame(results.count()).T.to_excel(file_writer, sheet_name=f'{prefix} Count', index=False)
+
+
+def format_results(practices, results):
+    final_results = pd.DataFrame(columns=practices.rule_id)
+    for rule in practices.rule_id:
+        mdb_ids = list(results[results[rule]].mdb_id)
+        if len(mdb_ids) == 0:
+            continue
+        if len(mdb_ids) > len(final_results):
+            final_results = final_results.reindex(index=range(len(mdb_ids)))
+        else:
+            mdb_ids += [np.nan for _ in range(len(final_results) - len(mdb_ids))]
+        final_results[rule] = mdb_ids
+    return final_results
+
+
+if __name__ == '__main__':
+    # retrieve report folders available
+    if not os.path.exists(GC_COPY_PATH):
+        print('Please import report data using gsutil as described in README.md. Make sure the data is included in the'
+              ' root of mobility-database-catalogs.')
+        exit(1)
+    report_results_folders = os.listdir(GC_COPY_PATH)
+    report_results_folders = [f'{GC_COPY_PATH}/{folder_name}/report-output-4.1.0/report.json'
+                              for folder_name in report_results_folders]
+
+    # retrieve data
+    dataset = get_sources(GTFS)
+
+    best_practice_results = init_results_container(BEST_PRACTICES_RULES)
+    bad_practice_results = init_results_container(BAD_PRACTICES_RULES)
+
+    for data in dataset.values():
+        mdb_id = data['mdb_source_id']
+
+        report_folder_path = [folder_name for folder_name in report_results_folders
+                              if len(folder_name.split('/')) > 1 and folder_name.split('/')[1].endswith(f'-{mdb_id}')]
+        if len(report_folder_path) != 1:
+            continue
+
+        report_folder_path = report_folder_path[0]
+
+        # retrieve data
+        try:
+            dataset_path = download_latest_dataset(data)
+        except RequestException as e:
+            continue
+
+        # validate compliance
+        validate_practices(BEST_PRACTICES_RULES, best_practice_results)
+        validate_practices(BAD_PRACTICES_RULES, bad_practice_results)
+
+        # clean up
+        os.remove(dataset_path)
+        print(mdb_id)
+
+    # formatting and saving the results
+    final_results_best_practices = format_results(BEST_PRACTICES_RULES, best_practice_results)
+    final_results_bad_practices = format_results(BAD_PRACTICES_RULES, bad_practice_results)
+
+    # write results
+    with pd.ExcelWriter('output.xlsx', engine='xlsxwriter') as writer:
+        write_results(writer, BEST_PRACTICES_RULES, final_results_best_practices, 'Best Practices')
+        write_results(writer, BAD_PRACTICES_RULES, final_results_bad_practices, 'Practice Review')
+    print('Completed. Check output.xlsx file.')