diff --git a/.github/workflows/project_feedback.yml b/.github/workflows/project_feedback.yml new file mode 100644 index 000000000..bfa1ec4f0 --- /dev/null +++ b/.github/workflows/project_feedback.yml @@ -0,0 +1,31 @@ +name: Run Tests +run-name: ${{ github.actor }} is running tests + +on: + push: + branches: + - main + +jobs: + test: + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + # project and exercise feedbacks + - name: Make test executable + run: chmod +x /home/runner/work/made-template/made-template/project/tests.sh + + - name: Run project tests + run: /home/runner/work/made-template/made-template/project/tests.sh \ No newline at end of file diff --git a/.gitignore b/.gitignore index d6d425f68..cc52fe553 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ .DS_Store /data/* -!/data/.gitkeep \ No newline at end of file +!/data/.gitkeep +.mypy_cache/ +temp_dir/ +project/__pycache__/ +*.pyc +*.sqlite \ No newline at end of file diff --git a/exercises/exercise1.jv b/exercises/exercise1.jv new file mode 100644 index 000000000..069d53922 --- /dev/null +++ b/exercises/exercise1.jv @@ -0,0 +1,74 @@ +pipeline FlugHafen{ + + //1. FlugHafen Pipeline connects the blocks via pipes to extract data from a CSV file + // in the web to a SQLite file sink. + FlugHafenHttpExtractor + -> FlugHafenTextFileInterpreter; + + //2. The FlugHafenTextFileInterpreter output is used as input for the FlugHafenCsvFileInterpreter + // block which is then used as input for the FlugHafenDataSelector block. + FlugHafenTextFileInterpreter + -> FlugHafenCsvFileInterpreter + // -> FlugHafenDatabaseWriter + -> FlugHafenDataSelector + -> FlugHafenTableInterpreter + -> FlugHafenLoader; + + //3. The FlugHafenHttpExtractor block is of type HttpExtractor and the URL is specified. + block FlugHafenHttpExtractor oftype HttpExtractor { + // URL of the data source + url: "https://opendata.rhein-kreis-neuss.de/api/explore/v2.1/catalog/datasets/rhein-kreis-neuss-flughafen-weltweit/exports/csv?lang=en&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B"; + } + + //4. The FlugHafenTextFileInterpreter block is of type TextFileInterpreter. + block FlugHafenTextFileInterpreter oftype TextFileInterpreter { } + + //5. Since we only need a specific range of the data, we use the CellRangeSelector block. + block FlugHafenDataSelector oftype CellRangeSelector { + // The name of the sheet + select: range A1:I*; + } + + //6. The FlugHafenCsvFileInterpreter block is of type CSVInterpreter and the delimiter is specified. + block FlugHafenCsvFileInterpreter oftype CSVInterpreter { + // Specify the separator as a semicolon for the CSV + delimiter: ';'; + } + + // block FlugHafenDatabaseWriter oftype DatabaseWriter { + // // The name of the database + // database: "flughafen.db"; + // // The name of the table + // table: "flughafen"; + // } + + //7. The FlugHafenTableInterpreter block is of type TableInterpreter and the necessary columns are specified. + block FlugHafenTableInterpreter oftype TableInterpreter { + // The first row contains the header + header: true; + // The columns of the table + columns: [ + "Lfd. Nummmer" oftype integer, + "Name des Flughafens" oftype text, + "Ort" oftype text, + "Land" oftype text, + "IATA" oftype text, + "ICAO" oftype text, + "Latitude" oftype decimal, + "Longitude" oftype decimal, + "Altitude" oftype integer, + + ]; + } + + //8. Finally the FlugHafenLoader block is of type SQLiteLoader and the table name and file name are specified. + block FlugHafenLoader oftype SQLiteLoader { + // The name of the table + table: "airports"; + // The name of the file + file: "airports.sqlite"; + } + + +} + diff --git a/exercises/exercise2.jv b/exercises/exercise2.jv new file mode 100644 index 000000000..f46db1c3c --- /dev/null +++ b/exercises/exercise2.jv @@ -0,0 +1,85 @@ +pipeline TreePlanting{ + + //1. TreePlanting Pipeline connects the blocks via pipes to extract data from a CSV file + // in the web to a SQLite file sink. + TreePlantingHttpExtractor + -> TreePlantingTextFileInterpreter; + + //2. The TreePlantingTextFileInterpreter output is used as input for the TreePlantingCsvFileInterpreter + // block which is then used as input for the TreePlantingDataSelector block. + TreePlantingTextFileInterpreter + -> TreePlantingCsvFileInterpreter + // -> TreePlantingDatabaseWriter + -> TreePlantingBaumartDeutschDeleter + -> TreePlantingTableInterpreter + -> TreePlantingLoader; + + //3. The TreePlantingHttpExtractor block is of type HttpExtractor and the URL is specified. + block TreePlantingHttpExtractor oftype HttpExtractor { + // URL of the data source + url: "https://opendata.rhein-kreis-neuss.de/api/v2/catalog/datasets/stadt-neuss-herbstpflanzung-2023/exports/csv"; + } + + //4. The TreePlantingTextFileInterpreter block is of type TextFileInterpreter. + block TreePlantingTextFileInterpreter oftype TextFileInterpreter { } + + //6. The TreePlantingCsvFileInterpreter block is of type CSVInterpreter and the delimiter is specified. + block TreePlantingCsvFileInterpreter oftype CSVInterpreter { + // Specify the separator as a semicolon for the CSV + delimiter: ';'; + } + + //5. The TreePlantingBaumartDeutschDeleter block is of type ColumnDeleter and the column to be deleted is specified. + block TreePlantingBaumartDeutschDeleter oftype ColumnDeleter { + // The name of the column + delete: [column E]; + } + + //7. The TreePlantingTableInterpreter block is of type TableInterpreter and the necessary columns are specified. + block TreePlantingTableInterpreter oftype TableInterpreter { + // The first row contains the header + header: true; + // The columns of the table + columns: [ + "lfd_nr" oftype integer, + "stadtteil" oftype Vogelsang, + "standort" oftype text, + "baumart_botanisch" oftype text, + "id" oftype GeoCoordinate, + "baumfamilie" oftype text, + + ]; + } + + block TreePlantingLoader oftype SQLiteLoader { + // The name of the table + table: "trees"; + // The name of the file + file: "trees.sqlite"; + } + + + valuetype Vogelsang oftype text { + // The value of the column + constraints: [ + // only allow column values that start with "Vogelsang" + VogelsangStadteil + ]; + } + + valuetype GeoCoordinate oftype text { + // The value of the column + constraints: [ + // only allow column values that match the pattern of a geo coordinate + Geopoints + ]; + } + + constraint VogelsangStadteil on text: value matches(/^Vogelsang*/); + //8. Finally the TreePlantingLoader block is of type SQLiteLoader and the table name and file name are specified. + + constraint Geopoints on text: value matches(/^\d{1,3}\.\d+,\s*\d{1,3}\.\d+$/); + + +} + diff --git a/exercises/exercise3.jv b/exercises/exercise3.jv new file mode 100644 index 000000000..90d443400 --- /dev/null +++ b/exercises/exercise3.jv @@ -0,0 +1,92 @@ +pipeline WorldBank { + + // 1. Block to extract an XLSX file from the web + block WorldBankHttpExtractor oftype HttpExtractor { + // URL of the source file containing the data + url: "https://thedocs.worldbank.org/en/doc/7d852628d96b9411d43e5d36d5dff941-0050062022/original/Graphs-Chapter-5-02082022.xlsx"; + } + + // 2. Block to interpret the downloaded file as an XLSX workbook + block WorldBankTextXLSXInterpreter oftype XLSXInterpreter { } + + // 3. Block to select the specific sheet "Figure S5.1.2" from the workbook + block DataCellSheetpicker oftype SheetPicker { + sheetName: "Figure S5.1.2"; + } + + // 4. Block to specify the cell range of interest in the selected sheet + block WorldBankRangeSelector oftype CellRangeSelector { + select: range P2:S45; + } + + // 5. Block to rename columns for clarity and standardization + block NameHeaderWriter oftype CellWriter { + at: range A1:D1; + write: ["Country Code", "Economy", "GDP per Capita", "Bond Issuance Share"]; + } + + // 6. Block to interpret and filter the "Bond Issuance Share" column data + block BondIssuanceTableInterpreter oftype TableInterpreter { + columns: [ + "Country Code" oftype CountryCodeAlpha3, + "Bond Issuance Share" oftype BetweenZeroAndOne, + ]; + } + + // 7. Block to interpret and filter the "GDP per Capita" column data + block GDPTableInterpreter oftype TableInterpreter { + columns: [ + "Country Code" oftype CountryCodeAlpha3, + "GDP per Capita" oftype PositiveDecimal, + ]; + } + + // 8. Block to load "Bond Issuance Share" data into a SQLite table + block BondIssuanceTableLoader oftype SQLiteLoader { + table: "bondIssuance"; // Table name in the SQLite database + file: "country-stats.sqlite"; // SQLite database file + } + + // 9. Block to load "GDP per Capita" data into a SQLite table + block GDPTableLoader oftype SQLiteLoader { + table: "gdpPerCapita"; // Table name in the SQLite database + file: "country-stats.sqlite"; // SQLite database file + } + + // 10. Value type to enforce positive decimal values + valuetype PositiveDecimal oftype decimal { + constraints: [ + OnlyPositiveDecimal // Constraint: Values must be > 0 + ]; + } + + // 11. Value type to enforce decimal values between 0 and 1 + valuetype BetweenZeroAndOne oftype decimal { + constraints: [ + BetweenZeroAndOneConstraint // Constraint: 0 <= value <= 1 + ]; + } + + // 12. Constraints definitions + constraint OnlyPositiveDecimal on decimal: value > 0; + constraint BetweenZeroAndOneConstraint on decimal: value >= 0 and value <= 1; + + // Pipeline connections + WorldBankHttpExtractor + -> WorldBankTextXLSXInterpreter; + + WorldBankTextXLSXInterpreter + -> DataCellSheetpicker + -> WorldBankRangeSelector + -> NameHeaderWriter; + + // Bond Issuance pipeline + NameHeaderWriter + -> BondIssuanceTableInterpreter + -> BondIssuanceTableLoader; + + // GDP per Capita pipeline + NameHeaderWriter + -> GDPTableInterpreter + -> GDPTableLoader; +} diff --git a/exercises/exercise4.jv b/exercises/exercise4.jv new file mode 100644 index 000000000..579294ee7 --- /dev/null +++ b/exercises/exercise4.jv @@ -0,0 +1,111 @@ +pipeline MobiliThek{ + + block DataExtractor oftype HttpExtractor { + url : "https://www.mowesta.com/data/measure/mowesta-dataset-20221107.zip"; + } + + block DataInterpreter oftype ArchiveInterpreter { + archiveType: "zip"; + } + + block FilePicker oftype FilePicker { + path: "/data.csv"; + } + + block TextFileInterpreter oftype TextFileInterpreter { } + + + block CSVInterpreter oftype CSVInterpreter { + delimiter: ';'; + } + + // block CellRangeSelector oftype CellRangeSelector { + // select: range A1:J4873; + // } + + block DeleteSomeColumns oftype ColumnDeleter { + delete: [column F, column G, column H, column I, column K, column L, column M, column N, column O, column P, column Q, + column R, column S, column T, column U, column V, column W, column X, column Y, column Z, + column AA, column AB, column AC, column AD, column AE, column AF, column AG, column AH, column AI, + column AJ, column AK, column AL, column AM, column AN, column AO, column AP, column AQ, column AR, + column AS, column AT, column AU, column AV, column AW, column AX, column AY, column AZ, column BA, + column BB, column BC, column BD, column BE, column BF, column BG, column BH, column BI, column BJ, + column BK, column BL, column BM, column BN, column BO, column BP, column BQ, column BR, column BS, + column BT, column BU, column BV, column BW, column BX, column BY, column BZ, column CA, column CB, + column CC, column CD, column CE, column CF, column CG, column CH, column CI, column CJ, column CK, + column CL, column CM, column CN, column CO, column CP, column CQ, column CR, column CS, column CT, + column CU, column CV, column CW, column CX, column CY, column CZ, column DA, column DB, column DC, + column DD, column DE, column DF, column DG, column DH, column DI, column DJ, column DK, column DL, + column DM, column DN, column DO, column DP, column DQ, column DR, column DS, column DT, column DU, + column DV, column DW, column DX, column DY, column DZ, column EA, column EB, column EC, column ED, + column EE, column EF, column EG, column EH, column EI, column EJ, column EK, column EL, column EM, + column EN, column EO, column EP, column EQ, column ER, column ES, column ET, column EU, column EV, + column EW, column EX, column EY]; + } + + block RenameColumns oftype CellWriter { + at: range A1:F1; + write: ["id", "producer", "model", "month", "temperature", "battery_temperature"]; + } + + block TempMeasurementTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + "id" oftype integer, + "producer" oftype text, + "model" oftype text, + "month" oftype BetweenOneAndTwelve, + "temperature" oftype decimal, + "battery_temperature" oftype decimal, + ]; + } + + block TempMeasurementTransformer oftype TableTransformer { + inputColumns: [ + "temperature", + ]; + outputColumn: "temperature"; + uses: FahrenheitUnit; + } + + block TempMeausurementBatteryTransformer oftype TableTransformer { + inputColumns: [ + "battery_temperature", + ]; + outputColumn: "battery_temperature"; + uses: FahrenheitUnit; + } + + transform FahrenheitUnit { + from Celsius oftype decimal; + to Fahrenheit oftype decimal; + Fahrenheit: Celsius * 9/5 + 32; + } + + valuetype BetweenOneAndTwelve oftype integer { + constraints: [ + BetweenOneAndTwelveConstraint + ]; + } + + constraint BetweenOneAndTwelveConstraint on integer: value >= 1 and value <= 12; + + + block DataLoader oftype SQLiteLoader { + table: "temperatures"; + file: "temperatures.sqlite"; + } + + DataExtractor + -> DataInterpreter + -> FilePicker + -> TextFileInterpreter + -> CSVInterpreter + // -> CommaSeparator + -> DeleteSomeColumns + -> RenameColumns + -> TempMeasurementTableInterpreter + -> TempMeasurementTransformer + -> TempMeausurementBatteryTransformer + -> DataLoader; +} \ No newline at end of file diff --git a/exercises/exercise5.jv b/exercises/exercise5.jv new file mode 100644 index 000000000..73335fbec --- /dev/null +++ b/exercises/exercise5.jv @@ -0,0 +1,85 @@ +pipeline GTFS { + + block GTFSDataExtractor oftype HttpExtractor{ + url: "https://gtfs.rhoenenergie-bus.de/GTFS.zip"; + } + + block GTFSDataInterpreter oftype ArchiveInterpreter{ + archiveType: "zip"; + } + + block GTFSFilePicker oftype FilePicker{ + path: "/stops.txt"; + } + + block GTFSTextFileInterpreter oftype TextFileInterpreter{ + encoding: "utf8"; + } + + + block GTFSCSVInterpreter oftype CSVInterpreter{ + enclosing: '"'; + enclosingEscape: '"'; + } + + block GTFSTableInterpreter oftype TableInterpreter{ + header: true; + columns: [ + "stop_id" oftype integer, + "stop_name" oftype text, + "stop_lat" oftype BetweenPlusMinusNinety, + "stop_lon" oftype BetweenPlusMinusNinety, + "zone_id" oftype ValidZone, + ]; + } + + // valuetype GermanUmlauts oftype text{ + // constraints: [ + // GermanUmlautsConstraint + // ]; + // } + // // /^[A-Za-z0-9äöüÄÖÜß\s\-\.(),]*$/; + // // /^(?=.*[äöüÄÖÜß])[A-Za-z0-9äöüÄÖÜß]*$/; + // constraint GermanUmlautsConstraint on text: value matches /^[A-Za-z0-9äöüÄÖÜß\s\-\.(),]*$/; + + valuetype BetweenPlusMinusNinety oftype decimal{ + constraints: [ + BetweenPlusMinusNinetyConstraint + ]; + } + constraint BetweenPlusMinusNinetyConstraint oftype RangeConstraint{ + lowerBound: -90; + lowerBoundInclusive: true; + upperBound: 90; + upperBoundInclusive: true; + } + + + valuetype ValidZone oftype integer{ + constraints: [ + ValidZoneConstraint + ]; + } + + + // constraint ValidZoneConstraint oftype RangeConstraint{ + // lowerBound: 1925; + // // upperBound: 1925; + // } + + constraint ValidZoneConstraint on integer: value == 1925 ; + + block GTFSDataLoader oftype SQLiteLoader{ + table: "stops"; + file: "gtfs.sqlite"; + } + + + GTFSDataExtractor + -> GTFSDataInterpreter + -> GTFSFilePicker + -> GTFSTextFileInterpreter + -> GTFSCSVInterpreter + -> GTFSTableInterpreter + -> GTFSDataLoader; +} \ No newline at end of file diff --git a/project/.__init__.py b/project/.__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/project/__pycache__/prepare_data.cpython-311.pyc b/project/__pycache__/prepare_data.cpython-311.pyc new file mode 100644 index 000000000..8b6946d4c Binary files /dev/null and b/project/__pycache__/prepare_data.cpython-311.pyc differ diff --git a/project/__pycache__/run_tests.cpython-311.pyc b/project/__pycache__/run_tests.cpython-311.pyc new file mode 100644 index 000000000..1bcdccc41 Binary files /dev/null and b/project/__pycache__/run_tests.cpython-311.pyc differ diff --git a/project/data-report.pdf b/project/data-report.pdf new file mode 100644 index 000000000..58a9e5426 Binary files /dev/null and b/project/data-report.pdf differ diff --git a/project/pipeline.sh b/project/pipeline.sh new file mode 100755 index 000000000..02b31d3ef --- /dev/null +++ b/project/pipeline.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python3 ./project/prepare_data.py diff --git a/project/prepare_data.py b/project/prepare_data.py new file mode 100644 index 000000000..83b2a711b --- /dev/null +++ b/project/prepare_data.py @@ -0,0 +1,183 @@ +import requests +import zipfile +import io +import os +import pandas as pd +import shutil +from sqlalchemy import create_engine, Table, Column, String, Integer, Float, MetaData + + +def create_temp_directory(path="project/temp_dir"): + """Creates a temporary directory for storing extracted files.""" + os.makedirs(path, mode=0o777, exist_ok=True) + return path + + +def create_database_engine(db_path="data/world_data.sqlite"): + """Creates and returns an SQLAlchemy engine.""" + return create_engine(f"sqlite:///{db_path}") + + +def create_tables(metadata, year_columns): + """Defines and returns the table structures.""" + tables = { + "LCN": Table( + "LCN", + metadata, + Column("CountryName", String), + Column("CountryCode", String), + Column("IndicatorName", String), + Column("IndicatorCode", String), + Column("SourceNote", String), + Column("SourceOrganization", String), + *[Column(year, Float) for year in year_columns], + ), + "NAC": Table( + "NAC", + metadata, + Column("CountryName", String), + Column("CountryCode", String), + Column("IndicatorName", String), + Column("IndicatorCode", String), + Column("SourceNote", String), + Column("SourceOrganization", String), + *[Column(year, Float) for year in year_columns], + ), + } + return tables + + +def download_and_extract_data(urls, temp_dir): + """Downloads and extracts CSV files from the given URLs.""" + for region, url in urls.items(): + response = requests.get(url) + with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref: + zip_ref.extractall(temp_dir) + print(f"Downloaded and extracted data for {region}") + + +def load_main_and_metadata_files(temp_dir): + """Loads main and metadata files from the temporary directory.""" + main_files = [ + f for f in os.listdir(temp_dir) if f.startswith("API") and f.endswith(".csv") + ] + metadata_files = [ + f + for f in os.listdir(temp_dir) + if f.startswith("Metadata_Indicator") and f.endswith(".csv") + ] + metadata_files = list(reversed(metadata_files)) + return main_files, metadata_files + + +def clean_and_merge_data(main_file, metadata_file, temp_dir, year_columns): + """Cleans and merges the main data with metadata.""" + # Load main data + main_df = pd.read_csv( + os.path.join(temp_dir, main_file), encoding="utf-8-sig", skiprows=4 + ) + main_df.fillna(0, inplace=True) + main_df = main_df.loc[:, ~main_df.columns.str.contains("^Unnamed")] + + # Load metadata + metadata_df = pd.read_csv( + os.path.join(temp_dir, metadata_file), encoding="utf-8-sig" + ) + metadata_df = metadata_df.loc[:, ~metadata_df.columns.str.contains("^Unnamed")] + metadata_df.rename(columns={"INDICATOR_CODE": "Indicator Code"}, inplace=True) + metadata_df = metadata_df.drop(columns=["INDICATOR_NAME"]) + + # Merge data and metadata on 'Indicator Code' + merged_df = main_df.merge(metadata_df, on="Indicator Code", how="outer") + + # Rename columns to match the database table schema + merged_df.rename( + columns={ + "Country Name": "CountryName", + "Country Code": "CountryCode", + "Indicator Name": "IndicatorName", + "Indicator Code": "IndicatorCode", + "SOURCE_NOTE": "SourceNote", + "SOURCE_ORGANIZATION": "SourceOrganization", + }, + inplace=True, + ) + + # Rearrange columns + merged_df = merged_df[ + [ + "CountryName", + "CountryCode", + "IndicatorName", + "IndicatorCode", + "SourceNote", + "SourceOrganization", + *year_columns, + ] + ] + + return merged_df + + +def save_to_database(merged_df, table_name, tables, engine): + """Saves the merged data to the appropriate SQLite table.""" + if table_name in tables: + merged_df.to_sql(table_name, engine, if_exists="append", index=False) + print(f"Merged data and metadata stored in SQLite table '{table_name}'") + else: + print(f"Table '{table_name}' not found in the database") + + +def clean_up(temp_dir): + """Removes the temporary directory.""" + shutil.rmtree(temp_dir) + print("Cleaned up temporary directory.") + + +def main(): + """Main function that orchestrates the entire process.""" + urls = { + "north_america": "https://api.worldbank.org/v2/en/country/NAC?downloadformat=csv", + "latin_america_caribbean": "https://api.worldbank.org/v2/en/country/LCN?downloadformat=csv", + } + + # Create the temp directory and database engine + temp_dir = create_temp_directory() + engine = create_database_engine() + print("Created temporary directory and database engine.") + + # Define year columns (1960 to 2023) + year_columns = [str(year) for year in range(1960, 2024)] + + # Create tables if they don't exist + metadata = MetaData() + tables = create_tables(metadata, year_columns) + metadata.create_all(engine) + + # Download and extract data + download_and_extract_data(urls, temp_dir) + + # Load main and metadata files + main_files, metadata_files = load_main_and_metadata_files(temp_dir) + + # Process each file pair + for main_file, metadata_file in zip(main_files, metadata_files): + merged_df = clean_and_merge_data( + main_file, metadata_file, temp_dir, year_columns + ) + + # Determine the table name based on the region + # table_name = main_file.split("_")[1].split(".")[0].title().replace(" ", "_") + # table_name = region.replace("_", " ").title().replace(" ", "_") + # Determine the region and map to table name + region = main_file.split("_")[1].split(".")[0] + + # Save the merged data to the database + save_to_database(merged_df, region, tables, engine) + + # Clean up the temporary directory + clean_up(temp_dir) + + +if __name__ == "__main__": + main() diff --git a/project/project-plan-example.md b/project/project-plan-example.md new file mode 100644 index 000000000..109487d1e --- /dev/null +++ b/project/project-plan-example.md @@ -0,0 +1,50 @@ +# Project Plan + +## A visual inspection of Regional Economic Trends: North America & Latin America Analysis + +This analysis utilizes the International Renewable Energy Agency (IRENA) dataset to carry out a comprehensive visual inspection of the energy deployment across the Americas. Some of the key aspects that will be covered are: Costs, Energy Transition, Capacity and Generation and Investment trends. + +## Main Question + + +* How have economic indicators (such as GDP, poverty rate, or education level) evolved over time across different countries in North America and Latin America & the Caribbean, and what trends or patterns can be identified in their development? + + + +## Description + + +For this project, I am working on integrating and analyzing global economic and development indicators from the World Bank for different regions. The goal is to clean, merge, and store these indicators in an SQLite database to facilitate future analysis. The project begins by downloading raw CSV files containing economic data as well as many indicators for North America and Latin America & the Caribbean. These files are then extracted from compressed archives, cleaned by removing unnecessary columns, and merged with metadata that provides descriptions for each indicator. The data is then transformed to match a predefined schema with columns for country names, indicator codes, and values for each year from 1960 to 2023 and stored in two tables 'LCN' and 'NAC' of the world_data.sqlite database. + +## Datasources + + + +#### Datasource 1: North America Economic Data [World Bank](https://data.worldbank.org/region/north-america?view=chart) +This dataset contains various economic and development indicators for countries in North America, including the United States, Canada, and Mexico. It includes data on GDP, poverty rates, education, healthcare, and other socio-economic indicators over time (from 1960 to 2023). The data is provided by the World Bank in CSV format and will be used to examine the economic development and trends in the North American region. The source is available for download through a World Bank API, which delivers the data in a zip file. + +#### Datasource 2: Latin America & Caribbean Economic Data [World Bank]( https://data.worldbank.org/region/latin-america-and-caribbean?view=chart) +Similar to the North American dataset, this dataset provides economic and development indicators for countries in Latin America and the Caribbean. It includes a wide range of data on economic growth, social development, and health metrics. The dataset spans the years 1960 to 2023 and will be used to analyze trends in economic and social development for the Latin American and Caribbean region. The data is also provided in CSV format and can be accessed through the World Bank API in a compressed zip file. [World bank Website](https://data.worldbank.org/) + +### Datasource: World Bank Dataset +* Metadata URL: https://data.worldbank.org/country +* Data source 1: https://data.worldbank.org/region/north-america?view=chart +* Data source 2: https://data.worldbank.org/region/latin-america-and-caribbean?view=chart +* Data Type: csv + +### License +[Creative Commons Attribution 4.0 International License (CC BY 4.0)](https://www.worldbank.org/en/about/legal/terms-of-use-for-datasets) + + + +## Work Packages + + + +- [x] Data Collection and Cleaning [#1][i1] +- [] Exploratory Data Analysis [#2][i1] +- [] Visualization Development [#3][i1] +- [] Comparision of Investment Trends [#4][i1] +- [] Final Report and Presentation [#5][i1] + +[i1]: https://github.com/jvalue/made-template/issues/1 diff --git a/project/run_tests.py b/project/run_tests.py new file mode 100644 index 000000000..8ac2c1828 --- /dev/null +++ b/project/run_tests.py @@ -0,0 +1,75 @@ +import unittest +from unittest.mock import patch, MagicMock +import os +import pandas as pd +from prepare_data import ( + create_temp_directory, + create_database_engine, + create_tables, + download_and_extract_data, + load_main_and_metadata_files, + save_to_database, + clean_up, +) + + +class TestDataProcessing(unittest.TestCase): + + @patch("os.makedirs") + def test_create_temp_directory(self, mock_makedirs): + path = create_temp_directory() + self.assertTrue(mock_makedirs.called) + self.assertEqual(path, "project/temp_dir") + + @patch("prepare_data.create_engine") + def test_create_database_engine(self, mock_create_engine): + db_path = "data/world_data.sqlite" + engine = create_database_engine() + mock_create_engine.assert_called_with(f"sqlite:///{db_path}") + self.assertIsNotNone(engine) + + @patch("requests.get") + @patch("zipfile.ZipFile") + def test_download_and_extract_data(self, mock_zipfile, mock_requests): + mock_response = MagicMock() + mock_response.content = b"dummy content" + mock_requests.return_value = mock_response + + urls = { + "north_america": "https://api.worldbank.org/v2/en/country/NAC?downloadformat=csv", + "latin_america_caribbean": "https://api.worldbank.org/v2/en/country/LCN?downloadformat=csv", + } + temp_dir = "temp_dir" + + download_and_extract_data(urls, temp_dir) + self.assertTrue(mock_zipfile.called) + + @patch("os.listdir") + @patch("pandas.read_csv") + def test_load_main_and_metadata_files(self, mock_read_csv, mock_listdir): + mock_listdir.return_value = ["API_data.csv", "Metadata_Indicator.csv"] + main_files, metadata_files = load_main_and_metadata_files("temp_dir") + + self.assertEqual(main_files, ["API_data.csv"]) + self.assertEqual(metadata_files, ["Metadata_Indicator.csv"]) + + @patch("pandas.DataFrame.to_sql") + def test_save_to_database(self, mock_to_sql): + merged_df = pd.DataFrame() + tables = { + "NAC": MagicMock(), # Mock for North America table + "LCN": MagicMock(), # Mock for Latin America/Caribbean table + } + + save_to_database(merged_df, "NAC", tables, None) + self.assertTrue(mock_to_sql.called) + + + @patch("shutil.rmtree") + def test_clean_up(self, mock_rmtree): + clean_up("temp_dir") + self.assertTrue(mock_rmtree.called) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/project/tests.sh b/project/tests.sh new file mode 100755 index 000000000..d059f1546 --- /dev/null +++ b/project/tests.sh @@ -0,0 +1 @@ +python3 ./project/run_tests.py \ No newline at end of file diff --git a/project/version_check.py b/project/version_check.py new file mode 100644 index 000000000..723123ea5 --- /dev/null +++ b/project/version_check.py @@ -0,0 +1,11 @@ +import pkg_resources + +def check_versions(package_list): + installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set} + for package in package_list: + version = installed_packages.get(package.lower(), "Not installed") + print(f"{package}: {version}") + +# Replace these with the package names you want to check +selected_packages = ["zipfile", "shutil", "numpy", "pandas", "scikit-learn", "matplotlib", "sqlalchemy"] +check_versions(selected_packages) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..b36213ddf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +numpy==1.26.4 +pandas==1.5.3 +scikit-learn==1.2.2 +matplotlib==3.8.0 +sqlalchemy==2.0.25 +seaborn +pytest +requests \ No newline at end of file